Add test_source scrapers for motousher.com and dirtstreet.in

Adds two new experimental product scrapers under test_source/, isolated from the active pipeline until verified and ready to promote. motousher/ (Shopify store — Shopify JSON API): - Scrapes 12 brands: All Balls Racing, DID Chains, EBC Brakes, Esjot Sprockets, Evans Coolant, Grip Puppies, HiFlo Filters, JT Sprockets, Maxima Racing Oils, Putoline, Ram Mount, Wunderlich - 2,446 products total scraped and verified - Uses /collections/{slug}/products.json + /products/{handle}.json - Parallel fetch (concurrency 3), paginated collection listing dirtstreet/ (WooCommerce store — HTML + JSON-LD): - Scrapes 5 brands: SC Project, Evotech Performance, DNA Air Filters, WRS, Zero Gravity Racing - 1,087 products total scraped and verified - Pure fetch with JSON-LD schema.org extraction (no browser) - Handles paginated /brand/{slug}/page/N/ archives - Price extracted from offers.priceSpecification[0].price - Stock status derived from JSON-LD availability field Both scrapers are standalone (node index.js), support --brand and --limit flags, save per-brand JSON files and a combined.json. Scraped data lives in data/sources/test_source/ (gitignored). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-04 12:17:23 +05:30 · 2026-06-04 12:17:23 +05:30 · b8d9478afa
commit b8d9478afa
parent 1d254a9009
9 changed files with 1117 additions and 0 deletions
--- a/src/business-logic/import-pipeline/test_source/.gitkeep
+++ b/src/business-logic/import-pipeline/test_source/.gitkeep
@ -0,0 +1 @@
+
--- a/src/business-logic/import-pipeline/test_source/README.md
+++ b/src/business-logic/import-pipeline/test_source/README.md
@ -0,0 +1,26 @@
+# Test Source Workspace
+
+This folder is for experimental supplier scraping work.
+
+Use one folder per supplier website or test target. Keep experiments here until
+the scraper, converter, and data shape are confirmed safe to move into the active
+`sources/` registry.
+
+Suggested folder shape:
+
+```txt
+test_source/
+  supplier-key/
+    links.txt
+    scraper.js
+    sample-output.json
+    notes.md
+```
+
+Rules for this area:
+
+- Do not register test sources in `../sources/index.js` until the scraper is ready.
+- Store only brand/filter/listing URLs in `links.txt`; avoid committing secrets.
+- Keep generated images, large raw exports, and run logs out of git.
+- Document what each URL represents in the supplier folder notes.
+
--- a/src/business-logic/import-pipeline/test_source/dirtstreet/brands.js
+++ b/src/business-logic/import-pipeline/test_source/dirtstreet/brands.js
@ -0,0 +1,35 @@
+/**
+ * Dirtstreet brand list.
+ *
+ * Brand index page: https://dirtstreet.in/shop-by-brands/
+ */
+
+const BRANDS = [
+  {
+    name: "SC Project",
+    slug: "scproject",
+    brandUrl: "https://dirtstreet.in/brand/scproject/",
+  },
+  {
+    name: "Evotech Performance",
+    slug: "evotechperformance",
+    brandUrl: "https://dirtstreet.in/brand/evotechperformance/",
+  },
+  {
+    name: "DNA Air Filters",
+    slug: "dnaairfilters",
+    brandUrl: "https://dirtstreet.in/brand/dnaairfilters/",
+  },
+  {
+    name: "WRS",
+    slug: "wrs",
+    brandUrl: "https://dirtstreet.in/brand/wrs/",
+  },
+  {
+    name: "Zero Gravity Racing",
+    slug: "zerogravityracing",
+    brandUrl: "https://dirtstreet.in/brand/zerogravityracing/",
+  },
+];
+
+module.exports = { BRANDS };
--- a/src/business-logic/import-pipeline/test_source/dirtstreet/index.js
+++ b/src/business-logic/import-pipeline/test_source/dirtstreet/index.js
@ -0,0 +1,138 @@
+/**
+ * Dirtstreet scraper runner.
+ *
+ * Usage:
+ *   node index.js                        → scrape all brands in brands.js
+ *   node index.js --brand scproject      → scrape one brand only
+ *   node index.js --limit 5             → first 5 products per brand (quick test)
+ */
+
+const fs = require("node:fs/promises");
+const path = require("node:path");
+const { BRANDS } = require("./brands");
+const { scrapeDirtstreetBrand } = require("./scraper");
+
+const OUTPUT_DIR = path.resolve(__dirname, "..", "..", "..", "..", "..", "data", "sources", "test_source", "dirtstreet");
+
+function parseArgs(argv = process.argv.slice(2)) {
+  const out = { brandSlug: null, limit: null };
+  for (let i = 0; i < argv.length; i++) {
+    if ((argv[i] === "--brand" || argv[i] === "-b") && argv[i + 1]) {
+      out.brandSlug = argv[i + 1].toLowerCase().trim();
+    }
+    if ((argv[i] === "--limit" || argv[i] === "-n") && argv[i + 1]) {
+      const n = Number.parseInt(argv[i + 1], 10);
+      if (Number.isFinite(n) && n > 0) out.limit = n;
+    }
+  }
+  return out;
+}
+
+async function run() {
+  const { brandSlug, limit } = parseArgs();
+  await fs.mkdir(OUTPUT_DIR, { recursive: true });
+
+  const brandsToRun = brandSlug
+    ? BRANDS.filter((b) => b.slug === brandSlug)
+    : BRANDS;
+
+  if (!brandsToRun.length) {
+    console.error(`No brand found matching slug: "${brandSlug}"`);
+    console.error(`Available: ${BRANDS.map((b) => b.slug).join(", ")}`);
+    process.exitCode = 1;
+    return;
+  }
+
+  const combinedProducts = [];
+  const runSummary = [];
+  const startedAt = new Date().toISOString();
+
+  for (const brand of brandsToRun) {
+    console.log(`\n${"=".repeat(60)}`);
+    console.log(`[DIRTSTREET] Scraping brand: ${brand.name}`);
+    console.log(`[DIRTSTREET] Brand URL: ${brand.brandUrl}`);
+    console.log(`${"=".repeat(60)}`);
+
+    const brandStartedAt = new Date().toISOString();
+    let products = [];
+    let error = null;
+
+    try {
+      products = await scrapeDirtstreetBrand(brand, { limit });
+
+      const brandFile = path.join(OUTPUT_DIR, `${brand.slug}.json`);
+      const brandPayload = {
+        generatedAt: new Date().toISOString(),
+        source: "dirtstreet",
+        brand: brand.name,
+        brandSlug: brand.slug,
+        brandUrl: brand.brandUrl,
+        totalProducts: products.length,
+        products,
+      };
+
+      await fs.writeFile(brandFile, JSON.stringify(brandPayload, null, 2), "utf8");
+      console.log(`[DIRTSTREET] Saved ${products.length} products → ${brandFile}`);
+
+      combinedProducts.push(...products);
+      runSummary.push({
+        brand: brand.name,
+        slug: brand.slug,
+        brandUrl: brand.brandUrl,
+        startedAt: brandStartedAt,
+        completedAt: new Date().toISOString(),
+        totalProducts: products.length,
+        success: true,
+      });
+    } catch (err) {
+      console.error(`[DIRTSTREET] Failed to scrape ${brand.name}: ${err.message}`);
+      error = err.message;
+      runSummary.push({
+        brand: brand.name,
+        slug: brand.slug,
+        brandUrl: brand.brandUrl,
+        startedAt: brandStartedAt,
+        completedAt: new Date().toISOString(),
+        totalProducts: 0,
+        success: false,
+        error,
+      });
+    }
+  }
+
+  // Save combined JSON
+  const combinedFile = path.join(OUTPUT_DIR, "combined.json");
+  const combinedPayload = {
+    generatedAt: new Date().toISOString(),
+    startedAt,
+    source: "dirtstreet",
+    siteUrl: "https://dirtstreet.in",
+    brandsScraped: runSummary.length,
+    totalProducts: combinedProducts.length,
+    runSummary,
+    products: combinedProducts,
+  };
+
+  await fs.writeFile(combinedFile, JSON.stringify(combinedPayload, null, 2), "utf8");
+
+  console.log(`\n${"=".repeat(60)}`);
+  console.log("[DIRTSTREET] RUN COMPLETE");
+  console.log(`${"=".repeat(60)}`);
+  console.log(`Brands scraped : ${runSummary.length}`);
+  console.log(`Total products : ${combinedProducts.length}`);
+  console.log(`Combined file  : ${combinedFile}`);
+  console.log("\nBrand summary:");
+  for (const s of runSummary) {
+    const status = s.success ? "OK" : "FAILED";
+    console.log(`  [${status}] ${s.brand} → ${s.totalProducts} products`);
+  }
+
+  return combinedPayload;
+}
+
+run().catch((err) => {
+  console.error("Runner failed:", err.message);
+  process.exitCode = 1;
+});
+
+module.exports = { run };
--- a/src/business-logic/import-pipeline/test_source/dirtstreet/scraper.js
+++ b/src/business-logic/import-pipeline/test_source/dirtstreet/scraper.js
@ -0,0 +1,423 @@
+/**
+ * Dirtstreet.in scraper - plain fetch + JSON-LD extraction (no browser).
+ *
+ * Dirtstreet is a WooCommerce store. Product pages contain JSON-LD schema.org
+ * markup. We fetch HTML directly and extract all fields via regex.
+ *
+ *   1. Brand pages paginated at /brand/{slug}/page/N/
+ *   2. Product detail via HTML + JSON-LD
+ */
+
+const BASE_URL = "https://dirtstreet.in";
+const CONCURRENCY = 3;
+
+function sleep(ms) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+async function fetchHtml(url, attempt = 1) {
+  try {
+    const res = await fetch(url, {
+      headers: {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "accept-language": "en-US,en;q=0.9",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
+        "cache-control": "no-cache",
+      },
+    });
+
+    if (res.status === 429 || res.status >= 500) {
+      if (attempt <= 4) {
+        const wait = attempt * 2000;
+        console.log(`[RETRY] ${url} -> HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
+        await sleep(wait);
+        return fetchHtml(url, attempt + 1);
+      }
+      throw new Error(`HTTP ${res.status} after retries`);
+    }
+
+    if (res.status === 404) return null;
+    if (!res.ok) throw new Error(`HTTP ${res.status}`);
+    return res.text();
+  } catch (err) {
+    if (attempt <= 3 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) {
+      await sleep(attempt * 2000);
+      return fetchHtml(url, attempt + 1);
+    }
+    throw err;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function decodeEntities(str) {
+  return String(str || "")
+    .replace(/&amp;/g, "&")
+    .replace(/&lt;/g, "<")
+    .replace(/&gt;/g, ">")
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/&times;/g, "x")
+    .replace(/&nbsp;/g, " ")
+    .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)));
+}
+
+function cleanText(value) {
+  return decodeEntities(String(value || ""))
+    .replace(/<[^>]+>/g, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+}
+
+function toNumber(value) {
+  if (!value) return null;
+  const match = String(value).match(/([\d,]+\.?\d*)/);
+  if (!match) return null;
+  const n = Number.parseFloat(match[1].replace(/,/g, ""));
+  return Number.isNaN(n) ? null : n;
+}
+
+function extractPriceFromHtml(html) {
+  // data-price attribute (WooCommerce sets this for JS cart)
+  const dp = html.match(/data-price=["']([\d.]+)["']/);
+  if (dp) { const n = Number.parseFloat(dp[1]); if (n > 0) return n; }
+
+  // Pattern on dirtstreet: &#8377;</span>198,000.00</bdi>
+  // Collect ALL such values, filter out 0, take the first valid one (the product price)
+  const bdiPattern = /(?:&#8377;|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>/gi;
+  let bdiMatch;
+  while ((bdiMatch = bdiPattern.exec(html)) !== null) {
+    const n = toNumber(bdiMatch[1]);
+    if (n > 0) return n;
+  }
+
+  // <ins> sale price block
+  const ins = html.match(/<ins>[\s\S]{0,300}?(?:&#8377;|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>[\s\S]{0,300}?<\/ins>/i);
+  if (ins) { const n = toNumber(ins[1]); if (n > 0) return n; }
+
+  // Fallback: &#8377; number anywhere
+  const entity = html.match(/&#8377;[^>]*>([\d,]+(?:\.\d+)?)/);
+  if (entity) { const n = toNumber(entity[1]); if (n > 0) return n; }
+
+  return null;
+}
+
+function availabilityToStockStatus(availability) {
+  if (!availability) return null;
+  const url = String(availability).toLowerCase();
+  if (url.includes("instock")) return "instock";
+  if (url.includes("outofstock")) return "outofstock";
+  if (url.includes("backorder") || url.includes("preorder")) return "onbackorder";
+  return null;
+}
+
+// ---------------------------------------------------------------------------
+// JSON-LD extraction
+// ---------------------------------------------------------------------------
+
+function extractJsonLd(html) {
+  const results = [];
+  const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
+  let match;
+  while ((match = regex.exec(html)) !== null) {
+    try {
+      results.push(JSON.parse(match[1].trim()));
+    } catch {
+      // skip
+    }
+  }
+  return results;
+}
+
+// ---------------------------------------------------------------------------
+// Collection page: extract product URLs
+// ---------------------------------------------------------------------------
+
+function extractProductUrls(html) {
+  const urls = new Set();
+  const regex = /href=["']((?:https?:\/\/dirtstreet\.in)?\/product\/[^"'#?]+)["']/gi;
+  let match;
+  while ((match = regex.exec(html)) !== null) {
+    let url = match[1];
+    if (url.startsWith("/")) url = `${BASE_URL}${url}`;
+    if (url.includes("/product-category/") || url.includes("/product-tag/")) continue;
+    // normalize trailing slash
+    url = url.replace(/\/?$/, "/");
+    urls.add(url);
+  }
+  return Array.from(urls);
+}
+
+function hasNextPage(html, currentPage, slug) {
+  const pattern = new RegExp(
+    `href=["'][^"']*/brand/${slug}/page/${currentPage + 1}/?["']`,
+    "i"
+  );
+  return pattern.test(html);
+}
+
+async function fetchBrandProductUrls(slug, brandUrl) {
+  const allUrls = new Set();
+  let page = 1;
+
+  while (true) {
+    const url = page === 1 ? brandUrl : `${brandUrl}page/${page}/`;
+    console.log(`[DIRTSTREET] Fetching brand page ${page}: ${url}`);
+
+    const html = await fetchHtml(url);
+    if (!html) break;
+
+    const urls = extractProductUrls(html);
+    const before = allUrls.size;
+    urls.forEach((u) => allUrls.add(u));
+
+    console.log(`[DIRTSTREET] Page ${page} -> ${urls.length} product URLs (total: ${allUrls.size})`);
+
+    if (!hasNextPage(html, page, slug) || allUrls.size === before) break;
+    page++;
+    await sleep(300);
+  }
+
+  return Array.from(allUrls);
+}
+
+// ---------------------------------------------------------------------------
+// Extra WooCommerce fields from HTML
+// ---------------------------------------------------------------------------
+
+function extractExtraFields(html) {
+  const extra = {
+    sku: null,
+    categories: [],
+    tags: [],
+    shortDescription: null,
+    stockStatus: null,
+    attributes: [],
+    variations: [],
+  };
+
+  // SKU - strip "SKU:" label prefix
+  const skuMatch = html.match(/<span[^>]+class="[^"]*\bsku\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
+  if (skuMatch) {
+    extra.sku = cleanText(skuMatch[1])
+      .replace(/^SKU\s*:\s*/i, "")
+      .trim();
+  }
+
+  // Stock status — read from the main product wrapper class only, not whole page
+  // The first <div/article class="product ... instock/outofstock ..."> is the current product
+  const productClassMatch = html.match(/class="[^"]*\bproduct\b[^"]*\b(instock|outofstock|on-backorder)\b[^"]*"/i);
+  if (productClassMatch) {
+    const cls = productClassMatch[1].toLowerCase();
+    if (cls === "instock") extra.stockStatus = "instock";
+    else if (cls === "outofstock") extra.stockStatus = "outofstock";
+    else extra.stockStatus = "onbackorder";
+  }
+
+  // Categories
+  const catMatch = html.match(/<span[^>]+class="[^"]*posted_in[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
+  if (catMatch) {
+    const catRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
+    let m;
+    while ((m = catRegex.exec(catMatch[1])) !== null) {
+      const cat = cleanText(m[1]);
+      if (cat) extra.categories.push(cat);
+    }
+  }
+
+  // Tags
+  const tagMatch = html.match(/<span[^>]+class="[^"]*tagged_as[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
+  if (tagMatch) {
+    const tagRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
+    let m;
+    while ((m = tagRegex.exec(tagMatch[1])) !== null) {
+      const tag = cleanText(m[1]);
+      if (tag) extra.tags.push(tag);
+    }
+  }
+
+  // Short description
+  const sdMatch = html.match(
+    /<div[^>]+class="[^"]*woocommerce-product-details__short-description[^"]*"[^>]*>([\s\S]*?)<\/div>/i
+  );
+  if (sdMatch) extra.shortDescription = cleanText(sdMatch[1]);
+
+  // Attributes table
+  const attrMatch = html.match(
+    /<table[^>]+class="[^"]*woocommerce-product-attributes[^"]*"[^>]*>([\s\S]*?)<\/table>/i
+  );
+  if (attrMatch) {
+    const rows = attrMatch[1].match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi) || [];
+    for (const row of rows) {
+      const th = row.match(/<th[^>]*>([\s\S]*?)<\/th>/i);
+      const td = row.match(/<td[^>]*>([\s\S]*?)<\/td>/i);
+      if (th && td) {
+        const key = cleanText(th[1]);
+        const value = cleanText(td[1]);
+        if (key && value) extra.attributes.push({ name: key, value });
+      }
+    }
+  }
+
+  // WooCommerce variations JSON in data attribute
+  const formMatch = html.match(/data-product_variations=["']([\[{][\s\S]*?)["']\s*>/);
+  if (formMatch) {
+    try {
+      extra.variations = JSON.parse(
+        formMatch[1].replace(/&quot;/g, '"').replace(/&#34;/g, '"')
+      );
+    } catch { /* skip */ }
+  }
+
+  return extra;
+}
+
+// ---------------------------------------------------------------------------
+// Scrape one product detail page
+// ---------------------------------------------------------------------------
+
+async function scrapeProductDetail(productUrl, brandName, brandSlug) {
+  const html = await fetchHtml(productUrl);
+  if (!html) {
+    return { recordType: "product", source: "dirtstreet", brand: brandName, brandSlug, url: productUrl, scrapeError: "404" };
+  }
+
+  const jsonLdBlocks = extractJsonLd(html);
+  // Try direct Product type first, then search every @graph block
+  let schema = jsonLdBlocks.find((b) => b["@type"] === "Product");
+  if (!schema) {
+    for (const block of jsonLdBlocks) {
+      if (Array.isArray(block["@graph"])) {
+        const found = block["@graph"].find((g) => g["@type"] === "Product");
+        if (found) { schema = found; break; }
+      }
+    }
+  }
+
+  const extra = extractExtraFields(html);
+
+  const offer = Array.isArray(schema?.offers) ? schema.offers[0] : (schema?.offers || {});
+
+  // Price: JSON-LD offers.priceSpecification[0].price OR offers.price, then HTML fallback
+  const priceSpec = Array.isArray(offer?.priceSpecification)
+    ? offer.priceSpecification[0]?.price
+    : null;
+  let price = toNumber(priceSpec ?? offer?.price ?? offer?.lowPrice);
+  if (!price || price === 0) price = extractPriceFromHtml(html);
+  if (price === 0) price = null;
+
+  // Stock: derive from JSON-LD availability (most reliable), fall back to HTML class
+  const stockFromSchema = availabilityToStockStatus(offer?.availability);
+  const stockStatus = stockFromSchema || extra.stockStatus || null;
+
+  const currency = offer?.priceCurrency || "INR";
+
+  const images = Array.isArray(schema?.image)
+    ? schema.image.map((img) => (typeof img === "string" ? img : img?.url || img?.contentUrl)).filter(Boolean)
+    : schema?.image
+      ? [typeof schema.image === "string" ? schema.image : (schema.image?.url || schema.image?.contentUrl)]
+      : [];
+
+  return {
+    recordType: "product",
+    source: "dirtstreet",
+    brand: schema?.brand?.name || brandName,
+    brandSlug,
+    url: productUrl,
+    title: cleanText(schema?.name || ""),
+    sku: extra.sku || cleanText(schema?.sku || ""),
+    mpn: cleanText(schema?.mpn || ""),
+    gtin: cleanText(schema?.gtin || schema?.gtin13 || ""),
+    price,
+    currency,
+    priceRaw: price != null ? String(price) : null,
+    compareAtPrice: toNumber(offer?.highPrice) || null,
+    availability: offer?.availability || null,
+    stockStatus,
+    image: images[0] || null,
+    images,
+    description: cleanText(schema?.description || ""),
+    shortDescription: extra.shortDescription,
+    categories: extra.categories,
+    tags: extra.tags,
+    attributes: extra.attributes,
+    variations: extra.variations,
+    aggregateRating: schema?.aggregateRating
+      ? { ratingValue: schema.aggregateRating.ratingValue, reviewCount: schema.aggregateRating.reviewCount }
+      : null,
+    scrapeError: null,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Concurrency helper
+// ---------------------------------------------------------------------------
+
+async function mapWithConcurrency(items, concurrency, worker) {
+  const results = new Array(items.length);
+  let index = 0;
+  async function run() {
+    while (true) {
+      const i = index++;
+      if (i >= items.length) return;
+      results[i] = await worker(items[i], i);
+    }
+  }
+  await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
+  return results;
+}
+
+// ---------------------------------------------------------------------------
+// Main export: scrape one brand
+// ---------------------------------------------------------------------------
+
+async function scrapeDirtstreetBrand({ name, slug, brandUrl }, options = {}) {
+  const limit = options.limit ? Number(options.limit) : null;
+  const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
+  const concurrency = options.concurrency || CONCURRENCY;
+
+  let productUrls = await fetchBrandProductUrls(slug, brandUrl);
+  console.log(`[DIRTSTREET:${name}] Found ${productUrls.length} products`);
+
+  if (limit && productUrls.length > limit) {
+    console.log(`[DIRTSTREET:${name}] Limit applied: fetching first ${limit} of ${productUrls.length}`);
+    productUrls = productUrls.slice(0, limit);
+  }
+
+  let done = 0;
+  const products = await mapWithConcurrency(productUrls, concurrency, async (url, i) => {
+    const product = await scrapeProductDetail(url, name, slug);
+    done++;
+    if (done % 10 === 0 || done === productUrls.length) {
+      console.log(`[DIRTSTREET:${name}] ${done}/${productUrls.length} products scraped`);
+    }
+    if (onProgress) onProgress({ done, total: productUrls.length, product: product.title || url });
+    return product;
+  });
+
+  return products;
+}
+
+// ---------------------------------------------------------------------------
+// Standalone test
+// ---------------------------------------------------------------------------
+
+if (require.main === module) {
+  const { BRANDS } = require("./brands");
+  const brand = BRANDS[0];
+  console.log(`Testing scraper for: ${brand.name}`);
+  scrapeDirtstreetBrand(brand, { limit: 3 })
+    .then((products) => {
+      console.log(`\nScraped ${products.length} products`);
+      console.log("Sample product:");
+      console.log(JSON.stringify(products[0], null, 2));
+    })
+    .catch((err) => {
+      console.error("Scrape failed:", err.message);
+      process.exitCode = 1;
+    });
+}
+
+module.exports = { scrapeDirtstreetBrand, fetchBrandProductUrls, scrapeProductDetail };
--- a/src/business-logic/import-pipeline/test_source/motousher/brands.js
+++ b/src/business-logic/import-pipeline/test_source/motousher/brands.js
@ -0,0 +1,73 @@
+/**
+ * Motousher brand list.
+ *
+ * Each entry maps a brand name to its collection page URL on motousher.com.
+ * Add more brands here — the scraper loops this list automatically.
+ *
+ * Brand index page: https://www.motousher.com/pages/partner-brands
+ */
+
+const BRANDS = [
+  {
+    name: "All Balls Racing",
+    slug: "all-balls-racing",
+    collectionUrl: "https://www.motousher.com/collections/all-balls-racing",
+  },
+  {
+    name: "DID Chains",
+    slug: "did-chains",
+    collectionUrl: "https://www.motousher.com/collections/did-chains",
+  },
+  {
+    name: "EBC Brakes",
+    slug: "ebc-brakes",
+    collectionUrl: "https://www.motousher.com/collections/ebc-brakes",
+  },
+  {
+    name: "Esjot Sprockets",
+    slug: "esjot-sprockets",
+    collectionUrl: "https://www.motousher.com/collections/esjot-sprockets",
+  },
+  {
+    name: "Evans Coolant",
+    slug: "evans-coolant",
+    collectionUrl: "https://www.motousher.com/collections/evans-coolant",
+  },
+  {
+    name: "Grip Puppies",
+    slug: "grip-puppies",
+    collectionUrl: "https://www.motousher.com/collections/grip-puppies",
+  },
+  {
+    name: "HiFlo Filters",
+    slug: "hi-flo",
+    collectionUrl: "https://www.motousher.com/collections/hi-flo",
+  },
+  {
+    name: "JT Sprockets",
+    slug: "jt-sprockets",
+    collectionUrl: "https://www.motousher.com/collections/jt-sprockets",
+  },
+  {
+    name: "Maxima Racing Oils",
+    slug: "maxima-racing-oils",
+    collectionUrl: "https://www.motousher.com/collections/maxima-racing-oils",
+  },
+  {
+    name: "Putoline",
+    slug: "putoline",
+    collectionUrl: "https://www.motousher.com/collections/putoline",
+  },
+  {
+    name: "Ram Mount",
+    slug: "ram-mount",
+    collectionUrl: "https://www.motousher.com/collections/ram-mount",
+  },
+  {
+    name: "Wunderlich",
+    slug: "wunderlich",
+    collectionUrl: "https://www.motousher.com/collections/wunderlich",
+  },
+];
+
+module.exports = { BRANDS };
--- a/src/business-logic/import-pipeline/test_source/motousher/index.js
+++ b/src/business-logic/import-pipeline/test_source/motousher/index.js
@ -0,0 +1,158 @@
+/**
+ * Motousher scraper runner.
+ *
+ * Loops through BRANDS in brands.js, scrapes each brand's collection page
+ * and all product detail pages, saves per-brand JSON files, then combines
+ * everything into a single combined.json.
+ *
+ * Usage:
+ *   node index.js                   → scrape all brands in brands.js
+ *   node index.js --brand all-balls-racing   → scrape one brand only
+ *   node index.js --limit 5         → only first 5 products per brand (for quick tests)
+ */
+
+const fs = require("node:fs/promises");
+const path = require("node:path");
+const { BRANDS } = require("./brands");
+const { scrapeMotousherBrand } = require("./scraper");
+
+const OUTPUT_DIR = path.resolve(__dirname, "..", "..", "..", "..", "..", "data", "sources", "test_source", "motousher");
+
+function slugify(str) {
+  return String(str || "")
+    .trim()
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, "-")
+    .replace(/^-+|-+$/g, "");
+}
+
+function parseArgs(argv = process.argv.slice(2)) {
+  const out = { brandSlug: null, limit: null };
+  for (let i = 0; i < argv.length; i++) {
+    if ((argv[i] === "--brand" || argv[i] === "-b") && argv[i + 1]) {
+      out.brandSlug = argv[i + 1].toLowerCase().trim();
+    }
+    if ((argv[i] === "--limit" || argv[i] === "-n") && argv[i + 1]) {
+      const n = Number.parseInt(argv[i + 1], 10);
+      if (Number.isFinite(n) && n > 0) out.limit = n;
+    }
+  }
+  return out;
+}
+
+async function run() {
+  const { brandSlug, limit } = parseArgs();
+  await fs.mkdir(OUTPUT_DIR, { recursive: true });
+
+  const brandsToRun = brandSlug
+    ? BRANDS.filter((b) => b.slug === brandSlug)
+    : BRANDS;
+
+  if (!brandsToRun.length) {
+    console.error(`No brand found matching slug: "${brandSlug}"`);
+    console.error(`Available: ${BRANDS.map((b) => b.slug).join(", ")}`);
+    process.exitCode = 1;
+    return;
+  }
+
+  const combinedProducts = [];
+  const runSummary = [];
+  const startedAt = new Date().toISOString();
+
+  for (const brand of brandsToRun) {
+    console.log(`\n${"=".repeat(60)}`);
+    console.log(`[MOTOUSHER] Scraping brand: ${brand.name}`);
+    console.log(`[MOTOUSHER] Collection URL: ${brand.collectionUrl}`);
+    console.log(`${"=".repeat(60)}`);
+
+    const brandStartedAt = new Date().toISOString();
+    let products = [];
+    let error = null;
+
+    try {
+      products = await scrapeMotousherBrand(brand, {
+        limit,
+        onProgress({ done, total, product }) {
+          // Inline progress already logged by scraper
+        },
+      });
+
+      // Save per-brand JSON
+      const brandFile = path.join(OUTPUT_DIR, `${brand.slug}.json`);
+      const brandPayload = {
+        generatedAt: new Date().toISOString(),
+        source: "motousher",
+        brand: brand.name,
+        brandSlug: brand.slug,
+        collectionUrl: brand.collectionUrl,
+        totalProducts: products.length,
+        products,
+      };
+
+      await fs.writeFile(brandFile, JSON.stringify(brandPayload, null, 2), "utf8");
+      console.log(`[MOTOUSHER] Saved ${products.length} products → ${brandFile}`);
+
+      combinedProducts.push(...products);
+
+      runSummary.push({
+        brand: brand.name,
+        slug: brand.slug,
+        collectionUrl: brand.collectionUrl,
+        startedAt: brandStartedAt,
+        completedAt: new Date().toISOString(),
+        totalProducts: products.length,
+        success: true,
+      });
+    } catch (err) {
+      console.error(`[MOTOUSHER] Failed to scrape ${brand.name}: ${err.message}`);
+      error = err.message;
+
+      runSummary.push({
+        brand: brand.name,
+        slug: brand.slug,
+        collectionUrl: brand.collectionUrl,
+        startedAt: brandStartedAt,
+        completedAt: new Date().toISOString(),
+        totalProducts: 0,
+        success: false,
+        error,
+      });
+    }
+  }
+
+  // Save combined JSON (all brands merged)
+  const combinedFile = path.join(OUTPUT_DIR, "combined.json");
+  const combinedPayload = {
+    generatedAt: new Date().toISOString(),
+    startedAt,
+    source: "motousher",
+    siteUrl: "https://www.motousher.com",
+    brandsScraped: runSummary.length,
+    totalProducts: combinedProducts.length,
+    runSummary,
+    products: combinedProducts,
+  };
+
+  await fs.writeFile(combinedFile, JSON.stringify(combinedPayload, null, 2), "utf8");
+
+  console.log(`\n${"=".repeat(60)}`);
+  console.log("[MOTOUSHER] RUN COMPLETE");
+  console.log(`${"=".repeat(60)}`);
+  console.log(`Brands scraped : ${runSummary.length}`);
+  console.log(`Total products : ${combinedProducts.length}`);
+  console.log(`Combined file  : ${combinedFile}`);
+  console.log("\nBrand summary:");
+  for (const s of runSummary) {
+    const status = s.success ? "OK" : "FAILED";
+    console.log(`  [${status}] ${s.brand} → ${s.totalProducts} products`);
+  }
+
+  return combinedPayload;
+}
+
+run().catch((err) => {
+  console.error("Runner failed:", err.message);
+  process.exitCode = 1;
+});
+
+module.exports = { run };
--- a/src/business-logic/import-pipeline/test_source/motousher/links.txt
+++ b/src/business-logic/import-pipeline/test_source/motousher/links.txt
@ -0,0 +1,9 @@
+# Motousher Partner Brand Links
+# Source: https://www.motousher.com/pages/partner-brands
+# Format: Brand Name | Collection URL
+
+All Balls Racing | https://www.motousher.com/collections/all-balls-racing
+
+# Add more brands below as needed, e.g.:
+# Acerbis | https://www.motousher.com/collections/acerbis
+# Twin Air | https://www.motousher.com/collections/twin-air
--- a/src/business-logic/import-pipeline/test_source/motousher/scraper.js
+++ b/src/business-logic/import-pipeline/test_source/motousher/scraper.js
@ -0,0 +1,254 @@
+/**
+ * Motousher.com scraper — Shopify JSON API (fast, no browser needed).
+ *
+ * Motousher is a Shopify store. Every collection and product has a public
+ * JSON endpoint, so we use plain fetch instead of Playwright:
+ *
+ *   Collection listing: /collections/{handle}/products.json?limit=250&page=N
+ *   Product detail:     /products/{handle}.json
+ *
+ * This is 10-20x faster than Playwright. Playwright is only used as a
+ * fallback if the JSON API returns nothing.
+ */
+
+const BASE_URL = "https://www.motousher.com";
+const CONCURRENCY = 3; // parallel product detail fetches — motousher rate-limits above ~5
+
+function sleep(ms) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+async function fetchJson(url, attempt = 1) {
+  try {
+    const res = await fetch(url, {
+      headers: {
+        "accept": "application/json",
+        "accept-language": "en-US,en;q=0.9",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
+      },
+    });
+
+    if (res.status === 429 || res.status >= 500) {
+      if (attempt <= 4) {
+        const wait = attempt * 1500;
+        console.log(`[RETRY] ${url} → HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
+        await sleep(wait);
+        return fetchJson(url, attempt + 1);
+      }
+      throw new Error(`HTTP ${res.status} after retries`);
+    }
+
+    if (!res.ok) throw new Error(`HTTP ${res.status}`);
+    return res.json();
+  } catch (err) {
+    if (attempt <= 4 && err.code === "UND_ERR_CONNECT_TIMEOUT") {
+      await sleep(attempt * 1500);
+      return fetchJson(url, attempt + 1);
+    }
+    throw err;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Collection: fetch ALL product handles via paginated JSON API
+// ---------------------------------------------------------------------------
+
+async function fetchCollectionProductHandles(collectionHandle) {
+  const handles = [];
+  let page = 1;
+
+  while (true) {
+    const url = `${BASE_URL}/collections/${collectionHandle}/products.json?limit=250&page=${page}`;
+    console.log(`[MOTOUSHER] Fetching collection page ${page}: ${url}`);
+
+    const data = await fetchJson(url);
+    const products = Array.isArray(data?.products) ? data.products : [];
+
+    if (!products.length) break;
+
+    for (const p of products) {
+      if (p.handle) handles.push(p.handle);
+    }
+
+    console.log(`[MOTOUSHER] Page ${page} → ${products.length} products (total so far: ${handles.length})`);
+
+    if (products.length < 250) break; // last page
+    page++;
+  }
+
+  return handles;
+}
+
+// ---------------------------------------------------------------------------
+// Product detail: fetch via /products/{handle}.json
+// ---------------------------------------------------------------------------
+
+function toNumber(value) {
+  if (!value) return null;
+  const match = String(value).match(/([\d,]+\.?\d*)/);
+  if (!match) return null;
+  const n = Number.parseFloat(match[1].replace(/,/g, ""));
+  return Number.isNaN(n) ? null : n;
+}
+
+function bestImageUrl(src) {
+  if (!src) return null;
+  // Remove Shopify width constraints to get the original full-size image
+  return src.replace(/(_\d+x\d*|_\d+x)(\.\w+)(\?|$)/, "$2$3");
+}
+
+function normalizeShopifyProduct(raw, brandName, brandSlug, collectionUrl) {
+  const images = (raw.images || []).map((img) => bestImageUrl(img.src)).filter(Boolean);
+
+  const options = (raw.options || []).map((opt) => ({
+    label: opt.name,
+    values: (opt.values || []).map((v) => ({ label: v, value: v })),
+  }));
+
+  const variants = (raw.variants || []).map((v) => ({
+    id: v.id,
+    title: v.title,
+    sku: v.sku || null,
+    price: toNumber(v.price),
+    compareAtPrice: toNumber(v.compare_at_price),
+    available: v.available,
+    options: [v.option1, v.option2, v.option3].filter(Boolean),
+    weight: v.grams || null,
+    barcode: v.barcode || null,
+  }));
+
+  const firstVariant = variants[0] || {};
+
+  return {
+    recordType: "product",
+    source: "motousher",
+    brand: brandName,
+    brandSlug,
+    collectionUrl,
+    title: raw.title || null,
+    url: `${BASE_URL}/products/${raw.handle}`,
+    handle: raw.handle || null,
+    productType: raw.product_type || null,
+    vendor: raw.vendor || brandName,
+    tags: Array.isArray(raw.tags) ? raw.tags : [],
+    priceRaw: firstVariant.price != null ? String(firstVariant.price) : null,
+    price: firstVariant.price ?? null,
+    compareAtPriceRaw: firstVariant.compareAtPrice != null ? String(firstVariant.compareAtPrice) : null,
+    compareAtPrice: firstVariant.compareAtPrice ?? null,
+    sku: firstVariant.sku || null,
+    barcode: firstVariant.barcode || null,
+    available: raw.variants?.some((v) => v.available) ?? null,
+    image: images[0] || null,
+    images,
+    descriptionHtml: raw.body_html || null,
+    description: raw.body_html
+      ? raw.body_html.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim()
+      : null,
+    options,
+    variants,
+    publishedAt: raw.published_at || null,
+    createdAt: raw.created_at || null,
+    updatedAt: raw.updated_at || null,
+    scrapeError: null,
+  };
+}
+
+async function fetchProductDetail(handle, brandName, brandSlug, collectionUrl) {
+  const url = `${BASE_URL}/products/${handle}.json`;
+  try {
+    const data = await fetchJson(url);
+    if (!data?.product) throw new Error("No product in response");
+    return normalizeShopifyProduct(data.product, brandName, brandSlug, collectionUrl);
+  } catch (err) {
+    return {
+      recordType: "product",
+      source: "motousher",
+      brand: brandName,
+      brandSlug,
+      collectionUrl,
+      handle,
+      url: `${BASE_URL}/products/${handle}`,
+      scrapeError: err.message,
+    };
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Concurrency helper
+// ---------------------------------------------------------------------------
+
+async function mapWithConcurrency(items, concurrency, worker) {
+  const results = new Array(items.length);
+  let index = 0;
+
+  async function run() {
+    while (true) {
+      const i = index++;
+      if (i >= items.length) return;
+      results[i] = await worker(items[i], i);
+    }
+  }
+
+  await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
+  return results;
+}
+
+// ---------------------------------------------------------------------------
+// Main export: scrape one brand
+// ---------------------------------------------------------------------------
+
+async function scrapeMotousherBrand({ name, slug, collectionUrl }, options = {}) {
+  const limit = options.limit ? Number(options.limit) : null;
+  const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
+  const concurrency = options.concurrency || CONCURRENCY;
+
+  // Step 1: get all product handles from collection pages
+  let handles = await fetchCollectionProductHandles(slug);
+  console.log(`[MOTOUSHER:${name}] Found ${handles.length} products in collection`);
+
+  if (limit && handles.length > limit) {
+    console.log(`[MOTOUSHER:${name}] Limit applied: fetching first ${limit} of ${handles.length}`);
+    handles = handles.slice(0, limit);
+  }
+
+  // Step 2: fetch all product details in parallel
+  let done = 0;
+  const products = await mapWithConcurrency(handles, concurrency, async (handle, i) => {
+    const product = await fetchProductDetail(handle, name, slug, collectionUrl);
+    done++;
+    if (done % 20 === 0 || done === handles.length) {
+      console.log(`[MOTOUSHER:${name}] ${done}/${handles.length} products fetched`);
+    }
+    if (onProgress) onProgress({ done, total: handles.length, product: product.title || handle });
+    return product;
+  });
+
+  return products;
+}
+
+// ---------------------------------------------------------------------------
+// Standalone test
+// ---------------------------------------------------------------------------
+
+if (require.main === module) {
+  const { BRANDS } = require("./brands");
+  const brand = BRANDS[0];
+
+  console.log(`Testing scraper for: ${brand.name}`);
+  scrapeMotousherBrand(brand, { limit: 5 })
+    .then((products) => {
+      console.log(`\nScraped ${products.length} products`);
+      console.log("Sample product:");
+      console.log(JSON.stringify(products[0], null, 2));
+    })
+    .catch((err) => {
+      console.error("Scrape failed:", err.message);
+      process.exitCode = 1;
+    });
+}
+
+module.exports = {
+  scrapeMotousherBrand,
+  fetchCollectionProductHandles,
+  fetchProductDetail,
+};