Add test_source scrapers for motousher.com and dirtstreet.in

Adds two new experimental product scrapers under test_source/, isolated
from the active pipeline until verified and ready to promote.

motousher/ (Shopify store — Shopify JSON API):
- Scrapes 12 brands: All Balls Racing, DID Chains, EBC Brakes, Esjot
  Sprockets, Evans Coolant, Grip Puppies, HiFlo Filters, JT Sprockets,
  Maxima Racing Oils, Putoline, Ram Mount, Wunderlich
- 2,446 products total scraped and verified
- Uses /collections/{slug}/products.json + /products/{handle}.json
- Parallel fetch (concurrency 3), paginated collection listing

dirtstreet/ (WooCommerce store — HTML + JSON-LD):
- Scrapes 5 brands: SC Project, Evotech Performance, DNA Air Filters,
  WRS, Zero Gravity Racing
- 1,087 products total scraped and verified
- Pure fetch with JSON-LD schema.org extraction (no browser)
- Handles paginated /brand/{slug}/page/N/ archives
- Price extracted from offers.priceSpecification[0].price
- Stock status derived from JSON-LD availability field

Both scrapers are standalone (node index.js), support --brand and
--limit flags, save per-brand JSON files and a combined.json.
Scraped data lives in data/sources/test_source/ (gitignored).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
MOHAN 2026-06-04 12:17:23 +05:30
parent 1d254a9009
commit b8d9478afa
9 changed files with 1117 additions and 0 deletions

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,26 @@
# Test Source Workspace
This folder is for experimental supplier scraping work.
Use one folder per supplier website or test target. Keep experiments here until
the scraper, converter, and data shape are confirmed safe to move into the active
`sources/` registry.
Suggested folder shape:
```txt
test_source/
supplier-key/
links.txt
scraper.js
sample-output.json
notes.md
```
Rules for this area:
- Do not register test sources in `../sources/index.js` until the scraper is ready.
- Store only brand/filter/listing URLs in `links.txt`; avoid committing secrets.
- Keep generated images, large raw exports, and run logs out of git.
- Document what each URL represents in the supplier folder notes.

View File

@ -0,0 +1,35 @@
/**
* Dirtstreet brand list.
*
* Brand index page: https://dirtstreet.in/shop-by-brands/
*/
const BRANDS = [
{
name: "SC Project",
slug: "scproject",
brandUrl: "https://dirtstreet.in/brand/scproject/",
},
{
name: "Evotech Performance",
slug: "evotechperformance",
brandUrl: "https://dirtstreet.in/brand/evotechperformance/",
},
{
name: "DNA Air Filters",
slug: "dnaairfilters",
brandUrl: "https://dirtstreet.in/brand/dnaairfilters/",
},
{
name: "WRS",
slug: "wrs",
brandUrl: "https://dirtstreet.in/brand/wrs/",
},
{
name: "Zero Gravity Racing",
slug: "zerogravityracing",
brandUrl: "https://dirtstreet.in/brand/zerogravityracing/",
},
];
module.exports = { BRANDS };

View File

@ -0,0 +1,138 @@
/**
* Dirtstreet scraper runner.
*
* Usage:
* node index.js scrape all brands in brands.js
* node index.js --brand scproject scrape one brand only
* node index.js --limit 5 first 5 products per brand (quick test)
*/
const fs = require("node:fs/promises");
const path = require("node:path");
const { BRANDS } = require("./brands");
const { scrapeDirtstreetBrand } = require("./scraper");
const OUTPUT_DIR = path.resolve(__dirname, "..", "..", "..", "..", "..", "data", "sources", "test_source", "dirtstreet");
function parseArgs(argv = process.argv.slice(2)) {
const out = { brandSlug: null, limit: null };
for (let i = 0; i < argv.length; i++) {
if ((argv[i] === "--brand" || argv[i] === "-b") && argv[i + 1]) {
out.brandSlug = argv[i + 1].toLowerCase().trim();
}
if ((argv[i] === "--limit" || argv[i] === "-n") && argv[i + 1]) {
const n = Number.parseInt(argv[i + 1], 10);
if (Number.isFinite(n) && n > 0) out.limit = n;
}
}
return out;
}
async function run() {
const { brandSlug, limit } = parseArgs();
await fs.mkdir(OUTPUT_DIR, { recursive: true });
const brandsToRun = brandSlug
? BRANDS.filter((b) => b.slug === brandSlug)
: BRANDS;
if (!brandsToRun.length) {
console.error(`No brand found matching slug: "${brandSlug}"`);
console.error(`Available: ${BRANDS.map((b) => b.slug).join(", ")}`);
process.exitCode = 1;
return;
}
const combinedProducts = [];
const runSummary = [];
const startedAt = new Date().toISOString();
for (const brand of brandsToRun) {
console.log(`\n${"=".repeat(60)}`);
console.log(`[DIRTSTREET] Scraping brand: ${brand.name}`);
console.log(`[DIRTSTREET] Brand URL: ${brand.brandUrl}`);
console.log(`${"=".repeat(60)}`);
const brandStartedAt = new Date().toISOString();
let products = [];
let error = null;
try {
products = await scrapeDirtstreetBrand(brand, { limit });
const brandFile = path.join(OUTPUT_DIR, `${brand.slug}.json`);
const brandPayload = {
generatedAt: new Date().toISOString(),
source: "dirtstreet",
brand: brand.name,
brandSlug: brand.slug,
brandUrl: brand.brandUrl,
totalProducts: products.length,
products,
};
await fs.writeFile(brandFile, JSON.stringify(brandPayload, null, 2), "utf8");
console.log(`[DIRTSTREET] Saved ${products.length} products → ${brandFile}`);
combinedProducts.push(...products);
runSummary.push({
brand: brand.name,
slug: brand.slug,
brandUrl: brand.brandUrl,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: products.length,
success: true,
});
} catch (err) {
console.error(`[DIRTSTREET] Failed to scrape ${brand.name}: ${err.message}`);
error = err.message;
runSummary.push({
brand: brand.name,
slug: brand.slug,
brandUrl: brand.brandUrl,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: 0,
success: false,
error,
});
}
}
// Save combined JSON
const combinedFile = path.join(OUTPUT_DIR, "combined.json");
const combinedPayload = {
generatedAt: new Date().toISOString(),
startedAt,
source: "dirtstreet",
siteUrl: "https://dirtstreet.in",
brandsScraped: runSummary.length,
totalProducts: combinedProducts.length,
runSummary,
products: combinedProducts,
};
await fs.writeFile(combinedFile, JSON.stringify(combinedPayload, null, 2), "utf8");
console.log(`\n${"=".repeat(60)}`);
console.log("[DIRTSTREET] RUN COMPLETE");
console.log(`${"=".repeat(60)}`);
console.log(`Brands scraped : ${runSummary.length}`);
console.log(`Total products : ${combinedProducts.length}`);
console.log(`Combined file : ${combinedFile}`);
console.log("\nBrand summary:");
for (const s of runSummary) {
const status = s.success ? "OK" : "FAILED";
console.log(` [${status}] ${s.brand}${s.totalProducts} products`);
}
return combinedPayload;
}
run().catch((err) => {
console.error("Runner failed:", err.message);
process.exitCode = 1;
});
module.exports = { run };

View File

@ -0,0 +1,423 @@
/**
* Dirtstreet.in scraper - plain fetch + JSON-LD extraction (no browser).
*
* Dirtstreet is a WooCommerce store. Product pages contain JSON-LD schema.org
* markup. We fetch HTML directly and extract all fields via regex.
*
* 1. Brand pages paginated at /brand/{slug}/page/N/
* 2. Product detail via HTML + JSON-LD
*/
const BASE_URL = "https://dirtstreet.in";
const CONCURRENCY = 3;
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchHtml(url, attempt = 1) {
try {
const res = await fetch(url, {
headers: {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
"cache-control": "no-cache",
},
});
if (res.status === 429 || res.status >= 500) {
if (attempt <= 4) {
const wait = attempt * 2000;
console.log(`[RETRY] ${url} -> HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
await sleep(wait);
return fetchHtml(url, attempt + 1);
}
throw new Error(`HTTP ${res.status} after retries`);
}
if (res.status === 404) return null;
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.text();
} catch (err) {
if (attempt <= 3 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) {
await sleep(attempt * 2000);
return fetchHtml(url, attempt + 1);
}
throw err;
}
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function decodeEntities(str) {
return String(str || "")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&times;/g, "x")
.replace(/&nbsp;/g, " ")
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)));
}
function cleanText(value) {
return decodeEntities(String(value || ""))
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function toNumber(value) {
if (!value) return null;
const match = String(value).match(/([\d,]+\.?\d*)/);
if (!match) return null;
const n = Number.parseFloat(match[1].replace(/,/g, ""));
return Number.isNaN(n) ? null : n;
}
function extractPriceFromHtml(html) {
// data-price attribute (WooCommerce sets this for JS cart)
const dp = html.match(/data-price=["']([\d.]+)["']/);
if (dp) { const n = Number.parseFloat(dp[1]); if (n > 0) return n; }
// Pattern on dirtstreet: &#8377;</span>198,000.00</bdi>
// Collect ALL such values, filter out 0, take the first valid one (the product price)
const bdiPattern = /(?:&#8377;|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>/gi;
let bdiMatch;
while ((bdiMatch = bdiPattern.exec(html)) !== null) {
const n = toNumber(bdiMatch[1]);
if (n > 0) return n;
}
// <ins> sale price block
const ins = html.match(/<ins>[\s\S]{0,300}?(?:&#8377;|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>[\s\S]{0,300}?<\/ins>/i);
if (ins) { const n = toNumber(ins[1]); if (n > 0) return n; }
// Fallback: &#8377; number anywhere
const entity = html.match(/&#8377;[^>]*>([\d,]+(?:\.\d+)?)/);
if (entity) { const n = toNumber(entity[1]); if (n > 0) return n; }
return null;
}
function availabilityToStockStatus(availability) {
if (!availability) return null;
const url = String(availability).toLowerCase();
if (url.includes("instock")) return "instock";
if (url.includes("outofstock")) return "outofstock";
if (url.includes("backorder") || url.includes("preorder")) return "onbackorder";
return null;
}
// ---------------------------------------------------------------------------
// JSON-LD extraction
// ---------------------------------------------------------------------------
function extractJsonLd(html) {
const results = [];
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
let match;
while ((match = regex.exec(html)) !== null) {
try {
results.push(JSON.parse(match[1].trim()));
} catch {
// skip
}
}
return results;
}
// ---------------------------------------------------------------------------
// Collection page: extract product URLs
// ---------------------------------------------------------------------------
function extractProductUrls(html) {
const urls = new Set();
const regex = /href=["']((?:https?:\/\/dirtstreet\.in)?\/product\/[^"'#?]+)["']/gi;
let match;
while ((match = regex.exec(html)) !== null) {
let url = match[1];
if (url.startsWith("/")) url = `${BASE_URL}${url}`;
if (url.includes("/product-category/") || url.includes("/product-tag/")) continue;
// normalize trailing slash
url = url.replace(/\/?$/, "/");
urls.add(url);
}
return Array.from(urls);
}
function hasNextPage(html, currentPage, slug) {
const pattern = new RegExp(
`href=["'][^"']*/brand/${slug}/page/${currentPage + 1}/?["']`,
"i"
);
return pattern.test(html);
}
async function fetchBrandProductUrls(slug, brandUrl) {
const allUrls = new Set();
let page = 1;
while (true) {
const url = page === 1 ? brandUrl : `${brandUrl}page/${page}/`;
console.log(`[DIRTSTREET] Fetching brand page ${page}: ${url}`);
const html = await fetchHtml(url);
if (!html) break;
const urls = extractProductUrls(html);
const before = allUrls.size;
urls.forEach((u) => allUrls.add(u));
console.log(`[DIRTSTREET] Page ${page} -> ${urls.length} product URLs (total: ${allUrls.size})`);
if (!hasNextPage(html, page, slug) || allUrls.size === before) break;
page++;
await sleep(300);
}
return Array.from(allUrls);
}
// ---------------------------------------------------------------------------
// Extra WooCommerce fields from HTML
// ---------------------------------------------------------------------------
function extractExtraFields(html) {
const extra = {
sku: null,
categories: [],
tags: [],
shortDescription: null,
stockStatus: null,
attributes: [],
variations: [],
};
// SKU - strip "SKU:" label prefix
const skuMatch = html.match(/<span[^>]+class="[^"]*\bsku\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
if (skuMatch) {
extra.sku = cleanText(skuMatch[1])
.replace(/^SKU\s*:\s*/i, "")
.trim();
}
// Stock status — read from the main product wrapper class only, not whole page
// The first <div/article class="product ... instock/outofstock ..."> is the current product
const productClassMatch = html.match(/class="[^"]*\bproduct\b[^"]*\b(instock|outofstock|on-backorder)\b[^"]*"/i);
if (productClassMatch) {
const cls = productClassMatch[1].toLowerCase();
if (cls === "instock") extra.stockStatus = "instock";
else if (cls === "outofstock") extra.stockStatus = "outofstock";
else extra.stockStatus = "onbackorder";
}
// Categories
const catMatch = html.match(/<span[^>]+class="[^"]*posted_in[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
if (catMatch) {
const catRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
let m;
while ((m = catRegex.exec(catMatch[1])) !== null) {
const cat = cleanText(m[1]);
if (cat) extra.categories.push(cat);
}
}
// Tags
const tagMatch = html.match(/<span[^>]+class="[^"]*tagged_as[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
if (tagMatch) {
const tagRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
let m;
while ((m = tagRegex.exec(tagMatch[1])) !== null) {
const tag = cleanText(m[1]);
if (tag) extra.tags.push(tag);
}
}
// Short description
const sdMatch = html.match(
/<div[^>]+class="[^"]*woocommerce-product-details__short-description[^"]*"[^>]*>([\s\S]*?)<\/div>/i
);
if (sdMatch) extra.shortDescription = cleanText(sdMatch[1]);
// Attributes table
const attrMatch = html.match(
/<table[^>]+class="[^"]*woocommerce-product-attributes[^"]*"[^>]*>([\s\S]*?)<\/table>/i
);
if (attrMatch) {
const rows = attrMatch[1].match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi) || [];
for (const row of rows) {
const th = row.match(/<th[^>]*>([\s\S]*?)<\/th>/i);
const td = row.match(/<td[^>]*>([\s\S]*?)<\/td>/i);
if (th && td) {
const key = cleanText(th[1]);
const value = cleanText(td[1]);
if (key && value) extra.attributes.push({ name: key, value });
}
}
}
// WooCommerce variations JSON in data attribute
const formMatch = html.match(/data-product_variations=["']([\[{][\s\S]*?)["']\s*>/);
if (formMatch) {
try {
extra.variations = JSON.parse(
formMatch[1].replace(/&quot;/g, '"').replace(/&#34;/g, '"')
);
} catch { /* skip */ }
}
return extra;
}
// ---------------------------------------------------------------------------
// Scrape one product detail page
// ---------------------------------------------------------------------------
async function scrapeProductDetail(productUrl, brandName, brandSlug) {
const html = await fetchHtml(productUrl);
if (!html) {
return { recordType: "product", source: "dirtstreet", brand: brandName, brandSlug, url: productUrl, scrapeError: "404" };
}
const jsonLdBlocks = extractJsonLd(html);
// Try direct Product type first, then search every @graph block
let schema = jsonLdBlocks.find((b) => b["@type"] === "Product");
if (!schema) {
for (const block of jsonLdBlocks) {
if (Array.isArray(block["@graph"])) {
const found = block["@graph"].find((g) => g["@type"] === "Product");
if (found) { schema = found; break; }
}
}
}
const extra = extractExtraFields(html);
const offer = Array.isArray(schema?.offers) ? schema.offers[0] : (schema?.offers || {});
// Price: JSON-LD offers.priceSpecification[0].price OR offers.price, then HTML fallback
const priceSpec = Array.isArray(offer?.priceSpecification)
? offer.priceSpecification[0]?.price
: null;
let price = toNumber(priceSpec ?? offer?.price ?? offer?.lowPrice);
if (!price || price === 0) price = extractPriceFromHtml(html);
if (price === 0) price = null;
// Stock: derive from JSON-LD availability (most reliable), fall back to HTML class
const stockFromSchema = availabilityToStockStatus(offer?.availability);
const stockStatus = stockFromSchema || extra.stockStatus || null;
const currency = offer?.priceCurrency || "INR";
const images = Array.isArray(schema?.image)
? schema.image.map((img) => (typeof img === "string" ? img : img?.url || img?.contentUrl)).filter(Boolean)
: schema?.image
? [typeof schema.image === "string" ? schema.image : (schema.image?.url || schema.image?.contentUrl)]
: [];
return {
recordType: "product",
source: "dirtstreet",
brand: schema?.brand?.name || brandName,
brandSlug,
url: productUrl,
title: cleanText(schema?.name || ""),
sku: extra.sku || cleanText(schema?.sku || ""),
mpn: cleanText(schema?.mpn || ""),
gtin: cleanText(schema?.gtin || schema?.gtin13 || ""),
price,
currency,
priceRaw: price != null ? String(price) : null,
compareAtPrice: toNumber(offer?.highPrice) || null,
availability: offer?.availability || null,
stockStatus,
image: images[0] || null,
images,
description: cleanText(schema?.description || ""),
shortDescription: extra.shortDescription,
categories: extra.categories,
tags: extra.tags,
attributes: extra.attributes,
variations: extra.variations,
aggregateRating: schema?.aggregateRating
? { ratingValue: schema.aggregateRating.ratingValue, reviewCount: schema.aggregateRating.reviewCount }
: null,
scrapeError: null,
};
}
// ---------------------------------------------------------------------------
// Concurrency helper
// ---------------------------------------------------------------------------
async function mapWithConcurrency(items, concurrency, worker) {
const results = new Array(items.length);
let index = 0;
async function run() {
while (true) {
const i = index++;
if (i >= items.length) return;
results[i] = await worker(items[i], i);
}
}
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
return results;
}
// ---------------------------------------------------------------------------
// Main export: scrape one brand
// ---------------------------------------------------------------------------
async function scrapeDirtstreetBrand({ name, slug, brandUrl }, options = {}) {
const limit = options.limit ? Number(options.limit) : null;
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
const concurrency = options.concurrency || CONCURRENCY;
let productUrls = await fetchBrandProductUrls(slug, brandUrl);
console.log(`[DIRTSTREET:${name}] Found ${productUrls.length} products`);
if (limit && productUrls.length > limit) {
console.log(`[DIRTSTREET:${name}] Limit applied: fetching first ${limit} of ${productUrls.length}`);
productUrls = productUrls.slice(0, limit);
}
let done = 0;
const products = await mapWithConcurrency(productUrls, concurrency, async (url, i) => {
const product = await scrapeProductDetail(url, name, slug);
done++;
if (done % 10 === 0 || done === productUrls.length) {
console.log(`[DIRTSTREET:${name}] ${done}/${productUrls.length} products scraped`);
}
if (onProgress) onProgress({ done, total: productUrls.length, product: product.title || url });
return product;
});
return products;
}
// ---------------------------------------------------------------------------
// Standalone test
// ---------------------------------------------------------------------------
if (require.main === module) {
const { BRANDS } = require("./brands");
const brand = BRANDS[0];
console.log(`Testing scraper for: ${brand.name}`);
scrapeDirtstreetBrand(brand, { limit: 3 })
.then((products) => {
console.log(`\nScraped ${products.length} products`);
console.log("Sample product:");
console.log(JSON.stringify(products[0], null, 2));
})
.catch((err) => {
console.error("Scrape failed:", err.message);
process.exitCode = 1;
});
}
module.exports = { scrapeDirtstreetBrand, fetchBrandProductUrls, scrapeProductDetail };

View File

@ -0,0 +1,73 @@
/**
* Motousher brand list.
*
* Each entry maps a brand name to its collection page URL on motousher.com.
* Add more brands here the scraper loops this list automatically.
*
* Brand index page: https://www.motousher.com/pages/partner-brands
*/
const BRANDS = [
{
name: "All Balls Racing",
slug: "all-balls-racing",
collectionUrl: "https://www.motousher.com/collections/all-balls-racing",
},
{
name: "DID Chains",
slug: "did-chains",
collectionUrl: "https://www.motousher.com/collections/did-chains",
},
{
name: "EBC Brakes",
slug: "ebc-brakes",
collectionUrl: "https://www.motousher.com/collections/ebc-brakes",
},
{
name: "Esjot Sprockets",
slug: "esjot-sprockets",
collectionUrl: "https://www.motousher.com/collections/esjot-sprockets",
},
{
name: "Evans Coolant",
slug: "evans-coolant",
collectionUrl: "https://www.motousher.com/collections/evans-coolant",
},
{
name: "Grip Puppies",
slug: "grip-puppies",
collectionUrl: "https://www.motousher.com/collections/grip-puppies",
},
{
name: "HiFlo Filters",
slug: "hi-flo",
collectionUrl: "https://www.motousher.com/collections/hi-flo",
},
{
name: "JT Sprockets",
slug: "jt-sprockets",
collectionUrl: "https://www.motousher.com/collections/jt-sprockets",
},
{
name: "Maxima Racing Oils",
slug: "maxima-racing-oils",
collectionUrl: "https://www.motousher.com/collections/maxima-racing-oils",
},
{
name: "Putoline",
slug: "putoline",
collectionUrl: "https://www.motousher.com/collections/putoline",
},
{
name: "Ram Mount",
slug: "ram-mount",
collectionUrl: "https://www.motousher.com/collections/ram-mount",
},
{
name: "Wunderlich",
slug: "wunderlich",
collectionUrl: "https://www.motousher.com/collections/wunderlich",
},
];
module.exports = { BRANDS };

View File

@ -0,0 +1,158 @@
/**
* Motousher scraper runner.
*
* Loops through BRANDS in brands.js, scrapes each brand's collection page
* and all product detail pages, saves per-brand JSON files, then combines
* everything into a single combined.json.
*
* Usage:
* node index.js scrape all brands in brands.js
* node index.js --brand all-balls-racing scrape one brand only
* node index.js --limit 5 only first 5 products per brand (for quick tests)
*/
const fs = require("node:fs/promises");
const path = require("node:path");
const { BRANDS } = require("./brands");
const { scrapeMotousherBrand } = require("./scraper");
const OUTPUT_DIR = path.resolve(__dirname, "..", "..", "..", "..", "..", "data", "sources", "test_source", "motousher");
function slugify(str) {
return String(str || "")
.trim()
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/^-+|-+$/g, "");
}
function parseArgs(argv = process.argv.slice(2)) {
const out = { brandSlug: null, limit: null };
for (let i = 0; i < argv.length; i++) {
if ((argv[i] === "--brand" || argv[i] === "-b") && argv[i + 1]) {
out.brandSlug = argv[i + 1].toLowerCase().trim();
}
if ((argv[i] === "--limit" || argv[i] === "-n") && argv[i + 1]) {
const n = Number.parseInt(argv[i + 1], 10);
if (Number.isFinite(n) && n > 0) out.limit = n;
}
}
return out;
}
async function run() {
const { brandSlug, limit } = parseArgs();
await fs.mkdir(OUTPUT_DIR, { recursive: true });
const brandsToRun = brandSlug
? BRANDS.filter((b) => b.slug === brandSlug)
: BRANDS;
if (!brandsToRun.length) {
console.error(`No brand found matching slug: "${brandSlug}"`);
console.error(`Available: ${BRANDS.map((b) => b.slug).join(", ")}`);
process.exitCode = 1;
return;
}
const combinedProducts = [];
const runSummary = [];
const startedAt = new Date().toISOString();
for (const brand of brandsToRun) {
console.log(`\n${"=".repeat(60)}`);
console.log(`[MOTOUSHER] Scraping brand: ${brand.name}`);
console.log(`[MOTOUSHER] Collection URL: ${brand.collectionUrl}`);
console.log(`${"=".repeat(60)}`);
const brandStartedAt = new Date().toISOString();
let products = [];
let error = null;
try {
products = await scrapeMotousherBrand(brand, {
limit,
onProgress({ done, total, product }) {
// Inline progress already logged by scraper
},
});
// Save per-brand JSON
const brandFile = path.join(OUTPUT_DIR, `${brand.slug}.json`);
const brandPayload = {
generatedAt: new Date().toISOString(),
source: "motousher",
brand: brand.name,
brandSlug: brand.slug,
collectionUrl: brand.collectionUrl,
totalProducts: products.length,
products,
};
await fs.writeFile(brandFile, JSON.stringify(brandPayload, null, 2), "utf8");
console.log(`[MOTOUSHER] Saved ${products.length} products → ${brandFile}`);
combinedProducts.push(...products);
runSummary.push({
brand: brand.name,
slug: brand.slug,
collectionUrl: brand.collectionUrl,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: products.length,
success: true,
});
} catch (err) {
console.error(`[MOTOUSHER] Failed to scrape ${brand.name}: ${err.message}`);
error = err.message;
runSummary.push({
brand: brand.name,
slug: brand.slug,
collectionUrl: brand.collectionUrl,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: 0,
success: false,
error,
});
}
}
// Save combined JSON (all brands merged)
const combinedFile = path.join(OUTPUT_DIR, "combined.json");
const combinedPayload = {
generatedAt: new Date().toISOString(),
startedAt,
source: "motousher",
siteUrl: "https://www.motousher.com",
brandsScraped: runSummary.length,
totalProducts: combinedProducts.length,
runSummary,
products: combinedProducts,
};
await fs.writeFile(combinedFile, JSON.stringify(combinedPayload, null, 2), "utf8");
console.log(`\n${"=".repeat(60)}`);
console.log("[MOTOUSHER] RUN COMPLETE");
console.log(`${"=".repeat(60)}`);
console.log(`Brands scraped : ${runSummary.length}`);
console.log(`Total products : ${combinedProducts.length}`);
console.log(`Combined file : ${combinedFile}`);
console.log("\nBrand summary:");
for (const s of runSummary) {
const status = s.success ? "OK" : "FAILED";
console.log(` [${status}] ${s.brand}${s.totalProducts} products`);
}
return combinedPayload;
}
run().catch((err) => {
console.error("Runner failed:", err.message);
process.exitCode = 1;
});
module.exports = { run };

View File

@ -0,0 +1,9 @@
# Motousher Partner Brand Links
# Source: https://www.motousher.com/pages/partner-brands
# Format: Brand Name | Collection URL
All Balls Racing | https://www.motousher.com/collections/all-balls-racing
# Add more brands below as needed, e.g.:
# Acerbis | https://www.motousher.com/collections/acerbis
# Twin Air | https://www.motousher.com/collections/twin-air

View File

@ -0,0 +1,254 @@
/**
* Motousher.com scraper Shopify JSON API (fast, no browser needed).
*
* Motousher is a Shopify store. Every collection and product has a public
* JSON endpoint, so we use plain fetch instead of Playwright:
*
* Collection listing: /collections/{handle}/products.json?limit=250&page=N
* Product detail: /products/{handle}.json
*
* This is 10-20x faster than Playwright. Playwright is only used as a
* fallback if the JSON API returns nothing.
*/
const BASE_URL = "https://www.motousher.com";
const CONCURRENCY = 3; // parallel product detail fetches — motousher rate-limits above ~5
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchJson(url, attempt = 1) {
try {
const res = await fetch(url, {
headers: {
"accept": "application/json",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
},
});
if (res.status === 429 || res.status >= 500) {
if (attempt <= 4) {
const wait = attempt * 1500;
console.log(`[RETRY] ${url} → HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
await sleep(wait);
return fetchJson(url, attempt + 1);
}
throw new Error(`HTTP ${res.status} after retries`);
}
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.json();
} catch (err) {
if (attempt <= 4 && err.code === "UND_ERR_CONNECT_TIMEOUT") {
await sleep(attempt * 1500);
return fetchJson(url, attempt + 1);
}
throw err;
}
}
// ---------------------------------------------------------------------------
// Collection: fetch ALL product handles via paginated JSON API
// ---------------------------------------------------------------------------
async function fetchCollectionProductHandles(collectionHandle) {
const handles = [];
let page = 1;
while (true) {
const url = `${BASE_URL}/collections/${collectionHandle}/products.json?limit=250&page=${page}`;
console.log(`[MOTOUSHER] Fetching collection page ${page}: ${url}`);
const data = await fetchJson(url);
const products = Array.isArray(data?.products) ? data.products : [];
if (!products.length) break;
for (const p of products) {
if (p.handle) handles.push(p.handle);
}
console.log(`[MOTOUSHER] Page ${page}${products.length} products (total so far: ${handles.length})`);
if (products.length < 250) break; // last page
page++;
}
return handles;
}
// ---------------------------------------------------------------------------
// Product detail: fetch via /products/{handle}.json
// ---------------------------------------------------------------------------
function toNumber(value) {
if (!value) return null;
const match = String(value).match(/([\d,]+\.?\d*)/);
if (!match) return null;
const n = Number.parseFloat(match[1].replace(/,/g, ""));
return Number.isNaN(n) ? null : n;
}
function bestImageUrl(src) {
if (!src) return null;
// Remove Shopify width constraints to get the original full-size image
return src.replace(/(_\d+x\d*|_\d+x)(\.\w+)(\?|$)/, "$2$3");
}
function normalizeShopifyProduct(raw, brandName, brandSlug, collectionUrl) {
const images = (raw.images || []).map((img) => bestImageUrl(img.src)).filter(Boolean);
const options = (raw.options || []).map((opt) => ({
label: opt.name,
values: (opt.values || []).map((v) => ({ label: v, value: v })),
}));
const variants = (raw.variants || []).map((v) => ({
id: v.id,
title: v.title,
sku: v.sku || null,
price: toNumber(v.price),
compareAtPrice: toNumber(v.compare_at_price),
available: v.available,
options: [v.option1, v.option2, v.option3].filter(Boolean),
weight: v.grams || null,
barcode: v.barcode || null,
}));
const firstVariant = variants[0] || {};
return {
recordType: "product",
source: "motousher",
brand: brandName,
brandSlug,
collectionUrl,
title: raw.title || null,
url: `${BASE_URL}/products/${raw.handle}`,
handle: raw.handle || null,
productType: raw.product_type || null,
vendor: raw.vendor || brandName,
tags: Array.isArray(raw.tags) ? raw.tags : [],
priceRaw: firstVariant.price != null ? String(firstVariant.price) : null,
price: firstVariant.price ?? null,
compareAtPriceRaw: firstVariant.compareAtPrice != null ? String(firstVariant.compareAtPrice) : null,
compareAtPrice: firstVariant.compareAtPrice ?? null,
sku: firstVariant.sku || null,
barcode: firstVariant.barcode || null,
available: raw.variants?.some((v) => v.available) ?? null,
image: images[0] || null,
images,
descriptionHtml: raw.body_html || null,
description: raw.body_html
? raw.body_html.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim()
: null,
options,
variants,
publishedAt: raw.published_at || null,
createdAt: raw.created_at || null,
updatedAt: raw.updated_at || null,
scrapeError: null,
};
}
async function fetchProductDetail(handle, brandName, brandSlug, collectionUrl) {
const url = `${BASE_URL}/products/${handle}.json`;
try {
const data = await fetchJson(url);
if (!data?.product) throw new Error("No product in response");
return normalizeShopifyProduct(data.product, brandName, brandSlug, collectionUrl);
} catch (err) {
return {
recordType: "product",
source: "motousher",
brand: brandName,
brandSlug,
collectionUrl,
handle,
url: `${BASE_URL}/products/${handle}`,
scrapeError: err.message,
};
}
}
// ---------------------------------------------------------------------------
// Concurrency helper
// ---------------------------------------------------------------------------
async function mapWithConcurrency(items, concurrency, worker) {
const results = new Array(items.length);
let index = 0;
async function run() {
while (true) {
const i = index++;
if (i >= items.length) return;
results[i] = await worker(items[i], i);
}
}
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
return results;
}
// ---------------------------------------------------------------------------
// Main export: scrape one brand
// ---------------------------------------------------------------------------
async function scrapeMotousherBrand({ name, slug, collectionUrl }, options = {}) {
const limit = options.limit ? Number(options.limit) : null;
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
const concurrency = options.concurrency || CONCURRENCY;
// Step 1: get all product handles from collection pages
let handles = await fetchCollectionProductHandles(slug);
console.log(`[MOTOUSHER:${name}] Found ${handles.length} products in collection`);
if (limit && handles.length > limit) {
console.log(`[MOTOUSHER:${name}] Limit applied: fetching first ${limit} of ${handles.length}`);
handles = handles.slice(0, limit);
}
// Step 2: fetch all product details in parallel
let done = 0;
const products = await mapWithConcurrency(handles, concurrency, async (handle, i) => {
const product = await fetchProductDetail(handle, name, slug, collectionUrl);
done++;
if (done % 20 === 0 || done === handles.length) {
console.log(`[MOTOUSHER:${name}] ${done}/${handles.length} products fetched`);
}
if (onProgress) onProgress({ done, total: handles.length, product: product.title || handle });
return product;
});
return products;
}
// ---------------------------------------------------------------------------
// Standalone test
// ---------------------------------------------------------------------------
if (require.main === module) {
const { BRANDS } = require("./brands");
const brand = BRANDS[0];
console.log(`Testing scraper for: ${brand.name}`);
scrapeMotousherBrand(brand, { limit: 5 })
.then((products) => {
console.log(`\nScraped ${products.length} products`);
console.log("Sample product:");
console.log(JSON.stringify(products[0], null, 2));
})
.catch((err) => {
console.error("Scrape failed:", err.message);
process.exitCode = 1;
});
}
module.exports = {
scrapeMotousherBrand,
fetchCollectionProductHandles,
fetchProductDetail,
};