Add test_source scrapers for motousher.com and dirtstreet.in
Adds two new experimental product scrapers under test_source/, isolated
from the active pipeline until verified and ready to promote.
motousher/ (Shopify store — Shopify JSON API):
- Scrapes 12 brands: All Balls Racing, DID Chains, EBC Brakes, Esjot
Sprockets, Evans Coolant, Grip Puppies, HiFlo Filters, JT Sprockets,
Maxima Racing Oils, Putoline, Ram Mount, Wunderlich
- 2,446 products total scraped and verified
- Uses /collections/{slug}/products.json + /products/{handle}.json
- Parallel fetch (concurrency 3), paginated collection listing
dirtstreet/ (WooCommerce store — HTML + JSON-LD):
- Scrapes 5 brands: SC Project, Evotech Performance, DNA Air Filters,
WRS, Zero Gravity Racing
- 1,087 products total scraped and verified
- Pure fetch with JSON-LD schema.org extraction (no browser)
- Handles paginated /brand/{slug}/page/N/ archives
- Price extracted from offers.priceSpecification[0].price
- Stock status derived from JSON-LD availability field
Both scrapers are standalone (node index.js), support --brand and
--limit flags, save per-brand JSON files and a combined.json.
Scraped data lives in data/sources/test_source/ (gitignored).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1d254a9009
commit
b8d9478afa
1
src/business-logic/import-pipeline/test_source/.gitkeep
Normal file
1
src/business-logic/import-pipeline/test_source/.gitkeep
Normal file
@ -0,0 +1 @@
|
||||
|
||||
26
src/business-logic/import-pipeline/test_source/README.md
Normal file
26
src/business-logic/import-pipeline/test_source/README.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Test Source Workspace
|
||||
|
||||
This folder is for experimental supplier scraping work.
|
||||
|
||||
Use one folder per supplier website or test target. Keep experiments here until
|
||||
the scraper, converter, and data shape are confirmed safe to move into the active
|
||||
`sources/` registry.
|
||||
|
||||
Suggested folder shape:
|
||||
|
||||
```txt
|
||||
test_source/
|
||||
supplier-key/
|
||||
links.txt
|
||||
scraper.js
|
||||
sample-output.json
|
||||
notes.md
|
||||
```
|
||||
|
||||
Rules for this area:
|
||||
|
||||
- Do not register test sources in `../sources/index.js` until the scraper is ready.
|
||||
- Store only brand/filter/listing URLs in `links.txt`; avoid committing secrets.
|
||||
- Keep generated images, large raw exports, and run logs out of git.
|
||||
- Document what each URL represents in the supplier folder notes.
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
/**
|
||||
* Dirtstreet brand list.
|
||||
*
|
||||
* Brand index page: https://dirtstreet.in/shop-by-brands/
|
||||
*/
|
||||
|
||||
const BRANDS = [
|
||||
{
|
||||
name: "SC Project",
|
||||
slug: "scproject",
|
||||
brandUrl: "https://dirtstreet.in/brand/scproject/",
|
||||
},
|
||||
{
|
||||
name: "Evotech Performance",
|
||||
slug: "evotechperformance",
|
||||
brandUrl: "https://dirtstreet.in/brand/evotechperformance/",
|
||||
},
|
||||
{
|
||||
name: "DNA Air Filters",
|
||||
slug: "dnaairfilters",
|
||||
brandUrl: "https://dirtstreet.in/brand/dnaairfilters/",
|
||||
},
|
||||
{
|
||||
name: "WRS",
|
||||
slug: "wrs",
|
||||
brandUrl: "https://dirtstreet.in/brand/wrs/",
|
||||
},
|
||||
{
|
||||
name: "Zero Gravity Racing",
|
||||
slug: "zerogravityracing",
|
||||
brandUrl: "https://dirtstreet.in/brand/zerogravityracing/",
|
||||
},
|
||||
];
|
||||
|
||||
module.exports = { BRANDS };
|
||||
@ -0,0 +1,138 @@
|
||||
/**
|
||||
* Dirtstreet scraper runner.
|
||||
*
|
||||
* Usage:
|
||||
* node index.js → scrape all brands in brands.js
|
||||
* node index.js --brand scproject → scrape one brand only
|
||||
* node index.js --limit 5 → first 5 products per brand (quick test)
|
||||
*/
|
||||
|
||||
const fs = require("node:fs/promises");
|
||||
const path = require("node:path");
|
||||
const { BRANDS } = require("./brands");
|
||||
const { scrapeDirtstreetBrand } = require("./scraper");
|
||||
|
||||
const OUTPUT_DIR = path.resolve(__dirname, "..", "..", "..", "..", "..", "data", "sources", "test_source", "dirtstreet");
|
||||
|
||||
function parseArgs(argv = process.argv.slice(2)) {
|
||||
const out = { brandSlug: null, limit: null };
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
if ((argv[i] === "--brand" || argv[i] === "-b") && argv[i + 1]) {
|
||||
out.brandSlug = argv[i + 1].toLowerCase().trim();
|
||||
}
|
||||
if ((argv[i] === "--limit" || argv[i] === "-n") && argv[i + 1]) {
|
||||
const n = Number.parseInt(argv[i + 1], 10);
|
||||
if (Number.isFinite(n) && n > 0) out.limit = n;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const { brandSlug, limit } = parseArgs();
|
||||
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
|
||||
const brandsToRun = brandSlug
|
||||
? BRANDS.filter((b) => b.slug === brandSlug)
|
||||
: BRANDS;
|
||||
|
||||
if (!brandsToRun.length) {
|
||||
console.error(`No brand found matching slug: "${brandSlug}"`);
|
||||
console.error(`Available: ${BRANDS.map((b) => b.slug).join(", ")}`);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
const combinedProducts = [];
|
||||
const runSummary = [];
|
||||
const startedAt = new Date().toISOString();
|
||||
|
||||
for (const brand of brandsToRun) {
|
||||
console.log(`\n${"=".repeat(60)}`);
|
||||
console.log(`[DIRTSTREET] Scraping brand: ${brand.name}`);
|
||||
console.log(`[DIRTSTREET] Brand URL: ${brand.brandUrl}`);
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
|
||||
const brandStartedAt = new Date().toISOString();
|
||||
let products = [];
|
||||
let error = null;
|
||||
|
||||
try {
|
||||
products = await scrapeDirtstreetBrand(brand, { limit });
|
||||
|
||||
const brandFile = path.join(OUTPUT_DIR, `${brand.slug}.json`);
|
||||
const brandPayload = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
source: "dirtstreet",
|
||||
brand: brand.name,
|
||||
brandSlug: brand.slug,
|
||||
brandUrl: brand.brandUrl,
|
||||
totalProducts: products.length,
|
||||
products,
|
||||
};
|
||||
|
||||
await fs.writeFile(brandFile, JSON.stringify(brandPayload, null, 2), "utf8");
|
||||
console.log(`[DIRTSTREET] Saved ${products.length} products → ${brandFile}`);
|
||||
|
||||
combinedProducts.push(...products);
|
||||
runSummary.push({
|
||||
brand: brand.name,
|
||||
slug: brand.slug,
|
||||
brandUrl: brand.brandUrl,
|
||||
startedAt: brandStartedAt,
|
||||
completedAt: new Date().toISOString(),
|
||||
totalProducts: products.length,
|
||||
success: true,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(`[DIRTSTREET] Failed to scrape ${brand.name}: ${err.message}`);
|
||||
error = err.message;
|
||||
runSummary.push({
|
||||
brand: brand.name,
|
||||
slug: brand.slug,
|
||||
brandUrl: brand.brandUrl,
|
||||
startedAt: brandStartedAt,
|
||||
completedAt: new Date().toISOString(),
|
||||
totalProducts: 0,
|
||||
success: false,
|
||||
error,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Save combined JSON
|
||||
const combinedFile = path.join(OUTPUT_DIR, "combined.json");
|
||||
const combinedPayload = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
startedAt,
|
||||
source: "dirtstreet",
|
||||
siteUrl: "https://dirtstreet.in",
|
||||
brandsScraped: runSummary.length,
|
||||
totalProducts: combinedProducts.length,
|
||||
runSummary,
|
||||
products: combinedProducts,
|
||||
};
|
||||
|
||||
await fs.writeFile(combinedFile, JSON.stringify(combinedPayload, null, 2), "utf8");
|
||||
|
||||
console.log(`\n${"=".repeat(60)}`);
|
||||
console.log("[DIRTSTREET] RUN COMPLETE");
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
console.log(`Brands scraped : ${runSummary.length}`);
|
||||
console.log(`Total products : ${combinedProducts.length}`);
|
||||
console.log(`Combined file : ${combinedFile}`);
|
||||
console.log("\nBrand summary:");
|
||||
for (const s of runSummary) {
|
||||
const status = s.success ? "OK" : "FAILED";
|
||||
console.log(` [${status}] ${s.brand} → ${s.totalProducts} products`);
|
||||
}
|
||||
|
||||
return combinedPayload;
|
||||
}
|
||||
|
||||
run().catch((err) => {
|
||||
console.error("Runner failed:", err.message);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
|
||||
module.exports = { run };
|
||||
@ -0,0 +1,423 @@
|
||||
/**
|
||||
* Dirtstreet.in scraper - plain fetch + JSON-LD extraction (no browser).
|
||||
*
|
||||
* Dirtstreet is a WooCommerce store. Product pages contain JSON-LD schema.org
|
||||
* markup. We fetch HTML directly and extract all fields via regex.
|
||||
*
|
||||
* 1. Brand pages paginated at /brand/{slug}/page/N/
|
||||
* 2. Product detail via HTML + JSON-LD
|
||||
*/
|
||||
|
||||
const BASE_URL = "https://dirtstreet.in";
|
||||
const CONCURRENCY = 3;
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchHtml(url, attempt = 1) {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
|
||||
"cache-control": "no-cache",
|
||||
},
|
||||
});
|
||||
|
||||
if (res.status === 429 || res.status >= 500) {
|
||||
if (attempt <= 4) {
|
||||
const wait = attempt * 2000;
|
||||
console.log(`[RETRY] ${url} -> HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
|
||||
await sleep(wait);
|
||||
return fetchHtml(url, attempt + 1);
|
||||
}
|
||||
throw new Error(`HTTP ${res.status} after retries`);
|
||||
}
|
||||
|
||||
if (res.status === 404) return null;
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
return res.text();
|
||||
} catch (err) {
|
||||
if (attempt <= 3 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) {
|
||||
await sleep(attempt * 2000);
|
||||
return fetchHtml(url, attempt + 1);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function decodeEntities(str) {
|
||||
return String(str || "")
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/×/g, "x")
|
||||
.replace(/ /g, " ")
|
||||
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)));
|
||||
}
|
||||
|
||||
function cleanText(value) {
|
||||
return decodeEntities(String(value || ""))
|
||||
.replace(/<[^>]+>/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function toNumber(value) {
|
||||
if (!value) return null;
|
||||
const match = String(value).match(/([\d,]+\.?\d*)/);
|
||||
if (!match) return null;
|
||||
const n = Number.parseFloat(match[1].replace(/,/g, ""));
|
||||
return Number.isNaN(n) ? null : n;
|
||||
}
|
||||
|
||||
function extractPriceFromHtml(html) {
|
||||
// data-price attribute (WooCommerce sets this for JS cart)
|
||||
const dp = html.match(/data-price=["']([\d.]+)["']/);
|
||||
if (dp) { const n = Number.parseFloat(dp[1]); if (n > 0) return n; }
|
||||
|
||||
// Pattern on dirtstreet: ₹</span>198,000.00</bdi>
|
||||
// Collect ALL such values, filter out 0, take the first valid one (the product price)
|
||||
const bdiPattern = /(?:₹|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>/gi;
|
||||
let bdiMatch;
|
||||
while ((bdiMatch = bdiPattern.exec(html)) !== null) {
|
||||
const n = toNumber(bdiMatch[1]);
|
||||
if (n > 0) return n;
|
||||
}
|
||||
|
||||
// <ins> sale price block
|
||||
const ins = html.match(/<ins>[\s\S]{0,300}?(?:₹|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>[\s\S]{0,300}?<\/ins>/i);
|
||||
if (ins) { const n = toNumber(ins[1]); if (n > 0) return n; }
|
||||
|
||||
// Fallback: ₹ number anywhere
|
||||
const entity = html.match(/₹[^>]*>([\d,]+(?:\.\d+)?)/);
|
||||
if (entity) { const n = toNumber(entity[1]); if (n > 0) return n; }
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function availabilityToStockStatus(availability) {
|
||||
if (!availability) return null;
|
||||
const url = String(availability).toLowerCase();
|
||||
if (url.includes("instock")) return "instock";
|
||||
if (url.includes("outofstock")) return "outofstock";
|
||||
if (url.includes("backorder") || url.includes("preorder")) return "onbackorder";
|
||||
return null;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// JSON-LD extraction
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function extractJsonLd(html) {
|
||||
const results = [];
|
||||
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
try {
|
||||
results.push(JSON.parse(match[1].trim()));
|
||||
} catch {
|
||||
// skip
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Collection page: extract product URLs
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function extractProductUrls(html) {
|
||||
const urls = new Set();
|
||||
const regex = /href=["']((?:https?:\/\/dirtstreet\.in)?\/product\/[^"'#?]+)["']/gi;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
let url = match[1];
|
||||
if (url.startsWith("/")) url = `${BASE_URL}${url}`;
|
||||
if (url.includes("/product-category/") || url.includes("/product-tag/")) continue;
|
||||
// normalize trailing slash
|
||||
url = url.replace(/\/?$/, "/");
|
||||
urls.add(url);
|
||||
}
|
||||
return Array.from(urls);
|
||||
}
|
||||
|
||||
function hasNextPage(html, currentPage, slug) {
|
||||
const pattern = new RegExp(
|
||||
`href=["'][^"']*/brand/${slug}/page/${currentPage + 1}/?["']`,
|
||||
"i"
|
||||
);
|
||||
return pattern.test(html);
|
||||
}
|
||||
|
||||
async function fetchBrandProductUrls(slug, brandUrl) {
|
||||
const allUrls = new Set();
|
||||
let page = 1;
|
||||
|
||||
while (true) {
|
||||
const url = page === 1 ? brandUrl : `${brandUrl}page/${page}/`;
|
||||
console.log(`[DIRTSTREET] Fetching brand page ${page}: ${url}`);
|
||||
|
||||
const html = await fetchHtml(url);
|
||||
if (!html) break;
|
||||
|
||||
const urls = extractProductUrls(html);
|
||||
const before = allUrls.size;
|
||||
urls.forEach((u) => allUrls.add(u));
|
||||
|
||||
console.log(`[DIRTSTREET] Page ${page} -> ${urls.length} product URLs (total: ${allUrls.size})`);
|
||||
|
||||
if (!hasNextPage(html, page, slug) || allUrls.size === before) break;
|
||||
page++;
|
||||
await sleep(300);
|
||||
}
|
||||
|
||||
return Array.from(allUrls);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Extra WooCommerce fields from HTML
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function extractExtraFields(html) {
|
||||
const extra = {
|
||||
sku: null,
|
||||
categories: [],
|
||||
tags: [],
|
||||
shortDescription: null,
|
||||
stockStatus: null,
|
||||
attributes: [],
|
||||
variations: [],
|
||||
};
|
||||
|
||||
// SKU - strip "SKU:" label prefix
|
||||
const skuMatch = html.match(/<span[^>]+class="[^"]*\bsku\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||
if (skuMatch) {
|
||||
extra.sku = cleanText(skuMatch[1])
|
||||
.replace(/^SKU\s*:\s*/i, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
// Stock status — read from the main product wrapper class only, not whole page
|
||||
// The first <div/article class="product ... instock/outofstock ..."> is the current product
|
||||
const productClassMatch = html.match(/class="[^"]*\bproduct\b[^"]*\b(instock|outofstock|on-backorder)\b[^"]*"/i);
|
||||
if (productClassMatch) {
|
||||
const cls = productClassMatch[1].toLowerCase();
|
||||
if (cls === "instock") extra.stockStatus = "instock";
|
||||
else if (cls === "outofstock") extra.stockStatus = "outofstock";
|
||||
else extra.stockStatus = "onbackorder";
|
||||
}
|
||||
|
||||
// Categories
|
||||
const catMatch = html.match(/<span[^>]+class="[^"]*posted_in[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||
if (catMatch) {
|
||||
const catRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
|
||||
let m;
|
||||
while ((m = catRegex.exec(catMatch[1])) !== null) {
|
||||
const cat = cleanText(m[1]);
|
||||
if (cat) extra.categories.push(cat);
|
||||
}
|
||||
}
|
||||
|
||||
// Tags
|
||||
const tagMatch = html.match(/<span[^>]+class="[^"]*tagged_as[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||
if (tagMatch) {
|
||||
const tagRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
|
||||
let m;
|
||||
while ((m = tagRegex.exec(tagMatch[1])) !== null) {
|
||||
const tag = cleanText(m[1]);
|
||||
if (tag) extra.tags.push(tag);
|
||||
}
|
||||
}
|
||||
|
||||
// Short description
|
||||
const sdMatch = html.match(
|
||||
/<div[^>]+class="[^"]*woocommerce-product-details__short-description[^"]*"[^>]*>([\s\S]*?)<\/div>/i
|
||||
);
|
||||
if (sdMatch) extra.shortDescription = cleanText(sdMatch[1]);
|
||||
|
||||
// Attributes table
|
||||
const attrMatch = html.match(
|
||||
/<table[^>]+class="[^"]*woocommerce-product-attributes[^"]*"[^>]*>([\s\S]*?)<\/table>/i
|
||||
);
|
||||
if (attrMatch) {
|
||||
const rows = attrMatch[1].match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi) || [];
|
||||
for (const row of rows) {
|
||||
const th = row.match(/<th[^>]*>([\s\S]*?)<\/th>/i);
|
||||
const td = row.match(/<td[^>]*>([\s\S]*?)<\/td>/i);
|
||||
if (th && td) {
|
||||
const key = cleanText(th[1]);
|
||||
const value = cleanText(td[1]);
|
||||
if (key && value) extra.attributes.push({ name: key, value });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// WooCommerce variations JSON in data attribute
|
||||
const formMatch = html.match(/data-product_variations=["']([\[{][\s\S]*?)["']\s*>/);
|
||||
if (formMatch) {
|
||||
try {
|
||||
extra.variations = JSON.parse(
|
||||
formMatch[1].replace(/"/g, '"').replace(/"/g, '"')
|
||||
);
|
||||
} catch { /* skip */ }
|
||||
}
|
||||
|
||||
return extra;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Scrape one product detail page
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function scrapeProductDetail(productUrl, brandName, brandSlug) {
|
||||
const html = await fetchHtml(productUrl);
|
||||
if (!html) {
|
||||
return { recordType: "product", source: "dirtstreet", brand: brandName, brandSlug, url: productUrl, scrapeError: "404" };
|
||||
}
|
||||
|
||||
const jsonLdBlocks = extractJsonLd(html);
|
||||
// Try direct Product type first, then search every @graph block
|
||||
let schema = jsonLdBlocks.find((b) => b["@type"] === "Product");
|
||||
if (!schema) {
|
||||
for (const block of jsonLdBlocks) {
|
||||
if (Array.isArray(block["@graph"])) {
|
||||
const found = block["@graph"].find((g) => g["@type"] === "Product");
|
||||
if (found) { schema = found; break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const extra = extractExtraFields(html);
|
||||
|
||||
const offer = Array.isArray(schema?.offers) ? schema.offers[0] : (schema?.offers || {});
|
||||
|
||||
// Price: JSON-LD offers.priceSpecification[0].price OR offers.price, then HTML fallback
|
||||
const priceSpec = Array.isArray(offer?.priceSpecification)
|
||||
? offer.priceSpecification[0]?.price
|
||||
: null;
|
||||
let price = toNumber(priceSpec ?? offer?.price ?? offer?.lowPrice);
|
||||
if (!price || price === 0) price = extractPriceFromHtml(html);
|
||||
if (price === 0) price = null;
|
||||
|
||||
// Stock: derive from JSON-LD availability (most reliable), fall back to HTML class
|
||||
const stockFromSchema = availabilityToStockStatus(offer?.availability);
|
||||
const stockStatus = stockFromSchema || extra.stockStatus || null;
|
||||
|
||||
const currency = offer?.priceCurrency || "INR";
|
||||
|
||||
const images = Array.isArray(schema?.image)
|
||||
? schema.image.map((img) => (typeof img === "string" ? img : img?.url || img?.contentUrl)).filter(Boolean)
|
||||
: schema?.image
|
||||
? [typeof schema.image === "string" ? schema.image : (schema.image?.url || schema.image?.contentUrl)]
|
||||
: [];
|
||||
|
||||
return {
|
||||
recordType: "product",
|
||||
source: "dirtstreet",
|
||||
brand: schema?.brand?.name || brandName,
|
||||
brandSlug,
|
||||
url: productUrl,
|
||||
title: cleanText(schema?.name || ""),
|
||||
sku: extra.sku || cleanText(schema?.sku || ""),
|
||||
mpn: cleanText(schema?.mpn || ""),
|
||||
gtin: cleanText(schema?.gtin || schema?.gtin13 || ""),
|
||||
price,
|
||||
currency,
|
||||
priceRaw: price != null ? String(price) : null,
|
||||
compareAtPrice: toNumber(offer?.highPrice) || null,
|
||||
availability: offer?.availability || null,
|
||||
stockStatus,
|
||||
image: images[0] || null,
|
||||
images,
|
||||
description: cleanText(schema?.description || ""),
|
||||
shortDescription: extra.shortDescription,
|
||||
categories: extra.categories,
|
||||
tags: extra.tags,
|
||||
attributes: extra.attributes,
|
||||
variations: extra.variations,
|
||||
aggregateRating: schema?.aggregateRating
|
||||
? { ratingValue: schema.aggregateRating.ratingValue, reviewCount: schema.aggregateRating.reviewCount }
|
||||
: null,
|
||||
scrapeError: null,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Concurrency helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function mapWithConcurrency(items, concurrency, worker) {
|
||||
const results = new Array(items.length);
|
||||
let index = 0;
|
||||
async function run() {
|
||||
while (true) {
|
||||
const i = index++;
|
||||
if (i >= items.length) return;
|
||||
results[i] = await worker(items[i], i);
|
||||
}
|
||||
}
|
||||
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
|
||||
return results;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main export: scrape one brand
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function scrapeDirtstreetBrand({ name, slug, brandUrl }, options = {}) {
|
||||
const limit = options.limit ? Number(options.limit) : null;
|
||||
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
||||
const concurrency = options.concurrency || CONCURRENCY;
|
||||
|
||||
let productUrls = await fetchBrandProductUrls(slug, brandUrl);
|
||||
console.log(`[DIRTSTREET:${name}] Found ${productUrls.length} products`);
|
||||
|
||||
if (limit && productUrls.length > limit) {
|
||||
console.log(`[DIRTSTREET:${name}] Limit applied: fetching first ${limit} of ${productUrls.length}`);
|
||||
productUrls = productUrls.slice(0, limit);
|
||||
}
|
||||
|
||||
let done = 0;
|
||||
const products = await mapWithConcurrency(productUrls, concurrency, async (url, i) => {
|
||||
const product = await scrapeProductDetail(url, name, slug);
|
||||
done++;
|
||||
if (done % 10 === 0 || done === productUrls.length) {
|
||||
console.log(`[DIRTSTREET:${name}] ${done}/${productUrls.length} products scraped`);
|
||||
}
|
||||
if (onProgress) onProgress({ done, total: productUrls.length, product: product.title || url });
|
||||
return product;
|
||||
});
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Standalone test
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
if (require.main === module) {
|
||||
const { BRANDS } = require("./brands");
|
||||
const brand = BRANDS[0];
|
||||
console.log(`Testing scraper for: ${brand.name}`);
|
||||
scrapeDirtstreetBrand(brand, { limit: 3 })
|
||||
.then((products) => {
|
||||
console.log(`\nScraped ${products.length} products`);
|
||||
console.log("Sample product:");
|
||||
console.log(JSON.stringify(products[0], null, 2));
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("Scrape failed:", err.message);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { scrapeDirtstreetBrand, fetchBrandProductUrls, scrapeProductDetail };
|
||||
@ -0,0 +1,73 @@
|
||||
/**
|
||||
* Motousher brand list.
|
||||
*
|
||||
* Each entry maps a brand name to its collection page URL on motousher.com.
|
||||
* Add more brands here — the scraper loops this list automatically.
|
||||
*
|
||||
* Brand index page: https://www.motousher.com/pages/partner-brands
|
||||
*/
|
||||
|
||||
const BRANDS = [
|
||||
{
|
||||
name: "All Balls Racing",
|
||||
slug: "all-balls-racing",
|
||||
collectionUrl: "https://www.motousher.com/collections/all-balls-racing",
|
||||
},
|
||||
{
|
||||
name: "DID Chains",
|
||||
slug: "did-chains",
|
||||
collectionUrl: "https://www.motousher.com/collections/did-chains",
|
||||
},
|
||||
{
|
||||
name: "EBC Brakes",
|
||||
slug: "ebc-brakes",
|
||||
collectionUrl: "https://www.motousher.com/collections/ebc-brakes",
|
||||
},
|
||||
{
|
||||
name: "Esjot Sprockets",
|
||||
slug: "esjot-sprockets",
|
||||
collectionUrl: "https://www.motousher.com/collections/esjot-sprockets",
|
||||
},
|
||||
{
|
||||
name: "Evans Coolant",
|
||||
slug: "evans-coolant",
|
||||
collectionUrl: "https://www.motousher.com/collections/evans-coolant",
|
||||
},
|
||||
{
|
||||
name: "Grip Puppies",
|
||||
slug: "grip-puppies",
|
||||
collectionUrl: "https://www.motousher.com/collections/grip-puppies",
|
||||
},
|
||||
{
|
||||
name: "HiFlo Filters",
|
||||
slug: "hi-flo",
|
||||
collectionUrl: "https://www.motousher.com/collections/hi-flo",
|
||||
},
|
||||
{
|
||||
name: "JT Sprockets",
|
||||
slug: "jt-sprockets",
|
||||
collectionUrl: "https://www.motousher.com/collections/jt-sprockets",
|
||||
},
|
||||
{
|
||||
name: "Maxima Racing Oils",
|
||||
slug: "maxima-racing-oils",
|
||||
collectionUrl: "https://www.motousher.com/collections/maxima-racing-oils",
|
||||
},
|
||||
{
|
||||
name: "Putoline",
|
||||
slug: "putoline",
|
||||
collectionUrl: "https://www.motousher.com/collections/putoline",
|
||||
},
|
||||
{
|
||||
name: "Ram Mount",
|
||||
slug: "ram-mount",
|
||||
collectionUrl: "https://www.motousher.com/collections/ram-mount",
|
||||
},
|
||||
{
|
||||
name: "Wunderlich",
|
||||
slug: "wunderlich",
|
||||
collectionUrl: "https://www.motousher.com/collections/wunderlich",
|
||||
},
|
||||
];
|
||||
|
||||
module.exports = { BRANDS };
|
||||
@ -0,0 +1,158 @@
|
||||
/**
|
||||
* Motousher scraper runner.
|
||||
*
|
||||
* Loops through BRANDS in brands.js, scrapes each brand's collection page
|
||||
* and all product detail pages, saves per-brand JSON files, then combines
|
||||
* everything into a single combined.json.
|
||||
*
|
||||
* Usage:
|
||||
* node index.js → scrape all brands in brands.js
|
||||
* node index.js --brand all-balls-racing → scrape one brand only
|
||||
* node index.js --limit 5 → only first 5 products per brand (for quick tests)
|
||||
*/
|
||||
|
||||
const fs = require("node:fs/promises");
|
||||
const path = require("node:path");
|
||||
const { BRANDS } = require("./brands");
|
||||
const { scrapeMotousherBrand } = require("./scraper");
|
||||
|
||||
const OUTPUT_DIR = path.resolve(__dirname, "..", "..", "..", "..", "..", "data", "sources", "test_source", "motousher");
|
||||
|
||||
function slugify(str) {
|
||||
return String(str || "")
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, "-")
|
||||
.replace(/^-+|-+$/g, "");
|
||||
}
|
||||
|
||||
function parseArgs(argv = process.argv.slice(2)) {
|
||||
const out = { brandSlug: null, limit: null };
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
if ((argv[i] === "--brand" || argv[i] === "-b") && argv[i + 1]) {
|
||||
out.brandSlug = argv[i + 1].toLowerCase().trim();
|
||||
}
|
||||
if ((argv[i] === "--limit" || argv[i] === "-n") && argv[i + 1]) {
|
||||
const n = Number.parseInt(argv[i + 1], 10);
|
||||
if (Number.isFinite(n) && n > 0) out.limit = n;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const { brandSlug, limit } = parseArgs();
|
||||
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
|
||||
const brandsToRun = brandSlug
|
||||
? BRANDS.filter((b) => b.slug === brandSlug)
|
||||
: BRANDS;
|
||||
|
||||
if (!brandsToRun.length) {
|
||||
console.error(`No brand found matching slug: "${brandSlug}"`);
|
||||
console.error(`Available: ${BRANDS.map((b) => b.slug).join(", ")}`);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
const combinedProducts = [];
|
||||
const runSummary = [];
|
||||
const startedAt = new Date().toISOString();
|
||||
|
||||
for (const brand of brandsToRun) {
|
||||
console.log(`\n${"=".repeat(60)}`);
|
||||
console.log(`[MOTOUSHER] Scraping brand: ${brand.name}`);
|
||||
console.log(`[MOTOUSHER] Collection URL: ${brand.collectionUrl}`);
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
|
||||
const brandStartedAt = new Date().toISOString();
|
||||
let products = [];
|
||||
let error = null;
|
||||
|
||||
try {
|
||||
products = await scrapeMotousherBrand(brand, {
|
||||
limit,
|
||||
onProgress({ done, total, product }) {
|
||||
// Inline progress already logged by scraper
|
||||
},
|
||||
});
|
||||
|
||||
// Save per-brand JSON
|
||||
const brandFile = path.join(OUTPUT_DIR, `${brand.slug}.json`);
|
||||
const brandPayload = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
source: "motousher",
|
||||
brand: brand.name,
|
||||
brandSlug: brand.slug,
|
||||
collectionUrl: brand.collectionUrl,
|
||||
totalProducts: products.length,
|
||||
products,
|
||||
};
|
||||
|
||||
await fs.writeFile(brandFile, JSON.stringify(brandPayload, null, 2), "utf8");
|
||||
console.log(`[MOTOUSHER] Saved ${products.length} products → ${brandFile}`);
|
||||
|
||||
combinedProducts.push(...products);
|
||||
|
||||
runSummary.push({
|
||||
brand: brand.name,
|
||||
slug: brand.slug,
|
||||
collectionUrl: brand.collectionUrl,
|
||||
startedAt: brandStartedAt,
|
||||
completedAt: new Date().toISOString(),
|
||||
totalProducts: products.length,
|
||||
success: true,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(`[MOTOUSHER] Failed to scrape ${brand.name}: ${err.message}`);
|
||||
error = err.message;
|
||||
|
||||
runSummary.push({
|
||||
brand: brand.name,
|
||||
slug: brand.slug,
|
||||
collectionUrl: brand.collectionUrl,
|
||||
startedAt: brandStartedAt,
|
||||
completedAt: new Date().toISOString(),
|
||||
totalProducts: 0,
|
||||
success: false,
|
||||
error,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Save combined JSON (all brands merged)
|
||||
const combinedFile = path.join(OUTPUT_DIR, "combined.json");
|
||||
const combinedPayload = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
startedAt,
|
||||
source: "motousher",
|
||||
siteUrl: "https://www.motousher.com",
|
||||
brandsScraped: runSummary.length,
|
||||
totalProducts: combinedProducts.length,
|
||||
runSummary,
|
||||
products: combinedProducts,
|
||||
};
|
||||
|
||||
await fs.writeFile(combinedFile, JSON.stringify(combinedPayload, null, 2), "utf8");
|
||||
|
||||
console.log(`\n${"=".repeat(60)}`);
|
||||
console.log("[MOTOUSHER] RUN COMPLETE");
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
console.log(`Brands scraped : ${runSummary.length}`);
|
||||
console.log(`Total products : ${combinedProducts.length}`);
|
||||
console.log(`Combined file : ${combinedFile}`);
|
||||
console.log("\nBrand summary:");
|
||||
for (const s of runSummary) {
|
||||
const status = s.success ? "OK" : "FAILED";
|
||||
console.log(` [${status}] ${s.brand} → ${s.totalProducts} products`);
|
||||
}
|
||||
|
||||
return combinedPayload;
|
||||
}
|
||||
|
||||
run().catch((err) => {
|
||||
console.error("Runner failed:", err.message);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
|
||||
module.exports = { run };
|
||||
@ -0,0 +1,9 @@
|
||||
# Motousher Partner Brand Links
|
||||
# Source: https://www.motousher.com/pages/partner-brands
|
||||
# Format: Brand Name | Collection URL
|
||||
|
||||
All Balls Racing | https://www.motousher.com/collections/all-balls-racing
|
||||
|
||||
# Add more brands below as needed, e.g.:
|
||||
# Acerbis | https://www.motousher.com/collections/acerbis
|
||||
# Twin Air | https://www.motousher.com/collections/twin-air
|
||||
@ -0,0 +1,254 @@
|
||||
/**
|
||||
* Motousher.com scraper — Shopify JSON API (fast, no browser needed).
|
||||
*
|
||||
* Motousher is a Shopify store. Every collection and product has a public
|
||||
* JSON endpoint, so we use plain fetch instead of Playwright:
|
||||
*
|
||||
* Collection listing: /collections/{handle}/products.json?limit=250&page=N
|
||||
* Product detail: /products/{handle}.json
|
||||
*
|
||||
* This is 10-20x faster than Playwright. Playwright is only used as a
|
||||
* fallback if the JSON API returns nothing.
|
||||
*/
|
||||
|
||||
const BASE_URL = "https://www.motousher.com";
|
||||
const CONCURRENCY = 3; // parallel product detail fetches — motousher rate-limits above ~5
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchJson(url, attempt = 1) {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
"accept": "application/json",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
|
||||
},
|
||||
});
|
||||
|
||||
if (res.status === 429 || res.status >= 500) {
|
||||
if (attempt <= 4) {
|
||||
const wait = attempt * 1500;
|
||||
console.log(`[RETRY] ${url} → HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
|
||||
await sleep(wait);
|
||||
return fetchJson(url, attempt + 1);
|
||||
}
|
||||
throw new Error(`HTTP ${res.status} after retries`);
|
||||
}
|
||||
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
return res.json();
|
||||
} catch (err) {
|
||||
if (attempt <= 4 && err.code === "UND_ERR_CONNECT_TIMEOUT") {
|
||||
await sleep(attempt * 1500);
|
||||
return fetchJson(url, attempt + 1);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Collection: fetch ALL product handles via paginated JSON API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function fetchCollectionProductHandles(collectionHandle) {
|
||||
const handles = [];
|
||||
let page = 1;
|
||||
|
||||
while (true) {
|
||||
const url = `${BASE_URL}/collections/${collectionHandle}/products.json?limit=250&page=${page}`;
|
||||
console.log(`[MOTOUSHER] Fetching collection page ${page}: ${url}`);
|
||||
|
||||
const data = await fetchJson(url);
|
||||
const products = Array.isArray(data?.products) ? data.products : [];
|
||||
|
||||
if (!products.length) break;
|
||||
|
||||
for (const p of products) {
|
||||
if (p.handle) handles.push(p.handle);
|
||||
}
|
||||
|
||||
console.log(`[MOTOUSHER] Page ${page} → ${products.length} products (total so far: ${handles.length})`);
|
||||
|
||||
if (products.length < 250) break; // last page
|
||||
page++;
|
||||
}
|
||||
|
||||
return handles;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Product detail: fetch via /products/{handle}.json
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function toNumber(value) {
|
||||
if (!value) return null;
|
||||
const match = String(value).match(/([\d,]+\.?\d*)/);
|
||||
if (!match) return null;
|
||||
const n = Number.parseFloat(match[1].replace(/,/g, ""));
|
||||
return Number.isNaN(n) ? null : n;
|
||||
}
|
||||
|
||||
function bestImageUrl(src) {
|
||||
if (!src) return null;
|
||||
// Remove Shopify width constraints to get the original full-size image
|
||||
return src.replace(/(_\d+x\d*|_\d+x)(\.\w+)(\?|$)/, "$2$3");
|
||||
}
|
||||
|
||||
function normalizeShopifyProduct(raw, brandName, brandSlug, collectionUrl) {
|
||||
const images = (raw.images || []).map((img) => bestImageUrl(img.src)).filter(Boolean);
|
||||
|
||||
const options = (raw.options || []).map((opt) => ({
|
||||
label: opt.name,
|
||||
values: (opt.values || []).map((v) => ({ label: v, value: v })),
|
||||
}));
|
||||
|
||||
const variants = (raw.variants || []).map((v) => ({
|
||||
id: v.id,
|
||||
title: v.title,
|
||||
sku: v.sku || null,
|
||||
price: toNumber(v.price),
|
||||
compareAtPrice: toNumber(v.compare_at_price),
|
||||
available: v.available,
|
||||
options: [v.option1, v.option2, v.option3].filter(Boolean),
|
||||
weight: v.grams || null,
|
||||
barcode: v.barcode || null,
|
||||
}));
|
||||
|
||||
const firstVariant = variants[0] || {};
|
||||
|
||||
return {
|
||||
recordType: "product",
|
||||
source: "motousher",
|
||||
brand: brandName,
|
||||
brandSlug,
|
||||
collectionUrl,
|
||||
title: raw.title || null,
|
||||
url: `${BASE_URL}/products/${raw.handle}`,
|
||||
handle: raw.handle || null,
|
||||
productType: raw.product_type || null,
|
||||
vendor: raw.vendor || brandName,
|
||||
tags: Array.isArray(raw.tags) ? raw.tags : [],
|
||||
priceRaw: firstVariant.price != null ? String(firstVariant.price) : null,
|
||||
price: firstVariant.price ?? null,
|
||||
compareAtPriceRaw: firstVariant.compareAtPrice != null ? String(firstVariant.compareAtPrice) : null,
|
||||
compareAtPrice: firstVariant.compareAtPrice ?? null,
|
||||
sku: firstVariant.sku || null,
|
||||
barcode: firstVariant.barcode || null,
|
||||
available: raw.variants?.some((v) => v.available) ?? null,
|
||||
image: images[0] || null,
|
||||
images,
|
||||
descriptionHtml: raw.body_html || null,
|
||||
description: raw.body_html
|
||||
? raw.body_html.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim()
|
||||
: null,
|
||||
options,
|
||||
variants,
|
||||
publishedAt: raw.published_at || null,
|
||||
createdAt: raw.created_at || null,
|
||||
updatedAt: raw.updated_at || null,
|
||||
scrapeError: null,
|
||||
};
|
||||
}
|
||||
|
||||
async function fetchProductDetail(handle, brandName, brandSlug, collectionUrl) {
|
||||
const url = `${BASE_URL}/products/${handle}.json`;
|
||||
try {
|
||||
const data = await fetchJson(url);
|
||||
if (!data?.product) throw new Error("No product in response");
|
||||
return normalizeShopifyProduct(data.product, brandName, brandSlug, collectionUrl);
|
||||
} catch (err) {
|
||||
return {
|
||||
recordType: "product",
|
||||
source: "motousher",
|
||||
brand: brandName,
|
||||
brandSlug,
|
||||
collectionUrl,
|
||||
handle,
|
||||
url: `${BASE_URL}/products/${handle}`,
|
||||
scrapeError: err.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Concurrency helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function mapWithConcurrency(items, concurrency, worker) {
|
||||
const results = new Array(items.length);
|
||||
let index = 0;
|
||||
|
||||
async function run() {
|
||||
while (true) {
|
||||
const i = index++;
|
||||
if (i >= items.length) return;
|
||||
results[i] = await worker(items[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
|
||||
return results;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main export: scrape one brand
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function scrapeMotousherBrand({ name, slug, collectionUrl }, options = {}) {
|
||||
const limit = options.limit ? Number(options.limit) : null;
|
||||
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
||||
const concurrency = options.concurrency || CONCURRENCY;
|
||||
|
||||
// Step 1: get all product handles from collection pages
|
||||
let handles = await fetchCollectionProductHandles(slug);
|
||||
console.log(`[MOTOUSHER:${name}] Found ${handles.length} products in collection`);
|
||||
|
||||
if (limit && handles.length > limit) {
|
||||
console.log(`[MOTOUSHER:${name}] Limit applied: fetching first ${limit} of ${handles.length}`);
|
||||
handles = handles.slice(0, limit);
|
||||
}
|
||||
|
||||
// Step 2: fetch all product details in parallel
|
||||
let done = 0;
|
||||
const products = await mapWithConcurrency(handles, concurrency, async (handle, i) => {
|
||||
const product = await fetchProductDetail(handle, name, slug, collectionUrl);
|
||||
done++;
|
||||
if (done % 20 === 0 || done === handles.length) {
|
||||
console.log(`[MOTOUSHER:${name}] ${done}/${handles.length} products fetched`);
|
||||
}
|
||||
if (onProgress) onProgress({ done, total: handles.length, product: product.title || handle });
|
||||
return product;
|
||||
});
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Standalone test
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
if (require.main === module) {
|
||||
const { BRANDS } = require("./brands");
|
||||
const brand = BRANDS[0];
|
||||
|
||||
console.log(`Testing scraper for: ${brand.name}`);
|
||||
scrapeMotousherBrand(brand, { limit: 5 })
|
||||
.then((products) => {
|
||||
console.log(`\nScraped ${products.length} products`);
|
||||
console.log("Sample product:");
|
||||
console.log(JSON.stringify(products[0], null, 2));
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("Scrape failed:", err.message);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
scrapeMotousherBrand,
|
||||
fetchCollectionProductHandles,
|
||||
fetchProductDetail,
|
||||
};
|
||||
Loading…
x
Reference in New Issue
Block a user