Add motousher and dirtstreet as active import pipeline sources

Both sources are now registered in sources/index.js and fully wired
into the 6-stage pipeline (fetch → download → watermark → upload →
convert → upsert). The frontend will automatically show them as tabs
via GET /pipeline/sources without any frontend changes needed.

motousher/ (Shopify JSON API — 12 brands, ~2,446 products):
- scraper.js: fetches /collections/{slug}/products.json + /products/{handle}.json
- converter.js: maps scraped products to standard pipeline format
- index.js: fetchWebsiteData() loops all brands, normalises to
  productSummary.img format for shared download/upload utilities
- Supports MOTOUSHER_BRANDS env var to filter brands on a run

dirtstreet/ (WooCommerce HTML + JSON-LD — 5 brands, ~1,087 products):
- scraper.js: pure fetch, paginates /brand/{slug}/page/N/,
  extracts price from offers.priceSpecification[0].price,
  stock from JSON-LD availability field
- converter.js: maps scraped products to standard pipeline format,
  builds descriptionHtml from body + short desc + attributes table
- index.js: fetchWebsiteData() loops all brands, normalises to
  productSummary.img format
- Supports DIRTSTREET_BRANDS env var to filter brands on a run

sources/index.js: registered all 4 sources (kyt, brocks-performance,
motousher, dirtstreet). GET /pipeline/sources now returns all 4.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
MOHAN 2026-06-04 12:25:31 +05:30
parent b8d9478afa
commit c0132ab0aa
9 changed files with 1175 additions and 0 deletions

View File

@ -0,0 +1,14 @@
/**
* Dirtstreet.in brand list production config.
* Source: https://dirtstreet.in/shop-by-brands/
*/
const BRANDS = [
{ name: "SC Project", slug: "scproject", brandUrl: "https://dirtstreet.in/brand/scproject/" },
{ name: "Evotech Performance", slug: "evotechperformance", brandUrl: "https://dirtstreet.in/brand/evotechperformance/" },
{ name: "DNA Air Filters", slug: "dnaairfilters", brandUrl: "https://dirtstreet.in/brand/dnaairfilters/" },
{ name: "WRS", slug: "wrs", brandUrl: "https://dirtstreet.in/brand/wrs/" },
{ name: "Zero Gravity Racing", slug: "zerogravityracing", brandUrl: "https://dirtstreet.in/brand/zerogravityracing/" },
];
module.exports = { BRANDS };

View File

@ -0,0 +1,143 @@
const path = require("node:path");
function slugify(str) {
return String(str || "").trim().toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
}
function getUploadedImageUrl(imgPath, uploadedImageMap) {
if (!uploadedImageMap || typeof uploadedImageMap !== "object") return null;
const bySourcePath = uploadedImageMap.bySourcePath || {};
return bySourcePath[String(imgPath)]?.url || null;
}
function getImageFileName(imagePath) {
try { return path.basename(new URL(imagePath).pathname); } catch { return path.basename(String(imagePath || "").split(/[?#]/)[0]); }
}
function availabilityToStockStatus(availability) {
if (!availability) return null;
const url = String(availability).toLowerCase();
if (url.includes("instock")) return "instock";
if (url.includes("outofstock")) return "outofstock";
if (url.includes("backorder") || url.includes("preorder")) return "onbackorder";
return null;
}
function buildDescriptionHtml(record) {
const parts = [];
if (record.descriptionHtml) {
parts.push(record.descriptionHtml);
} else if (record.description) {
parts.push(`<p>${record.description.replace(/\n/g, "<br/>")}</p>`);
}
if (record.shortDescription && !parts.join("").includes(record.shortDescription)) {
parts.unshift(`<p><strong>${record.shortDescription}</strong></p>`);
}
if (Array.isArray(record.attributes) && record.attributes.length) {
const rows = record.attributes
.map((attr) => `<tr><th>${attr.name}</th><td>${attr.value}</td></tr>`)
.join("");
parts.push(`<table><tbody>${rows}</tbody></table>`);
}
return parts.join("\n");
}
function convertDirtstreetRecordToShopifyReady(record, options = {}) {
const brand = options.brand || record.brand || "Dirtstreet";
const uploadedImageMap = options.uploadedImageMap || null;
const title = record.title || "Untitled Product";
const sku = record.sku || slugify(title);
const productId = sku || slugify(`${brand}-${title}`);
const price = Number(record.price ?? 0);
const compareAtPrice = record.compareAtPrice ? Number(record.compareAtPrice) : null;
const imagePaths = Array.isArray(record.images) ? record.images : [];
// Stock: prefer schema availability, fall back to stockStatus
const stockStatus = availabilityToStockStatus(record.availability) || record.stockStatus || null;
const quantity = stockStatus === "instock" ? 1 : 0;
const descriptionHtml = buildDescriptionHtml(record);
const files = imagePaths
.map((imgPath) => ({
type: "Image",
url: getUploadedImageUrl(imgPath, uploadedImageMap) || imgPath,
media_content: getImageFileName(imgPath),
source_path: imgPath,
}))
.filter((f) => f.url);
// Categories become subcategory / collections
const categories = Array.isArray(record.categories) ? record.categories : [];
const [primaryCategory, ...subCategories] = categories;
const category = primaryCategory || "Motorcycle Parts";
const subcategory = subCategories.join(", ");
// Variants — dirtstreet products are typically simple (single variant)
const variants = [{ sku, price, compare_price: compareAtPrice, quantity, optionValues: [] }];
const tags = [
brand,
category,
...categories,
sku,
record.stockStatus || "",
...(Array.isArray(record.tags) ? record.tags : []),
].filter(Boolean).map((t) => String(t).trim());
return {
id: productId,
source: {
productId,
sourceKey: "dirtstreet",
url: record.url || null,
image_paths: imagePaths,
},
attributes: {
product_name: title,
brand,
category,
subcategory,
part_number: sku,
mfr_part_number: record.mpn || sku,
price,
compare_price: compareAtPrice,
purchase_cost: null,
barcode: record.gtin || record.barcode || "",
price_group: null,
units_per_sku: null,
part_description: descriptionHtml,
descriptions: descriptionHtml ? [{ type: "Market Description", description: descriptionHtml }] : [],
files,
image_paths: imagePaths,
dimensions: [{ weight: 0 }],
total_quantity: quantity,
inventorydata: { inventory: { main: quantity } },
options: [],
variants,
fitmentTags: { make: [], model: [], year: [], drive: [], baseModel: [] },
tags,
handle: slugify(`${brand}-${title}-${productId}`),
source_url: record.url || null,
},
};
}
function convertDirtstreetJsonToShopifyProducts(input, options = {}) {
const records = Array.isArray(input?.products) ? input.products : [];
return records.map((record) => {
// Each record in aggregated JSON has raw scraped data in record.scraped
const scraped = record.scraped || record;
return convertDirtstreetRecordToShopifyReady(scraped, {
brand: options.brand || scraped.brand || record.brand,
uploadedImageMap: options.uploadedImageMap,
});
});
}
module.exports = { convertDirtstreetRecordToShopifyReady, convertDirtstreetJsonToShopifyProducts };

View File

@ -0,0 +1,135 @@
const fs = require("node:fs/promises");
const path = require("node:path");
const { BRANDS } = require("./brands");
const { scrapeDirtstreetBrand } = require("./scraper");
const { convertDirtstreetJsonToShopifyProducts } = require("./converter");
const sourceKey = "dirtstreet";
const label = "Dirtstreet";
const dataDir = path.join("data", "sources", sourceKey);
function paths() {
return {
aggregatedJson: path.join(dataDir, "01_products_aggregated.json"),
historyJson: path.join(dataDir, "01_products_run_history.json"),
downloadedImagesDir: path.join(dataDir, "02_downloaded_product_images"),
watermarkState: path.join(dataDir, "03_watermark_state.json"),
imageUploadState: path.join(dataDir, "04_shopify_image_upload_state.json"),
uploadedMapJson: path.join(dataDir, "04_shopify_uploaded_images_map.json"),
shopifyReadyJson: path.join(dataDir, "05_shopify_products_ready.json"),
logsDir: path.join(dataDir, "99_run_logs"),
};
}
async function fetchWebsiteData({ paths: runPaths }) {
const brandsToScrape = process.env.DIRTSTREET_BRANDS
? BRANDS.filter((b) => process.env.DIRTSTREET_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
: BRANDS;
const allProducts = [];
const runSummary = [];
const now = new Date().toISOString();
for (const brand of brandsToScrape) {
console.log(`[DIRTSTREET] Scraping brand: ${brand.name} (${brand.brandUrl})`);
const brandStartedAt = new Date().toISOString();
try {
const products = await scrapeDirtstreetBrand(brand);
for (const product of products) {
allProducts.push({
productId: product.sku || product.url,
sourceKey,
brand: product.brand,
brandSlug: product.brandSlug,
// productSummary.img is read by shared downloadImages + uploadImages utilities
productSummary: {
id: product.sku || product.url,
name: product.title,
img: product.images || [],
cost: { mrp: product.price || 0 },
},
scraped: product,
});
}
runSummary.push({
brand: brand.name,
slug: brand.slug,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: products.length,
success: true,
});
console.log(`[DIRTSTREET] ${brand.name}: ${products.length} products scraped`);
} catch (err) {
console.log(`[DIRTSTREET] ${brand.name} failed: ${err.message}`);
runSummary.push({
brand: brand.name,
slug: brand.slug,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: 0,
success: false,
error: err.message,
});
}
}
const analysis = {
timestamp: now,
totalBrands: brandsToScrape.length,
totalProductsUnique: allProducts.length,
detailSuccess: allProducts.filter((p) => !p.scraped?.scrapeError).length,
detailFailed: allProducts.filter((p) => p.scraped?.scrapeError).length,
runSummary,
};
const payload = {
generatedAt: now,
sourceKey,
sourceLabel: label,
analysis,
products: allProducts,
};
await fs.mkdir(path.dirname(path.resolve(process.cwd(), runPaths.aggregatedJson)), { recursive: true });
await fs.writeFile(path.resolve(process.cwd(), runPaths.aggregatedJson), JSON.stringify(payload, null, 2), "utf8");
let history = [];
try {
const raw = await fs.readFile(path.resolve(process.cwd(), runPaths.historyJson), "utf8");
history = JSON.parse(raw);
if (!Array.isArray(history)) history = [];
} catch { history = []; }
history.push(analysis);
await fs.writeFile(path.resolve(process.cwd(), runPaths.historyJson), JSON.stringify(history, null, 2), "utf8");
console.log(`[DIRTSTREET] Saved ${allProducts.length} products to ${runPaths.aggregatedJson}`);
return {
analysis,
outputJsonPath: path.resolve(process.cwd(), runPaths.aggregatedJson),
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
};
}
function convertToShopifyProducts(input, options = {}) {
return convertDirtstreetJsonToShopifyProducts(input, {
brand: options.brand || label,
uploadedImageMap: options.uploadedImageMap,
});
}
module.exports = {
sourceKey,
label,
defaultBrand: "Dirtstreet",
defaultImageBaseUrl: "",
envImageBaseUrl: "DIRTSTREET_IMAGE_BASE_URL",
paths,
fetchWebsiteData,
convertToShopifyProducts,
};

View File

@ -0,0 +1,423 @@
/**
* Dirtstreet.in scraper - plain fetch + JSON-LD extraction (no browser).
*
* Dirtstreet is a WooCommerce store. Product pages contain JSON-LD schema.org
* markup. We fetch HTML directly and extract all fields via regex.
*
* 1. Brand pages paginated at /brand/{slug}/page/N/
* 2. Product detail via HTML + JSON-LD
*/
const BASE_URL = "https://dirtstreet.in";
const CONCURRENCY = 3;
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchHtml(url, attempt = 1) {
try {
const res = await fetch(url, {
headers: {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
"cache-control": "no-cache",
},
});
if (res.status === 429 || res.status >= 500) {
if (attempt <= 4) {
const wait = attempt * 2000;
console.log(`[RETRY] ${url} -> HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
await sleep(wait);
return fetchHtml(url, attempt + 1);
}
throw new Error(`HTTP ${res.status} after retries`);
}
if (res.status === 404) return null;
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.text();
} catch (err) {
if (attempt <= 3 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) {
await sleep(attempt * 2000);
return fetchHtml(url, attempt + 1);
}
throw err;
}
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function decodeEntities(str) {
return String(str || "")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&times;/g, "x")
.replace(/&nbsp;/g, " ")
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)));
}
function cleanText(value) {
return decodeEntities(String(value || ""))
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function toNumber(value) {
if (!value) return null;
const match = String(value).match(/([\d,]+\.?\d*)/);
if (!match) return null;
const n = Number.parseFloat(match[1].replace(/,/g, ""));
return Number.isNaN(n) ? null : n;
}
function extractPriceFromHtml(html) {
// data-price attribute (WooCommerce sets this for JS cart)
const dp = html.match(/data-price=["']([\d.]+)["']/);
if (dp) { const n = Number.parseFloat(dp[1]); if (n > 0) return n; }
// Pattern on dirtstreet: &#8377;</span>198,000.00</bdi>
// Collect ALL such values, filter out 0, take the first valid one (the product price)
const bdiPattern = /(?:&#8377;|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>/gi;
let bdiMatch;
while ((bdiMatch = bdiPattern.exec(html)) !== null) {
const n = toNumber(bdiMatch[1]);
if (n > 0) return n;
}
// <ins> sale price block
const ins = html.match(/<ins>[\s\S]{0,300}?(?:&#8377;|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>[\s\S]{0,300}?<\/ins>/i);
if (ins) { const n = toNumber(ins[1]); if (n > 0) return n; }
// Fallback: &#8377; number anywhere
const entity = html.match(/&#8377;[^>]*>([\d,]+(?:\.\d+)?)/);
if (entity) { const n = toNumber(entity[1]); if (n > 0) return n; }
return null;
}
function availabilityToStockStatus(availability) {
if (!availability) return null;
const url = String(availability).toLowerCase();
if (url.includes("instock")) return "instock";
if (url.includes("outofstock")) return "outofstock";
if (url.includes("backorder") || url.includes("preorder")) return "onbackorder";
return null;
}
// ---------------------------------------------------------------------------
// JSON-LD extraction
// ---------------------------------------------------------------------------
function extractJsonLd(html) {
const results = [];
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
let match;
while ((match = regex.exec(html)) !== null) {
try {
results.push(JSON.parse(match[1].trim()));
} catch {
// skip
}
}
return results;
}
// ---------------------------------------------------------------------------
// Collection page: extract product URLs
// ---------------------------------------------------------------------------
function extractProductUrls(html) {
const urls = new Set();
const regex = /href=["']((?:https?:\/\/dirtstreet\.in)?\/product\/[^"'#?]+)["']/gi;
let match;
while ((match = regex.exec(html)) !== null) {
let url = match[1];
if (url.startsWith("/")) url = `${BASE_URL}${url}`;
if (url.includes("/product-category/") || url.includes("/product-tag/")) continue;
// normalize trailing slash
url = url.replace(/\/?$/, "/");
urls.add(url);
}
return Array.from(urls);
}
function hasNextPage(html, currentPage, slug) {
const pattern = new RegExp(
`href=["'][^"']*/brand/${slug}/page/${currentPage + 1}/?["']`,
"i"
);
return pattern.test(html);
}
async function fetchBrandProductUrls(slug, brandUrl) {
const allUrls = new Set();
let page = 1;
while (true) {
const url = page === 1 ? brandUrl : `${brandUrl}page/${page}/`;
console.log(`[DIRTSTREET] Fetching brand page ${page}: ${url}`);
const html = await fetchHtml(url);
if (!html) break;
const urls = extractProductUrls(html);
const before = allUrls.size;
urls.forEach((u) => allUrls.add(u));
console.log(`[DIRTSTREET] Page ${page} -> ${urls.length} product URLs (total: ${allUrls.size})`);
if (!hasNextPage(html, page, slug) || allUrls.size === before) break;
page++;
await sleep(300);
}
return Array.from(allUrls);
}
// ---------------------------------------------------------------------------
// Extra WooCommerce fields from HTML
// ---------------------------------------------------------------------------
function extractExtraFields(html) {
const extra = {
sku: null,
categories: [],
tags: [],
shortDescription: null,
stockStatus: null,
attributes: [],
variations: [],
};
// SKU - strip "SKU:" label prefix
const skuMatch = html.match(/<span[^>]+class="[^"]*\bsku\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
if (skuMatch) {
extra.sku = cleanText(skuMatch[1])
.replace(/^SKU\s*:\s*/i, "")
.trim();
}
// Stock status — read from the main product wrapper class only, not whole page
// The first <div/article class="product ... instock/outofstock ..."> is the current product
const productClassMatch = html.match(/class="[^"]*\bproduct\b[^"]*\b(instock|outofstock|on-backorder)\b[^"]*"/i);
if (productClassMatch) {
const cls = productClassMatch[1].toLowerCase();
if (cls === "instock") extra.stockStatus = "instock";
else if (cls === "outofstock") extra.stockStatus = "outofstock";
else extra.stockStatus = "onbackorder";
}
// Categories
const catMatch = html.match(/<span[^>]+class="[^"]*posted_in[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
if (catMatch) {
const catRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
let m;
while ((m = catRegex.exec(catMatch[1])) !== null) {
const cat = cleanText(m[1]);
if (cat) extra.categories.push(cat);
}
}
// Tags
const tagMatch = html.match(/<span[^>]+class="[^"]*tagged_as[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
if (tagMatch) {
const tagRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
let m;
while ((m = tagRegex.exec(tagMatch[1])) !== null) {
const tag = cleanText(m[1]);
if (tag) extra.tags.push(tag);
}
}
// Short description
const sdMatch = html.match(
/<div[^>]+class="[^"]*woocommerce-product-details__short-description[^"]*"[^>]*>([\s\S]*?)<\/div>/i
);
if (sdMatch) extra.shortDescription = cleanText(sdMatch[1]);
// Attributes table
const attrMatch = html.match(
/<table[^>]+class="[^"]*woocommerce-product-attributes[^"]*"[^>]*>([\s\S]*?)<\/table>/i
);
if (attrMatch) {
const rows = attrMatch[1].match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi) || [];
for (const row of rows) {
const th = row.match(/<th[^>]*>([\s\S]*?)<\/th>/i);
const td = row.match(/<td[^>]*>([\s\S]*?)<\/td>/i);
if (th && td) {
const key = cleanText(th[1]);
const value = cleanText(td[1]);
if (key && value) extra.attributes.push({ name: key, value });
}
}
}
// WooCommerce variations JSON in data attribute
const formMatch = html.match(/data-product_variations=["']([\[{][\s\S]*?)["']\s*>/);
if (formMatch) {
try {
extra.variations = JSON.parse(
formMatch[1].replace(/&quot;/g, '"').replace(/&#34;/g, '"')
);
} catch { /* skip */ }
}
return extra;
}
// ---------------------------------------------------------------------------
// Scrape one product detail page
// ---------------------------------------------------------------------------
async function scrapeProductDetail(productUrl, brandName, brandSlug) {
const html = await fetchHtml(productUrl);
if (!html) {
return { recordType: "product", source: "dirtstreet", brand: brandName, brandSlug, url: productUrl, scrapeError: "404" };
}
const jsonLdBlocks = extractJsonLd(html);
// Try direct Product type first, then search every @graph block
let schema = jsonLdBlocks.find((b) => b["@type"] === "Product");
if (!schema) {
for (const block of jsonLdBlocks) {
if (Array.isArray(block["@graph"])) {
const found = block["@graph"].find((g) => g["@type"] === "Product");
if (found) { schema = found; break; }
}
}
}
const extra = extractExtraFields(html);
const offer = Array.isArray(schema?.offers) ? schema.offers[0] : (schema?.offers || {});
// Price: JSON-LD offers.priceSpecification[0].price OR offers.price, then HTML fallback
const priceSpec = Array.isArray(offer?.priceSpecification)
? offer.priceSpecification[0]?.price
: null;
let price = toNumber(priceSpec ?? offer?.price ?? offer?.lowPrice);
if (!price || price === 0) price = extractPriceFromHtml(html);
if (price === 0) price = null;
// Stock: derive from JSON-LD availability (most reliable), fall back to HTML class
const stockFromSchema = availabilityToStockStatus(offer?.availability);
const stockStatus = stockFromSchema || extra.stockStatus || null;
const currency = offer?.priceCurrency || "INR";
const images = Array.isArray(schema?.image)
? schema.image.map((img) => (typeof img === "string" ? img : img?.url || img?.contentUrl)).filter(Boolean)
: schema?.image
? [typeof schema.image === "string" ? schema.image : (schema.image?.url || schema.image?.contentUrl)]
: [];
return {
recordType: "product",
source: "dirtstreet",
brand: schema?.brand?.name || brandName,
brandSlug,
url: productUrl,
title: cleanText(schema?.name || ""),
sku: extra.sku || cleanText(schema?.sku || ""),
mpn: cleanText(schema?.mpn || ""),
gtin: cleanText(schema?.gtin || schema?.gtin13 || ""),
price,
currency,
priceRaw: price != null ? String(price) : null,
compareAtPrice: toNumber(offer?.highPrice) || null,
availability: offer?.availability || null,
stockStatus,
image: images[0] || null,
images,
description: cleanText(schema?.description || ""),
shortDescription: extra.shortDescription,
categories: extra.categories,
tags: extra.tags,
attributes: extra.attributes,
variations: extra.variations,
aggregateRating: schema?.aggregateRating
? { ratingValue: schema.aggregateRating.ratingValue, reviewCount: schema.aggregateRating.reviewCount }
: null,
scrapeError: null,
};
}
// ---------------------------------------------------------------------------
// Concurrency helper
// ---------------------------------------------------------------------------
async function mapWithConcurrency(items, concurrency, worker) {
const results = new Array(items.length);
let index = 0;
async function run() {
while (true) {
const i = index++;
if (i >= items.length) return;
results[i] = await worker(items[i], i);
}
}
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
return results;
}
// ---------------------------------------------------------------------------
// Main export: scrape one brand
// ---------------------------------------------------------------------------
async function scrapeDirtstreetBrand({ name, slug, brandUrl }, options = {}) {
const limit = options.limit ? Number(options.limit) : null;
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
const concurrency = options.concurrency || CONCURRENCY;
let productUrls = await fetchBrandProductUrls(slug, brandUrl);
console.log(`[DIRTSTREET:${name}] Found ${productUrls.length} products`);
if (limit && productUrls.length > limit) {
console.log(`[DIRTSTREET:${name}] Limit applied: fetching first ${limit} of ${productUrls.length}`);
productUrls = productUrls.slice(0, limit);
}
let done = 0;
const products = await mapWithConcurrency(productUrls, concurrency, async (url, i) => {
const product = await scrapeProductDetail(url, name, slug);
done++;
if (done % 10 === 0 || done === productUrls.length) {
console.log(`[DIRTSTREET:${name}] ${done}/${productUrls.length} products scraped`);
}
if (onProgress) onProgress({ done, total: productUrls.length, product: product.title || url });
return product;
});
return products;
}
// ---------------------------------------------------------------------------
// Standalone test
// ---------------------------------------------------------------------------
if (require.main === module) {
const { BRANDS } = require("./brands");
const brand = BRANDS[0];
console.log(`Testing scraper for: ${brand.name}`);
scrapeDirtstreetBrand(brand, { limit: 3 })
.then((products) => {
console.log(`\nScraped ${products.length} products`);
console.log("Sample product:");
console.log(JSON.stringify(products[0], null, 2));
})
.catch((err) => {
console.error("Scrape failed:", err.message);
process.exitCode = 1;
});
}
module.exports = { scrapeDirtstreetBrand, fetchBrandProductUrls, scrapeProductDetail };

View File

@ -1,9 +1,13 @@
const kyt = require("./kyt");
const brocksPerformance = require("./brocks-performance");
const motousher = require("./motousher");
const dirtstreet = require("./dirtstreet");
const sources = {
[kyt.sourceKey]: kyt,
[brocksPerformance.sourceKey]: brocksPerformance,
[motousher.sourceKey]: motousher,
[dirtstreet.sourceKey]: dirtstreet,
};
function normalizeSourceKey(sourceKey) {

View File

@ -0,0 +1,21 @@
/**
* Motousher.com brand list production config.
* Source: https://www.motousher.com/pages/partner-brands
*/
const BRANDS = [
{ name: "All Balls Racing", slug: "all-balls-racing", collectionUrl: "https://www.motousher.com/collections/all-balls-racing" },
{ name: "DID Chains", slug: "did-chains", collectionUrl: "https://www.motousher.com/collections/did-chains" },
{ name: "EBC Brakes", slug: "ebc-brakes", collectionUrl: "https://www.motousher.com/collections/ebc-brakes" },
{ name: "Esjot Sprockets", slug: "esjot-sprockets", collectionUrl: "https://www.motousher.com/collections/esjot-sprockets" },
{ name: "Evans Coolant", slug: "evans-coolant", collectionUrl: "https://www.motousher.com/collections/evans-coolant" },
{ name: "Grip Puppies", slug: "grip-puppies", collectionUrl: "https://www.motousher.com/collections/grip-puppies" },
{ name: "HiFlo Filters", slug: "hi-flo", collectionUrl: "https://www.motousher.com/collections/hi-flo" },
{ name: "JT Sprockets", slug: "jt-sprockets", collectionUrl: "https://www.motousher.com/collections/jt-sprockets" },
{ name: "Maxima Racing Oils", slug: "maxima-racing-oils", collectionUrl: "https://www.motousher.com/collections/maxima-racing-oils" },
{ name: "Putoline", slug: "putoline", collectionUrl: "https://www.motousher.com/collections/putoline" },
{ name: "Ram Mount", slug: "ram-mount", collectionUrl: "https://www.motousher.com/collections/ram-mount" },
{ name: "Wunderlich", slug: "wunderlich", collectionUrl: "https://www.motousher.com/collections/wunderlich" },
];
module.exports = { BRANDS };

View File

@ -0,0 +1,113 @@
const path = require("node:path");
function slugify(str) {
return String(str || "").trim().toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
}
function getUploadedImageUrl(imgPath, uploadedImageMap) {
if (!uploadedImageMap || typeof uploadedImageMap !== "object") return null;
const bySourcePath = uploadedImageMap.bySourcePath || {};
return bySourcePath[String(imgPath)]?.url || null;
}
function getImageFileName(imagePath) {
try { return path.basename(new URL(imagePath).pathname); } catch { return path.basename(String(imagePath || "").split(/[?#]/)[0]); }
}
function convertMotousherRecordToShopifyReady(record, options = {}) {
const brand = options.brand || record.brand || "Motousher";
const uploadedImageMap = options.uploadedImageMap || null;
const productId = record.handle || slugify(`${record.title || ""}-${record.sku || ""}`);
const title = record.title || "Untitled Product";
const sku = record.sku || productId;
const price = Number(record.price ?? 0);
const compareAtPrice = record.compareAtPrice ? Number(record.compareAtPrice) : null;
const quantity = record.variants?.reduce((sum, v) => sum + (v.quantity || 0), 0) || 0;
const descriptionHtml = record.descriptionHtml || "";
const imagePaths = Array.isArray(record.images) ? record.images : [];
const files = imagePaths
.map((imgPath) => ({
type: "Image",
url: getUploadedImageUrl(imgPath, uploadedImageMap) || imgPath,
media_content: getImageFileName(imgPath),
source_path: imgPath,
}))
.filter((f) => f.url);
// Options: filter out "Title"/"Default Title" Shopify placeholder
const sourceOptions = Array.isArray(record.options)
? record.options.filter((opt) => opt.name !== "Title" || !opt.values?.every((v) => v === "Default Title"))
: [];
// Variants
let variants;
if (Array.isArray(record.variants) && record.variants.length) {
variants = record.variants.map((v, index) => ({
sku: v.sku || (index === 0 ? sku : `${sku}-${index + 1}`),
price: Number(v.price ?? price),
compare_price: v.compare_price != null ? Number(v.compare_price) : compareAtPrice,
quantity: Number(v.quantity ?? 0),
optionValues: Array.isArray(v.optionValues) ? v.optionValues : [],
}));
} else {
variants = [{ sku, price, compare_price: compareAtPrice, quantity, optionValues: [] }];
}
const category = record.productType || "Motorcycle Parts";
const tags = [
brand,
category,
record.brandSlug,
sku,
...(Array.isArray(record.tags) ? record.tags : []),
].filter(Boolean).map((t) => String(t).trim());
return {
id: productId,
source: {
productId,
sourceKey: "motousher",
url: record.url || null,
image_paths: imagePaths,
},
attributes: {
product_name: title,
brand,
category,
subcategory: record.productType || "",
part_number: sku,
mfr_part_number: sku,
price,
compare_price: compareAtPrice,
purchase_cost: null,
barcode: record.barcode || "",
price_group: null,
units_per_sku: null,
part_description: descriptionHtml,
descriptions: descriptionHtml ? [{ type: "Market Description", description: descriptionHtml }] : [],
files,
image_paths: imagePaths,
dimensions: [{ weight: record.weight ? record.weight / 1000 : 0 }],
total_quantity: quantity,
inventorydata: { inventory: { main: quantity } },
options: sourceOptions.map((opt) => ({
name: opt.name,
values: Array.isArray(opt.values) ? opt.values : [],
})),
variants,
fitmentTags: { make: [], model: [], year: [], drive: [], baseModel: [] },
tags,
handle: slugify(`${brand}-${title}-${productId}`),
source_url: record.url || null,
},
};
}
function convertMotousherJsonToShopifyProducts(input, options = {}) {
const records = Array.isArray(input?.products) ? input.products : [];
return records.map((record) => convertMotousherRecordToShopifyReady(record, options));
}
module.exports = { convertMotousherRecordToShopifyReady, convertMotousherJsonToShopifyProducts };

View File

@ -0,0 +1,137 @@
const fs = require("node:fs/promises");
const path = require("node:path");
const { BRANDS } = require("./brands");
const { scrapeMotousherBrand } = require("./scraper");
const { convertMotousherJsonToShopifyProducts } = require("./converter");
const sourceKey = "motousher";
const label = "Motousher";
const dataDir = path.join("data", "sources", sourceKey);
function paths() {
return {
aggregatedJson: path.join(dataDir, "01_products_aggregated.json"),
historyJson: path.join(dataDir, "01_products_run_history.json"),
downloadedImagesDir: path.join(dataDir, "02_downloaded_product_images"),
watermarkState: path.join(dataDir, "03_watermark_state.json"),
imageUploadState: path.join(dataDir, "04_shopify_image_upload_state.json"),
uploadedMapJson: path.join(dataDir, "04_shopify_uploaded_images_map.json"),
shopifyReadyJson: path.join(dataDir, "05_shopify_products_ready.json"),
logsDir: path.join(dataDir, "99_run_logs"),
};
}
async function fetchWebsiteData({ paths: runPaths }) {
const brandsToScrape = process.env.MOTOUSHER_BRANDS
? BRANDS.filter((b) => process.env.MOTOUSHER_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
: BRANDS;
const allProducts = [];
const runSummary = [];
const now = new Date().toISOString();
for (const brand of brandsToScrape) {
console.log(`[MOTOUSHER] Scraping brand: ${brand.name} (${brand.collectionUrl})`);
const brandStartedAt = new Date().toISOString();
try {
const products = await scrapeMotousherBrand(brand);
// Normalize into pipeline-compatible aggregated format
for (const product of products) {
allProducts.push({
productId: product.handle || product.url,
sourceKey,
brand: product.brand,
brandSlug: product.brandSlug,
// productSummary.img is read by shared downloadImages + uploadImages utilities
productSummary: {
id: product.handle,
name: product.title,
img: product.images || [],
cost: { mrp: product.price || 0 },
},
scraped: product,
});
}
runSummary.push({
brand: brand.name,
slug: brand.slug,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: products.length,
success: true,
});
console.log(`[MOTOUSHER] ${brand.name}: ${products.length} products fetched`);
} catch (err) {
console.log(`[MOTOUSHER] ${brand.name} failed: ${err.message}`);
runSummary.push({
brand: brand.name,
slug: brand.slug,
startedAt: brandStartedAt,
completedAt: new Date().toISOString(),
totalProducts: 0,
success: false,
error: err.message,
});
}
}
const analysis = {
timestamp: now,
totalBrands: brandsToScrape.length,
totalProductsUnique: allProducts.length,
detailSuccess: allProducts.filter((p) => !p.scraped?.scrapeError).length,
detailFailed: allProducts.filter((p) => p.scraped?.scrapeError).length,
runSummary,
};
const payload = {
generatedAt: now,
sourceKey,
sourceLabel: label,
analysis,
products: allProducts,
};
await fs.mkdir(path.dirname(path.resolve(process.cwd(), runPaths.aggregatedJson)), { recursive: true });
await fs.writeFile(path.resolve(process.cwd(), runPaths.aggregatedJson), JSON.stringify(payload, null, 2), "utf8");
// Append to history
let history = [];
try {
const raw = await fs.readFile(path.resolve(process.cwd(), runPaths.historyJson), "utf8");
history = JSON.parse(raw);
if (!Array.isArray(history)) history = [];
} catch { history = []; }
history.push(analysis);
await fs.writeFile(path.resolve(process.cwd(), runPaths.historyJson), JSON.stringify(history, null, 2), "utf8");
console.log(`[MOTOUSHER] Saved ${allProducts.length} products to ${runPaths.aggregatedJson}`);
return {
analysis,
outputJsonPath: path.resolve(process.cwd(), runPaths.aggregatedJson),
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
};
}
function convertToShopifyProducts(input, options = {}) {
return convertMotousherJsonToShopifyProducts(input, {
brand: options.brand || label,
uploadedImageMap: options.uploadedImageMap,
});
}
module.exports = {
sourceKey,
label,
defaultBrand: "Motousher",
defaultImageBaseUrl: "",
envImageBaseUrl: "MOTOUSHER_IMAGE_BASE_URL",
paths,
fetchWebsiteData,
convertToShopifyProducts,
};

View File

@ -0,0 +1,185 @@
/**
* Motousher.com scraper Shopify JSON API.
* Uses /collections/{slug}/products.json (paginated) + /products/{handle}.json
*/
const BASE_URL = "https://www.motousher.com";
const CONCURRENCY = 3;
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchJson(url, attempt = 1) {
try {
const res = await fetch(url, {
headers: {
accept: "application/json",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
},
});
if (res.status === 429 || res.status >= 500) {
if (attempt <= 4) {
const wait = attempt * 1500;
console.log(`[RETRY] ${url} HTTP ${res.status}, retrying in ${wait}ms (${attempt}/4)`);
await sleep(wait);
return fetchJson(url, attempt + 1);
}
throw new Error(`HTTP ${res.status} after retries`);
}
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.json();
} catch (err) {
if (attempt <= 4 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) {
await sleep(attempt * 1500);
return fetchJson(url, attempt + 1);
}
throw err;
}
}
async function fetchCollectionHandles(collectionSlug) {
const handles = [];
let page = 1;
while (true) {
const url = `${BASE_URL}/collections/${collectionSlug}/products.json?limit=250&page=${page}`;
const data = await fetchJson(url);
const products = Array.isArray(data?.products) ? data.products : [];
if (!products.length) break;
for (const p of products) {
if (p.handle) handles.push(p.handle);
}
if (products.length < 250) break;
page++;
}
return handles;
}
function toNumber(value) {
if (!value) return null;
const match = String(value).match(/([\d,]+\.?\d*)/);
if (!match) return null;
const n = Number.parseFloat(match[1].replace(/,/g, ""));
return Number.isNaN(n) ? null : n;
}
function bestImageUrl(src) {
if (!src) return null;
return src.replace(/(_\d+x\d*|_\d+x)(\.\w+)(\?|$)/, "$2$3");
}
function normalizeShopifyProduct(raw, brandName, brandSlug, collectionUrl) {
const images = (raw.images || []).map((img) => bestImageUrl(img.src)).filter(Boolean);
const firstVariant = raw.variants?.[0] || {};
const price = toNumber(firstVariant.price);
const compareAtPrice = toNumber(firstVariant.compare_at_price) || null;
const quantity = raw.variants?.reduce((sum, v) => sum + (v.inventory_quantity || 0), 0) || 0;
const options = (raw.options || [])
.filter((opt) => opt.name !== "Title" || (opt.values || []).join("") !== "Default Title")
.map((opt) => ({
name: opt.name,
values: opt.values || [],
}));
const hasRealOptions = options.length > 0;
const variants = (raw.variants || []).map((v, index) => ({
sku: v.sku || firstVariant.sku || raw.handle || "",
price: toNumber(v.price) ?? price,
compare_price: toNumber(v.compare_at_price) || null,
quantity: v.inventory_quantity || 0,
optionValues: hasRealOptions
? [v.option1, v.option2, v.option3]
.filter(Boolean)
.filter((val) => val !== "Default Title")
.map((val, i) => ({
name: options[i]?.name || `Option${i + 1}`,
value: val,
}))
: [],
}));
return {
recordType: "product",
source: "motousher",
brand: brandName,
brandSlug,
collectionUrl,
handle: raw.handle || null,
title: raw.title || null,
url: `${BASE_URL}/products/${raw.handle}`,
productType: raw.product_type || null,
vendor: raw.vendor || brandName,
tags: Array.isArray(raw.tags) ? raw.tags : typeof raw.tags === "string" ? raw.tags.split(",").map((t) => t.trim()).filter(Boolean) : [],
price,
compareAtPrice,
sku: firstVariant.sku || null,
barcode: firstVariant.barcode || null,
weight: firstVariant.grams || null,
available: raw.variants?.some((v) => v.available) ?? null,
image: images[0] || null,
images,
descriptionHtml: raw.body_html || null,
options,
variants,
publishedAt: raw.published_at || null,
scrapeError: null,
};
}
async function fetchProductDetail(handle, brandName, brandSlug, collectionUrl) {
const url = `${BASE_URL}/products/${handle}.json`;
try {
const data = await fetchJson(url);
if (!data?.product) throw new Error("No product in response");
return normalizeShopifyProduct(data.product, brandName, brandSlug, collectionUrl);
} catch (err) {
return { recordType: "product", source: "motousher", brand: brandName, brandSlug, collectionUrl, handle, url: `${BASE_URL}/products/${handle}`, scrapeError: err.message };
}
}
async function mapWithConcurrency(items, concurrency, worker) {
const results = new Array(items.length);
let index = 0;
async function run() {
while (true) {
const i = index++;
if (i >= items.length) return;
results[i] = await worker(items[i], i);
}
}
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
return results;
}
async function scrapeMotousherBrand({ name, slug, collectionUrl }, options = {}) {
const limit = options.limit ? Number(options.limit) : null;
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
const concurrency = options.concurrency || CONCURRENCY;
let handles = await fetchCollectionHandles(slug);
if (limit && handles.length > limit) {
handles = handles.slice(0, limit);
}
let done = 0;
const products = await mapWithConcurrency(handles, concurrency, async (handle, i) => {
const product = await fetchProductDetail(handle, name, slug, collectionUrl);
done++;
if (done % 20 === 0 || done === handles.length) {
console.log(`[MOTOUSHER:${name}] ${done}/${handles.length} fetched`);
}
if (onProgress) onProgress({ done, total: handles.length, product: product.title || handle });
return product;
});
return products;
}
module.exports = { scrapeMotousherBrand, fetchCollectionHandles, fetchProductDetail };