Add motousher and dirtstreet as active import pipeline sources
Both sources are now registered in sources/index.js and fully wired
into the 6-stage pipeline (fetch → download → watermark → upload →
convert → upsert). The frontend will automatically show them as tabs
via GET /pipeline/sources without any frontend changes needed.
motousher/ (Shopify JSON API — 12 brands, ~2,446 products):
- scraper.js: fetches /collections/{slug}/products.json + /products/{handle}.json
- converter.js: maps scraped products to standard pipeline format
- index.js: fetchWebsiteData() loops all brands, normalises to
productSummary.img format for shared download/upload utilities
- Supports MOTOUSHER_BRANDS env var to filter brands on a run
dirtstreet/ (WooCommerce HTML + JSON-LD — 5 brands, ~1,087 products):
- scraper.js: pure fetch, paginates /brand/{slug}/page/N/,
extracts price from offers.priceSpecification[0].price,
stock from JSON-LD availability field
- converter.js: maps scraped products to standard pipeline format,
builds descriptionHtml from body + short desc + attributes table
- index.js: fetchWebsiteData() loops all brands, normalises to
productSummary.img format
- Supports DIRTSTREET_BRANDS env var to filter brands on a run
sources/index.js: registered all 4 sources (kyt, brocks-performance,
motousher, dirtstreet). GET /pipeline/sources now returns all 4.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b8d9478afa
commit
c0132ab0aa
@ -0,0 +1,14 @@
|
|||||||
|
/**
|
||||||
|
* Dirtstreet.in brand list — production config.
|
||||||
|
* Source: https://dirtstreet.in/shop-by-brands/
|
||||||
|
*/
|
||||||
|
|
||||||
|
const BRANDS = [
|
||||||
|
{ name: "SC Project", slug: "scproject", brandUrl: "https://dirtstreet.in/brand/scproject/" },
|
||||||
|
{ name: "Evotech Performance", slug: "evotechperformance", brandUrl: "https://dirtstreet.in/brand/evotechperformance/" },
|
||||||
|
{ name: "DNA Air Filters", slug: "dnaairfilters", brandUrl: "https://dirtstreet.in/brand/dnaairfilters/" },
|
||||||
|
{ name: "WRS", slug: "wrs", brandUrl: "https://dirtstreet.in/brand/wrs/" },
|
||||||
|
{ name: "Zero Gravity Racing", slug: "zerogravityracing", brandUrl: "https://dirtstreet.in/brand/zerogravityracing/" },
|
||||||
|
];
|
||||||
|
|
||||||
|
module.exports = { BRANDS };
|
||||||
@ -0,0 +1,143 @@
|
|||||||
|
const path = require("node:path");
|
||||||
|
|
||||||
|
function slugify(str) {
|
||||||
|
return String(str || "").trim().toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
function getUploadedImageUrl(imgPath, uploadedImageMap) {
|
||||||
|
if (!uploadedImageMap || typeof uploadedImageMap !== "object") return null;
|
||||||
|
const bySourcePath = uploadedImageMap.bySourcePath || {};
|
||||||
|
return bySourcePath[String(imgPath)]?.url || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getImageFileName(imagePath) {
|
||||||
|
try { return path.basename(new URL(imagePath).pathname); } catch { return path.basename(String(imagePath || "").split(/[?#]/)[0]); }
|
||||||
|
}
|
||||||
|
|
||||||
|
function availabilityToStockStatus(availability) {
|
||||||
|
if (!availability) return null;
|
||||||
|
const url = String(availability).toLowerCase();
|
||||||
|
if (url.includes("instock")) return "instock";
|
||||||
|
if (url.includes("outofstock")) return "outofstock";
|
||||||
|
if (url.includes("backorder") || url.includes("preorder")) return "onbackorder";
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildDescriptionHtml(record) {
|
||||||
|
const parts = [];
|
||||||
|
|
||||||
|
if (record.descriptionHtml) {
|
||||||
|
parts.push(record.descriptionHtml);
|
||||||
|
} else if (record.description) {
|
||||||
|
parts.push(`<p>${record.description.replace(/\n/g, "<br/>")}</p>`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (record.shortDescription && !parts.join("").includes(record.shortDescription)) {
|
||||||
|
parts.unshift(`<p><strong>${record.shortDescription}</strong></p>`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.isArray(record.attributes) && record.attributes.length) {
|
||||||
|
const rows = record.attributes
|
||||||
|
.map((attr) => `<tr><th>${attr.name}</th><td>${attr.value}</td></tr>`)
|
||||||
|
.join("");
|
||||||
|
parts.push(`<table><tbody>${rows}</tbody></table>`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return parts.join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertDirtstreetRecordToShopifyReady(record, options = {}) {
|
||||||
|
const brand = options.brand || record.brand || "Dirtstreet";
|
||||||
|
const uploadedImageMap = options.uploadedImageMap || null;
|
||||||
|
|
||||||
|
const title = record.title || "Untitled Product";
|
||||||
|
const sku = record.sku || slugify(title);
|
||||||
|
const productId = sku || slugify(`${brand}-${title}`);
|
||||||
|
const price = Number(record.price ?? 0);
|
||||||
|
const compareAtPrice = record.compareAtPrice ? Number(record.compareAtPrice) : null;
|
||||||
|
const imagePaths = Array.isArray(record.images) ? record.images : [];
|
||||||
|
|
||||||
|
// Stock: prefer schema availability, fall back to stockStatus
|
||||||
|
const stockStatus = availabilityToStockStatus(record.availability) || record.stockStatus || null;
|
||||||
|
const quantity = stockStatus === "instock" ? 1 : 0;
|
||||||
|
|
||||||
|
const descriptionHtml = buildDescriptionHtml(record);
|
||||||
|
|
||||||
|
const files = imagePaths
|
||||||
|
.map((imgPath) => ({
|
||||||
|
type: "Image",
|
||||||
|
url: getUploadedImageUrl(imgPath, uploadedImageMap) || imgPath,
|
||||||
|
media_content: getImageFileName(imgPath),
|
||||||
|
source_path: imgPath,
|
||||||
|
}))
|
||||||
|
.filter((f) => f.url);
|
||||||
|
|
||||||
|
// Categories become subcategory / collections
|
||||||
|
const categories = Array.isArray(record.categories) ? record.categories : [];
|
||||||
|
const [primaryCategory, ...subCategories] = categories;
|
||||||
|
const category = primaryCategory || "Motorcycle Parts";
|
||||||
|
const subcategory = subCategories.join(", ");
|
||||||
|
|
||||||
|
// Variants — dirtstreet products are typically simple (single variant)
|
||||||
|
const variants = [{ sku, price, compare_price: compareAtPrice, quantity, optionValues: [] }];
|
||||||
|
|
||||||
|
const tags = [
|
||||||
|
brand,
|
||||||
|
category,
|
||||||
|
...categories,
|
||||||
|
sku,
|
||||||
|
record.stockStatus || "",
|
||||||
|
...(Array.isArray(record.tags) ? record.tags : []),
|
||||||
|
].filter(Boolean).map((t) => String(t).trim());
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: productId,
|
||||||
|
source: {
|
||||||
|
productId,
|
||||||
|
sourceKey: "dirtstreet",
|
||||||
|
url: record.url || null,
|
||||||
|
image_paths: imagePaths,
|
||||||
|
},
|
||||||
|
attributes: {
|
||||||
|
product_name: title,
|
||||||
|
brand,
|
||||||
|
category,
|
||||||
|
subcategory,
|
||||||
|
part_number: sku,
|
||||||
|
mfr_part_number: record.mpn || sku,
|
||||||
|
price,
|
||||||
|
compare_price: compareAtPrice,
|
||||||
|
purchase_cost: null,
|
||||||
|
barcode: record.gtin || record.barcode || "",
|
||||||
|
price_group: null,
|
||||||
|
units_per_sku: null,
|
||||||
|
part_description: descriptionHtml,
|
||||||
|
descriptions: descriptionHtml ? [{ type: "Market Description", description: descriptionHtml }] : [],
|
||||||
|
files,
|
||||||
|
image_paths: imagePaths,
|
||||||
|
dimensions: [{ weight: 0 }],
|
||||||
|
total_quantity: quantity,
|
||||||
|
inventorydata: { inventory: { main: quantity } },
|
||||||
|
options: [],
|
||||||
|
variants,
|
||||||
|
fitmentTags: { make: [], model: [], year: [], drive: [], baseModel: [] },
|
||||||
|
tags,
|
||||||
|
handle: slugify(`${brand}-${title}-${productId}`),
|
||||||
|
source_url: record.url || null,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertDirtstreetJsonToShopifyProducts(input, options = {}) {
|
||||||
|
const records = Array.isArray(input?.products) ? input.products : [];
|
||||||
|
return records.map((record) => {
|
||||||
|
// Each record in aggregated JSON has raw scraped data in record.scraped
|
||||||
|
const scraped = record.scraped || record;
|
||||||
|
return convertDirtstreetRecordToShopifyReady(scraped, {
|
||||||
|
brand: options.brand || scraped.brand || record.brand,
|
||||||
|
uploadedImageMap: options.uploadedImageMap,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { convertDirtstreetRecordToShopifyReady, convertDirtstreetJsonToShopifyProducts };
|
||||||
135
src/business-logic/import-pipeline/sources/dirtstreet/index.js
Normal file
135
src/business-logic/import-pipeline/sources/dirtstreet/index.js
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
const fs = require("node:fs/promises");
|
||||||
|
const path = require("node:path");
|
||||||
|
const { BRANDS } = require("./brands");
|
||||||
|
const { scrapeDirtstreetBrand } = require("./scraper");
|
||||||
|
const { convertDirtstreetJsonToShopifyProducts } = require("./converter");
|
||||||
|
|
||||||
|
const sourceKey = "dirtstreet";
|
||||||
|
const label = "Dirtstreet";
|
||||||
|
const dataDir = path.join("data", "sources", sourceKey);
|
||||||
|
|
||||||
|
function paths() {
|
||||||
|
return {
|
||||||
|
aggregatedJson: path.join(dataDir, "01_products_aggregated.json"),
|
||||||
|
historyJson: path.join(dataDir, "01_products_run_history.json"),
|
||||||
|
downloadedImagesDir: path.join(dataDir, "02_downloaded_product_images"),
|
||||||
|
watermarkState: path.join(dataDir, "03_watermark_state.json"),
|
||||||
|
imageUploadState: path.join(dataDir, "04_shopify_image_upload_state.json"),
|
||||||
|
uploadedMapJson: path.join(dataDir, "04_shopify_uploaded_images_map.json"),
|
||||||
|
shopifyReadyJson: path.join(dataDir, "05_shopify_products_ready.json"),
|
||||||
|
logsDir: path.join(dataDir, "99_run_logs"),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWebsiteData({ paths: runPaths }) {
|
||||||
|
const brandsToScrape = process.env.DIRTSTREET_BRANDS
|
||||||
|
? BRANDS.filter((b) => process.env.DIRTSTREET_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
|
||||||
|
: BRANDS;
|
||||||
|
|
||||||
|
const allProducts = [];
|
||||||
|
const runSummary = [];
|
||||||
|
const now = new Date().toISOString();
|
||||||
|
|
||||||
|
for (const brand of brandsToScrape) {
|
||||||
|
console.log(`[DIRTSTREET] Scraping brand: ${brand.name} (${brand.brandUrl})`);
|
||||||
|
const brandStartedAt = new Date().toISOString();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const products = await scrapeDirtstreetBrand(brand);
|
||||||
|
|
||||||
|
for (const product of products) {
|
||||||
|
allProducts.push({
|
||||||
|
productId: product.sku || product.url,
|
||||||
|
sourceKey,
|
||||||
|
brand: product.brand,
|
||||||
|
brandSlug: product.brandSlug,
|
||||||
|
// productSummary.img is read by shared downloadImages + uploadImages utilities
|
||||||
|
productSummary: {
|
||||||
|
id: product.sku || product.url,
|
||||||
|
name: product.title,
|
||||||
|
img: product.images || [],
|
||||||
|
cost: { mrp: product.price || 0 },
|
||||||
|
},
|
||||||
|
scraped: product,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
runSummary.push({
|
||||||
|
brand: brand.name,
|
||||||
|
slug: brand.slug,
|
||||||
|
startedAt: brandStartedAt,
|
||||||
|
completedAt: new Date().toISOString(),
|
||||||
|
totalProducts: products.length,
|
||||||
|
success: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[DIRTSTREET] ${brand.name}: ${products.length} products scraped`);
|
||||||
|
} catch (err) {
|
||||||
|
console.log(`[DIRTSTREET] ${brand.name} failed: ${err.message}`);
|
||||||
|
runSummary.push({
|
||||||
|
brand: brand.name,
|
||||||
|
slug: brand.slug,
|
||||||
|
startedAt: brandStartedAt,
|
||||||
|
completedAt: new Date().toISOString(),
|
||||||
|
totalProducts: 0,
|
||||||
|
success: false,
|
||||||
|
error: err.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const analysis = {
|
||||||
|
timestamp: now,
|
||||||
|
totalBrands: brandsToScrape.length,
|
||||||
|
totalProductsUnique: allProducts.length,
|
||||||
|
detailSuccess: allProducts.filter((p) => !p.scraped?.scrapeError).length,
|
||||||
|
detailFailed: allProducts.filter((p) => p.scraped?.scrapeError).length,
|
||||||
|
runSummary,
|
||||||
|
};
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
generatedAt: now,
|
||||||
|
sourceKey,
|
||||||
|
sourceLabel: label,
|
||||||
|
analysis,
|
||||||
|
products: allProducts,
|
||||||
|
};
|
||||||
|
|
||||||
|
await fs.mkdir(path.dirname(path.resolve(process.cwd(), runPaths.aggregatedJson)), { recursive: true });
|
||||||
|
await fs.writeFile(path.resolve(process.cwd(), runPaths.aggregatedJson), JSON.stringify(payload, null, 2), "utf8");
|
||||||
|
|
||||||
|
let history = [];
|
||||||
|
try {
|
||||||
|
const raw = await fs.readFile(path.resolve(process.cwd(), runPaths.historyJson), "utf8");
|
||||||
|
history = JSON.parse(raw);
|
||||||
|
if (!Array.isArray(history)) history = [];
|
||||||
|
} catch { history = []; }
|
||||||
|
history.push(analysis);
|
||||||
|
await fs.writeFile(path.resolve(process.cwd(), runPaths.historyJson), JSON.stringify(history, null, 2), "utf8");
|
||||||
|
|
||||||
|
console.log(`[DIRTSTREET] Saved ${allProducts.length} products to ${runPaths.aggregatedJson}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
analysis,
|
||||||
|
outputJsonPath: path.resolve(process.cwd(), runPaths.aggregatedJson),
|
||||||
|
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertToShopifyProducts(input, options = {}) {
|
||||||
|
return convertDirtstreetJsonToShopifyProducts(input, {
|
||||||
|
brand: options.brand || label,
|
||||||
|
uploadedImageMap: options.uploadedImageMap,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
sourceKey,
|
||||||
|
label,
|
||||||
|
defaultBrand: "Dirtstreet",
|
||||||
|
defaultImageBaseUrl: "",
|
||||||
|
envImageBaseUrl: "DIRTSTREET_IMAGE_BASE_URL",
|
||||||
|
paths,
|
||||||
|
fetchWebsiteData,
|
||||||
|
convertToShopifyProducts,
|
||||||
|
};
|
||||||
423
src/business-logic/import-pipeline/sources/dirtstreet/scraper.js
Normal file
423
src/business-logic/import-pipeline/sources/dirtstreet/scraper.js
Normal file
@ -0,0 +1,423 @@
|
|||||||
|
/**
|
||||||
|
* Dirtstreet.in scraper - plain fetch + JSON-LD extraction (no browser).
|
||||||
|
*
|
||||||
|
* Dirtstreet is a WooCommerce store. Product pages contain JSON-LD schema.org
|
||||||
|
* markup. We fetch HTML directly and extract all fields via regex.
|
||||||
|
*
|
||||||
|
* 1. Brand pages paginated at /brand/{slug}/page/N/
|
||||||
|
* 2. Product detail via HTML + JSON-LD
|
||||||
|
*/
|
||||||
|
|
||||||
|
const BASE_URL = "https://dirtstreet.in";
|
||||||
|
const CONCURRENCY = 3;
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchHtml(url, attempt = 1) {
|
||||||
|
try {
|
||||||
|
const res = await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
|
||||||
|
"cache-control": "no-cache",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res.status === 429 || res.status >= 500) {
|
||||||
|
if (attempt <= 4) {
|
||||||
|
const wait = attempt * 2000;
|
||||||
|
console.log(`[RETRY] ${url} -> HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`);
|
||||||
|
await sleep(wait);
|
||||||
|
return fetchHtml(url, attempt + 1);
|
||||||
|
}
|
||||||
|
throw new Error(`HTTP ${res.status} after retries`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res.status === 404) return null;
|
||||||
|
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||||
|
return res.text();
|
||||||
|
} catch (err) {
|
||||||
|
if (attempt <= 3 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) {
|
||||||
|
await sleep(attempt * 2000);
|
||||||
|
return fetchHtml(url, attempt + 1);
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function decodeEntities(str) {
|
||||||
|
return String(str || "")
|
||||||
|
.replace(/&/g, "&")
|
||||||
|
.replace(/</g, "<")
|
||||||
|
.replace(/>/g, ">")
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/×/g, "x")
|
||||||
|
.replace(/ /g, " ")
|
||||||
|
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)));
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanText(value) {
|
||||||
|
return decodeEntities(String(value || ""))
|
||||||
|
.replace(/<[^>]+>/g, " ")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function toNumber(value) {
|
||||||
|
if (!value) return null;
|
||||||
|
const match = String(value).match(/([\d,]+\.?\d*)/);
|
||||||
|
if (!match) return null;
|
||||||
|
const n = Number.parseFloat(match[1].replace(/,/g, ""));
|
||||||
|
return Number.isNaN(n) ? null : n;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractPriceFromHtml(html) {
|
||||||
|
// data-price attribute (WooCommerce sets this for JS cart)
|
||||||
|
const dp = html.match(/data-price=["']([\d.]+)["']/);
|
||||||
|
if (dp) { const n = Number.parseFloat(dp[1]); if (n > 0) return n; }
|
||||||
|
|
||||||
|
// Pattern on dirtstreet: ₹</span>198,000.00</bdi>
|
||||||
|
// Collect ALL such values, filter out 0, take the first valid one (the product price)
|
||||||
|
const bdiPattern = /(?:₹|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>/gi;
|
||||||
|
let bdiMatch;
|
||||||
|
while ((bdiMatch = bdiPattern.exec(html)) !== null) {
|
||||||
|
const n = toNumber(bdiMatch[1]);
|
||||||
|
if (n > 0) return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// <ins> sale price block
|
||||||
|
const ins = html.match(/<ins>[\s\S]{0,300}?(?:₹|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>[\s\S]{0,300}?<\/ins>/i);
|
||||||
|
if (ins) { const n = toNumber(ins[1]); if (n > 0) return n; }
|
||||||
|
|
||||||
|
// Fallback: ₹ number anywhere
|
||||||
|
const entity = html.match(/₹[^>]*>([\d,]+(?:\.\d+)?)/);
|
||||||
|
if (entity) { const n = toNumber(entity[1]); if (n > 0) return n; }
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function availabilityToStockStatus(availability) {
|
||||||
|
if (!availability) return null;
|
||||||
|
const url = String(availability).toLowerCase();
|
||||||
|
if (url.includes("instock")) return "instock";
|
||||||
|
if (url.includes("outofstock")) return "outofstock";
|
||||||
|
if (url.includes("backorder") || url.includes("preorder")) return "onbackorder";
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// JSON-LD extraction
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function extractJsonLd(html) {
|
||||||
|
const results = [];
|
||||||
|
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
||||||
|
let match;
|
||||||
|
while ((match = regex.exec(html)) !== null) {
|
||||||
|
try {
|
||||||
|
results.push(JSON.parse(match[1].trim()));
|
||||||
|
} catch {
|
||||||
|
// skip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Collection page: extract product URLs
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function extractProductUrls(html) {
|
||||||
|
const urls = new Set();
|
||||||
|
const regex = /href=["']((?:https?:\/\/dirtstreet\.in)?\/product\/[^"'#?]+)["']/gi;
|
||||||
|
let match;
|
||||||
|
while ((match = regex.exec(html)) !== null) {
|
||||||
|
let url = match[1];
|
||||||
|
if (url.startsWith("/")) url = `${BASE_URL}${url}`;
|
||||||
|
if (url.includes("/product-category/") || url.includes("/product-tag/")) continue;
|
||||||
|
// normalize trailing slash
|
||||||
|
url = url.replace(/\/?$/, "/");
|
||||||
|
urls.add(url);
|
||||||
|
}
|
||||||
|
return Array.from(urls);
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasNextPage(html, currentPage, slug) {
|
||||||
|
const pattern = new RegExp(
|
||||||
|
`href=["'][^"']*/brand/${slug}/page/${currentPage + 1}/?["']`,
|
||||||
|
"i"
|
||||||
|
);
|
||||||
|
return pattern.test(html);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchBrandProductUrls(slug, brandUrl) {
|
||||||
|
const allUrls = new Set();
|
||||||
|
let page = 1;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const url = page === 1 ? brandUrl : `${brandUrl}page/${page}/`;
|
||||||
|
console.log(`[DIRTSTREET] Fetching brand page ${page}: ${url}`);
|
||||||
|
|
||||||
|
const html = await fetchHtml(url);
|
||||||
|
if (!html) break;
|
||||||
|
|
||||||
|
const urls = extractProductUrls(html);
|
||||||
|
const before = allUrls.size;
|
||||||
|
urls.forEach((u) => allUrls.add(u));
|
||||||
|
|
||||||
|
console.log(`[DIRTSTREET] Page ${page} -> ${urls.length} product URLs (total: ${allUrls.size})`);
|
||||||
|
|
||||||
|
if (!hasNextPage(html, page, slug) || allUrls.size === before) break;
|
||||||
|
page++;
|
||||||
|
await sleep(300);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Array.from(allUrls);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Extra WooCommerce fields from HTML
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function extractExtraFields(html) {
|
||||||
|
const extra = {
|
||||||
|
sku: null,
|
||||||
|
categories: [],
|
||||||
|
tags: [],
|
||||||
|
shortDescription: null,
|
||||||
|
stockStatus: null,
|
||||||
|
attributes: [],
|
||||||
|
variations: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
// SKU - strip "SKU:" label prefix
|
||||||
|
const skuMatch = html.match(/<span[^>]+class="[^"]*\bsku\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||||
|
if (skuMatch) {
|
||||||
|
extra.sku = cleanText(skuMatch[1])
|
||||||
|
.replace(/^SKU\s*:\s*/i, "")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stock status — read from the main product wrapper class only, not whole page
|
||||||
|
// The first <div/article class="product ... instock/outofstock ..."> is the current product
|
||||||
|
const productClassMatch = html.match(/class="[^"]*\bproduct\b[^"]*\b(instock|outofstock|on-backorder)\b[^"]*"/i);
|
||||||
|
if (productClassMatch) {
|
||||||
|
const cls = productClassMatch[1].toLowerCase();
|
||||||
|
if (cls === "instock") extra.stockStatus = "instock";
|
||||||
|
else if (cls === "outofstock") extra.stockStatus = "outofstock";
|
||||||
|
else extra.stockStatus = "onbackorder";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Categories
|
||||||
|
const catMatch = html.match(/<span[^>]+class="[^"]*posted_in[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||||
|
if (catMatch) {
|
||||||
|
const catRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
|
||||||
|
let m;
|
||||||
|
while ((m = catRegex.exec(catMatch[1])) !== null) {
|
||||||
|
const cat = cleanText(m[1]);
|
||||||
|
if (cat) extra.categories.push(cat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tags
|
||||||
|
const tagMatch = html.match(/<span[^>]+class="[^"]*tagged_as[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||||
|
if (tagMatch) {
|
||||||
|
const tagRegex = /<a[^>]*>([\s\S]*?)<\/a>/gi;
|
||||||
|
let m;
|
||||||
|
while ((m = tagRegex.exec(tagMatch[1])) !== null) {
|
||||||
|
const tag = cleanText(m[1]);
|
||||||
|
if (tag) extra.tags.push(tag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Short description
|
||||||
|
const sdMatch = html.match(
|
||||||
|
/<div[^>]+class="[^"]*woocommerce-product-details__short-description[^"]*"[^>]*>([\s\S]*?)<\/div>/i
|
||||||
|
);
|
||||||
|
if (sdMatch) extra.shortDescription = cleanText(sdMatch[1]);
|
||||||
|
|
||||||
|
// Attributes table
|
||||||
|
const attrMatch = html.match(
|
||||||
|
/<table[^>]+class="[^"]*woocommerce-product-attributes[^"]*"[^>]*>([\s\S]*?)<\/table>/i
|
||||||
|
);
|
||||||
|
if (attrMatch) {
|
||||||
|
const rows = attrMatch[1].match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi) || [];
|
||||||
|
for (const row of rows) {
|
||||||
|
const th = row.match(/<th[^>]*>([\s\S]*?)<\/th>/i);
|
||||||
|
const td = row.match(/<td[^>]*>([\s\S]*?)<\/td>/i);
|
||||||
|
if (th && td) {
|
||||||
|
const key = cleanText(th[1]);
|
||||||
|
const value = cleanText(td[1]);
|
||||||
|
if (key && value) extra.attributes.push({ name: key, value });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WooCommerce variations JSON in data attribute
|
||||||
|
const formMatch = html.match(/data-product_variations=["']([\[{][\s\S]*?)["']\s*>/);
|
||||||
|
if (formMatch) {
|
||||||
|
try {
|
||||||
|
extra.variations = JSON.parse(
|
||||||
|
formMatch[1].replace(/"/g, '"').replace(/"/g, '"')
|
||||||
|
);
|
||||||
|
} catch { /* skip */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
return extra;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Scrape one product detail page
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async function scrapeProductDetail(productUrl, brandName, brandSlug) {
|
||||||
|
const html = await fetchHtml(productUrl);
|
||||||
|
if (!html) {
|
||||||
|
return { recordType: "product", source: "dirtstreet", brand: brandName, brandSlug, url: productUrl, scrapeError: "404" };
|
||||||
|
}
|
||||||
|
|
||||||
|
const jsonLdBlocks = extractJsonLd(html);
|
||||||
|
// Try direct Product type first, then search every @graph block
|
||||||
|
let schema = jsonLdBlocks.find((b) => b["@type"] === "Product");
|
||||||
|
if (!schema) {
|
||||||
|
for (const block of jsonLdBlocks) {
|
||||||
|
if (Array.isArray(block["@graph"])) {
|
||||||
|
const found = block["@graph"].find((g) => g["@type"] === "Product");
|
||||||
|
if (found) { schema = found; break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const extra = extractExtraFields(html);
|
||||||
|
|
||||||
|
const offer = Array.isArray(schema?.offers) ? schema.offers[0] : (schema?.offers || {});
|
||||||
|
|
||||||
|
// Price: JSON-LD offers.priceSpecification[0].price OR offers.price, then HTML fallback
|
||||||
|
const priceSpec = Array.isArray(offer?.priceSpecification)
|
||||||
|
? offer.priceSpecification[0]?.price
|
||||||
|
: null;
|
||||||
|
let price = toNumber(priceSpec ?? offer?.price ?? offer?.lowPrice);
|
||||||
|
if (!price || price === 0) price = extractPriceFromHtml(html);
|
||||||
|
if (price === 0) price = null;
|
||||||
|
|
||||||
|
// Stock: derive from JSON-LD availability (most reliable), fall back to HTML class
|
||||||
|
const stockFromSchema = availabilityToStockStatus(offer?.availability);
|
||||||
|
const stockStatus = stockFromSchema || extra.stockStatus || null;
|
||||||
|
|
||||||
|
const currency = offer?.priceCurrency || "INR";
|
||||||
|
|
||||||
|
const images = Array.isArray(schema?.image)
|
||||||
|
? schema.image.map((img) => (typeof img === "string" ? img : img?.url || img?.contentUrl)).filter(Boolean)
|
||||||
|
: schema?.image
|
||||||
|
? [typeof schema.image === "string" ? schema.image : (schema.image?.url || schema.image?.contentUrl)]
|
||||||
|
: [];
|
||||||
|
|
||||||
|
return {
|
||||||
|
recordType: "product",
|
||||||
|
source: "dirtstreet",
|
||||||
|
brand: schema?.brand?.name || brandName,
|
||||||
|
brandSlug,
|
||||||
|
url: productUrl,
|
||||||
|
title: cleanText(schema?.name || ""),
|
||||||
|
sku: extra.sku || cleanText(schema?.sku || ""),
|
||||||
|
mpn: cleanText(schema?.mpn || ""),
|
||||||
|
gtin: cleanText(schema?.gtin || schema?.gtin13 || ""),
|
||||||
|
price,
|
||||||
|
currency,
|
||||||
|
priceRaw: price != null ? String(price) : null,
|
||||||
|
compareAtPrice: toNumber(offer?.highPrice) || null,
|
||||||
|
availability: offer?.availability || null,
|
||||||
|
stockStatus,
|
||||||
|
image: images[0] || null,
|
||||||
|
images,
|
||||||
|
description: cleanText(schema?.description || ""),
|
||||||
|
shortDescription: extra.shortDescription,
|
||||||
|
categories: extra.categories,
|
||||||
|
tags: extra.tags,
|
||||||
|
attributes: extra.attributes,
|
||||||
|
variations: extra.variations,
|
||||||
|
aggregateRating: schema?.aggregateRating
|
||||||
|
? { ratingValue: schema.aggregateRating.ratingValue, reviewCount: schema.aggregateRating.reviewCount }
|
||||||
|
: null,
|
||||||
|
scrapeError: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Concurrency helper
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async function mapWithConcurrency(items, concurrency, worker) {
|
||||||
|
const results = new Array(items.length);
|
||||||
|
let index = 0;
|
||||||
|
async function run() {
|
||||||
|
while (true) {
|
||||||
|
const i = index++;
|
||||||
|
if (i >= items.length) return;
|
||||||
|
results[i] = await worker(items[i], i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Main export: scrape one brand
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async function scrapeDirtstreetBrand({ name, slug, brandUrl }, options = {}) {
|
||||||
|
const limit = options.limit ? Number(options.limit) : null;
|
||||||
|
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
||||||
|
const concurrency = options.concurrency || CONCURRENCY;
|
||||||
|
|
||||||
|
let productUrls = await fetchBrandProductUrls(slug, brandUrl);
|
||||||
|
console.log(`[DIRTSTREET:${name}] Found ${productUrls.length} products`);
|
||||||
|
|
||||||
|
if (limit && productUrls.length > limit) {
|
||||||
|
console.log(`[DIRTSTREET:${name}] Limit applied: fetching first ${limit} of ${productUrls.length}`);
|
||||||
|
productUrls = productUrls.slice(0, limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
let done = 0;
|
||||||
|
const products = await mapWithConcurrency(productUrls, concurrency, async (url, i) => {
|
||||||
|
const product = await scrapeProductDetail(url, name, slug);
|
||||||
|
done++;
|
||||||
|
if (done % 10 === 0 || done === productUrls.length) {
|
||||||
|
console.log(`[DIRTSTREET:${name}] ${done}/${productUrls.length} products scraped`);
|
||||||
|
}
|
||||||
|
if (onProgress) onProgress({ done, total: productUrls.length, product: product.title || url });
|
||||||
|
return product;
|
||||||
|
});
|
||||||
|
|
||||||
|
return products;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Standalone test
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
const { BRANDS } = require("./brands");
|
||||||
|
const brand = BRANDS[0];
|
||||||
|
console.log(`Testing scraper for: ${brand.name}`);
|
||||||
|
scrapeDirtstreetBrand(brand, { limit: 3 })
|
||||||
|
.then((products) => {
|
||||||
|
console.log(`\nScraped ${products.length} products`);
|
||||||
|
console.log("Sample product:");
|
||||||
|
console.log(JSON.stringify(products[0], null, 2));
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
console.error("Scrape failed:", err.message);
|
||||||
|
process.exitCode = 1;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { scrapeDirtstreetBrand, fetchBrandProductUrls, scrapeProductDetail };
|
||||||
@ -1,9 +1,13 @@
|
|||||||
const kyt = require("./kyt");
|
const kyt = require("./kyt");
|
||||||
const brocksPerformance = require("./brocks-performance");
|
const brocksPerformance = require("./brocks-performance");
|
||||||
|
const motousher = require("./motousher");
|
||||||
|
const dirtstreet = require("./dirtstreet");
|
||||||
|
|
||||||
const sources = {
|
const sources = {
|
||||||
[kyt.sourceKey]: kyt,
|
[kyt.sourceKey]: kyt,
|
||||||
[brocksPerformance.sourceKey]: brocksPerformance,
|
[brocksPerformance.sourceKey]: brocksPerformance,
|
||||||
|
[motousher.sourceKey]: motousher,
|
||||||
|
[dirtstreet.sourceKey]: dirtstreet,
|
||||||
};
|
};
|
||||||
|
|
||||||
function normalizeSourceKey(sourceKey) {
|
function normalizeSourceKey(sourceKey) {
|
||||||
|
|||||||
@ -0,0 +1,21 @@
|
|||||||
|
/**
|
||||||
|
* Motousher.com brand list — production config.
|
||||||
|
* Source: https://www.motousher.com/pages/partner-brands
|
||||||
|
*/
|
||||||
|
|
||||||
|
const BRANDS = [
|
||||||
|
{ name: "All Balls Racing", slug: "all-balls-racing", collectionUrl: "https://www.motousher.com/collections/all-balls-racing" },
|
||||||
|
{ name: "DID Chains", slug: "did-chains", collectionUrl: "https://www.motousher.com/collections/did-chains" },
|
||||||
|
{ name: "EBC Brakes", slug: "ebc-brakes", collectionUrl: "https://www.motousher.com/collections/ebc-brakes" },
|
||||||
|
{ name: "Esjot Sprockets", slug: "esjot-sprockets", collectionUrl: "https://www.motousher.com/collections/esjot-sprockets" },
|
||||||
|
{ name: "Evans Coolant", slug: "evans-coolant", collectionUrl: "https://www.motousher.com/collections/evans-coolant" },
|
||||||
|
{ name: "Grip Puppies", slug: "grip-puppies", collectionUrl: "https://www.motousher.com/collections/grip-puppies" },
|
||||||
|
{ name: "HiFlo Filters", slug: "hi-flo", collectionUrl: "https://www.motousher.com/collections/hi-flo" },
|
||||||
|
{ name: "JT Sprockets", slug: "jt-sprockets", collectionUrl: "https://www.motousher.com/collections/jt-sprockets" },
|
||||||
|
{ name: "Maxima Racing Oils", slug: "maxima-racing-oils", collectionUrl: "https://www.motousher.com/collections/maxima-racing-oils" },
|
||||||
|
{ name: "Putoline", slug: "putoline", collectionUrl: "https://www.motousher.com/collections/putoline" },
|
||||||
|
{ name: "Ram Mount", slug: "ram-mount", collectionUrl: "https://www.motousher.com/collections/ram-mount" },
|
||||||
|
{ name: "Wunderlich", slug: "wunderlich", collectionUrl: "https://www.motousher.com/collections/wunderlich" },
|
||||||
|
];
|
||||||
|
|
||||||
|
module.exports = { BRANDS };
|
||||||
@ -0,0 +1,113 @@
|
|||||||
|
const path = require("node:path");
|
||||||
|
|
||||||
|
function slugify(str) {
|
||||||
|
return String(str || "").trim().toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
function getUploadedImageUrl(imgPath, uploadedImageMap) {
|
||||||
|
if (!uploadedImageMap || typeof uploadedImageMap !== "object") return null;
|
||||||
|
const bySourcePath = uploadedImageMap.bySourcePath || {};
|
||||||
|
return bySourcePath[String(imgPath)]?.url || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getImageFileName(imagePath) {
|
||||||
|
try { return path.basename(new URL(imagePath).pathname); } catch { return path.basename(String(imagePath || "").split(/[?#]/)[0]); }
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertMotousherRecordToShopifyReady(record, options = {}) {
|
||||||
|
const brand = options.brand || record.brand || "Motousher";
|
||||||
|
const uploadedImageMap = options.uploadedImageMap || null;
|
||||||
|
|
||||||
|
const productId = record.handle || slugify(`${record.title || ""}-${record.sku || ""}`);
|
||||||
|
const title = record.title || "Untitled Product";
|
||||||
|
const sku = record.sku || productId;
|
||||||
|
const price = Number(record.price ?? 0);
|
||||||
|
const compareAtPrice = record.compareAtPrice ? Number(record.compareAtPrice) : null;
|
||||||
|
const quantity = record.variants?.reduce((sum, v) => sum + (v.quantity || 0), 0) || 0;
|
||||||
|
const descriptionHtml = record.descriptionHtml || "";
|
||||||
|
const imagePaths = Array.isArray(record.images) ? record.images : [];
|
||||||
|
|
||||||
|
const files = imagePaths
|
||||||
|
.map((imgPath) => ({
|
||||||
|
type: "Image",
|
||||||
|
url: getUploadedImageUrl(imgPath, uploadedImageMap) || imgPath,
|
||||||
|
media_content: getImageFileName(imgPath),
|
||||||
|
source_path: imgPath,
|
||||||
|
}))
|
||||||
|
.filter((f) => f.url);
|
||||||
|
|
||||||
|
// Options: filter out "Title"/"Default Title" Shopify placeholder
|
||||||
|
const sourceOptions = Array.isArray(record.options)
|
||||||
|
? record.options.filter((opt) => opt.name !== "Title" || !opt.values?.every((v) => v === "Default Title"))
|
||||||
|
: [];
|
||||||
|
|
||||||
|
// Variants
|
||||||
|
let variants;
|
||||||
|
if (Array.isArray(record.variants) && record.variants.length) {
|
||||||
|
variants = record.variants.map((v, index) => ({
|
||||||
|
sku: v.sku || (index === 0 ? sku : `${sku}-${index + 1}`),
|
||||||
|
price: Number(v.price ?? price),
|
||||||
|
compare_price: v.compare_price != null ? Number(v.compare_price) : compareAtPrice,
|
||||||
|
quantity: Number(v.quantity ?? 0),
|
||||||
|
optionValues: Array.isArray(v.optionValues) ? v.optionValues : [],
|
||||||
|
}));
|
||||||
|
} else {
|
||||||
|
variants = [{ sku, price, compare_price: compareAtPrice, quantity, optionValues: [] }];
|
||||||
|
}
|
||||||
|
|
||||||
|
const category = record.productType || "Motorcycle Parts";
|
||||||
|
const tags = [
|
||||||
|
brand,
|
||||||
|
category,
|
||||||
|
record.brandSlug,
|
||||||
|
sku,
|
||||||
|
...(Array.isArray(record.tags) ? record.tags : []),
|
||||||
|
].filter(Boolean).map((t) => String(t).trim());
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: productId,
|
||||||
|
source: {
|
||||||
|
productId,
|
||||||
|
sourceKey: "motousher",
|
||||||
|
url: record.url || null,
|
||||||
|
image_paths: imagePaths,
|
||||||
|
},
|
||||||
|
attributes: {
|
||||||
|
product_name: title,
|
||||||
|
brand,
|
||||||
|
category,
|
||||||
|
subcategory: record.productType || "",
|
||||||
|
part_number: sku,
|
||||||
|
mfr_part_number: sku,
|
||||||
|
price,
|
||||||
|
compare_price: compareAtPrice,
|
||||||
|
purchase_cost: null,
|
||||||
|
barcode: record.barcode || "",
|
||||||
|
price_group: null,
|
||||||
|
units_per_sku: null,
|
||||||
|
part_description: descriptionHtml,
|
||||||
|
descriptions: descriptionHtml ? [{ type: "Market Description", description: descriptionHtml }] : [],
|
||||||
|
files,
|
||||||
|
image_paths: imagePaths,
|
||||||
|
dimensions: [{ weight: record.weight ? record.weight / 1000 : 0 }],
|
||||||
|
total_quantity: quantity,
|
||||||
|
inventorydata: { inventory: { main: quantity } },
|
||||||
|
options: sourceOptions.map((opt) => ({
|
||||||
|
name: opt.name,
|
||||||
|
values: Array.isArray(opt.values) ? opt.values : [],
|
||||||
|
})),
|
||||||
|
variants,
|
||||||
|
fitmentTags: { make: [], model: [], year: [], drive: [], baseModel: [] },
|
||||||
|
tags,
|
||||||
|
handle: slugify(`${brand}-${title}-${productId}`),
|
||||||
|
source_url: record.url || null,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertMotousherJsonToShopifyProducts(input, options = {}) {
|
||||||
|
const records = Array.isArray(input?.products) ? input.products : [];
|
||||||
|
return records.map((record) => convertMotousherRecordToShopifyReady(record, options));
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { convertMotousherRecordToShopifyReady, convertMotousherJsonToShopifyProducts };
|
||||||
137
src/business-logic/import-pipeline/sources/motousher/index.js
Normal file
137
src/business-logic/import-pipeline/sources/motousher/index.js
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
const fs = require("node:fs/promises");
|
||||||
|
const path = require("node:path");
|
||||||
|
const { BRANDS } = require("./brands");
|
||||||
|
const { scrapeMotousherBrand } = require("./scraper");
|
||||||
|
const { convertMotousherJsonToShopifyProducts } = require("./converter");
|
||||||
|
|
||||||
|
const sourceKey = "motousher";
|
||||||
|
const label = "Motousher";
|
||||||
|
const dataDir = path.join("data", "sources", sourceKey);
|
||||||
|
|
||||||
|
function paths() {
|
||||||
|
return {
|
||||||
|
aggregatedJson: path.join(dataDir, "01_products_aggregated.json"),
|
||||||
|
historyJson: path.join(dataDir, "01_products_run_history.json"),
|
||||||
|
downloadedImagesDir: path.join(dataDir, "02_downloaded_product_images"),
|
||||||
|
watermarkState: path.join(dataDir, "03_watermark_state.json"),
|
||||||
|
imageUploadState: path.join(dataDir, "04_shopify_image_upload_state.json"),
|
||||||
|
uploadedMapJson: path.join(dataDir, "04_shopify_uploaded_images_map.json"),
|
||||||
|
shopifyReadyJson: path.join(dataDir, "05_shopify_products_ready.json"),
|
||||||
|
logsDir: path.join(dataDir, "99_run_logs"),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWebsiteData({ paths: runPaths }) {
|
||||||
|
const brandsToScrape = process.env.MOTOUSHER_BRANDS
|
||||||
|
? BRANDS.filter((b) => process.env.MOTOUSHER_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
|
||||||
|
: BRANDS;
|
||||||
|
|
||||||
|
const allProducts = [];
|
||||||
|
const runSummary = [];
|
||||||
|
const now = new Date().toISOString();
|
||||||
|
|
||||||
|
for (const brand of brandsToScrape) {
|
||||||
|
console.log(`[MOTOUSHER] Scraping brand: ${brand.name} (${brand.collectionUrl})`);
|
||||||
|
const brandStartedAt = new Date().toISOString();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const products = await scrapeMotousherBrand(brand);
|
||||||
|
|
||||||
|
// Normalize into pipeline-compatible aggregated format
|
||||||
|
for (const product of products) {
|
||||||
|
allProducts.push({
|
||||||
|
productId: product.handle || product.url,
|
||||||
|
sourceKey,
|
||||||
|
brand: product.brand,
|
||||||
|
brandSlug: product.brandSlug,
|
||||||
|
// productSummary.img is read by shared downloadImages + uploadImages utilities
|
||||||
|
productSummary: {
|
||||||
|
id: product.handle,
|
||||||
|
name: product.title,
|
||||||
|
img: product.images || [],
|
||||||
|
cost: { mrp: product.price || 0 },
|
||||||
|
},
|
||||||
|
scraped: product,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
runSummary.push({
|
||||||
|
brand: brand.name,
|
||||||
|
slug: brand.slug,
|
||||||
|
startedAt: brandStartedAt,
|
||||||
|
completedAt: new Date().toISOString(),
|
||||||
|
totalProducts: products.length,
|
||||||
|
success: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[MOTOUSHER] ${brand.name}: ${products.length} products fetched`);
|
||||||
|
} catch (err) {
|
||||||
|
console.log(`[MOTOUSHER] ${brand.name} failed: ${err.message}`);
|
||||||
|
runSummary.push({
|
||||||
|
brand: brand.name,
|
||||||
|
slug: brand.slug,
|
||||||
|
startedAt: brandStartedAt,
|
||||||
|
completedAt: new Date().toISOString(),
|
||||||
|
totalProducts: 0,
|
||||||
|
success: false,
|
||||||
|
error: err.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const analysis = {
|
||||||
|
timestamp: now,
|
||||||
|
totalBrands: brandsToScrape.length,
|
||||||
|
totalProductsUnique: allProducts.length,
|
||||||
|
detailSuccess: allProducts.filter((p) => !p.scraped?.scrapeError).length,
|
||||||
|
detailFailed: allProducts.filter((p) => p.scraped?.scrapeError).length,
|
||||||
|
runSummary,
|
||||||
|
};
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
generatedAt: now,
|
||||||
|
sourceKey,
|
||||||
|
sourceLabel: label,
|
||||||
|
analysis,
|
||||||
|
products: allProducts,
|
||||||
|
};
|
||||||
|
|
||||||
|
await fs.mkdir(path.dirname(path.resolve(process.cwd(), runPaths.aggregatedJson)), { recursive: true });
|
||||||
|
await fs.writeFile(path.resolve(process.cwd(), runPaths.aggregatedJson), JSON.stringify(payload, null, 2), "utf8");
|
||||||
|
|
||||||
|
// Append to history
|
||||||
|
let history = [];
|
||||||
|
try {
|
||||||
|
const raw = await fs.readFile(path.resolve(process.cwd(), runPaths.historyJson), "utf8");
|
||||||
|
history = JSON.parse(raw);
|
||||||
|
if (!Array.isArray(history)) history = [];
|
||||||
|
} catch { history = []; }
|
||||||
|
history.push(analysis);
|
||||||
|
await fs.writeFile(path.resolve(process.cwd(), runPaths.historyJson), JSON.stringify(history, null, 2), "utf8");
|
||||||
|
|
||||||
|
console.log(`[MOTOUSHER] Saved ${allProducts.length} products to ${runPaths.aggregatedJson}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
analysis,
|
||||||
|
outputJsonPath: path.resolve(process.cwd(), runPaths.aggregatedJson),
|
||||||
|
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertToShopifyProducts(input, options = {}) {
|
||||||
|
return convertMotousherJsonToShopifyProducts(input, {
|
||||||
|
brand: options.brand || label,
|
||||||
|
uploadedImageMap: options.uploadedImageMap,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
sourceKey,
|
||||||
|
label,
|
||||||
|
defaultBrand: "Motousher",
|
||||||
|
defaultImageBaseUrl: "",
|
||||||
|
envImageBaseUrl: "MOTOUSHER_IMAGE_BASE_URL",
|
||||||
|
paths,
|
||||||
|
fetchWebsiteData,
|
||||||
|
convertToShopifyProducts,
|
||||||
|
};
|
||||||
185
src/business-logic/import-pipeline/sources/motousher/scraper.js
Normal file
185
src/business-logic/import-pipeline/sources/motousher/scraper.js
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
/**
|
||||||
|
* Motousher.com scraper — Shopify JSON API.
|
||||||
|
* Uses /collections/{slug}/products.json (paginated) + /products/{handle}.json
|
||||||
|
*/
|
||||||
|
|
||||||
|
const BASE_URL = "https://www.motousher.com";
|
||||||
|
const CONCURRENCY = 3;
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchJson(url, attempt = 1) {
|
||||||
|
try {
|
||||||
|
const res = await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
accept: "application/json",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res.status === 429 || res.status >= 500) {
|
||||||
|
if (attempt <= 4) {
|
||||||
|
const wait = attempt * 1500;
|
||||||
|
console.log(`[RETRY] ${url} HTTP ${res.status}, retrying in ${wait}ms (${attempt}/4)`);
|
||||||
|
await sleep(wait);
|
||||||
|
return fetchJson(url, attempt + 1);
|
||||||
|
}
|
||||||
|
throw new Error(`HTTP ${res.status} after retries`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||||
|
return res.json();
|
||||||
|
} catch (err) {
|
||||||
|
if (attempt <= 4 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) {
|
||||||
|
await sleep(attempt * 1500);
|
||||||
|
return fetchJson(url, attempt + 1);
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchCollectionHandles(collectionSlug) {
|
||||||
|
const handles = [];
|
||||||
|
let page = 1;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const url = `${BASE_URL}/collections/${collectionSlug}/products.json?limit=250&page=${page}`;
|
||||||
|
const data = await fetchJson(url);
|
||||||
|
const products = Array.isArray(data?.products) ? data.products : [];
|
||||||
|
if (!products.length) break;
|
||||||
|
for (const p of products) {
|
||||||
|
if (p.handle) handles.push(p.handle);
|
||||||
|
}
|
||||||
|
if (products.length < 250) break;
|
||||||
|
page++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return handles;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toNumber(value) {
|
||||||
|
if (!value) return null;
|
||||||
|
const match = String(value).match(/([\d,]+\.?\d*)/);
|
||||||
|
if (!match) return null;
|
||||||
|
const n = Number.parseFloat(match[1].replace(/,/g, ""));
|
||||||
|
return Number.isNaN(n) ? null : n;
|
||||||
|
}
|
||||||
|
|
||||||
|
function bestImageUrl(src) {
|
||||||
|
if (!src) return null;
|
||||||
|
return src.replace(/(_\d+x\d*|_\d+x)(\.\w+)(\?|$)/, "$2$3");
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeShopifyProduct(raw, brandName, brandSlug, collectionUrl) {
|
||||||
|
const images = (raw.images || []).map((img) => bestImageUrl(img.src)).filter(Boolean);
|
||||||
|
const firstVariant = raw.variants?.[0] || {};
|
||||||
|
const price = toNumber(firstVariant.price);
|
||||||
|
const compareAtPrice = toNumber(firstVariant.compare_at_price) || null;
|
||||||
|
const quantity = raw.variants?.reduce((sum, v) => sum + (v.inventory_quantity || 0), 0) || 0;
|
||||||
|
|
||||||
|
const options = (raw.options || [])
|
||||||
|
.filter((opt) => opt.name !== "Title" || (opt.values || []).join("") !== "Default Title")
|
||||||
|
.map((opt) => ({
|
||||||
|
name: opt.name,
|
||||||
|
values: opt.values || [],
|
||||||
|
}));
|
||||||
|
|
||||||
|
const hasRealOptions = options.length > 0;
|
||||||
|
|
||||||
|
const variants = (raw.variants || []).map((v, index) => ({
|
||||||
|
sku: v.sku || firstVariant.sku || raw.handle || "",
|
||||||
|
price: toNumber(v.price) ?? price,
|
||||||
|
compare_price: toNumber(v.compare_at_price) || null,
|
||||||
|
quantity: v.inventory_quantity || 0,
|
||||||
|
optionValues: hasRealOptions
|
||||||
|
? [v.option1, v.option2, v.option3]
|
||||||
|
.filter(Boolean)
|
||||||
|
.filter((val) => val !== "Default Title")
|
||||||
|
.map((val, i) => ({
|
||||||
|
name: options[i]?.name || `Option${i + 1}`,
|
||||||
|
value: val,
|
||||||
|
}))
|
||||||
|
: [],
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
recordType: "product",
|
||||||
|
source: "motousher",
|
||||||
|
brand: brandName,
|
||||||
|
brandSlug,
|
||||||
|
collectionUrl,
|
||||||
|
handle: raw.handle || null,
|
||||||
|
title: raw.title || null,
|
||||||
|
url: `${BASE_URL}/products/${raw.handle}`,
|
||||||
|
productType: raw.product_type || null,
|
||||||
|
vendor: raw.vendor || brandName,
|
||||||
|
tags: Array.isArray(raw.tags) ? raw.tags : typeof raw.tags === "string" ? raw.tags.split(",").map((t) => t.trim()).filter(Boolean) : [],
|
||||||
|
price,
|
||||||
|
compareAtPrice,
|
||||||
|
sku: firstVariant.sku || null,
|
||||||
|
barcode: firstVariant.barcode || null,
|
||||||
|
weight: firstVariant.grams || null,
|
||||||
|
available: raw.variants?.some((v) => v.available) ?? null,
|
||||||
|
image: images[0] || null,
|
||||||
|
images,
|
||||||
|
descriptionHtml: raw.body_html || null,
|
||||||
|
options,
|
||||||
|
variants,
|
||||||
|
publishedAt: raw.published_at || null,
|
||||||
|
scrapeError: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchProductDetail(handle, brandName, brandSlug, collectionUrl) {
|
||||||
|
const url = `${BASE_URL}/products/${handle}.json`;
|
||||||
|
try {
|
||||||
|
const data = await fetchJson(url);
|
||||||
|
if (!data?.product) throw new Error("No product in response");
|
||||||
|
return normalizeShopifyProduct(data.product, brandName, brandSlug, collectionUrl);
|
||||||
|
} catch (err) {
|
||||||
|
return { recordType: "product", source: "motousher", brand: brandName, brandSlug, collectionUrl, handle, url: `${BASE_URL}/products/${handle}`, scrapeError: err.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function mapWithConcurrency(items, concurrency, worker) {
|
||||||
|
const results = new Array(items.length);
|
||||||
|
let index = 0;
|
||||||
|
async function run() {
|
||||||
|
while (true) {
|
||||||
|
const i = index++;
|
||||||
|
if (i >= items.length) return;
|
||||||
|
results[i] = await worker(items[i], i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run));
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeMotousherBrand({ name, slug, collectionUrl }, options = {}) {
|
||||||
|
const limit = options.limit ? Number(options.limit) : null;
|
||||||
|
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
||||||
|
const concurrency = options.concurrency || CONCURRENCY;
|
||||||
|
|
||||||
|
let handles = await fetchCollectionHandles(slug);
|
||||||
|
if (limit && handles.length > limit) {
|
||||||
|
handles = handles.slice(0, limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
let done = 0;
|
||||||
|
const products = await mapWithConcurrency(handles, concurrency, async (handle, i) => {
|
||||||
|
const product = await fetchProductDetail(handle, name, slug, collectionUrl);
|
||||||
|
done++;
|
||||||
|
if (done % 20 === 0 || done === handles.length) {
|
||||||
|
console.log(`[MOTOUSHER:${name}] ${done}/${handles.length} fetched`);
|
||||||
|
}
|
||||||
|
if (onProgress) onProgress({ done, total: handles.length, product: product.title || handle });
|
||||||
|
return product;
|
||||||
|
});
|
||||||
|
|
||||||
|
return products;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { scrapeMotousherBrand, fetchCollectionHandles, fetchProductDetail };
|
||||||
Loading…
x
Reference in New Issue
Block a user