From c0132ab0aae5f2d363de87966eaea19c7921d704 Mon Sep 17 00:00:00 2001 From: MOHAN Date: Thu, 4 Jun 2026 12:25:31 +0530 Subject: [PATCH] Add motousher and dirtstreet as active import pipeline sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both sources are now registered in sources/index.js and fully wired into the 6-stage pipeline (fetch → download → watermark → upload → convert → upsert). The frontend will automatically show them as tabs via GET /pipeline/sources without any frontend changes needed. motousher/ (Shopify JSON API — 12 brands, ~2,446 products): - scraper.js: fetches /collections/{slug}/products.json + /products/{handle}.json - converter.js: maps scraped products to standard pipeline format - index.js: fetchWebsiteData() loops all brands, normalises to productSummary.img format for shared download/upload utilities - Supports MOTOUSHER_BRANDS env var to filter brands on a run dirtstreet/ (WooCommerce HTML + JSON-LD — 5 brands, ~1,087 products): - scraper.js: pure fetch, paginates /brand/{slug}/page/N/, extracts price from offers.priceSpecification[0].price, stock from JSON-LD availability field - converter.js: maps scraped products to standard pipeline format, builds descriptionHtml from body + short desc + attributes table - index.js: fetchWebsiteData() loops all brands, normalises to productSummary.img format - Supports DIRTSTREET_BRANDS env var to filter brands on a run sources/index.js: registered all 4 sources (kyt, brocks-performance, motousher, dirtstreet). GET /pipeline/sources now returns all 4. Co-Authored-By: Claude Sonnet 4.6 --- .../sources/dirtstreet/brands.js | 14 + .../sources/dirtstreet/converter.js | 143 ++++++ .../sources/dirtstreet/index.js | 135 ++++++ .../sources/dirtstreet/scraper.js | 423 ++++++++++++++++++ .../import-pipeline/sources/index.js | 4 + .../sources/motousher/brands.js | 21 + .../sources/motousher/converter.js | 113 +++++ .../sources/motousher/index.js | 137 ++++++ .../sources/motousher/scraper.js | 185 ++++++++ 9 files changed, 1175 insertions(+) create mode 100644 src/business-logic/import-pipeline/sources/dirtstreet/brands.js create mode 100644 src/business-logic/import-pipeline/sources/dirtstreet/converter.js create mode 100644 src/business-logic/import-pipeline/sources/dirtstreet/index.js create mode 100644 src/business-logic/import-pipeline/sources/dirtstreet/scraper.js create mode 100644 src/business-logic/import-pipeline/sources/motousher/brands.js create mode 100644 src/business-logic/import-pipeline/sources/motousher/converter.js create mode 100644 src/business-logic/import-pipeline/sources/motousher/index.js create mode 100644 src/business-logic/import-pipeline/sources/motousher/scraper.js diff --git a/src/business-logic/import-pipeline/sources/dirtstreet/brands.js b/src/business-logic/import-pipeline/sources/dirtstreet/brands.js new file mode 100644 index 0000000..73a978f --- /dev/null +++ b/src/business-logic/import-pipeline/sources/dirtstreet/brands.js @@ -0,0 +1,14 @@ +/** + * Dirtstreet.in brand list — production config. + * Source: https://dirtstreet.in/shop-by-brands/ + */ + +const BRANDS = [ + { name: "SC Project", slug: "scproject", brandUrl: "https://dirtstreet.in/brand/scproject/" }, + { name: "Evotech Performance", slug: "evotechperformance", brandUrl: "https://dirtstreet.in/brand/evotechperformance/" }, + { name: "DNA Air Filters", slug: "dnaairfilters", brandUrl: "https://dirtstreet.in/brand/dnaairfilters/" }, + { name: "WRS", slug: "wrs", brandUrl: "https://dirtstreet.in/brand/wrs/" }, + { name: "Zero Gravity Racing", slug: "zerogravityracing", brandUrl: "https://dirtstreet.in/brand/zerogravityracing/" }, +]; + +module.exports = { BRANDS }; diff --git a/src/business-logic/import-pipeline/sources/dirtstreet/converter.js b/src/business-logic/import-pipeline/sources/dirtstreet/converter.js new file mode 100644 index 0000000..e99de09 --- /dev/null +++ b/src/business-logic/import-pipeline/sources/dirtstreet/converter.js @@ -0,0 +1,143 @@ +const path = require("node:path"); + +function slugify(str) { + return String(str || "").trim().toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, ""); +} + +function getUploadedImageUrl(imgPath, uploadedImageMap) { + if (!uploadedImageMap || typeof uploadedImageMap !== "object") return null; + const bySourcePath = uploadedImageMap.bySourcePath || {}; + return bySourcePath[String(imgPath)]?.url || null; +} + +function getImageFileName(imagePath) { + try { return path.basename(new URL(imagePath).pathname); } catch { return path.basename(String(imagePath || "").split(/[?#]/)[0]); } +} + +function availabilityToStockStatus(availability) { + if (!availability) return null; + const url = String(availability).toLowerCase(); + if (url.includes("instock")) return "instock"; + if (url.includes("outofstock")) return "outofstock"; + if (url.includes("backorder") || url.includes("preorder")) return "onbackorder"; + return null; +} + +function buildDescriptionHtml(record) { + const parts = []; + + if (record.descriptionHtml) { + parts.push(record.descriptionHtml); + } else if (record.description) { + parts.push(`

${record.description.replace(/\n/g, "
")}

`); + } + + if (record.shortDescription && !parts.join("").includes(record.shortDescription)) { + parts.unshift(`

${record.shortDescription}

`); + } + + if (Array.isArray(record.attributes) && record.attributes.length) { + const rows = record.attributes + .map((attr) => `${attr.name}${attr.value}`) + .join(""); + parts.push(`${rows}
`); + } + + return parts.join("\n"); +} + +function convertDirtstreetRecordToShopifyReady(record, options = {}) { + const brand = options.brand || record.brand || "Dirtstreet"; + const uploadedImageMap = options.uploadedImageMap || null; + + const title = record.title || "Untitled Product"; + const sku = record.sku || slugify(title); + const productId = sku || slugify(`${brand}-${title}`); + const price = Number(record.price ?? 0); + const compareAtPrice = record.compareAtPrice ? Number(record.compareAtPrice) : null; + const imagePaths = Array.isArray(record.images) ? record.images : []; + + // Stock: prefer schema availability, fall back to stockStatus + const stockStatus = availabilityToStockStatus(record.availability) || record.stockStatus || null; + const quantity = stockStatus === "instock" ? 1 : 0; + + const descriptionHtml = buildDescriptionHtml(record); + + const files = imagePaths + .map((imgPath) => ({ + type: "Image", + url: getUploadedImageUrl(imgPath, uploadedImageMap) || imgPath, + media_content: getImageFileName(imgPath), + source_path: imgPath, + })) + .filter((f) => f.url); + + // Categories become subcategory / collections + const categories = Array.isArray(record.categories) ? record.categories : []; + const [primaryCategory, ...subCategories] = categories; + const category = primaryCategory || "Motorcycle Parts"; + const subcategory = subCategories.join(", "); + + // Variants — dirtstreet products are typically simple (single variant) + const variants = [{ sku, price, compare_price: compareAtPrice, quantity, optionValues: [] }]; + + const tags = [ + brand, + category, + ...categories, + sku, + record.stockStatus || "", + ...(Array.isArray(record.tags) ? record.tags : []), + ].filter(Boolean).map((t) => String(t).trim()); + + return { + id: productId, + source: { + productId, + sourceKey: "dirtstreet", + url: record.url || null, + image_paths: imagePaths, + }, + attributes: { + product_name: title, + brand, + category, + subcategory, + part_number: sku, + mfr_part_number: record.mpn || sku, + price, + compare_price: compareAtPrice, + purchase_cost: null, + barcode: record.gtin || record.barcode || "", + price_group: null, + units_per_sku: null, + part_description: descriptionHtml, + descriptions: descriptionHtml ? [{ type: "Market Description", description: descriptionHtml }] : [], + files, + image_paths: imagePaths, + dimensions: [{ weight: 0 }], + total_quantity: quantity, + inventorydata: { inventory: { main: quantity } }, + options: [], + variants, + fitmentTags: { make: [], model: [], year: [], drive: [], baseModel: [] }, + tags, + handle: slugify(`${brand}-${title}-${productId}`), + source_url: record.url || null, + }, + }; +} + +function convertDirtstreetJsonToShopifyProducts(input, options = {}) { + const records = Array.isArray(input?.products) ? input.products : []; + return records.map((record) => { + // Each record in aggregated JSON has raw scraped data in record.scraped + const scraped = record.scraped || record; + return convertDirtstreetRecordToShopifyReady(scraped, { + brand: options.brand || scraped.brand || record.brand, + uploadedImageMap: options.uploadedImageMap, + }); + }); +} + +module.exports = { convertDirtstreetRecordToShopifyReady, convertDirtstreetJsonToShopifyProducts }; diff --git a/src/business-logic/import-pipeline/sources/dirtstreet/index.js b/src/business-logic/import-pipeline/sources/dirtstreet/index.js new file mode 100644 index 0000000..0143a1c --- /dev/null +++ b/src/business-logic/import-pipeline/sources/dirtstreet/index.js @@ -0,0 +1,135 @@ +const fs = require("node:fs/promises"); +const path = require("node:path"); +const { BRANDS } = require("./brands"); +const { scrapeDirtstreetBrand } = require("./scraper"); +const { convertDirtstreetJsonToShopifyProducts } = require("./converter"); + +const sourceKey = "dirtstreet"; +const label = "Dirtstreet"; +const dataDir = path.join("data", "sources", sourceKey); + +function paths() { + return { + aggregatedJson: path.join(dataDir, "01_products_aggregated.json"), + historyJson: path.join(dataDir, "01_products_run_history.json"), + downloadedImagesDir: path.join(dataDir, "02_downloaded_product_images"), + watermarkState: path.join(dataDir, "03_watermark_state.json"), + imageUploadState: path.join(dataDir, "04_shopify_image_upload_state.json"), + uploadedMapJson: path.join(dataDir, "04_shopify_uploaded_images_map.json"), + shopifyReadyJson: path.join(dataDir, "05_shopify_products_ready.json"), + logsDir: path.join(dataDir, "99_run_logs"), + }; +} + +async function fetchWebsiteData({ paths: runPaths }) { + const brandsToScrape = process.env.DIRTSTREET_BRANDS + ? BRANDS.filter((b) => process.env.DIRTSTREET_BRANDS.split(",").map((s) => s.trim()).includes(b.slug)) + : BRANDS; + + const allProducts = []; + const runSummary = []; + const now = new Date().toISOString(); + + for (const brand of brandsToScrape) { + console.log(`[DIRTSTREET] Scraping brand: ${brand.name} (${brand.brandUrl})`); + const brandStartedAt = new Date().toISOString(); + + try { + const products = await scrapeDirtstreetBrand(brand); + + for (const product of products) { + allProducts.push({ + productId: product.sku || product.url, + sourceKey, + brand: product.brand, + brandSlug: product.brandSlug, + // productSummary.img is read by shared downloadImages + uploadImages utilities + productSummary: { + id: product.sku || product.url, + name: product.title, + img: product.images || [], + cost: { mrp: product.price || 0 }, + }, + scraped: product, + }); + } + + runSummary.push({ + brand: brand.name, + slug: brand.slug, + startedAt: brandStartedAt, + completedAt: new Date().toISOString(), + totalProducts: products.length, + success: true, + }); + + console.log(`[DIRTSTREET] ${brand.name}: ${products.length} products scraped`); + } catch (err) { + console.log(`[DIRTSTREET] ${brand.name} failed: ${err.message}`); + runSummary.push({ + brand: brand.name, + slug: brand.slug, + startedAt: brandStartedAt, + completedAt: new Date().toISOString(), + totalProducts: 0, + success: false, + error: err.message, + }); + } + } + + const analysis = { + timestamp: now, + totalBrands: brandsToScrape.length, + totalProductsUnique: allProducts.length, + detailSuccess: allProducts.filter((p) => !p.scraped?.scrapeError).length, + detailFailed: allProducts.filter((p) => p.scraped?.scrapeError).length, + runSummary, + }; + + const payload = { + generatedAt: now, + sourceKey, + sourceLabel: label, + analysis, + products: allProducts, + }; + + await fs.mkdir(path.dirname(path.resolve(process.cwd(), runPaths.aggregatedJson)), { recursive: true }); + await fs.writeFile(path.resolve(process.cwd(), runPaths.aggregatedJson), JSON.stringify(payload, null, 2), "utf8"); + + let history = []; + try { + const raw = await fs.readFile(path.resolve(process.cwd(), runPaths.historyJson), "utf8"); + history = JSON.parse(raw); + if (!Array.isArray(history)) history = []; + } catch { history = []; } + history.push(analysis); + await fs.writeFile(path.resolve(process.cwd(), runPaths.historyJson), JSON.stringify(history, null, 2), "utf8"); + + console.log(`[DIRTSTREET] Saved ${allProducts.length} products to ${runPaths.aggregatedJson}`); + + return { + analysis, + outputJsonPath: path.resolve(process.cwd(), runPaths.aggregatedJson), + historyPath: path.resolve(process.cwd(), runPaths.historyJson), + }; +} + +function convertToShopifyProducts(input, options = {}) { + return convertDirtstreetJsonToShopifyProducts(input, { + brand: options.brand || label, + uploadedImageMap: options.uploadedImageMap, + }); +} + +module.exports = { + sourceKey, + label, + defaultBrand: "Dirtstreet", + defaultImageBaseUrl: "", + envImageBaseUrl: "DIRTSTREET_IMAGE_BASE_URL", + paths, + fetchWebsiteData, + convertToShopifyProducts, +}; diff --git a/src/business-logic/import-pipeline/sources/dirtstreet/scraper.js b/src/business-logic/import-pipeline/sources/dirtstreet/scraper.js new file mode 100644 index 0000000..82653ce --- /dev/null +++ b/src/business-logic/import-pipeline/sources/dirtstreet/scraper.js @@ -0,0 +1,423 @@ +/** + * Dirtstreet.in scraper - plain fetch + JSON-LD extraction (no browser). + * + * Dirtstreet is a WooCommerce store. Product pages contain JSON-LD schema.org + * markup. We fetch HTML directly and extract all fields via regex. + * + * 1. Brand pages paginated at /brand/{slug}/page/N/ + * 2. Product detail via HTML + JSON-LD + */ + +const BASE_URL = "https://dirtstreet.in"; +const CONCURRENCY = 3; + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchHtml(url, attempt = 1) { + try { + const res = await fetch(url, { + headers: { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "accept-language": "en-US,en;q=0.9", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36", + "cache-control": "no-cache", + }, + }); + + if (res.status === 429 || res.status >= 500) { + if (attempt <= 4) { + const wait = attempt * 2000; + console.log(`[RETRY] ${url} -> HTTP ${res.status}, retrying in ${wait}ms (attempt ${attempt}/4)`); + await sleep(wait); + return fetchHtml(url, attempt + 1); + } + throw new Error(`HTTP ${res.status} after retries`); + } + + if (res.status === 404) return null; + if (!res.ok) throw new Error(`HTTP ${res.status}`); + return res.text(); + } catch (err) { + if (attempt <= 3 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) { + await sleep(attempt * 2000); + return fetchHtml(url, attempt + 1); + } + throw err; + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function decodeEntities(str) { + return String(str || "") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/×/g, "x") + .replace(/ /g, " ") + .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code))); +} + +function cleanText(value) { + return decodeEntities(String(value || "")) + .replace(/<[^>]+>/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function toNumber(value) { + if (!value) return null; + const match = String(value).match(/([\d,]+\.?\d*)/); + if (!match) return null; + const n = Number.parseFloat(match[1].replace(/,/g, "")); + return Number.isNaN(n) ? null : n; +} + +function extractPriceFromHtml(html) { + // data-price attribute (WooCommerce sets this for JS cart) + const dp = html.match(/data-price=["']([\d.]+)["']/); + if (dp) { const n = Number.parseFloat(dp[1]); if (n > 0) return n; } + + // Pattern on dirtstreet: ₹198,000.00 + // Collect ALL such values, filter out 0, take the first valid one (the product price) + const bdiPattern = /(?:₹|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>/gi; + let bdiMatch; + while ((bdiMatch = bdiPattern.exec(html)) !== null) { + const n = toNumber(bdiMatch[1]); + if (n > 0) return n; + } + + // sale price block + const ins = html.match(/[\s\S]{0,300}?(?:₹|₹)<\/span>([\d,]+(?:\.\d+)?)<\/bdi>[\s\S]{0,300}?<\/ins>/i); + if (ins) { const n = toNumber(ins[1]); if (n > 0) return n; } + + // Fallback: ₹ number anywhere + const entity = html.match(/₹[^>]*>([\d,]+(?:\.\d+)?)/); + if (entity) { const n = toNumber(entity[1]); if (n > 0) return n; } + + return null; +} + +function availabilityToStockStatus(availability) { + if (!availability) return null; + const url = String(availability).toLowerCase(); + if (url.includes("instock")) return "instock"; + if (url.includes("outofstock")) return "outofstock"; + if (url.includes("backorder") || url.includes("preorder")) return "onbackorder"; + return null; +} + +// --------------------------------------------------------------------------- +// JSON-LD extraction +// --------------------------------------------------------------------------- + +function extractJsonLd(html) { + const results = []; + const regex = /]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; + let match; + while ((match = regex.exec(html)) !== null) { + try { + results.push(JSON.parse(match[1].trim())); + } catch { + // skip + } + } + return results; +} + +// --------------------------------------------------------------------------- +// Collection page: extract product URLs +// --------------------------------------------------------------------------- + +function extractProductUrls(html) { + const urls = new Set(); + const regex = /href=["']((?:https?:\/\/dirtstreet\.in)?\/product\/[^"'#?]+)["']/gi; + let match; + while ((match = regex.exec(html)) !== null) { + let url = match[1]; + if (url.startsWith("/")) url = `${BASE_URL}${url}`; + if (url.includes("/product-category/") || url.includes("/product-tag/")) continue; + // normalize trailing slash + url = url.replace(/\/?$/, "/"); + urls.add(url); + } + return Array.from(urls); +} + +function hasNextPage(html, currentPage, slug) { + const pattern = new RegExp( + `href=["'][^"']*/brand/${slug}/page/${currentPage + 1}/?["']`, + "i" + ); + return pattern.test(html); +} + +async function fetchBrandProductUrls(slug, brandUrl) { + const allUrls = new Set(); + let page = 1; + + while (true) { + const url = page === 1 ? brandUrl : `${brandUrl}page/${page}/`; + console.log(`[DIRTSTREET] Fetching brand page ${page}: ${url}`); + + const html = await fetchHtml(url); + if (!html) break; + + const urls = extractProductUrls(html); + const before = allUrls.size; + urls.forEach((u) => allUrls.add(u)); + + console.log(`[DIRTSTREET] Page ${page} -> ${urls.length} product URLs (total: ${allUrls.size})`); + + if (!hasNextPage(html, page, slug) || allUrls.size === before) break; + page++; + await sleep(300); + } + + return Array.from(allUrls); +} + +// --------------------------------------------------------------------------- +// Extra WooCommerce fields from HTML +// --------------------------------------------------------------------------- + +function extractExtraFields(html) { + const extra = { + sku: null, + categories: [], + tags: [], + shortDescription: null, + stockStatus: null, + attributes: [], + variations: [], + }; + + // SKU - strip "SKU:" label prefix + const skuMatch = html.match(/]+class="[^"]*\bsku\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i); + if (skuMatch) { + extra.sku = cleanText(skuMatch[1]) + .replace(/^SKU\s*:\s*/i, "") + .trim(); + } + + // Stock status — read from the main product wrapper class only, not whole page + // The first
is the current product + const productClassMatch = html.match(/class="[^"]*\bproduct\b[^"]*\b(instock|outofstock|on-backorder)\b[^"]*"/i); + if (productClassMatch) { + const cls = productClassMatch[1].toLowerCase(); + if (cls === "instock") extra.stockStatus = "instock"; + else if (cls === "outofstock") extra.stockStatus = "outofstock"; + else extra.stockStatus = "onbackorder"; + } + + // Categories + const catMatch = html.match(/]+class="[^"]*posted_in[^"]*"[^>]*>([\s\S]*?)<\/span>/i); + if (catMatch) { + const catRegex = /]*>([\s\S]*?)<\/a>/gi; + let m; + while ((m = catRegex.exec(catMatch[1])) !== null) { + const cat = cleanText(m[1]); + if (cat) extra.categories.push(cat); + } + } + + // Tags + const tagMatch = html.match(/]+class="[^"]*tagged_as[^"]*"[^>]*>([\s\S]*?)<\/span>/i); + if (tagMatch) { + const tagRegex = /]*>([\s\S]*?)<\/a>/gi; + let m; + while ((m = tagRegex.exec(tagMatch[1])) !== null) { + const tag = cleanText(m[1]); + if (tag) extra.tags.push(tag); + } + } + + // Short description + const sdMatch = html.match( + /]+class="[^"]*woocommerce-product-details__short-description[^"]*"[^>]*>([\s\S]*?)<\/div>/i + ); + if (sdMatch) extra.shortDescription = cleanText(sdMatch[1]); + + // Attributes table + const attrMatch = html.match( + /]+class="[^"]*woocommerce-product-attributes[^"]*"[^>]*>([\s\S]*?)<\/table>/i + ); + if (attrMatch) { + const rows = attrMatch[1].match(/]*>([\s\S]*?)<\/tr>/gi) || []; + for (const row of rows) { + const th = row.match(/]*>([\s\S]*?)<\/th>/i); + const td = row.match(/]*>([\s\S]*?)<\/td>/i); + if (th && td) { + const key = cleanText(th[1]); + const value = cleanText(td[1]); + if (key && value) extra.attributes.push({ name: key, value }); + } + } + } + + // WooCommerce variations JSON in data attribute + const formMatch = html.match(/data-product_variations=["']([\[{][\s\S]*?)["']\s*>/); + if (formMatch) { + try { + extra.variations = JSON.parse( + formMatch[1].replace(/"/g, '"').replace(/"/g, '"') + ); + } catch { /* skip */ } + } + + return extra; +} + +// --------------------------------------------------------------------------- +// Scrape one product detail page +// --------------------------------------------------------------------------- + +async function scrapeProductDetail(productUrl, brandName, brandSlug) { + const html = await fetchHtml(productUrl); + if (!html) { + return { recordType: "product", source: "dirtstreet", brand: brandName, brandSlug, url: productUrl, scrapeError: "404" }; + } + + const jsonLdBlocks = extractJsonLd(html); + // Try direct Product type first, then search every @graph block + let schema = jsonLdBlocks.find((b) => b["@type"] === "Product"); + if (!schema) { + for (const block of jsonLdBlocks) { + if (Array.isArray(block["@graph"])) { + const found = block["@graph"].find((g) => g["@type"] === "Product"); + if (found) { schema = found; break; } + } + } + } + + const extra = extractExtraFields(html); + + const offer = Array.isArray(schema?.offers) ? schema.offers[0] : (schema?.offers || {}); + + // Price: JSON-LD offers.priceSpecification[0].price OR offers.price, then HTML fallback + const priceSpec = Array.isArray(offer?.priceSpecification) + ? offer.priceSpecification[0]?.price + : null; + let price = toNumber(priceSpec ?? offer?.price ?? offer?.lowPrice); + if (!price || price === 0) price = extractPriceFromHtml(html); + if (price === 0) price = null; + + // Stock: derive from JSON-LD availability (most reliable), fall back to HTML class + const stockFromSchema = availabilityToStockStatus(offer?.availability); + const stockStatus = stockFromSchema || extra.stockStatus || null; + + const currency = offer?.priceCurrency || "INR"; + + const images = Array.isArray(schema?.image) + ? schema.image.map((img) => (typeof img === "string" ? img : img?.url || img?.contentUrl)).filter(Boolean) + : schema?.image + ? [typeof schema.image === "string" ? schema.image : (schema.image?.url || schema.image?.contentUrl)] + : []; + + return { + recordType: "product", + source: "dirtstreet", + brand: schema?.brand?.name || brandName, + brandSlug, + url: productUrl, + title: cleanText(schema?.name || ""), + sku: extra.sku || cleanText(schema?.sku || ""), + mpn: cleanText(schema?.mpn || ""), + gtin: cleanText(schema?.gtin || schema?.gtin13 || ""), + price, + currency, + priceRaw: price != null ? String(price) : null, + compareAtPrice: toNumber(offer?.highPrice) || null, + availability: offer?.availability || null, + stockStatus, + image: images[0] || null, + images, + description: cleanText(schema?.description || ""), + shortDescription: extra.shortDescription, + categories: extra.categories, + tags: extra.tags, + attributes: extra.attributes, + variations: extra.variations, + aggregateRating: schema?.aggregateRating + ? { ratingValue: schema.aggregateRating.ratingValue, reviewCount: schema.aggregateRating.reviewCount } + : null, + scrapeError: null, + }; +} + +// --------------------------------------------------------------------------- +// Concurrency helper +// --------------------------------------------------------------------------- + +async function mapWithConcurrency(items, concurrency, worker) { + const results = new Array(items.length); + let index = 0; + async function run() { + while (true) { + const i = index++; + if (i >= items.length) return; + results[i] = await worker(items[i], i); + } + } + await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run)); + return results; +} + +// --------------------------------------------------------------------------- +// Main export: scrape one brand +// --------------------------------------------------------------------------- + +async function scrapeDirtstreetBrand({ name, slug, brandUrl }, options = {}) { + const limit = options.limit ? Number(options.limit) : null; + const onProgress = typeof options.onProgress === "function" ? options.onProgress : null; + const concurrency = options.concurrency || CONCURRENCY; + + let productUrls = await fetchBrandProductUrls(slug, brandUrl); + console.log(`[DIRTSTREET:${name}] Found ${productUrls.length} products`); + + if (limit && productUrls.length > limit) { + console.log(`[DIRTSTREET:${name}] Limit applied: fetching first ${limit} of ${productUrls.length}`); + productUrls = productUrls.slice(0, limit); + } + + let done = 0; + const products = await mapWithConcurrency(productUrls, concurrency, async (url, i) => { + const product = await scrapeProductDetail(url, name, slug); + done++; + if (done % 10 === 0 || done === productUrls.length) { + console.log(`[DIRTSTREET:${name}] ${done}/${productUrls.length} products scraped`); + } + if (onProgress) onProgress({ done, total: productUrls.length, product: product.title || url }); + return product; + }); + + return products; +} + +// --------------------------------------------------------------------------- +// Standalone test +// --------------------------------------------------------------------------- + +if (require.main === module) { + const { BRANDS } = require("./brands"); + const brand = BRANDS[0]; + console.log(`Testing scraper for: ${brand.name}`); + scrapeDirtstreetBrand(brand, { limit: 3 }) + .then((products) => { + console.log(`\nScraped ${products.length} products`); + console.log("Sample product:"); + console.log(JSON.stringify(products[0], null, 2)); + }) + .catch((err) => { + console.error("Scrape failed:", err.message); + process.exitCode = 1; + }); +} + +module.exports = { scrapeDirtstreetBrand, fetchBrandProductUrls, scrapeProductDetail }; diff --git a/src/business-logic/import-pipeline/sources/index.js b/src/business-logic/import-pipeline/sources/index.js index 691b0d1..b78f398 100644 --- a/src/business-logic/import-pipeline/sources/index.js +++ b/src/business-logic/import-pipeline/sources/index.js @@ -1,9 +1,13 @@ const kyt = require("./kyt"); const brocksPerformance = require("./brocks-performance"); +const motousher = require("./motousher"); +const dirtstreet = require("./dirtstreet"); const sources = { [kyt.sourceKey]: kyt, [brocksPerformance.sourceKey]: brocksPerformance, + [motousher.sourceKey]: motousher, + [dirtstreet.sourceKey]: dirtstreet, }; function normalizeSourceKey(sourceKey) { diff --git a/src/business-logic/import-pipeline/sources/motousher/brands.js b/src/business-logic/import-pipeline/sources/motousher/brands.js new file mode 100644 index 0000000..a197d10 --- /dev/null +++ b/src/business-logic/import-pipeline/sources/motousher/brands.js @@ -0,0 +1,21 @@ +/** + * Motousher.com brand list — production config. + * Source: https://www.motousher.com/pages/partner-brands + */ + +const BRANDS = [ + { name: "All Balls Racing", slug: "all-balls-racing", collectionUrl: "https://www.motousher.com/collections/all-balls-racing" }, + { name: "DID Chains", slug: "did-chains", collectionUrl: "https://www.motousher.com/collections/did-chains" }, + { name: "EBC Brakes", slug: "ebc-brakes", collectionUrl: "https://www.motousher.com/collections/ebc-brakes" }, + { name: "Esjot Sprockets", slug: "esjot-sprockets", collectionUrl: "https://www.motousher.com/collections/esjot-sprockets" }, + { name: "Evans Coolant", slug: "evans-coolant", collectionUrl: "https://www.motousher.com/collections/evans-coolant" }, + { name: "Grip Puppies", slug: "grip-puppies", collectionUrl: "https://www.motousher.com/collections/grip-puppies" }, + { name: "HiFlo Filters", slug: "hi-flo", collectionUrl: "https://www.motousher.com/collections/hi-flo" }, + { name: "JT Sprockets", slug: "jt-sprockets", collectionUrl: "https://www.motousher.com/collections/jt-sprockets" }, + { name: "Maxima Racing Oils", slug: "maxima-racing-oils", collectionUrl: "https://www.motousher.com/collections/maxima-racing-oils" }, + { name: "Putoline", slug: "putoline", collectionUrl: "https://www.motousher.com/collections/putoline" }, + { name: "Ram Mount", slug: "ram-mount", collectionUrl: "https://www.motousher.com/collections/ram-mount" }, + { name: "Wunderlich", slug: "wunderlich", collectionUrl: "https://www.motousher.com/collections/wunderlich" }, +]; + +module.exports = { BRANDS }; diff --git a/src/business-logic/import-pipeline/sources/motousher/converter.js b/src/business-logic/import-pipeline/sources/motousher/converter.js new file mode 100644 index 0000000..6337892 --- /dev/null +++ b/src/business-logic/import-pipeline/sources/motousher/converter.js @@ -0,0 +1,113 @@ +const path = require("node:path"); + +function slugify(str) { + return String(str || "").trim().toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, ""); +} + +function getUploadedImageUrl(imgPath, uploadedImageMap) { + if (!uploadedImageMap || typeof uploadedImageMap !== "object") return null; + const bySourcePath = uploadedImageMap.bySourcePath || {}; + return bySourcePath[String(imgPath)]?.url || null; +} + +function getImageFileName(imagePath) { + try { return path.basename(new URL(imagePath).pathname); } catch { return path.basename(String(imagePath || "").split(/[?#]/)[0]); } +} + +function convertMotousherRecordToShopifyReady(record, options = {}) { + const brand = options.brand || record.brand || "Motousher"; + const uploadedImageMap = options.uploadedImageMap || null; + + const productId = record.handle || slugify(`${record.title || ""}-${record.sku || ""}`); + const title = record.title || "Untitled Product"; + const sku = record.sku || productId; + const price = Number(record.price ?? 0); + const compareAtPrice = record.compareAtPrice ? Number(record.compareAtPrice) : null; + const quantity = record.variants?.reduce((sum, v) => sum + (v.quantity || 0), 0) || 0; + const descriptionHtml = record.descriptionHtml || ""; + const imagePaths = Array.isArray(record.images) ? record.images : []; + + const files = imagePaths + .map((imgPath) => ({ + type: "Image", + url: getUploadedImageUrl(imgPath, uploadedImageMap) || imgPath, + media_content: getImageFileName(imgPath), + source_path: imgPath, + })) + .filter((f) => f.url); + + // Options: filter out "Title"/"Default Title" Shopify placeholder + const sourceOptions = Array.isArray(record.options) + ? record.options.filter((opt) => opt.name !== "Title" || !opt.values?.every((v) => v === "Default Title")) + : []; + + // Variants + let variants; + if (Array.isArray(record.variants) && record.variants.length) { + variants = record.variants.map((v, index) => ({ + sku: v.sku || (index === 0 ? sku : `${sku}-${index + 1}`), + price: Number(v.price ?? price), + compare_price: v.compare_price != null ? Number(v.compare_price) : compareAtPrice, + quantity: Number(v.quantity ?? 0), + optionValues: Array.isArray(v.optionValues) ? v.optionValues : [], + })); + } else { + variants = [{ sku, price, compare_price: compareAtPrice, quantity, optionValues: [] }]; + } + + const category = record.productType || "Motorcycle Parts"; + const tags = [ + brand, + category, + record.brandSlug, + sku, + ...(Array.isArray(record.tags) ? record.tags : []), + ].filter(Boolean).map((t) => String(t).trim()); + + return { + id: productId, + source: { + productId, + sourceKey: "motousher", + url: record.url || null, + image_paths: imagePaths, + }, + attributes: { + product_name: title, + brand, + category, + subcategory: record.productType || "", + part_number: sku, + mfr_part_number: sku, + price, + compare_price: compareAtPrice, + purchase_cost: null, + barcode: record.barcode || "", + price_group: null, + units_per_sku: null, + part_description: descriptionHtml, + descriptions: descriptionHtml ? [{ type: "Market Description", description: descriptionHtml }] : [], + files, + image_paths: imagePaths, + dimensions: [{ weight: record.weight ? record.weight / 1000 : 0 }], + total_quantity: quantity, + inventorydata: { inventory: { main: quantity } }, + options: sourceOptions.map((opt) => ({ + name: opt.name, + values: Array.isArray(opt.values) ? opt.values : [], + })), + variants, + fitmentTags: { make: [], model: [], year: [], drive: [], baseModel: [] }, + tags, + handle: slugify(`${brand}-${title}-${productId}`), + source_url: record.url || null, + }, + }; +} + +function convertMotousherJsonToShopifyProducts(input, options = {}) { + const records = Array.isArray(input?.products) ? input.products : []; + return records.map((record) => convertMotousherRecordToShopifyReady(record, options)); +} + +module.exports = { convertMotousherRecordToShopifyReady, convertMotousherJsonToShopifyProducts }; diff --git a/src/business-logic/import-pipeline/sources/motousher/index.js b/src/business-logic/import-pipeline/sources/motousher/index.js new file mode 100644 index 0000000..4659642 --- /dev/null +++ b/src/business-logic/import-pipeline/sources/motousher/index.js @@ -0,0 +1,137 @@ +const fs = require("node:fs/promises"); +const path = require("node:path"); +const { BRANDS } = require("./brands"); +const { scrapeMotousherBrand } = require("./scraper"); +const { convertMotousherJsonToShopifyProducts } = require("./converter"); + +const sourceKey = "motousher"; +const label = "Motousher"; +const dataDir = path.join("data", "sources", sourceKey); + +function paths() { + return { + aggregatedJson: path.join(dataDir, "01_products_aggregated.json"), + historyJson: path.join(dataDir, "01_products_run_history.json"), + downloadedImagesDir: path.join(dataDir, "02_downloaded_product_images"), + watermarkState: path.join(dataDir, "03_watermark_state.json"), + imageUploadState: path.join(dataDir, "04_shopify_image_upload_state.json"), + uploadedMapJson: path.join(dataDir, "04_shopify_uploaded_images_map.json"), + shopifyReadyJson: path.join(dataDir, "05_shopify_products_ready.json"), + logsDir: path.join(dataDir, "99_run_logs"), + }; +} + +async function fetchWebsiteData({ paths: runPaths }) { + const brandsToScrape = process.env.MOTOUSHER_BRANDS + ? BRANDS.filter((b) => process.env.MOTOUSHER_BRANDS.split(",").map((s) => s.trim()).includes(b.slug)) + : BRANDS; + + const allProducts = []; + const runSummary = []; + const now = new Date().toISOString(); + + for (const brand of brandsToScrape) { + console.log(`[MOTOUSHER] Scraping brand: ${brand.name} (${brand.collectionUrl})`); + const brandStartedAt = new Date().toISOString(); + + try { + const products = await scrapeMotousherBrand(brand); + + // Normalize into pipeline-compatible aggregated format + for (const product of products) { + allProducts.push({ + productId: product.handle || product.url, + sourceKey, + brand: product.brand, + brandSlug: product.brandSlug, + // productSummary.img is read by shared downloadImages + uploadImages utilities + productSummary: { + id: product.handle, + name: product.title, + img: product.images || [], + cost: { mrp: product.price || 0 }, + }, + scraped: product, + }); + } + + runSummary.push({ + brand: brand.name, + slug: brand.slug, + startedAt: brandStartedAt, + completedAt: new Date().toISOString(), + totalProducts: products.length, + success: true, + }); + + console.log(`[MOTOUSHER] ${brand.name}: ${products.length} products fetched`); + } catch (err) { + console.log(`[MOTOUSHER] ${brand.name} failed: ${err.message}`); + runSummary.push({ + brand: brand.name, + slug: brand.slug, + startedAt: brandStartedAt, + completedAt: new Date().toISOString(), + totalProducts: 0, + success: false, + error: err.message, + }); + } + } + + const analysis = { + timestamp: now, + totalBrands: brandsToScrape.length, + totalProductsUnique: allProducts.length, + detailSuccess: allProducts.filter((p) => !p.scraped?.scrapeError).length, + detailFailed: allProducts.filter((p) => p.scraped?.scrapeError).length, + runSummary, + }; + + const payload = { + generatedAt: now, + sourceKey, + sourceLabel: label, + analysis, + products: allProducts, + }; + + await fs.mkdir(path.dirname(path.resolve(process.cwd(), runPaths.aggregatedJson)), { recursive: true }); + await fs.writeFile(path.resolve(process.cwd(), runPaths.aggregatedJson), JSON.stringify(payload, null, 2), "utf8"); + + // Append to history + let history = []; + try { + const raw = await fs.readFile(path.resolve(process.cwd(), runPaths.historyJson), "utf8"); + history = JSON.parse(raw); + if (!Array.isArray(history)) history = []; + } catch { history = []; } + history.push(analysis); + await fs.writeFile(path.resolve(process.cwd(), runPaths.historyJson), JSON.stringify(history, null, 2), "utf8"); + + console.log(`[MOTOUSHER] Saved ${allProducts.length} products to ${runPaths.aggregatedJson}`); + + return { + analysis, + outputJsonPath: path.resolve(process.cwd(), runPaths.aggregatedJson), + historyPath: path.resolve(process.cwd(), runPaths.historyJson), + }; +} + +function convertToShopifyProducts(input, options = {}) { + return convertMotousherJsonToShopifyProducts(input, { + brand: options.brand || label, + uploadedImageMap: options.uploadedImageMap, + }); +} + +module.exports = { + sourceKey, + label, + defaultBrand: "Motousher", + defaultImageBaseUrl: "", + envImageBaseUrl: "MOTOUSHER_IMAGE_BASE_URL", + paths, + fetchWebsiteData, + convertToShopifyProducts, +}; diff --git a/src/business-logic/import-pipeline/sources/motousher/scraper.js b/src/business-logic/import-pipeline/sources/motousher/scraper.js new file mode 100644 index 0000000..e8df557 --- /dev/null +++ b/src/business-logic/import-pipeline/sources/motousher/scraper.js @@ -0,0 +1,185 @@ +/** + * Motousher.com scraper — Shopify JSON API. + * Uses /collections/{slug}/products.json (paginated) + /products/{handle}.json + */ + +const BASE_URL = "https://www.motousher.com"; +const CONCURRENCY = 3; + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchJson(url, attempt = 1) { + try { + const res = await fetch(url, { + headers: { + accept: "application/json", + "accept-language": "en-US,en;q=0.9", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36", + }, + }); + + if (res.status === 429 || res.status >= 500) { + if (attempt <= 4) { + const wait = attempt * 1500; + console.log(`[RETRY] ${url} HTTP ${res.status}, retrying in ${wait}ms (${attempt}/4)`); + await sleep(wait); + return fetchJson(url, attempt + 1); + } + throw new Error(`HTTP ${res.status} after retries`); + } + + if (!res.ok) throw new Error(`HTTP ${res.status}`); + return res.json(); + } catch (err) { + if (attempt <= 4 && (err.code === "UND_ERR_CONNECT_TIMEOUT" || err.code === "ECONNRESET")) { + await sleep(attempt * 1500); + return fetchJson(url, attempt + 1); + } + throw err; + } +} + +async function fetchCollectionHandles(collectionSlug) { + const handles = []; + let page = 1; + + while (true) { + const url = `${BASE_URL}/collections/${collectionSlug}/products.json?limit=250&page=${page}`; + const data = await fetchJson(url); + const products = Array.isArray(data?.products) ? data.products : []; + if (!products.length) break; + for (const p of products) { + if (p.handle) handles.push(p.handle); + } + if (products.length < 250) break; + page++; + } + + return handles; +} + +function toNumber(value) { + if (!value) return null; + const match = String(value).match(/([\d,]+\.?\d*)/); + if (!match) return null; + const n = Number.parseFloat(match[1].replace(/,/g, "")); + return Number.isNaN(n) ? null : n; +} + +function bestImageUrl(src) { + if (!src) return null; + return src.replace(/(_\d+x\d*|_\d+x)(\.\w+)(\?|$)/, "$2$3"); +} + +function normalizeShopifyProduct(raw, brandName, brandSlug, collectionUrl) { + const images = (raw.images || []).map((img) => bestImageUrl(img.src)).filter(Boolean); + const firstVariant = raw.variants?.[0] || {}; + const price = toNumber(firstVariant.price); + const compareAtPrice = toNumber(firstVariant.compare_at_price) || null; + const quantity = raw.variants?.reduce((sum, v) => sum + (v.inventory_quantity || 0), 0) || 0; + + const options = (raw.options || []) + .filter((opt) => opt.name !== "Title" || (opt.values || []).join("") !== "Default Title") + .map((opt) => ({ + name: opt.name, + values: opt.values || [], + })); + + const hasRealOptions = options.length > 0; + + const variants = (raw.variants || []).map((v, index) => ({ + sku: v.sku || firstVariant.sku || raw.handle || "", + price: toNumber(v.price) ?? price, + compare_price: toNumber(v.compare_at_price) || null, + quantity: v.inventory_quantity || 0, + optionValues: hasRealOptions + ? [v.option1, v.option2, v.option3] + .filter(Boolean) + .filter((val) => val !== "Default Title") + .map((val, i) => ({ + name: options[i]?.name || `Option${i + 1}`, + value: val, + })) + : [], + })); + + return { + recordType: "product", + source: "motousher", + brand: brandName, + brandSlug, + collectionUrl, + handle: raw.handle || null, + title: raw.title || null, + url: `${BASE_URL}/products/${raw.handle}`, + productType: raw.product_type || null, + vendor: raw.vendor || brandName, + tags: Array.isArray(raw.tags) ? raw.tags : typeof raw.tags === "string" ? raw.tags.split(",").map((t) => t.trim()).filter(Boolean) : [], + price, + compareAtPrice, + sku: firstVariant.sku || null, + barcode: firstVariant.barcode || null, + weight: firstVariant.grams || null, + available: raw.variants?.some((v) => v.available) ?? null, + image: images[0] || null, + images, + descriptionHtml: raw.body_html || null, + options, + variants, + publishedAt: raw.published_at || null, + scrapeError: null, + }; +} + +async function fetchProductDetail(handle, brandName, brandSlug, collectionUrl) { + const url = `${BASE_URL}/products/${handle}.json`; + try { + const data = await fetchJson(url); + if (!data?.product) throw new Error("No product in response"); + return normalizeShopifyProduct(data.product, brandName, brandSlug, collectionUrl); + } catch (err) { + return { recordType: "product", source: "motousher", brand: brandName, brandSlug, collectionUrl, handle, url: `${BASE_URL}/products/${handle}`, scrapeError: err.message }; + } +} + +async function mapWithConcurrency(items, concurrency, worker) { + const results = new Array(items.length); + let index = 0; + async function run() { + while (true) { + const i = index++; + if (i >= items.length) return; + results[i] = await worker(items[i], i); + } + } + await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, run)); + return results; +} + +async function scrapeMotousherBrand({ name, slug, collectionUrl }, options = {}) { + const limit = options.limit ? Number(options.limit) : null; + const onProgress = typeof options.onProgress === "function" ? options.onProgress : null; + const concurrency = options.concurrency || CONCURRENCY; + + let handles = await fetchCollectionHandles(slug); + if (limit && handles.length > limit) { + handles = handles.slice(0, limit); + } + + let done = 0; + const products = await mapWithConcurrency(handles, concurrency, async (handle, i) => { + const product = await fetchProductDetail(handle, name, slug, collectionUrl); + done++; + if (done % 20 === 0 || done === handles.length) { + console.log(`[MOTOUSHER:${name}] ${done}/${handles.length} fetched`); + } + if (onProgress) onProgress({ done, total: handles.length, product: product.title || handle }); + return product; + }); + + return products; +} + +module.exports = { scrapeMotousherBrand, fetchCollectionHandles, fetchProductDetail };