diff --git a/src/business-logic/import-pipeline/sources/dirtstreet/index.js b/src/business-logic/import-pipeline/sources/dirtstreet/index.js index 0143a1c..b63d419 100644 --- a/src/business-logic/import-pipeline/sources/dirtstreet/index.js +++ b/src/business-logic/import-pipeline/sources/dirtstreet/index.js @@ -22,6 +22,35 @@ function paths() { } async function fetchWebsiteData({ paths: runPaths }) { + // Use cached aggregated JSON if it exists and is fresh (default 24h). + // Set DIRTSTREET_CACHE_HOURS=0 to always force a full re-scrape. + const cacheMaxHours = Number(process.env.DIRTSTREET_CACHE_HOURS ?? 24); + const absAggregatedPath = path.resolve(process.cwd(), runPaths.aggregatedJson); + + if (cacheMaxHours > 0) { + try { + const existing = JSON.parse(await fs.readFile(absAggregatedPath, "utf8")); + if (existing?.generatedAt && Array.isArray(existing?.products) && existing.products.length > 0) { + const ageMs = Date.now() - new Date(existing.generatedAt).getTime(); + const ageHours = ageMs / (1000 * 60 * 60); + if (ageHours < cacheMaxHours) { + console.log( + `[DIRTSTREET] Using cached data (${ageHours.toFixed(1)}h old, limit ${cacheMaxHours}h) — ${existing.products.length} products. Set DIRTSTREET_CACHE_HOURS=0 to force re-scrape.` + ); + return { + analysis: existing.analysis || {}, + outputJsonPath: absAggregatedPath, + historyPath: path.resolve(process.cwd(), runPaths.historyJson), + cached: true, + }; + } + console.log(`[DIRTSTREET] Cache expired (${ageHours.toFixed(1)}h old). Re-scraping all brands...`); + } + } catch { + // no cache file yet, proceed with scraping + } + } + const brandsToScrape = process.env.DIRTSTREET_BRANDS ? BRANDS.filter((b) => process.env.DIRTSTREET_BRANDS.split(",").map((s) => s.trim()).includes(b.slug)) : BRANDS; diff --git a/src/business-logic/import-pipeline/sources/motousher/index.js b/src/business-logic/import-pipeline/sources/motousher/index.js index 4659642..63b5884 100644 --- a/src/business-logic/import-pipeline/sources/motousher/index.js +++ b/src/business-logic/import-pipeline/sources/motousher/index.js @@ -22,6 +22,35 @@ function paths() { } async function fetchWebsiteData({ paths: runPaths }) { + // Use cached aggregated JSON if it exists and is fresh (default 24h). + // Set MOTOUSHER_CACHE_HOURS=0 to always force a full re-scrape. + const cacheMaxHours = Number(process.env.MOTOUSHER_CACHE_HOURS ?? 24); + const absAggregatedPath = path.resolve(process.cwd(), runPaths.aggregatedJson); + + if (cacheMaxHours > 0) { + try { + const existing = JSON.parse(await fs.readFile(absAggregatedPath, "utf8")); + if (existing?.generatedAt && Array.isArray(existing?.products) && existing.products.length > 0) { + const ageMs = Date.now() - new Date(existing.generatedAt).getTime(); + const ageHours = ageMs / (1000 * 60 * 60); + if (ageHours < cacheMaxHours) { + console.log( + `[MOTOUSHER] Using cached data (${ageHours.toFixed(1)}h old, limit ${cacheMaxHours}h) — ${existing.products.length} products. Set MOTOUSHER_CACHE_HOURS=0 to force re-scrape.` + ); + return { + analysis: existing.analysis || {}, + outputJsonPath: absAggregatedPath, + historyPath: path.resolve(process.cwd(), runPaths.historyJson), + cached: true, + }; + } + console.log(`[MOTOUSHER] Cache expired (${ageHours.toFixed(1)}h old). Re-scraping all brands...`); + } + } catch { + // no cache file yet, proceed with scraping + } + } + const brandsToScrape = process.env.MOTOUSHER_BRANDS ? BRANDS.filter((b) => process.env.MOTOUSHER_BRANDS.split(",").map((s) => s.trim()).includes(b.slug)) : BRANDS;