Add 24h cache to motousher and dirtstreet fetchWebsiteData

Subsequent pipeline runs within 24 hours reuse the existing
01_products_aggregated.json instead of re-scraping all brands,
eliminating redundant HTTP requests and 429 rate-limit retries.

Cache lifetime controlled per source:
  MOTOUSHER_CACHE_HOURS=0  → always re-scrape
  DIRTSTREET_CACHE_HOURS=0 → always re-scrape
  (default: 24h)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
MOHAN 2026-06-04 16:44:38 +05:30
parent 6eac0b92ed
commit 4e536f08b3
2 changed files with 58 additions and 0 deletions

View File

@ -22,6 +22,35 @@ function paths() {
}
async function fetchWebsiteData({ paths: runPaths }) {
// Use cached aggregated JSON if it exists and is fresh (default 24h).
// Set DIRTSTREET_CACHE_HOURS=0 to always force a full re-scrape.
const cacheMaxHours = Number(process.env.DIRTSTREET_CACHE_HOURS ?? 24);
const absAggregatedPath = path.resolve(process.cwd(), runPaths.aggregatedJson);
if (cacheMaxHours > 0) {
try {
const existing = JSON.parse(await fs.readFile(absAggregatedPath, "utf8"));
if (existing?.generatedAt && Array.isArray(existing?.products) && existing.products.length > 0) {
const ageMs = Date.now() - new Date(existing.generatedAt).getTime();
const ageHours = ageMs / (1000 * 60 * 60);
if (ageHours < cacheMaxHours) {
console.log(
`[DIRTSTREET] Using cached data (${ageHours.toFixed(1)}h old, limit ${cacheMaxHours}h) — ${existing.products.length} products. Set DIRTSTREET_CACHE_HOURS=0 to force re-scrape.`
);
return {
analysis: existing.analysis || {},
outputJsonPath: absAggregatedPath,
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
cached: true,
};
}
console.log(`[DIRTSTREET] Cache expired (${ageHours.toFixed(1)}h old). Re-scraping all brands...`);
}
} catch {
// no cache file yet, proceed with scraping
}
}
const brandsToScrape = process.env.DIRTSTREET_BRANDS
? BRANDS.filter((b) => process.env.DIRTSTREET_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
: BRANDS;

View File

@ -22,6 +22,35 @@ function paths() {
}
async function fetchWebsiteData({ paths: runPaths }) {
// Use cached aggregated JSON if it exists and is fresh (default 24h).
// Set MOTOUSHER_CACHE_HOURS=0 to always force a full re-scrape.
const cacheMaxHours = Number(process.env.MOTOUSHER_CACHE_HOURS ?? 24);
const absAggregatedPath = path.resolve(process.cwd(), runPaths.aggregatedJson);
if (cacheMaxHours > 0) {
try {
const existing = JSON.parse(await fs.readFile(absAggregatedPath, "utf8"));
if (existing?.generatedAt && Array.isArray(existing?.products) && existing.products.length > 0) {
const ageMs = Date.now() - new Date(existing.generatedAt).getTime();
const ageHours = ageMs / (1000 * 60 * 60);
if (ageHours < cacheMaxHours) {
console.log(
`[MOTOUSHER] Using cached data (${ageHours.toFixed(1)}h old, limit ${cacheMaxHours}h) — ${existing.products.length} products. Set MOTOUSHER_CACHE_HOURS=0 to force re-scrape.`
);
return {
analysis: existing.analysis || {},
outputJsonPath: absAggregatedPath,
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
cached: true,
};
}
console.log(`[MOTOUSHER] Cache expired (${ageHours.toFixed(1)}h old). Re-scraping all brands...`);
}
} catch {
// no cache file yet, proceed with scraping
}
}
const brandsToScrape = process.env.MOTOUSHER_BRANDS
? BRANDS.filter((b) => process.env.MOTOUSHER_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
: BRANDS;