Add 24h cache to motousher and dirtstreet fetchWebsiteData
Subsequent pipeline runs within 24 hours reuse the existing 01_products_aggregated.json instead of re-scraping all brands, eliminating redundant HTTP requests and 429 rate-limit retries. Cache lifetime controlled per source: MOTOUSHER_CACHE_HOURS=0 → always re-scrape DIRTSTREET_CACHE_HOURS=0 → always re-scrape (default: 24h) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
6eac0b92ed
commit
4e536f08b3
@ -22,6 +22,35 @@ function paths() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function fetchWebsiteData({ paths: runPaths }) {
|
async function fetchWebsiteData({ paths: runPaths }) {
|
||||||
|
// Use cached aggregated JSON if it exists and is fresh (default 24h).
|
||||||
|
// Set DIRTSTREET_CACHE_HOURS=0 to always force a full re-scrape.
|
||||||
|
const cacheMaxHours = Number(process.env.DIRTSTREET_CACHE_HOURS ?? 24);
|
||||||
|
const absAggregatedPath = path.resolve(process.cwd(), runPaths.aggregatedJson);
|
||||||
|
|
||||||
|
if (cacheMaxHours > 0) {
|
||||||
|
try {
|
||||||
|
const existing = JSON.parse(await fs.readFile(absAggregatedPath, "utf8"));
|
||||||
|
if (existing?.generatedAt && Array.isArray(existing?.products) && existing.products.length > 0) {
|
||||||
|
const ageMs = Date.now() - new Date(existing.generatedAt).getTime();
|
||||||
|
const ageHours = ageMs / (1000 * 60 * 60);
|
||||||
|
if (ageHours < cacheMaxHours) {
|
||||||
|
console.log(
|
||||||
|
`[DIRTSTREET] Using cached data (${ageHours.toFixed(1)}h old, limit ${cacheMaxHours}h) — ${existing.products.length} products. Set DIRTSTREET_CACHE_HOURS=0 to force re-scrape.`
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
analysis: existing.analysis || {},
|
||||||
|
outputJsonPath: absAggregatedPath,
|
||||||
|
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
|
||||||
|
cached: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
console.log(`[DIRTSTREET] Cache expired (${ageHours.toFixed(1)}h old). Re-scraping all brands...`);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// no cache file yet, proceed with scraping
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const brandsToScrape = process.env.DIRTSTREET_BRANDS
|
const brandsToScrape = process.env.DIRTSTREET_BRANDS
|
||||||
? BRANDS.filter((b) => process.env.DIRTSTREET_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
|
? BRANDS.filter((b) => process.env.DIRTSTREET_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
|
||||||
: BRANDS;
|
: BRANDS;
|
||||||
|
|||||||
@ -22,6 +22,35 @@ function paths() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function fetchWebsiteData({ paths: runPaths }) {
|
async function fetchWebsiteData({ paths: runPaths }) {
|
||||||
|
// Use cached aggregated JSON if it exists and is fresh (default 24h).
|
||||||
|
// Set MOTOUSHER_CACHE_HOURS=0 to always force a full re-scrape.
|
||||||
|
const cacheMaxHours = Number(process.env.MOTOUSHER_CACHE_HOURS ?? 24);
|
||||||
|
const absAggregatedPath = path.resolve(process.cwd(), runPaths.aggregatedJson);
|
||||||
|
|
||||||
|
if (cacheMaxHours > 0) {
|
||||||
|
try {
|
||||||
|
const existing = JSON.parse(await fs.readFile(absAggregatedPath, "utf8"));
|
||||||
|
if (existing?.generatedAt && Array.isArray(existing?.products) && existing.products.length > 0) {
|
||||||
|
const ageMs = Date.now() - new Date(existing.generatedAt).getTime();
|
||||||
|
const ageHours = ageMs / (1000 * 60 * 60);
|
||||||
|
if (ageHours < cacheMaxHours) {
|
||||||
|
console.log(
|
||||||
|
`[MOTOUSHER] Using cached data (${ageHours.toFixed(1)}h old, limit ${cacheMaxHours}h) — ${existing.products.length} products. Set MOTOUSHER_CACHE_HOURS=0 to force re-scrape.`
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
analysis: existing.analysis || {},
|
||||||
|
outputJsonPath: absAggregatedPath,
|
||||||
|
historyPath: path.resolve(process.cwd(), runPaths.historyJson),
|
||||||
|
cached: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
console.log(`[MOTOUSHER] Cache expired (${ageHours.toFixed(1)}h old). Re-scraping all brands...`);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// no cache file yet, proceed with scraping
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const brandsToScrape = process.env.MOTOUSHER_BRANDS
|
const brandsToScrape = process.env.MOTOUSHER_BRANDS
|
||||||
? BRANDS.filter((b) => process.env.MOTOUSHER_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
|
? BRANDS.filter((b) => process.env.MOTOUSHER_BRANDS.split(",").map((s) => s.trim()).includes(b.slug))
|
||||||
: BRANDS;
|
: BRANDS;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user