import path from "node:path"; import fs from "node:fs"; import { fileURLToPath } from "node:url"; import { crawl } from "../crawler.js"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const ts = () => new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-"); function attachJson(res, filename, obj) { const json = JSON.stringify(obj, null, 2); res.setHeader("Content-Type", "application/json; charset=utf-8"); res.setHeader("Content-Disposition", `attachment; filename="${filename}"`); return res.send(json); } function isAbs(p) { try { return path.isAbsolute(p); } catch { return false; } } export async function crawlHandler(req, res) { try { const { url, max, stream, download, nostore } = req.query; if (!url) return res.status(400).json({ error: "Missing url param" }); const target = new URL(String(url)); const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500); const wantsStream = String(stream) === "1" || (req.get("accept") || "").includes("text/event-stream"); /* ---------- SSE mode ---------- */ if (wantsStream) { if (String(download) === "1") { return res.status(400).json({ error: "download not supported with stream=1" }); } res.setHeader("Content-Type", "text/event-stream"); res.setHeader("Cache-Control", "no-cache, no-transform"); res.setHeader("Connection", "keep-alive"); res.flushHeaders?.(); const send = (obj, evt) => { if (evt) res.write(`event: ${evt}\n`); res.write(`data: ${JSON.stringify(obj)}\n\n`); }; const heartbeat = setInterval(() => res.write(":\n\n"), 15000); let finished = false; req.on("close", () => { clearInterval(heartbeat); if (!finished) console.warn("SSE client disconnected."); }); const onProgress = (tick) => send(tick, "tick"); send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started"); const result = await crawl(target.toString(), limit, onProgress, { persistReports: false, collectPages: true, }); finished = true; clearInterval(heartbeat); send({ ok: true, done: true, result }, "done"); return res.end(); } /* ---------- Non-streaming mode ---------- */ const preferMemory = String(nostore) === "1" || String(download) === "1"; const result = await crawl( target.toString(), limit, undefined, preferMemory ? { persistReports: false, collectPages: true } : { persistReports: true, collectPages: true } ); if (String(download) === "1") { const filename = `crawl-${ts()}.json`; if (Array.isArray(result?.results)) { return attachJson(res, filename, result.results); } const jsonPath = result?.files?.json; if (jsonPath) { const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath); if (fs.existsSync(abs)) { res.setHeader("Content-Type", "application/json; charset=utf-8"); res.setHeader("Content-Disposition", `attachment; filename="${filename}"`); return fs.createReadStream(abs).pipe(res); } } return attachJson(res, filename, result ?? {}); } return res.json({ ok: true, message: "Crawl completed", url: target.toString(), limit, ...result, }); } catch (err) { console.error("Crawl error:", err); res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) }); } }