2025-10-09 10:10:50 +05:30

116 lines
3.5 KiB
JavaScript

import path from "node:path";
import fs from "node:fs";
import { fileURLToPath } from "node:url";
import { crawl } from "../crawler.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const ts = () =>
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-");
function attachJson(res, filename, obj) {
const json = JSON.stringify(obj, null, 2);
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return res.send(json);
}
function isAbs(p) {
try {
return path.isAbsolute(p);
} catch {
return false;
}
}
export async function crawlHandler(req, res) {
try {
const { url, max, stream, download, nostore } = req.query;
if (!url) return res.status(400).json({ error: "Missing url param" });
const target = new URL(String(url));
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
const wantsStream =
String(stream) === "1" ||
(req.get("accept") || "").includes("text/event-stream");
/* ---------- SSE mode ---------- */
if (wantsStream) {
if (String(download) === "1") {
return res.status(400).json({ error: "download not supported with stream=1" });
}
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache, no-transform");
res.setHeader("Connection", "keep-alive");
res.flushHeaders?.();
const send = (obj, evt) => {
if (evt) res.write(`event: ${evt}\n`);
res.write(`data: ${JSON.stringify(obj)}\n\n`);
};
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
let finished = false;
req.on("close", () => {
clearInterval(heartbeat);
if (!finished) console.warn("SSE client disconnected.");
});
const onProgress = (tick) => send(tick, "tick");
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
const result = await crawl(target.toString(), limit, onProgress, {
persistReports: false,
collectPages: true,
});
finished = true;
clearInterval(heartbeat);
send({ ok: true, done: true, result }, "done");
return res.end();
}
/* ---------- Non-streaming mode ---------- */
const preferMemory = String(nostore) === "1" || String(download) === "1";
const result = await crawl(
target.toString(),
limit,
undefined,
preferMemory
? { persistReports: false, collectPages: true }
: { persistReports: true, collectPages: true }
);
if (String(download) === "1") {
const filename = `crawl-${ts()}.json`;
if (Array.isArray(result?.results)) {
return attachJson(res, filename, result.results);
}
const jsonPath = result?.files?.json;
if (jsonPath) {
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
if (fs.existsSync(abs)) {
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return fs.createReadStream(abs).pipe(res);
}
}
return attachJson(res, filename, result ?? {});
}
return res.json({
ok: true,
message: "Crawl completed",
url: target.toString(),
limit,
...result,
});
} catch (err) {
console.error("Crawl error:", err);
res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
}
}