116 lines
3.5 KiB
JavaScript
116 lines
3.5 KiB
JavaScript
import path from "node:path";
|
|
import fs from "node:fs";
|
|
import { fileURLToPath } from "node:url";
|
|
import { crawl } from "../crawler.js";
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
|
const ts = () =>
|
|
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-");
|
|
|
|
function attachJson(res, filename, obj) {
|
|
const json = JSON.stringify(obj, null, 2);
|
|
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
|
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
|
return res.send(json);
|
|
}
|
|
function isAbs(p) {
|
|
try {
|
|
return path.isAbsolute(p);
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
export async function crawlHandler(req, res) {
|
|
try {
|
|
const { url, max, stream, download, nostore } = req.query;
|
|
if (!url) return res.status(400).json({ error: "Missing url param" });
|
|
|
|
const target = new URL(String(url));
|
|
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
|
|
const wantsStream =
|
|
String(stream) === "1" ||
|
|
(req.get("accept") || "").includes("text/event-stream");
|
|
|
|
/* ---------- SSE mode ---------- */
|
|
if (wantsStream) {
|
|
if (String(download) === "1") {
|
|
return res.status(400).json({ error: "download not supported with stream=1" });
|
|
}
|
|
|
|
res.setHeader("Content-Type", "text/event-stream");
|
|
res.setHeader("Cache-Control", "no-cache, no-transform");
|
|
res.setHeader("Connection", "keep-alive");
|
|
res.flushHeaders?.();
|
|
|
|
const send = (obj, evt) => {
|
|
if (evt) res.write(`event: ${evt}\n`);
|
|
res.write(`data: ${JSON.stringify(obj)}\n\n`);
|
|
};
|
|
|
|
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
|
|
let finished = false;
|
|
|
|
req.on("close", () => {
|
|
clearInterval(heartbeat);
|
|
if (!finished) console.warn("SSE client disconnected.");
|
|
});
|
|
|
|
const onProgress = (tick) => send(tick, "tick");
|
|
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
|
|
|
|
const result = await crawl(target.toString(), limit, onProgress, {
|
|
persistReports: false,
|
|
collectPages: true,
|
|
});
|
|
|
|
finished = true;
|
|
clearInterval(heartbeat);
|
|
send({ ok: true, done: true, result }, "done");
|
|
return res.end();
|
|
}
|
|
|
|
/* ---------- Non-streaming mode ---------- */
|
|
const preferMemory = String(nostore) === "1" || String(download) === "1";
|
|
const result = await crawl(
|
|
target.toString(),
|
|
limit,
|
|
undefined,
|
|
preferMemory
|
|
? { persistReports: false, collectPages: true }
|
|
: { persistReports: true, collectPages: true }
|
|
);
|
|
|
|
if (String(download) === "1") {
|
|
const filename = `crawl-${ts()}.json`;
|
|
|
|
if (Array.isArray(result?.results)) {
|
|
return attachJson(res, filename, result.results);
|
|
}
|
|
|
|
const jsonPath = result?.files?.json;
|
|
if (jsonPath) {
|
|
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
|
|
if (fs.existsSync(abs)) {
|
|
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
|
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
|
return fs.createReadStream(abs).pipe(res);
|
|
}
|
|
}
|
|
return attachJson(res, filename, result ?? {});
|
|
}
|
|
|
|
return res.json({
|
|
ok: true,
|
|
message: "Crawl completed",
|
|
url: target.toString(),
|
|
limit,
|
|
...result,
|
|
});
|
|
} catch (err) {
|
|
console.error("Crawl error:", err);
|
|
res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
|
|
}
|
|
}
|