237 lines
8.5 KiB
JavaScript
237 lines
8.5 KiB
JavaScript
// // server.js
|
||
// import express from "express";
|
||
// import { Queue } from "bullmq";
|
||
// import { connection } from "./redis.js";
|
||
// import crypto from "crypto";
|
||
|
||
// const app = express();
|
||
// app.use(express.json());
|
||
|
||
// const crawlQueue = new Queue("crawl", { connection });
|
||
|
||
// // Start a new crawl
|
||
// app.post("/crawl", async (req, res) => {
|
||
// const { startUrl } = req.body;
|
||
// if (!startUrl) return res.status(400).json({ error: "Missing startUrl" });
|
||
|
||
// const crawlId = crypto.randomUUID();
|
||
// await crawlQueue.add("fetch", { crawlId, url: startUrl });
|
||
|
||
// res.json({ crawlId, message: "Crawl started" });
|
||
// });
|
||
|
||
// // (Optional) Check progress
|
||
// app.get("/status/:id", async (req, res) => {
|
||
// // For now just reply with "in progress"
|
||
// res.json({ crawlId: req.params.id, status: "in progress" });
|
||
// });
|
||
|
||
// app.listen(3000, () => {
|
||
// console.log("Crawler API running at http://localhost:3000");
|
||
// });
|
||
|
||
|
||
// // server.js
|
||
// import express from "express";
|
||
// import cors from "cors"; // ← optional but recommended
|
||
// import { crawl } from "./crawler.js"; // ensure crawl is a NAMED export; if default, use: import crawl from "./crawler.js";
|
||
|
||
// const app = express();
|
||
// const PORT = process.env.PORT || 3010;
|
||
|
||
// /* Parse JSON BEFORE any middleware that might read req.body */
|
||
// app.use(express.json());
|
||
|
||
// /* CORS (adjust origins as needed) */
|
||
// app.use(cors({
|
||
// origin: [
|
||
// "http://localhost:3000",
|
||
// "https://your-frontend.example" // ← replace or remove
|
||
// ],
|
||
// }));
|
||
|
||
// /* Safe request logger */
|
||
// app.use((req, res, next) => {
|
||
// console.log(`[${new Date().toISOString()}] ${req.method} ${req.originalUrl}`);
|
||
// if (req.query && Object.keys(req.query).length) console.log("Query:", req.query);
|
||
// if (req.body && typeof req.body === "object" && Object.keys(req.body).length) console.log("Body:", req.body);
|
||
// next();
|
||
// });
|
||
|
||
// /* GET /crawl?url=https://site.com&max=50 */
|
||
// app.get("/crawl", async (req, res) => {
|
||
// try {
|
||
// const { url, max } = req.query;
|
||
// if (!url) return res.status(400).json({ error: "Missing url param" });
|
||
|
||
// // validate & normalize
|
||
// const target = new URL(String(url)); // throws if invalid
|
||
// const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
|
||
|
||
// await crawl(target.toString(), limit);
|
||
// res.json({ ok: true, message: `Crawl started`, url: target.toString(), limit });
|
||
// } catch (err) {
|
||
// console.error("Crawl error:", err);
|
||
// res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
|
||
// }
|
||
// });
|
||
|
||
// /* Global safety nets so crashes don’t become silent restart loops */
|
||
// process.on("unhandledRejection", (err) => console.error("unhandledRejection:", err));
|
||
// process.on("uncaughtException", (err) => console.error("uncaughtException:", err));
|
||
|
||
// /* Bind to all interfaces so remote calls work */
|
||
// app.listen(PORT, "0.0.0.0", () => {
|
||
// console.log(`🚀 Server running at http://localhost:${PORT}`);
|
||
// });
|
||
|
||
// server.js
|
||
import express from "express";
|
||
import cors from "cors";
|
||
import path from "node:path";
|
||
import fs from "node:fs";
|
||
import fsp from "node:fs/promises";
|
||
import { fileURLToPath } from "node:url";
|
||
import { crawl } from "./crawler.js"; // crawl(target, limit, onProgress?, options?)
|
||
|
||
const app = express();
|
||
const PORT = process.env.PORT || 3010;
|
||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||
|
||
/* ------------ Middleware ------------ */
|
||
app.use(express.json());
|
||
app.use(cors({ origin: ["http://localhost:3000", "https://app.crawlerx.co"] }));
|
||
app.use(express.static(path.join(__dirname, "public")));
|
||
app.get("/", (_req, res) => {
|
||
const viewer = path.join(__dirname, "public", "crawlerx_viewer.html");
|
||
return fs.existsSync(viewer)
|
||
? res.sendFile(viewer)
|
||
: res.type("text/plain").send("CrawlerX backend is running.");
|
||
});
|
||
app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() }));
|
||
|
||
/* ------------ Helpers ------------ */
|
||
const ts = () =>
|
||
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-"); // safe filename
|
||
function attachJson(res, filename, obj) {
|
||
const json = JSON.stringify(obj, null, 2);
|
||
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
||
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
||
return res.send(json);
|
||
}
|
||
function isAbs(p) {
|
||
try { return path.isAbsolute(p); } catch { return false; }
|
||
}
|
||
|
||
/* ------------ Crawl endpoint ------------ */
|
||
/**
|
||
* GET /crawl?url=https://site.com&max=50[&stream=1][&download=1][&nostore=1]
|
||
* - stream=1 : SSE live progress (no download)
|
||
* - download=1 : respond as a JSON download (attachment)
|
||
* - nostore=1 : ask crawler not to write files (if supported by your crawler)
|
||
*/
|
||
app.get("/crawl", async (req, res) => {
|
||
try {
|
||
const { url, max, stream, download, nostore } = req.query;
|
||
if (!url) return res.status(400).json({ error: "Missing url param" });
|
||
|
||
const target = new URL(String(url)); // validate
|
||
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
|
||
const wantsStream =
|
||
String(stream) === "1" ||
|
||
(req.get("accept") || "").includes("text/event-stream");
|
||
|
||
/* ---------- SSE mode ---------- */
|
||
if (wantsStream) {
|
||
if (String(download) === "1") {
|
||
return res.status(400).json({ error: "download not supported with stream=1" });
|
||
}
|
||
res.setHeader("Content-Type", "text/event-stream");
|
||
res.setHeader("Cache-Control", "no-cache, no-transform");
|
||
res.setHeader("Connection", "keep-alive");
|
||
res.flushHeaders?.();
|
||
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
|
||
const send = (obj, evt) => {
|
||
if (evt) res.write(`event: ${evt}\n`);
|
||
res.write(`data: ${JSON.stringify(obj)}\n\n`);
|
||
};
|
||
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
|
||
|
||
let finished = false;
|
||
req.on("close", () => { clearInterval(heartbeat); if (!finished) console.warn("SSE client disconnected."); });
|
||
|
||
const result = await crawl(
|
||
target.toString(),
|
||
limit,
|
||
(tick) => send(tick),
|
||
// If your crawler supports it, this avoids writing files during SSE runs:
|
||
{ persistReports: false, collectPages: true }
|
||
);
|
||
|
||
finished = true;
|
||
clearInterval(heartbeat);
|
||
send({ ok: true, done: true, result }, "done");
|
||
return res.end();
|
||
}
|
||
|
||
/* ---------- Non-streaming mode ---------- */
|
||
// Ask crawler (if it supports options) to avoid writing files when nostore=1 or download requested.
|
||
const preferMemory = String(nostore) === "1" || String(download) === "1";
|
||
const result = await crawl(
|
||
target.toString(),
|
||
limit,
|
||
undefined,
|
||
preferMemory ? { persistReports: false, collectPages: true } : undefined
|
||
);
|
||
|
||
// If caller wants a downloadable JSON file...
|
||
if (String(download) === "1") {
|
||
const filename = `crawl-${ts()}.json`;
|
||
|
||
// 1) Best case: crawler returned in-memory data (no disk IO).
|
||
// Use whichever property your crawler exposes. We try common shapes.
|
||
const inMemory =
|
||
result?.jsonData ??
|
||
result?.pages ??
|
||
result?.report ??
|
||
(Array.isArray(result) ? result : null);
|
||
|
||
if (inMemory) {
|
||
return attachJson(res, filename, inMemory);
|
||
}
|
||
|
||
// 2) Fallback: crawler saved a JSON report path that we can stream.
|
||
const jsonPath = result?.reports?.json;
|
||
if (jsonPath && fs.existsSync(isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath))) {
|
||
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
|
||
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
||
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
||
return fs.createReadStream(abs).pipe(res);
|
||
}
|
||
|
||
// 3) Last resort: send the entire result itself as JSON.
|
||
return attachJson(res, filename, result);
|
||
}
|
||
|
||
// Default JSON (inline, not attachment)
|
||
return res.json({
|
||
ok: true,
|
||
message: "Crawl completed",
|
||
url: target.toString(),
|
||
limit,
|
||
result
|
||
});
|
||
} catch (err) {
|
||
console.error("Crawl error:", err);
|
||
return res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
|
||
}
|
||
});
|
||
|
||
/* ------------ Safety nets ------------ */
|
||
process.on("unhandledRejection", (e) => console.error("unhandledRejection:", e));
|
||
process.on("uncaughtException", (e) => console.error("uncaughtException:", e));
|
||
|
||
/* ------------ Start server ------------ */
|
||
app.listen(PORT, "0.0.0.0", () => {
|
||
console.log(`🚀 Server running at http://localhost:${PORT}`);
|
||
}); |