// // server.js // import express from "express"; // import { Queue } from "bullmq"; // import { connection } from "./redis.js"; // import crypto from "crypto"; // const app = express(); // app.use(express.json()); // const crawlQueue = new Queue("crawl", { connection }); // // Start a new crawl // app.post("/crawl", async (req, res) => { // const { startUrl } = req.body; // if (!startUrl) return res.status(400).json({ error: "Missing startUrl" }); // const crawlId = crypto.randomUUID(); // await crawlQueue.add("fetch", { crawlId, url: startUrl }); // res.json({ crawlId, message: "Crawl started" }); // }); // // (Optional) Check progress // app.get("/status/:id", async (req, res) => { // // For now just reply with "in progress" // res.json({ crawlId: req.params.id, status: "in progress" }); // }); // app.listen(3000, () => { // console.log("Crawler API running at http://localhost:3000"); // }); // // server.js // import express from "express"; // import cors from "cors"; // ← optional but recommended // import { crawl } from "./crawler.js"; // ensure crawl is a NAMED export; if default, use: import crawl from "./crawler.js"; // const app = express(); // const PORT = process.env.PORT || 3010; // /* Parse JSON BEFORE any middleware that might read req.body */ // app.use(express.json()); // /* CORS (adjust origins as needed) */ // app.use(cors({ // origin: [ // "http://localhost:3000", // "https://your-frontend.example" // ← replace or remove // ], // })); // /* Safe request logger */ // app.use((req, res, next) => { // console.log(`[${new Date().toISOString()}] ${req.method} ${req.originalUrl}`); // if (req.query && Object.keys(req.query).length) console.log("Query:", req.query); // if (req.body && typeof req.body === "object" && Object.keys(req.body).length) console.log("Body:", req.body); // next(); // }); // /* GET /crawl?url=https://site.com&max=50 */ // app.get("/crawl", async (req, res) => { // try { // const { url, max } = req.query; // if (!url) return res.status(400).json({ error: "Missing url param" }); // // validate & normalize // const target = new URL(String(url)); // throws if invalid // const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500); // await crawl(target.toString(), limit); // res.json({ ok: true, message: `Crawl started`, url: target.toString(), limit }); // } catch (err) { // console.error("Crawl error:", err); // res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) }); // } // }); // /* Global safety nets so crashes don’t become silent restart loops */ // process.on("unhandledRejection", (err) => console.error("unhandledRejection:", err)); // process.on("uncaughtException", (err) => console.error("uncaughtException:", err)); // /* Bind to all interfaces so remote calls work */ // app.listen(PORT, "0.0.0.0", () => { // console.log(`πŸš€ Server running at http://localhost:${PORT}`); // }); // server.js import express from "express"; import cors from "cors"; import path from "node:path"; import fs from "node:fs"; import fsp from "node:fs/promises"; import { fileURLToPath } from "node:url"; import { crawl } from "./crawler.js"; // crawl(target, limit, onProgress?, options?) const app = express(); const PORT = process.env.PORT || 3010; const __dirname = path.dirname(fileURLToPath(import.meta.url)); /* ------------ Middleware ------------ */ app.use(express.json()); app.use(cors({ origin: ["http://localhost:3000", "https://app.crawlerx.co"] })); app.use(express.static(path.join(__dirname, "public"))); app.get("/", (_req, res) => { const viewer = path.join(__dirname, "public", "crawlerx_viewer.html"); return fs.existsSync(viewer) ? res.sendFile(viewer) : res.type("text/plain").send("CrawlerX backend is running."); }); app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() })); /* ------------ Helpers ------------ */ const ts = () => new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-"); // safe filename function attachJson(res, filename, obj) { const json = JSON.stringify(obj, null, 2); res.setHeader("Content-Type", "application/json; charset=utf-8"); res.setHeader("Content-Disposition", `attachment; filename="${filename}"`); return res.send(json); } function isAbs(p) { try { return path.isAbsolute(p); } catch { return false; } } /* ------------ Crawl endpoint ------------ */ /** * GET /crawl?url=https://site.com&max=50[&stream=1][&download=1][&nostore=1] * - stream=1 : SSE live progress (no download) * - download=1 : respond as a JSON download (attachment) * - nostore=1 : ask crawler not to write files (if supported by your crawler) */ app.get("/crawl", async (req, res) => { try { const { url, max, stream, download, nostore } = req.query; if (!url) return res.status(400).json({ error: "Missing url param" }); const target = new URL(String(url)); // validate const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500); const wantsStream = String(stream) === "1" || (req.get("accept") || "").includes("text/event-stream"); /* ---------- SSE mode ---------- */ if (wantsStream) { if (String(download) === "1") { return res.status(400).json({ error: "download not supported with stream=1" }); } res.setHeader("Content-Type", "text/event-stream"); res.setHeader("Cache-Control", "no-cache, no-transform"); res.setHeader("Connection", "keep-alive"); res.flushHeaders?.(); const heartbeat = setInterval(() => res.write(":\n\n"), 15000); const send = (obj, evt) => { if (evt) res.write(`event: ${evt}\n`); res.write(`data: ${JSON.stringify(obj)}\n\n`); }; send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started"); let finished = false; req.on("close", () => { clearInterval(heartbeat); if (!finished) console.warn("SSE client disconnected."); }); const result = await crawl( target.toString(), limit, (tick) => send(tick), // If your crawler supports it, this avoids writing files during SSE runs: { persistReports: false, collectPages: true } ); finished = true; clearInterval(heartbeat); send({ ok: true, done: true, result }, "done"); return res.end(); } /* ---------- Non-streaming mode ---------- */ // Ask crawler (if it supports options) to avoid writing files when nostore=1 or download requested. const preferMemory = String(nostore) === "1" || String(download) === "1"; const result = await crawl( target.toString(), limit, undefined, preferMemory ? { persistReports: false, collectPages: true } : undefined ); // If caller wants a downloadable JSON file... if (String(download) === "1") { const filename = `crawl-${ts()}.json`; // 1) Best case: crawler returned in-memory data (no disk IO). // Use whichever property your crawler exposes. We try common shapes. const inMemory = result?.jsonData ?? result?.pages ?? result?.report ?? (Array.isArray(result) ? result : null); if (inMemory) { return attachJson(res, filename, inMemory); } // 2) Fallback: crawler saved a JSON report path that we can stream. const jsonPath = result?.reports?.json; if (jsonPath && fs.existsSync(isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath))) { const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath); res.setHeader("Content-Type", "application/json; charset=utf-8"); res.setHeader("Content-Disposition", `attachment; filename="${filename}"`); return fs.createReadStream(abs).pipe(res); } // 3) Last resort: send the entire result itself as JSON. return attachJson(res, filename, result); } // Default JSON (inline, not attachment) return res.json({ ok: true, message: "Crawl completed", url: target.toString(), limit, result }); } catch (err) { console.error("Crawl error:", err); return res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) }); } }); /* ------------ Safety nets ------------ */ process.on("unhandledRejection", (e) => console.error("unhandledRejection:", e)); process.on("uncaughtException", (e) => console.error("uncaughtException:", e)); /* ------------ Start server ------------ */ app.listen(PORT, "0.0.0.0", () => { console.log(`πŸš€ Server running at http://localhost:${PORT}`); });