Metatron_Admin_Backend/server copy.js
2025-10-09 10:10:50 +05:30

237 lines
8.5 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// // server.js
// import express from "express";
// import { Queue } from "bullmq";
// import { connection } from "./redis.js";
// import crypto from "crypto";
// const app = express();
// app.use(express.json());
// const crawlQueue = new Queue("crawl", { connection });
// // Start a new crawl
// app.post("/crawl", async (req, res) => {
// const { startUrl } = req.body;
// if (!startUrl) return res.status(400).json({ error: "Missing startUrl" });
// const crawlId = crypto.randomUUID();
// await crawlQueue.add("fetch", { crawlId, url: startUrl });
// res.json({ crawlId, message: "Crawl started" });
// });
// // (Optional) Check progress
// app.get("/status/:id", async (req, res) => {
// // For now just reply with "in progress"
// res.json({ crawlId: req.params.id, status: "in progress" });
// });
// app.listen(3000, () => {
// console.log("Crawler API running at http://localhost:3000");
// });
// // server.js
// import express from "express";
// import cors from "cors"; // ← optional but recommended
// import { crawl } from "./crawler.js"; // ensure crawl is a NAMED export; if default, use: import crawl from "./crawler.js";
// const app = express();
// const PORT = process.env.PORT || 3010;
// /* Parse JSON BEFORE any middleware that might read req.body */
// app.use(express.json());
// /* CORS (adjust origins as needed) */
// app.use(cors({
// origin: [
// "http://localhost:3000",
// "https://your-frontend.example" // ← replace or remove
// ],
// }));
// /* Safe request logger */
// app.use((req, res, next) => {
// console.log(`[${new Date().toISOString()}] ${req.method} ${req.originalUrl}`);
// if (req.query && Object.keys(req.query).length) console.log("Query:", req.query);
// if (req.body && typeof req.body === "object" && Object.keys(req.body).length) console.log("Body:", req.body);
// next();
// });
// /* GET /crawl?url=https://site.com&max=50 */
// app.get("/crawl", async (req, res) => {
// try {
// const { url, max } = req.query;
// if (!url) return res.status(400).json({ error: "Missing url param" });
// // validate & normalize
// const target = new URL(String(url)); // throws if invalid
// const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
// await crawl(target.toString(), limit);
// res.json({ ok: true, message: `Crawl started`, url: target.toString(), limit });
// } catch (err) {
// console.error("Crawl error:", err);
// res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
// }
// });
// /* Global safety nets so crashes dont become silent restart loops */
// process.on("unhandledRejection", (err) => console.error("unhandledRejection:", err));
// process.on("uncaughtException", (err) => console.error("uncaughtException:", err));
// /* Bind to all interfaces so remote calls work */
// app.listen(PORT, "0.0.0.0", () => {
// console.log(`🚀 Server running at http://localhost:${PORT}`);
// });
// server.js
import express from "express";
import cors from "cors";
import path from "node:path";
import fs from "node:fs";
import fsp from "node:fs/promises";
import { fileURLToPath } from "node:url";
import { crawl } from "./crawler.js"; // crawl(target, limit, onProgress?, options?)
const app = express();
const PORT = process.env.PORT || 3010;
const __dirname = path.dirname(fileURLToPath(import.meta.url));
/* ------------ Middleware ------------ */
app.use(express.json());
app.use(cors({ origin: ["http://localhost:3000", "https://app.crawlerx.co"] }));
app.use(express.static(path.join(__dirname, "public")));
app.get("/", (_req, res) => {
const viewer = path.join(__dirname, "public", "crawlerx_viewer.html");
return fs.existsSync(viewer)
? res.sendFile(viewer)
: res.type("text/plain").send("CrawlerX backend is running.");
});
app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() }));
/* ------------ Helpers ------------ */
const ts = () =>
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-"); // safe filename
function attachJson(res, filename, obj) {
const json = JSON.stringify(obj, null, 2);
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return res.send(json);
}
function isAbs(p) {
try { return path.isAbsolute(p); } catch { return false; }
}
/* ------------ Crawl endpoint ------------ */
/**
* GET /crawl?url=https://site.com&max=50[&stream=1][&download=1][&nostore=1]
* - stream=1 : SSE live progress (no download)
* - download=1 : respond as a JSON download (attachment)
* - nostore=1 : ask crawler not to write files (if supported by your crawler)
*/
app.get("/crawl", async (req, res) => {
try {
const { url, max, stream, download, nostore } = req.query;
if (!url) return res.status(400).json({ error: "Missing url param" });
const target = new URL(String(url)); // validate
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
const wantsStream =
String(stream) === "1" ||
(req.get("accept") || "").includes("text/event-stream");
/* ---------- SSE mode ---------- */
if (wantsStream) {
if (String(download) === "1") {
return res.status(400).json({ error: "download not supported with stream=1" });
}
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache, no-transform");
res.setHeader("Connection", "keep-alive");
res.flushHeaders?.();
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
const send = (obj, evt) => {
if (evt) res.write(`event: ${evt}\n`);
res.write(`data: ${JSON.stringify(obj)}\n\n`);
};
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
let finished = false;
req.on("close", () => { clearInterval(heartbeat); if (!finished) console.warn("SSE client disconnected."); });
const result = await crawl(
target.toString(),
limit,
(tick) => send(tick),
// If your crawler supports it, this avoids writing files during SSE runs:
{ persistReports: false, collectPages: true }
);
finished = true;
clearInterval(heartbeat);
send({ ok: true, done: true, result }, "done");
return res.end();
}
/* ---------- Non-streaming mode ---------- */
// Ask crawler (if it supports options) to avoid writing files when nostore=1 or download requested.
const preferMemory = String(nostore) === "1" || String(download) === "1";
const result = await crawl(
target.toString(),
limit,
undefined,
preferMemory ? { persistReports: false, collectPages: true } : undefined
);
// If caller wants a downloadable JSON file...
if (String(download) === "1") {
const filename = `crawl-${ts()}.json`;
// 1) Best case: crawler returned in-memory data (no disk IO).
// Use whichever property your crawler exposes. We try common shapes.
const inMemory =
result?.jsonData ??
result?.pages ??
result?.report ??
(Array.isArray(result) ? result : null);
if (inMemory) {
return attachJson(res, filename, inMemory);
}
// 2) Fallback: crawler saved a JSON report path that we can stream.
const jsonPath = result?.reports?.json;
if (jsonPath && fs.existsSync(isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath))) {
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return fs.createReadStream(abs).pipe(res);
}
// 3) Last resort: send the entire result itself as JSON.
return attachJson(res, filename, result);
}
// Default JSON (inline, not attachment)
return res.json({
ok: true,
message: "Crawl completed",
url: target.toString(),
limit,
result
});
} catch (err) {
console.error("Crawl error:", err);
return res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
}
});
/* ------------ Safety nets ------------ */
process.on("unhandledRejection", (e) => console.error("unhandledRejection:", e));
process.on("uncaughtException", (e) => console.error("uncaughtException:", e));
/* ------------ Start server ------------ */
app.listen(PORT, "0.0.0.0", () => {
console.log(`🚀 Server running at http://localhost:${PORT}`);
});