Metatron_Admin_Backend/server copy.js

// // server.js
// import express from "express";
// import { Queue } from "bullmq";
// import { connection } from "./redis.js";
// import crypto from "crypto";

// const app = express();
// app.use(express.json());

// const crawlQueue = new Queue("crawl", { connection });

// // Start a new crawl
// app.post("/crawl", async (req, res) => {
//   const { startUrl } = req.body;
//   if (!startUrl) return res.status(400).json({ error: "Missing startUrl" });

//   const crawlId = crypto.randomUUID();
//   await crawlQueue.add("fetch", { crawlId, url: startUrl });

//   res.json({ crawlId, message: "Crawl started" });
// });

// // (Optional) Check progress
// app.get("/status/:id", async (req, res) => {
//   // For now just reply with "in progress"
//   res.json({ crawlId: req.params.id, status: "in progress" });
// });

// app.listen(3000, () => {
//   console.log("Crawler API running at http://localhost:3000");
// });


// // server.js
// import express from "express";
// import cors from "cors";                 // ← optional but recommended
// import { crawl } from "./crawler.js";    // ensure crawl is a NAMED export; if default, use: import crawl from "./crawler.js";

// const app = express();
// const PORT = process.env.PORT || 3010;

// /* Parse JSON BEFORE any middleware that might read req.body */
// app.use(express.json());

// /* CORS (adjust origins as needed) */
// app.use(cors({
//   origin: [
//     "http://localhost:3000",
//     "https://your-frontend.example"     // ← replace or remove
//   ],
// }));

// /* Safe request logger */
// app.use((req, res, next) => {
//   console.log(`[${new Date().toISOString()}] ${req.method} ${req.originalUrl}`);
//   if (req.query && Object.keys(req.query).length) console.log("Query:", req.query);
//   if (req.body && typeof req.body === "object" && Object.keys(req.body).length) console.log("Body:", req.body);
//   next();
// });

// /* GET /crawl?url=https://site.com&max=50 */
// app.get("/crawl", async (req, res) => {
//   try {
//     const { url, max } = req.query;
//     if (!url) return res.status(400).json({ error: "Missing url param" });

//     // validate & normalize
//     const target = new URL(String(url));                 // throws if invalid
//     const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);

//     await crawl(target.toString(), limit);
//     res.json({ ok: true, message: `Crawl started`, url: target.toString(), limit });
//   } catch (err) {
//     console.error("Crawl error:", err);
//     res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
//   }
// });

// /* Global safety nets so crashes don’t become silent restart loops */
// process.on("unhandledRejection", (err) => console.error("unhandledRejection:", err));
// process.on("uncaughtException",  (err) => console.error("uncaughtException:",  err));

// /* Bind to all interfaces so remote calls work */
// app.listen(PORT, "0.0.0.0", () => {
//   console.log(`🚀 Server running at http://localhost:${PORT}`);
// });

// server.js
import express from "express";
import cors from "cors";
import path from "node:path";
import fs from "node:fs";
import fsp from "node:fs/promises";
import { fileURLToPath } from "node:url";
import { crawl } from "./crawler.js"; // crawl(target, limit, onProgress?, options?)

const app = express();
const PORT = process.env.PORT || 3010;
const __dirname = path.dirname(fileURLToPath(import.meta.url));

/* ------------ Middleware ------------ */
app.use(express.json());
app.use(cors({ origin: ["http://localhost:3000", "https://app.crawlerx.co"] }));
app.use(express.static(path.join(__dirname, "public")));
app.get("/", (_req, res) => {
  const viewer = path.join(__dirname, "public", "crawlerx_viewer.html");
  return fs.existsSync(viewer)
    ? res.sendFile(viewer)
    : res.type("text/plain").send("CrawlerX backend is running.");
});
app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() }));

/* ------------ Helpers ------------ */
const ts = () =>
  new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-"); // safe filename
function attachJson(res, filename, obj) {
  const json = JSON.stringify(obj, null, 2);
  res.setHeader("Content-Type", "application/json; charset=utf-8");
  res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
  return res.send(json);
}
function isAbs(p) {
  try { return path.isAbsolute(p); } catch { return false; }
}

/* ------------ Crawl endpoint ------------ */
/**
 * GET /crawl?url=https://site.com&max=50[&stream=1][&download=1][&nostore=1]
 * - stream=1 : SSE live progress (no download)
 * - download=1 : respond as a JSON download (attachment)
 * - nostore=1 : ask crawler not to write files (if supported by your crawler)
 */
app.get("/crawl", async (req, res) => {
  try {
    const { url, max, stream, download, nostore } = req.query;
    if (!url) return res.status(400).json({ error: "Missing url param" });

    const target = new URL(String(url)); // validate
    const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
    const wantsStream =
      String(stream) === "1" ||
      (req.get("accept") || "").includes("text/event-stream");

    /* ---------- SSE mode ---------- */
    if (wantsStream) {
      if (String(download) === "1") {
        return res.status(400).json({ error: "download not supported with stream=1" });
      }
      res.setHeader("Content-Type", "text/event-stream");
      res.setHeader("Cache-Control", "no-cache, no-transform");
      res.setHeader("Connection", "keep-alive");
      res.flushHeaders?.();
      const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
      const send = (obj, evt) => {
        if (evt) res.write(`event: ${evt}\n`);
        res.write(`data: ${JSON.stringify(obj)}\n\n`);
      };
      send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");

      let finished = false;
      req.on("close", () => { clearInterval(heartbeat); if (!finished) console.warn("SSE client disconnected."); });

      const result = await crawl(
        target.toString(),
        limit,
        (tick) => send(tick),
        // If your crawler supports it, this avoids writing files during SSE runs:
        { persistReports: false, collectPages: true }
      );

      finished = true;
      clearInterval(heartbeat);
      send({ ok: true, done: true, result }, "done");
      return res.end();
    }

    /* ---------- Non-streaming mode ---------- */
    // Ask crawler (if it supports options) to avoid writing files when nostore=1 or download requested.
    const preferMemory = String(nostore) === "1" || String(download) === "1";
    const result = await crawl(
      target.toString(),
      limit,
      undefined,
      preferMemory ? { persistReports: false, collectPages: true } : undefined
    );

    // If caller wants a downloadable JSON file...
    if (String(download) === "1") {
      const filename = `crawl-${ts()}.json`;

      // 1) Best case: crawler returned in-memory data (no disk IO).
      //    Use whichever property your crawler exposes. We try common shapes.
      const inMemory =
        result?.jsonData ??
        result?.pages ??
        result?.report ??
        (Array.isArray(result) ? result : null);

      if (inMemory) {
        return attachJson(res, filename, inMemory);
      }

      // 2) Fallback: crawler saved a JSON report path that we can stream.
      const jsonPath = result?.reports?.json;
      if (jsonPath && fs.existsSync(isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath))) {
        const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
        res.setHeader("Content-Type", "application/json; charset=utf-8");
        res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
        return fs.createReadStream(abs).pipe(res);
      }

      // 3) Last resort: send the entire result itself as JSON.
      return attachJson(res, filename, result);
    }

    // Default JSON (inline, not attachment)
    return res.json({
      ok: true,
      message: "Crawl completed",
      url: target.toString(),
      limit,
      result
    });
  } catch (err) {
    console.error("Crawl error:", err);
    return res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
  }
});

/* ------------ Safety nets ------------ */
process.on("unhandledRejection", (e) => console.error("unhandledRejection:", e));
process.on("uncaughtException", (e) => console.error("uncaughtException:", e));

/* ------------ Start server ------------ */
app.listen(PORT, "0.0.0.0", () => {
  console.log(`🚀 Server running at http://localhost:${PORT}`);
});