Metatron_Admin_Backend/crawler copy.js
2025-10-09 10:10:50 +05:30

710 lines
26 KiB
JavaScript

import got from "got";
import * as cheerio from "cheerio";
import normalizeUrl from "normalize-url";
import { isInternal } from "./utils/urlHelpers.js";
import { getSitemapUrls } from "./utils/sitemap.js";
import fs from "node:fs";
import path from "node:path";
import { chromium } from "playwright";
// NEW libs
import pixelWidth from "string-pixel-width";
import * as readability from "text-readability";
import stringSimilarity from "string-similarity";
/* ------------------------------ globals --------------------------------- */
const visited = new Set();
const queue = [];
const results = [];
// Link provenance: every discovered edge (source -> target)
const edges = []; // { from, raw_href, to, discovered_by }
// Quick referrer map for error report
const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}>
const REAL_UA =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
const REAL_HEADERS = {
"user-agent": REAL_UA,
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"upgrade-insecure-requests": "1",
};
/* ------------------------------ utils ----------------------------------- */
function csvEscape(v) {
if (v === undefined || v === null) return "";
const s = String(v);
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
}
function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}
function writePageReports(results) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const base = path.join("reports", `crawl-${stamp}`);
fs.writeFileSync(`${base}.json`, JSON.stringify(results, null, 2), "utf8");
console.log(`\n📝 Full JSON report saved: ${base}.json`);
// Columns (a Screaming-Frog-ish shape with our extras)
const headers = [
"url", "status", "status_text", "time_ms", "bytes", "content_type", "http_version",
"title", "title_length", "title_pixel_width",
"meta_description", "meta_description_length", "meta_description_pixel_width",
"h1_1", "h1_1_length", "h1_1_pixel_width", "h1_2", "h1_2_length", "h1_2_pixel_width",
"h2_1", "h2_2",
"canonical", "robots_meta", "x_robots_tag", "noindex", "nofollow",
"lang", "word_count", "flesch_reading_ease", "flesch_kincaid_grade",
"gunning_fog", "coleman_liau", "ari", "smog",
"schema_types", "inlinks", "outlinks", "render_mode",
"last_modified", "set_cookie", "crawl_timestamp",
"duplicate_title_exact", "nearest_title_similarity", "nearest_title_url",
"duplicate_description_exact", "nearest_description_similarity", "nearest_description_url"
];
const lines = [headers.join(",")];
for (const r of results) {
lines.push([
r.url,
r.status,
r.status_text ?? "",
r.time_ms,
r.bytes,
r.content_type,
r.http_version ?? "",
r.title,
r.title_length,
r.title_pixel_width,
r.meta_description,
r.meta_description_length,
r.meta_description_pixel_width,
r.h1_1 ?? "",
r.h1_1_length ?? 0,
r.h1_1_pixel_width ?? "",
r.h1_2 ?? "",
r.h1_2_length ?? 0,
r.h1_2_pixel_width ?? "",
r.h2_1 ?? "",
r.h2_2 ?? "",
r.canonical,
r.robots_meta,
r.x_robots_tag ?? "",
r.noindex,
r.nofollow,
r.lang ?? "",
r.word_count ?? "",
r.flesch_reading_ease ?? "",
r.flesch_kincaid_grade ?? "",
r.gunning_fog ?? "",
r.coleman_liau ?? "",
r.ari ?? "",
r.smog ?? "",
Array.isArray(r.schema_types) ? r.schema_types.join("|") : "",
r.inlinks ?? 0,
r.outlinks ?? 0,
r.render_mode,
r.last_modified ?? "",
r.set_cookie ? "yes" : "no",
r.crawl_timestamp ?? "",
r.duplicate_title_exact ?? "",
r.nearest_title_similarity ?? "",
r.nearest_title_url ?? "",
r.duplicate_description_exact ?? "",
r.nearest_description_similarity ?? "",
r.nearest_description_url ?? ""
].map(csvEscape).join(","));
}
//fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8");
//console.log(`\n📝 Page reports saved:\n - ${base}.csv\n - ${base}.json`);
}
function writeLinkEdges(edges) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `links-${stamp}.csv`);
const headers = ["from", "raw_href", "to", "discovered_by"];
const lines = [headers.join(",")];
for (const e of edges) {
lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(","));
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`🔗 Link provenance saved: ${file}`);
}
function writeErrors(results) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `errors-${stamp}.csv`);
const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"];
const lines = [headers.join(",")];
for (const r of results) {
if (r && r.status !== null && r.status >= 400) {
const refs = referrers.get(r.url) || [];
if (refs.length === 0) {
lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(","));
} else {
for (const ref of refs) {
lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(","));
}
}
}
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`❗ Error report saved: ${file}`);
}
function addEdge(from, rawHref, to, discovered_by) {
edges.push({ from, raw_href: rawHref || "", to, discovered_by });
if (!referrers.has(to)) referrers.set(to, []);
referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by });
}
/* ---------------------- parse HTML without JS --------------------------- */
function safeJsonParse(txt) {
try { return JSON.parse(txt); } catch { return null; }
}
function parseSchemaTypes($) {
const types = new Set();
$('script[type="application/ld+json"]').each((_, el) => {
const raw = $(el).contents().text();
const parsed = safeJsonParse(raw);
if (!parsed) return;
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) { obj.forEach(collect); return; }
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") types.add(t);
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && types.add(x));
// nested
Object.values(obj).forEach(collect);
}
};
collect(parsed);
});
return [...types];
}
function parseHtml(html, url) {
const $ = cheerio.load(html);
let title = ($("title").first().text() || "").trim();
const ogTitle = $('meta[property="og:title"]').attr("content") || "";
const twTitle = $('meta[name="twitter:title"]').attr("content") || "";
// Headings (capture top two H1s and H2s)
const h1s = $("h1").map((_, el) => $(el).text().trim()).get();
const h2s = $("h2").map((_, el) => $(el).text().trim()).get();
const h1_1 = h1s[0] || "";
const h1_2 = h1s[1] || "";
const h2_1 = h2s[0] || "";
const h2_2 = h2s[1] || "";
const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length;
if (!title) title = (ogTitle || twTitle || h1_1 || "").trim();
const metaDesc = ($('meta[name="description"]').attr("content") || "").trim();
const canonical = ($('link[rel="canonical"]').attr("href") || "").trim();
const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim();
const robotsLower = robotsMeta.toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const lang = ($("html").attr("lang") || "").trim();
// Basic text body for word count / readability
const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim();
const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
// Internal links + raw href
const internalLinks = new Set();
const rawLinks = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const abs = new URL(href, url).toString();
rawLinks.push({ raw: href, abs });
internalLinks.add(abs);
} catch { }
});
// Schema.org JSON-LD types
const schemaTypes = parseSchemaTypes($);
return {
title,
metaDesc,
h1_1, h1_2, h2_1, h2_2,
totalHeadings,
canonical, robotsMeta, noindex, nofollow,
internalLinks, rawLinks,
lang,
wordCount,
schemaTypes,
bodyText
};
}
/* ------------------------------ fetchers -------------------------------- */
async function fetchWithGot(url) {
const t0 = Date.now();
const res = await got(url, {
timeout: { request: 20000 },
throwHttpErrors: false,
headers: REAL_HEADERS,
http2: false
});
const dt = Date.now() - t0;
const contentType = (res.headers["content-type"] || "").toLowerCase();
const bytes = res.headers["content-length"]
? Number(res.headers["content-length"])
: Buffer.byteLength(res.body || "", "utf8");
return {
status: res.statusCode ?? null,
status_text: res.statusMessage ?? "",
time_ms: dt,
contentType,
body: res.body,
bytes,
render_mode: "http",
httpVersion: res.httpVersion ?? "",
headers: res.headers
};
}
async function createBrowserContext() {
const browser = await chromium.launch({ headless: true, args: ["--disable-blink-features=AutomationControlled"] });
const context = await browser.newContext({
ignoreHTTPSErrors: true, // Ignore SSL certificate errors
userAgent: REAL_UA,
viewport: { width: 1366, height: 768 },
deviceScaleFactor: 1,
isMobile: false,
locale: "en-US",
extraHTTPHeaders: REAL_HEADERS
});
await context.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] });
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
});
return { browser: context.browser(), context };
}
async function fetchWithPlaywrightAndExtract(url, shared) {
const page = await shared.context.newPage();
const t0 = Date.now();
let status = null, mainHeaders = {}, statusText = "";
try {
const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
status = resp?.status() ?? null;
statusText = resp?.statusText() ?? "";
try { mainHeaders = resp ? await resp.headers() : {}; } catch { }
try { await page.waitForLoadState("networkidle", { timeout: 12000 }); } catch { }
try {
await page.waitForFunction(() => {
const main = document.querySelector("main") || document.body;
const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length;
const hasHeading = !!document.querySelector("h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']");
return textLen > 160 || hasHeading;
}, { timeout: 8000 });
} catch { }
const dom = await page.evaluate(() => {
const clean = s => (s || "").replace(/\s+/g, " ").trim();
const getTextList = sel => Array.from(document.querySelectorAll(sel))
.map(el => clean(el.textContent)).filter(Boolean);
const title = document.title || "";
const ogTitle = document.querySelector('meta[property="og:title"]')?.content || "";
const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || "";
const metaDesc = document.querySelector('meta[name="description"]')?.content || "";
const canonical = document.querySelector('link[rel="canonical"]')?.href || "";
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || "";
const lang = document.documentElement.getAttribute("lang") || "";
const h1 = getTextList("h1");
const h2 = getTextList("h2");
const h3 = getTextList("h3");
const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length;
const links = Array.from(document.querySelectorAll("a[href]"))
.map(a => {
const raw = a.getAttribute("href");
try { return { raw, abs: new URL(raw, location.href).toString() }; }
catch { return null; }
})
.filter(Boolean);
const firstHeading = h1[0] || h2[0] || "";
const bodyText = clean((document.querySelector("main") || document.body).innerText || "");
const schemaScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]')).map(s => s.textContent || "");
return {
htmlLen: (document.documentElement.outerHTML || "").length,
title, ogTitle, twTitle, metaDesc, canonical, robotsMeta, lang,
h1, h2, totalHeadings,
links,
bodyText,
schemaScripts
};
});
// Parse schema types from strings (outside of page)
const schemaTypes = [];
for (const raw of dom.schemaScripts || []) {
try {
const parsed = JSON.parse(raw);
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) { obj.forEach(collect); return; }
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") schemaTypes.push(t);
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && schemaTypes.push(x));
Object.values(obj).forEach(collect);
}
};
collect(parsed);
} catch { }
}
const dt = Date.now() - t0;
const robotsLower = (dom.robotsMeta || "").toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim();
return {
status,
status_text: statusText,
time_ms: dt,
contentType: "text/html",
bytes: dom.htmlLen || 0,
render_mode: "rendered",
headers: mainHeaders,
domExtract: {
title: finalTitle,
metaDesc: dom.metaDesc || "",
canonical: dom.canonical || "",
robotsMeta: dom.robotsMeta || "",
lang: dom.lang || "",
noindex, nofollow,
h1_1: dom.h1?.[0] || "",
h1_2: dom.h1?.[1] || "",
h2_1: dom.h2?.[0] || "",
h2_2: dom.h2?.[1] || "",
totalHeadings: dom.totalHeadings || 0,
links: new Set((dom.links || []).map(l => l.abs)),
rawLinks: dom.links || [],
bodyText: dom.bodyText || "",
schemaTypes: Array.from(new Set(schemaTypes))
}
};
} finally {
await page.close();
}
}
/* ------------------------- render decision ------------------------------ */
function shouldRender(currentUrl, httpRes, parsed, homeTitle) {
const { pathname } = new URL(currentUrl);
if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell
if (parsed.totalHeadings === 0) return true;
if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true;
return false;
}
function withWWW(urlStr) {
try { const u = new URL(urlStr); if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname; return u.toString(); }
catch { return urlStr; }
}
/* ------------------------ per-page enrichers ---------------------------- */
function measurePixelWidth(text, size = 16, font = "arial") {
if (!text) return 0;
try { return pixelWidth(text, { font, size }); } catch { return Math.round(text.length * size * 0.5); }
}
function computeReadability(text) {
if (!text) return {};
const safe = text.slice(0, 200000); // cap
const out = {};
try { out.flesch_reading_ease = readability.fleschReadingEase(safe); } catch { }
try { out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe); } catch { }
try { out.gunning_fog = readability.gunningFog(safe); } catch { }
try { out.coleman_liau = readability.colemanLiauIndex(safe); } catch { }
try { out.ari = readability.automatedReadabilityIndex(safe); } catch { }
try { out.smog = readability.smogIndex(safe); } catch { }
return out;
}
/* -------------------------------- main ---------------------------------- */
// async function crawl(startUrl, maxPages = 50) {
export async function crawl(startUrl, maxPages = 50) {
const start = normalizeUrl(startUrl, { stripHash: true });
queue.push(start);
// Seed from sitemap.xml + record provenance
try {
const sitemapUrls = await getSitemapUrls(start);
for (const u of sitemapUrls) {
queue.push(u);
addEdge("sitemap.xml", u, u, "sitemap");
}
console.log(`📌 Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`);
} catch (e) {
console.log("⚠️ Sitemap step skipped:", e.message);
}
let shared = null;
async function getShared() { if (!shared) shared = await createBrowserContext(); return shared; }
let homeTitle = null;
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift();
if (!url) continue;
const normUrl = normalizeUrl(url, { stripHash: true });
if (visited.has(normUrl)) continue;
visited.add(normUrl);
let attemptUrls = [normUrl];
let usedWWWRetry = false;
for (let attempt = 0; attempt < attemptUrls.length; attempt++) {
const currentUrl = attemptUrls[attempt];
try {
// 1) HTTP fetch
let pageRes = await fetchWithGot(currentUrl);
let parsed = {
title: "", metaDesc: "", h1_1: "", h1_2: "", h2_1: "", h2_2: "",
totalHeadings: 0, canonical: "", robotsMeta: "", noindex: false, nofollow: false,
internalLinks: new Set(), rawLinks: [],
lang: "", wordCount: 0, bodyText: "", schemaTypes: []
};
if (pageRes.contentType.includes("text/html")) {
const p = parseHtml(pageRes.body || "", currentUrl);
parsed = { ...parsed, ...p };
}
if (!homeTitle && new URL(currentUrl).pathname === "/") {
homeTitle = parsed.title || "";
}
// 2) Render if needed
if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) {
const s = await getShared();
const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s);
if (rendered.domExtract) {
pageRes = { ...rendered, body: null };
parsed = {
...parsed,
title: rendered.domExtract.title,
metaDesc: rendered.domExtract.metaDesc,
h1_1: rendered.domExtract.h1_1,
h1_2: rendered.domExtract.h1_2,
h2_1: rendered.domExtract.h2_1,
h2_2: rendered.domExtract.h2_2,
totalHeadings: rendered.domExtract.totalHeadings,
canonical: rendered.domExtract.canonical,
robotsMeta: rendered.domExtract.robotsMeta,
noindex: rendered.domExtract.noindex,
nofollow: rendered.domExtract.nofollow,
internalLinks: rendered.domExtract.links,
rawLinks: rendered.domExtract.rawLinks,
lang: rendered.domExtract.lang || parsed.lang,
bodyText: rendered.domExtract.bodyText || parsed.bodyText,
wordCount: (rendered.domExtract.bodyText || "").split(/\s+/).filter(Boolean).length,
schemaTypes: rendered.domExtract.schemaTypes
};
}
}
// If still looks empty, try www once
if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) {
attemptUrls.push(withWWW(currentUrl));
usedWWWRetry = true;
continue;
}
// Enqueue internal links + record provenance
for (const link of parsed.internalLinks) {
if (isInternal(start, link)) {
const ln = normalizeUrl(link, { stripHash: true });
const rawMatch = (parsed.rawLinks || []).find(r => r.abs === link)?.raw ?? "";
addEdge(currentUrl, rawMatch, ln, pageRes.render_mode);
if (!visited.has(ln)) queue.push(ln);
}
}
// ---- Per-page metrics & enrichers ----
const title = parsed.title || "";
const metaDesc = parsed.metaDesc || "";
const h1_1 = parsed.h1_1 || "";
const h1_2 = parsed.h1_2 || "";
const lang = parsed.lang || "";
const bodyText = parsed.bodyText || "";
const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0);
const titlePx = measurePixelWidth(title, 16, "arial");
const descPx = measurePixelWidth(metaDesc, 14, "arial");
const h1_1_px = measurePixelWidth(h1_1, 24, "arial");
const h1_2_px = measurePixelWidth(h1_2, 24, "arial");
const read = computeReadability(bodyText);
const headers = pageRes.headers || {};
const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? "";
const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? "";
const setCookie = !!headers["set-cookie"];
const outlinks = parsed.internalLinks.size;
const inlinks = (referrers.get(currentUrl) || []).length;
// Save page row
results.push({
url: currentUrl,
status: pageRes.status,
status_text: pageRes.status_text ?? "",
time_ms: pageRes.time_ms,
bytes: pageRes.bytes,
content_type: pageRes.contentType,
http_version: pageRes.httpVersion ?? "",
title,
title_length: title.length,
title_pixel_width: titlePx,
meta_description: metaDesc,
meta_description_length: metaDesc.length,
meta_description_pixel_width: descPx,
h1_1,
h1_1_length: h1_1.length,
h1_1_pixel_width: h1_1_px,
h1_2,
h1_2_length: h1_2.length,
h1_2_pixel_width: h1_2_px,
h2_1: parsed.h2_1 || "",
h2_2: parsed.h2_2 || "",
canonical: parsed.canonical,
robots_meta: parsed.robotsMeta,
x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots,
noindex: parsed.noindex,
nofollow: parsed.nofollow,
lang,
word_count: wordCount,
flesch_reading_ease: read.flesch_reading_ease ?? "",
flesch_kincaid_grade: read.flesch_kincaid_grade ?? "",
gunning_fog: read.gunning_fog ?? "",
coleman_liau: read.coleman_liau ?? "",
ari: read.ari ?? "",
smog: read.smog ?? "",
schema_types: parsed.schemaTypes || [],
inlinks,
outlinks,
render_mode: pageRes.render_mode,
last_modified: lastModified,
set_cookie: setCookie,
crawl_timestamp: new Date().toISOString()
});
console.log(
`[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${title || h1_1}`
);
break; // success for this URL; stop attempts
} catch (err) {
console.error(`[ERROR] ${currentUrl} -> ${err.message}`);
results.push({
url: currentUrl,
status: null, status_text: "", time_ms: null, bytes: null, content_type: "",
http_version: "", title: "", title_length: 0, title_pixel_width: "",
meta_description: "", meta_description_length: 0, meta_description_pixel_width: "",
h1_1: "", h1_1_length: 0, h1_1_pixel_width: "", h1_2: "", h1_2_length: 0, h1_2_pixel_width: "",
h2_1: "", h2_2: "",
canonical: "", robots_meta: "", x_robots_tag: "", noindex: false, nofollow: false,
lang: "", word_count: "", flesch_reading_ease: "", flesch_kincaid_grade: "",
gunning_fog: "", coleman_liau: "", ari: "", smog: "",
schema_types: [], inlinks: 0, outlinks: 0, render_mode: "error",
last_modified: "", set_cookie: "", crawl_timestamp: new Date().toISOString()
});
}
}
}
if (shared) await shared.browser.close();
// -------------------- Post-process: duplicates & similarity -------------
// Titles
const titleMap = new Map();
for (const r of results) {
const key = (r.title || "").trim();
if (!titleMap.has(key)) titleMap.set(key, []);
titleMap.get(key).push(r);
}
for (const [t, arr] of titleMap.entries()) {
if (!t) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no";
}
// Meta descriptions
const descMap = new Map();
for (const r of results) {
const key = (r.meta_description || "").trim();
if (!descMap.has(key)) descMap.set(key, []);
descMap.get(key).push(r);
}
for (const [d, arr] of descMap.entries()) {
if (!d) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no";
}
// Nearest neighbor similarities (within site, lightweight)
const titleList = results.map(r => ({ url: r.url, text: (r.title || "").trim() }));
const descList = results.map(r => ({ url: r.url, text: (r.meta_description || "").trim() }));
for (const r of results) {
// titles
const others = titleList.filter(x => x.url !== r.url && x.text);
let bestT = { rating: 0, target: "" };
if (r.title && others.length) {
const ratings = stringSimilarity.findBestMatch(r.title, others.map(x => x.text));
const best = ratings.bestMatch;
bestT.rating = best.rating;
const idx = ratings.ratings.findIndex(x => x.rating === best.rating);
bestT.target = others[idx]?.url || "";
}
r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : "";
r.nearest_title_url = bestT.target;
// descriptions
const othersD = descList.filter(x => x.url !== r.url && x.text);
let bestD = { rating: 0, target: "" };
if (r.meta_description && othersD.length) {
const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map(x => x.text));
const best = ratingsD.bestMatch;
bestD.rating = best.rating;
const idx = ratingsD.ratings.findIndex(x => x.rating === best.rating);
bestD.target = othersD[idx]?.url || "";
}
r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : "";
r.nearest_description_url = bestD.target;
}
console.log(`\n✅ Crawl finished. Total pages: ${visited.size}`);
writePageReports(results);
writeLinkEdges(edges);
writeErrors(results);
}
// // CLI: node crawler.js https://site.com 200
// const START_URL = process.argv[2] || "https://example.com";
// const MAX_PAGES = Number(process.argv[3] || 100);
// crawl(START_URL, MAX_PAGES);