2025-10-09 10:10:50 +05:30

922 lines
29 KiB
JavaScript

// crawler.js
import got from "got";
import * as cheerio from "cheerio";
import normalizeUrl from "normalize-url";
import { isInternal } from "./utils/urlHelpers.js";
import { getSitemapUrls } from "./utils/sitemap.js";
import fs from "node:fs";
import path from "node:path";
import { chromium } from "playwright";
// NEW libs
import pixelWidth from "string-pixel-width";
import * as readability from "text-readability";
import stringSimilarity from "string-similarity";
/* ------------------------------ globals --------------------------------- */
// NOTE: We'll reset these at the start of crawl() so repeated runs don't share state.
const visited = new Set();
const queue = [];
const results = [];
// Link provenance: every discovered edge (source -> target)
const edges = []; // { from, raw_href, to, discovered_by }
// Quick referrer map for error report
const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}>
const REAL_UA =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
const REAL_HEADERS = {
"user-agent": REAL_UA,
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"upgrade-insecure-requests": "1",
};
/* ------------------------------ utils ----------------------------------- */
function csvEscape(v) {
if (v === undefined || v === null) return "";
const s = String(v);
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
}
function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}
function writePageReports(rows) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const base = path.join("reports", `crawl-${stamp}`);
fs.writeFileSync(`${base}.json`, JSON.stringify(rows, null, 2), "utf8");
console.log(`\n📝 Full JSON report saved: ${base}.json`);
// Columns (a Screaming-Frog-ish shape with our extras)
const headers = [
"url",
"status",
"status_text",
"time_ms",
"bytes",
"content_type",
"http_version",
"title",
"title_length",
"title_pixel_width",
"meta_description",
"meta_description_length",
"meta_description_pixel_width",
"h1_1",
"h1_1_length",
"h1_1_pixel_width",
"h1_2",
"h1_2_length",
"h1_2_pixel_width",
"h2_1",
"h2_2",
"canonical",
"robots_meta",
"x_robots_tag",
"noindex",
"nofollow",
"lang",
"word_count",
"flesch_reading_ease",
"flesch_kincaid_grade",
"gunning_fog",
"coleman_liau",
"ari",
"smog",
"schema_types",
"inlinks",
"outlinks",
"render_mode",
"last_modified",
"set_cookie",
"crawl_timestamp",
"duplicate_title_exact",
"nearest_title_similarity",
"nearest_title_url",
"duplicate_description_exact",
"nearest_description_similarity",
"nearest_description_url",
];
const lines = [headers.join(",")];
for (const r of rows) {
lines.push(
[
r.url,
r.status,
r.status_text ?? "",
r.time_ms,
r.bytes,
r.content_type,
r.http_version ?? "",
r.title,
r.title_length,
r.title_pixel_width,
r.meta_description,
r.meta_description_length,
r.meta_description_pixel_width,
r.h1_1 ?? "",
r.h1_1_length ?? 0,
r.h1_1_pixel_width ?? "",
r.h1_2 ?? "",
r.h1_2_length ?? 0,
r.h1_2_pixel_width ?? "",
r.h2_1 ?? "",
r.h2_2 ?? "",
r.canonical,
r.robots_meta,
r.x_robots_tag ?? "",
r.noindex,
r.nofollow,
r.lang ?? "",
r.word_count ?? "",
r.flesch_reading_ease ?? "",
r.flesch_kincaid_grade ?? "",
r.gunning_fog ?? "",
r.coleman_liau ?? "",
r.ari ?? "",
r.smog ?? "",
Array.isArray(r.schema_types) ? r.schema_types.join("|") : "",
r.inlinks ?? 0,
r.outlinks ?? 0,
r.render_mode,
r.last_modified ?? "",
r.set_cookie ? "yes" : "no",
r.crawl_timestamp ?? "",
r.duplicate_title_exact ?? "",
r.nearest_title_similarity ?? "",
r.nearest_title_url ?? "",
r.duplicate_description_exact ?? "",
r.nearest_description_similarity ?? "",
r.nearest_description_url ?? "",
]
.map(csvEscape)
.join(",")
);
}
// If you also want CSV persisted, uncomment:
// fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8");
// console.log(`📝 CSV report saved: ${base}.csv`);
return { json: path.resolve(`${base}.json`) /*, csv: path.resolve(`${base}.csv`)*/ };
}
function writeLinkEdges(edges) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `links-${stamp}.csv`);
const headers = ["from", "raw_href", "to", "discovered_by"];
const lines = [headers.join(",")];
for (const e of edges) {
lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(","));
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`🔗 Link provenance saved: ${file}`);
return { linksCsv: path.resolve(file) };
}
function writeErrors(rows) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `errors-${stamp}.csv`);
const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"];
const lines = [headers.join(",")];
for (const r of rows) {
if (r && r.status !== null && r.status >= 400) {
const refs = referrers.get(r.url) || [];
if (refs.length === 0) {
lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(","));
} else {
for (const ref of refs) {
lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(","));
}
}
}
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`❗ Error report saved: ${file}`);
return { errorsCsv: path.resolve(file) };
}
function addEdge(from, rawHref, to, discovered_by) {
edges.push({ from, raw_href: rawHref || "", to, discovered_by });
if (!referrers.has(to)) referrers.set(to, []);
referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by });
}
/* ---------------------- parse HTML without JS --------------------------- */
function safeJsonParse(txt) {
try {
return JSON.parse(txt);
} catch {
return null;
}
}
function parseSchemaTypes($) {
const types = new Set();
$('script[type="application/ld+json"]').each((_, el) => {
const raw = $(el).contents().text();
const parsed = safeJsonParse(raw);
if (!parsed) return;
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) {
obj.forEach(collect);
return;
}
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") types.add(t);
else if (Array.isArray(t)) t.forEach((x) => typeof x === "string" && types.add(x));
// nested
Object.values(obj).forEach(collect);
}
};
collect(parsed);
});
return [...types];
}
function parseHtml(html, url) {
const $ = cheerio.load(html);
let title = ($("title").first().text() || "").trim();
const ogTitle = $('meta[property="og:title"]').attr("content") || "";
const twTitle = $('meta[name="twitter:title"]').attr("content") || "";
// Headings (capture top two H1s and H2s)
const h1s = $("h1")
.map((_, el) => $(el).text().trim())
.get();
const h2s = $("h2")
.map((_, el) => $(el).text().trim())
.get();
const h1_1 = h1s[0] || "";
const h1_2 = h1s[1] || "";
const h2_1 = h2s[0] || "";
const h2_2 = h2s[1] || "";
const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length;
if (!title) title = (ogTitle || twTitle || h1_1 || "").trim();
const metaDesc = ($('meta[name="description"]').attr("content") || "").trim();
const canonical = ($('link[rel="canonical"]').attr("href") || "").trim();
const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim();
const robotsLower = robotsMeta.toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const lang = ($("html").attr("lang") || "").trim();
// Basic text body for word count / readability
const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim();
const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
// Internal links + raw href
const internalLinks = new Set();
const rawLinks = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const abs = new URL(href, url).toString();
rawLinks.push({ raw: href, abs });
internalLinks.add(abs);
} catch {}
});
// Schema.org JSON-LD types
const schemaTypes = parseSchemaTypes($);
return {
title,
metaDesc,
h1_1,
h1_2,
h2_1,
h2_2,
totalHeadings,
canonical,
robotsMeta,
noindex,
nofollow,
internalLinks,
rawLinks,
lang,
wordCount,
schemaTypes,
bodyText,
};
}
/* ------------------------------ fetchers -------------------------------- */
async function fetchWithGot(url) {
const t0 = Date.now();
const res = await got(url, {
timeout: { request: 20000 },
throwHttpErrors: false,
headers: REAL_HEADERS,
http2: false,
});
const dt = Date.now() - t0;
const contentType = (res.headers["content-type"] || "").toLowerCase();
const bytes = res.headers["content-length"]
? Number(res.headers["content-length"])
: Buffer.byteLength(res.body || "", "utf8");
return {
status: res.statusCode ?? null,
status_text: res.statusMessage ?? "",
time_ms: dt,
contentType,
body: res.body,
bytes,
render_mode: "http",
httpVersion: res.httpVersion ?? "",
headers: res.headers,
};
}
async function createBrowserContext() {
const browser = await chromium.launch({
headless: true,
args: ["--disable-blink-features=AutomationControlled"],
});
const context = await browser.newContext({
ignoreHTTPSErrors: true, // Ignore SSL certificate errors
userAgent: REAL_UA,
viewport: { width: 1366, height: 768 },
deviceScaleFactor: 1,
isMobile: false,
locale: "en-US",
extraHTTPHeaders: REAL_HEADERS,
});
await context.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] });
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
});
return { browser: context.browser(), context };
}
async function fetchWithPlaywrightAndExtract(url, shared) {
const page = await shared.context.newPage();
const t0 = Date.now();
let status = null,
mainHeaders = {},
statusText = "";
try {
const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
status = resp?.status() ?? null;
statusText = resp?.statusText() ?? "";
try {
mainHeaders = resp ? await resp.headers() : {};
} catch {}
try {
await page.waitForLoadState("networkidle", { timeout: 12000 });
} catch {}
try {
await page.waitForFunction(
() => {
const main = document.querySelector("main") || document.body;
const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length;
const hasHeading = !!document.querySelector(
"h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']"
);
return textLen > 160 || hasHeading;
},
{ timeout: 8000 }
);
} catch {}
const dom = await page.evaluate(() => {
const clean = (s) => (s || "").replace(/\s+/g, " ").trim();
const getTextList = (sel) =>
Array.from(document.querySelectorAll(sel))
.map((el) => clean(el.textContent))
.filter(Boolean);
const title = document.title || "";
const ogTitle = document.querySelector('meta[property="og:title"]')?.content || "";
const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || "";
const metaDesc = document.querySelector('meta[name="description"]')?.content || "";
const canonical = document.querySelector('link[rel="canonical"]')?.href || "";
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || "";
const lang = document.documentElement.getAttribute("lang") || "";
const h1 = getTextList("h1");
const h2 = getTextList("h2");
const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length;
const links = Array.from(document.querySelectorAll("a[href]"))
.map((a) => {
const raw = a.getAttribute("href");
try {
return { raw, abs: new URL(raw, location.href).toString() };
} catch {
return null;
}
})
.filter(Boolean);
const bodyText = clean((document.querySelector("main") || document.body).innerText || "");
const schemaScripts = Array.from(
document.querySelectorAll('script[type="application/ld+json"]')
).map((s) => s.textContent || "");
return {
htmlLen: (document.documentElement.outerHTML || "").length,
title,
ogTitle,
twTitle,
metaDesc,
canonical,
robotsMeta,
lang,
h1,
h2,
totalHeadings,
links,
bodyText,
schemaScripts,
};
});
// Parse schema types from strings (outside of page)
const schemaTypes = [];
for (const raw of dom.schemaScripts || []) {
try {
const parsed = JSON.parse(raw);
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) {
obj.forEach(collect);
return;
}
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") schemaTypes.push(t);
else if (Array.isArray(t)) t.forEach((x) => typeof x === "string" && schemaTypes.push(x));
Object.values(obj).forEach(collect);
}
};
collect(parsed);
} catch {}
}
const dt = Date.now() - t0;
const robotsLower = (dom.robotsMeta || "").toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim();
return {
status,
status_text: statusText,
time_ms: dt,
contentType: "text/html",
bytes: dom.htmlLen || 0,
render_mode: "rendered",
headers: mainHeaders,
domExtract: {
title: finalTitle,
metaDesc: dom.metaDesc || "",
canonical: dom.canonical || "",
robotsMeta: dom.robotsMeta || "",
lang: dom.lang || "",
noindex,
nofollow,
h1_1: dom.h1?.[0] || "",
h1_2: dom.h1?.[1] || "",
h2_1: dom.h2?.[0] || "",
h2_2: dom.h2?.[1] || "",
totalHeadings: dom.totalHeadings || 0,
links: new Set((dom.links || []).map((l) => l.abs)),
rawLinks: dom.links || [],
bodyText: dom.bodyText || "",
schemaTypes: Array.from(new Set(schemaTypes)),
},
};
} finally {
await page.close();
}
}
/* ------------------------- render decision ------------------------------ */
function shouldRender(currentUrl, httpRes, parsed, homeTitle) {
const { pathname } = new URL(currentUrl);
if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell
if (parsed.totalHeadings === 0) return true;
if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true;
return false;
}
function withWWW(urlStr) {
try {
const u = new URL(urlStr);
if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname;
return u.toString();
} catch {
return urlStr;
}
}
/* ------------------------ per-page enrichers ---------------------------- */
function measurePixelWidth(text, size = 16, font = "arial") {
if (!text) return 0;
try {
return pixelWidth(text, { font, size });
} catch {
return Math.round(text.length * size * 0.5);
}
}
function computeReadability(text) {
if (!text) return {};
const safe = text.slice(0, 200000); // cap
const out = {};
try {
out.flesch_reading_ease = readability.fleschReadingEase(safe);
} catch {}
try {
out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe);
} catch {}
try {
out.gunning_fog = readability.gunningFog(safe);
} catch {}
try {
out.coleman_liau = readability.colemanLiauIndex(safe);
} catch {}
try {
out.ari = readability.automatedReadabilityIndex(safe);
} catch {}
try {
out.smog = readability.smogIndex(safe);
} catch {}
return out;
}
/* -------------------------------- main ---------------------------------- */
/**
* Crawl a site and return a structured report.
* @param {string} startUrl
* @param {number} maxPages
* @param {(tick:any)=>void} [onProgress] optional callback for progress events
* @param {{persistReports?: boolean, collectPages?: boolean}} [options]
* @returns {{ results: any[], files: Record<string,string>, total: number }}
*/
export async function crawl(startUrl, maxPages = 50, onProgress, options = {}) {
const persistReports = options.persistReports !== false; // default true
// Reset global state per run
visited.clear();
queue.length = 0;
results.length = 0;
edges.length = 0;
referrers.clear();
const start = normalizeUrl(startUrl, { stripHash: true });
queue.push(start);
// Seed from sitemap.xml + record provenance
try {
const sitemapUrls = await getSitemapUrls(start);
for (const u of sitemapUrls) {
queue.push(u);
addEdge("sitemap.xml", u, u, "sitemap");
}
console.log(`📌 Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`);
} catch (e) {
console.log("⚠️ Sitemap step skipped:", e.message);
}
let shared = null;
async function getShared() {
if (!shared) shared = await createBrowserContext();
return shared;
}
let homeTitle = null;
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift();
if (!url) continue;
const normUrl = normalizeUrl(url, { stripHash: true });
if (visited.has(normUrl)) continue;
visited.add(normUrl);
const attemptUrls = [normUrl];
let usedWWWRetry = false;
for (let attempt = 0; attempt < attemptUrls.length; attempt++) {
const currentUrl = attemptUrls[attempt];
try {
// 1) HTTP fetch
let pageRes = await fetchWithGot(currentUrl);
let parsed = {
title: "",
metaDesc: "",
h1_1: "",
h1_2: "",
h2_1: "",
h2_2: "",
totalHeadings: 0,
canonical: "",
robotsMeta: "",
noindex: false,
nofollow: false,
internalLinks: new Set(),
rawLinks: [],
lang: "",
wordCount: 0,
bodyText: "",
schemaTypes: [],
};
if (pageRes.contentType.includes("text/html")) {
const p = parseHtml(pageRes.body || "", currentUrl);
parsed = { ...parsed, ...p };
}
if (!homeTitle && new URL(currentUrl).pathname === "/") {
homeTitle = parsed.title || "";
}
// 2) Render if needed
if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) {
const s = await getShared();
const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s);
if (rendered.domExtract) {
pageRes = { ...rendered, body: null };
parsed = {
...parsed,
title: rendered.domExtract.title,
metaDesc: rendered.domExtract.metaDesc,
h1_1: rendered.domExtract.h1_1,
h1_2: rendered.domExtract.h1_2,
h2_1: rendered.domExtract.h2_1,
h2_2: rendered.domExtract.h2_2,
totalHeadings: rendered.domExtract.totalHeadings,
canonical: rendered.domExtract.canonical,
robotsMeta: rendered.domExtract.robotsMeta,
noindex: rendered.domExtract.noindex,
nofollow: rendered.domExtract.nofollow,
internalLinks: rendered.domExtract.links,
rawLinks: rendered.domExtract.rawLinks,
lang: rendered.domExtract.lang || parsed.lang,
bodyText: rendered.domExtract.bodyText || parsed.bodyText,
wordCount: (rendered.domExtract.bodyText || "")
.split(/\s+/)
.filter(Boolean).length,
schemaTypes: rendered.domExtract.schemaTypes,
};
}
}
// If still looks empty, try www once
if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) {
attemptUrls.push(withWWW(currentUrl));
usedWWWRetry = true;
continue;
}
// Enqueue internal links + record provenance
for (const link of parsed.internalLinks) {
if (isInternal(start, link)) {
const ln = normalizeUrl(link, { stripHash: true });
const rawMatch = (parsed.rawLinks || []).find((r) => r.abs === link)?.raw ?? "";
addEdge(currentUrl, rawMatch, ln, pageRes.render_mode);
if (!visited.has(ln)) queue.push(ln);
}
}
// ---- Per-page metrics & enrichers ----
const title = parsed.title || "";
const metaDesc = parsed.metaDesc || "";
const h1_1 = parsed.h1_1 || "";
const h1_2 = parsed.h1_2 || "";
const lang = parsed.lang || "";
const bodyText = parsed.bodyText || "";
const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0);
const titlePx = measurePixelWidth(title, 16, "arial");
const descPx = measurePixelWidth(metaDesc, 14, "arial");
const h1_1_px = measurePixelWidth(h1_1, 24, "arial");
const h1_2_px = measurePixelWidth(h1_2, 24, "arial");
const read = computeReadability(bodyText);
const headers = pageRes.headers || {};
const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? "";
const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? "";
const setCookie = !!headers["set-cookie"];
const outlinks = parsed.internalLinks.size;
const inlinks = (referrers.get(currentUrl) || []).length;
// Save page row
results.push({
url: currentUrl,
status: pageRes.status,
status_text: pageRes.status_text ?? "",
time_ms: pageRes.time_ms,
bytes: pageRes.bytes,
content_type: pageRes.contentType,
http_version: pageRes.httpVersion ?? "",
title,
title_length: title.length,
title_pixel_width: titlePx,
meta_description: metaDesc,
meta_description_length: metaDesc.length,
meta_description_pixel_width: descPx,
h1_1,
h1_1_length: h1_1.length,
h1_1_pixel_width: h1_1_px,
h1_2,
h1_2_length: h1_2.length,
h1_2_pixel_width: h1_2_px,
h2_1: parsed.h2_1 || "",
h2_2: parsed.h2_2 || "",
canonical: parsed.canonical,
robots_meta: parsed.robotsMeta,
x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots,
noindex: parsed.noindex,
nofollow: parsed.nofollow,
lang,
word_count: wordCount,
flesch_reading_ease: read.flesch_reading_ease ?? "",
flesch_kincaid_grade: read.flesch_kincaid_grade ?? "",
gunning_fog: read.gunning_fog ?? "",
coleman_liau: read.coleman_liau ?? "",
ari: read.ari ?? "",
smog: read.smog ?? "",
schema_types: parsed.schemaTypes || [],
inlinks,
outlinks,
render_mode: pageRes.render_mode,
last_modified: lastModified,
set_cookie: setCookie,
crawl_timestamp: new Date().toISOString(),
});
console.log(
`[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${
title || h1_1
}`
);
// optional progress callback (non-fatal)
try {
onProgress?.({
url: currentUrl,
status: pageRes.status,
title,
inlinks,
outlinks,
visited: visited.size,
queued: queue.length,
});
} catch {}
break; // success for this URL; stop attempts
} catch (err) {
console.error(`[ERROR] ${currentUrl} -> ${err.message}`);
results.push({
url: currentUrl,
status: null,
status_text: "",
time_ms: null,
bytes: null,
content_type: "",
http_version: "",
title: "",
title_length: 0,
title_pixel_width: "",
meta_description: "",
meta_description_length: 0,
meta_description_pixel_width: "",
h1_1: "",
h1_1_length: 0,
h1_1_pixel_width: "",
h1_2: "",
h1_2_length: 0,
h1_2_pixel_width: "",
h2_1: "",
h2_2: "",
canonical: "",
robots_meta: "",
x_robots_tag: "",
noindex: false,
nofollow: false,
lang: "",
word_count: "",
flesch_reading_ease: "",
flesch_kincaid_grade: "",
gunning_fog: "",
coleman_liau: "",
ari: "",
smog: "",
schema_types: [],
inlinks: 0,
outlinks: 0,
render_mode: "error",
last_modified: "",
set_cookie: "",
crawl_timestamp: new Date().toISOString(),
});
try {
onProgress?.({
url: currentUrl,
error: String(err?.message || err),
visited: visited.size,
queued: queue.length,
});
} catch {}
}
}
}
if (shared) await shared.browser.close();
// -------------------- Post-process: duplicates & similarity -------------
// Titles
const titleMap = new Map();
for (const r of results) {
const key = (r.title || "").trim();
if (!titleMap.has(key)) titleMap.set(key, []);
titleMap.get(key).push(r);
}
for (const [t, arr] of titleMap.entries()) {
if (!t) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no";
}
// Meta descriptions
const descMap = new Map();
for (const r of results) {
const key = (r.meta_description || "").trim();
if (!descMap.has(key)) descMap.set(key, []);
descMap.get(key).push(r);
}
for (const [d, arr] of descMap.entries()) {
if (!d) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no";
}
// Nearest neighbor similarities (within site, lightweight)
const titleList = results.map((r) => ({ url: r.url, text: (r.title || "").trim() }));
const descList = results.map((r) => ({ url: r.url, text: (r.meta_description || "").trim() }));
for (const r of results) {
// titles
const others = titleList.filter((x) => x.url !== r.url && x.text);
let bestT = { rating: 0, target: "" };
if (r.title && others.length) {
const ratings = stringSimilarity.findBestMatch(r.title, others.map((x) => x.text));
const best = ratings.bestMatch;
bestT.rating = best.rating;
const idx = ratings.ratings.findIndex((x) => x.rating === best.rating);
bestT.target = others[idx]?.url || "";
}
r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : "";
r.nearest_title_url = bestT.target;
// descriptions
const othersD = descList.filter((x) => x.url !== r.url && x.text);
let bestD = { rating: 0, target: "" };
if (r.meta_description && othersD.length) {
const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map((x) => x.text));
const best = ratingsD.bestMatch;
bestD.rating = best.rating;
const idx = ratingsD.ratings.findIndex((x) => x.rating === best.rating);
bestD.target = othersD[idx]?.url || "";
}
r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : "";
r.nearest_description_url = bestD.target;
}
console.log(`\n✅ Crawl finished. Total pages: ${visited.size}`);
let files = {};
if (persistReports) {
const a = writePageReports(results);
const b = writeLinkEdges(edges);
const c = writeErrors(results);
files = { ...a, ...b, ...c };
}
return { results, files, total: results.length };
}
// // CLI: node crawler.js https://site.com 200
// const START_URL = process.argv[2] || "https://example.com";
// const MAX_PAGES = Number(process.argv[3] || 100);
// crawl(START_URL, MAX_PAGES);