710 lines
26 KiB
JavaScript
710 lines
26 KiB
JavaScript
import got from "got";
|
|
import * as cheerio from "cheerio";
|
|
import normalizeUrl from "normalize-url";
|
|
import { isInternal } from "./utils/urlHelpers.js";
|
|
import { getSitemapUrls } from "./utils/sitemap.js";
|
|
import fs from "node:fs";
|
|
import path from "node:path";
|
|
import { chromium } from "playwright";
|
|
|
|
// NEW libs
|
|
import pixelWidth from "string-pixel-width";
|
|
import * as readability from "text-readability";
|
|
import stringSimilarity from "string-similarity";
|
|
|
|
/* ------------------------------ globals --------------------------------- */
|
|
const visited = new Set();
|
|
const queue = [];
|
|
const results = [];
|
|
|
|
// Link provenance: every discovered edge (source -> target)
|
|
const edges = []; // { from, raw_href, to, discovered_by }
|
|
|
|
// Quick referrer map for error report
|
|
const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}>
|
|
|
|
const REAL_UA =
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
const REAL_HEADERS = {
|
|
"user-agent": REAL_UA,
|
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
"accept-language": "en-US,en;q=0.9",
|
|
"upgrade-insecure-requests": "1",
|
|
};
|
|
|
|
/* ------------------------------ utils ----------------------------------- */
|
|
function csvEscape(v) {
|
|
if (v === undefined || v === null) return "";
|
|
const s = String(v);
|
|
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
|
|
}
|
|
function ensureDir(dir) {
|
|
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
}
|
|
function writePageReports(results) {
|
|
ensureDir("reports");
|
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
|
const base = path.join("reports", `crawl-${stamp}`);
|
|
|
|
fs.writeFileSync(`${base}.json`, JSON.stringify(results, null, 2), "utf8");
|
|
console.log(`\n📝 Full JSON report saved: ${base}.json`);
|
|
|
|
// Columns (a Screaming-Frog-ish shape with our extras)
|
|
const headers = [
|
|
"url", "status", "status_text", "time_ms", "bytes", "content_type", "http_version",
|
|
"title", "title_length", "title_pixel_width",
|
|
"meta_description", "meta_description_length", "meta_description_pixel_width",
|
|
"h1_1", "h1_1_length", "h1_1_pixel_width", "h1_2", "h1_2_length", "h1_2_pixel_width",
|
|
"h2_1", "h2_2",
|
|
"canonical", "robots_meta", "x_robots_tag", "noindex", "nofollow",
|
|
"lang", "word_count", "flesch_reading_ease", "flesch_kincaid_grade",
|
|
"gunning_fog", "coleman_liau", "ari", "smog",
|
|
"schema_types", "inlinks", "outlinks", "render_mode",
|
|
"last_modified", "set_cookie", "crawl_timestamp",
|
|
"duplicate_title_exact", "nearest_title_similarity", "nearest_title_url",
|
|
"duplicate_description_exact", "nearest_description_similarity", "nearest_description_url"
|
|
];
|
|
const lines = [headers.join(",")];
|
|
for (const r of results) {
|
|
lines.push([
|
|
r.url,
|
|
r.status,
|
|
r.status_text ?? "",
|
|
r.time_ms,
|
|
r.bytes,
|
|
r.content_type,
|
|
r.http_version ?? "",
|
|
r.title,
|
|
r.title_length,
|
|
r.title_pixel_width,
|
|
r.meta_description,
|
|
r.meta_description_length,
|
|
r.meta_description_pixel_width,
|
|
r.h1_1 ?? "",
|
|
r.h1_1_length ?? 0,
|
|
r.h1_1_pixel_width ?? "",
|
|
r.h1_2 ?? "",
|
|
r.h1_2_length ?? 0,
|
|
r.h1_2_pixel_width ?? "",
|
|
r.h2_1 ?? "",
|
|
r.h2_2 ?? "",
|
|
r.canonical,
|
|
r.robots_meta,
|
|
r.x_robots_tag ?? "",
|
|
r.noindex,
|
|
r.nofollow,
|
|
r.lang ?? "",
|
|
r.word_count ?? "",
|
|
r.flesch_reading_ease ?? "",
|
|
r.flesch_kincaid_grade ?? "",
|
|
r.gunning_fog ?? "",
|
|
r.coleman_liau ?? "",
|
|
r.ari ?? "",
|
|
r.smog ?? "",
|
|
Array.isArray(r.schema_types) ? r.schema_types.join("|") : "",
|
|
r.inlinks ?? 0,
|
|
r.outlinks ?? 0,
|
|
r.render_mode,
|
|
r.last_modified ?? "",
|
|
r.set_cookie ? "yes" : "no",
|
|
r.crawl_timestamp ?? "",
|
|
r.duplicate_title_exact ?? "",
|
|
r.nearest_title_similarity ?? "",
|
|
r.nearest_title_url ?? "",
|
|
r.duplicate_description_exact ?? "",
|
|
r.nearest_description_similarity ?? "",
|
|
r.nearest_description_url ?? ""
|
|
].map(csvEscape).join(","));
|
|
}
|
|
//fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8");
|
|
//console.log(`\n📝 Page reports saved:\n - ${base}.csv\n - ${base}.json`);
|
|
}
|
|
function writeLinkEdges(edges) {
|
|
ensureDir("reports");
|
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
|
const file = path.join("reports", `links-${stamp}.csv`);
|
|
const headers = ["from", "raw_href", "to", "discovered_by"];
|
|
const lines = [headers.join(",")];
|
|
for (const e of edges) {
|
|
lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(","));
|
|
}
|
|
fs.writeFileSync(file, lines.join("\n"), "utf8");
|
|
console.log(`🔗 Link provenance saved: ${file}`);
|
|
}
|
|
function writeErrors(results) {
|
|
ensureDir("reports");
|
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
|
const file = path.join("reports", `errors-${stamp}.csv`);
|
|
const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"];
|
|
const lines = [headers.join(",")];
|
|
|
|
for (const r of results) {
|
|
if (r && r.status !== null && r.status >= 400) {
|
|
const refs = referrers.get(r.url) || [];
|
|
if (refs.length === 0) {
|
|
lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(","));
|
|
} else {
|
|
for (const ref of refs) {
|
|
lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(","));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
fs.writeFileSync(file, lines.join("\n"), "utf8");
|
|
console.log(`❗ Error report saved: ${file}`);
|
|
}
|
|
function addEdge(from, rawHref, to, discovered_by) {
|
|
edges.push({ from, raw_href: rawHref || "", to, discovered_by });
|
|
if (!referrers.has(to)) referrers.set(to, []);
|
|
referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by });
|
|
}
|
|
|
|
/* ---------------------- parse HTML without JS --------------------------- */
|
|
function safeJsonParse(txt) {
|
|
try { return JSON.parse(txt); } catch { return null; }
|
|
}
|
|
function parseSchemaTypes($) {
|
|
const types = new Set();
|
|
$('script[type="application/ld+json"]').each((_, el) => {
|
|
const raw = $(el).contents().text();
|
|
const parsed = safeJsonParse(raw);
|
|
if (!parsed) return;
|
|
const collect = (obj) => {
|
|
if (!obj) return;
|
|
if (Array.isArray(obj)) { obj.forEach(collect); return; }
|
|
if (typeof obj === "object") {
|
|
const t = obj["@type"];
|
|
if (typeof t === "string") types.add(t);
|
|
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && types.add(x));
|
|
// nested
|
|
Object.values(obj).forEach(collect);
|
|
}
|
|
};
|
|
collect(parsed);
|
|
});
|
|
return [...types];
|
|
}
|
|
function parseHtml(html, url) {
|
|
const $ = cheerio.load(html);
|
|
|
|
let title = ($("title").first().text() || "").trim();
|
|
const ogTitle = $('meta[property="og:title"]').attr("content") || "";
|
|
const twTitle = $('meta[name="twitter:title"]').attr("content") || "";
|
|
|
|
// Headings (capture top two H1s and H2s)
|
|
const h1s = $("h1").map((_, el) => $(el).text().trim()).get();
|
|
const h2s = $("h2").map((_, el) => $(el).text().trim()).get();
|
|
|
|
const h1_1 = h1s[0] || "";
|
|
const h1_2 = h1s[1] || "";
|
|
const h2_1 = h2s[0] || "";
|
|
const h2_2 = h2s[1] || "";
|
|
|
|
const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length;
|
|
|
|
if (!title) title = (ogTitle || twTitle || h1_1 || "").trim();
|
|
|
|
const metaDesc = ($('meta[name="description"]').attr("content") || "").trim();
|
|
const canonical = ($('link[rel="canonical"]').attr("href") || "").trim();
|
|
const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim();
|
|
const robotsLower = robotsMeta.toLowerCase();
|
|
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
|
|
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
|
|
|
|
const lang = ($("html").attr("lang") || "").trim();
|
|
|
|
// Basic text body for word count / readability
|
|
const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim();
|
|
const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
|
|
|
|
// Internal links + raw href
|
|
const internalLinks = new Set();
|
|
const rawLinks = [];
|
|
$("a[href]").each((_, el) => {
|
|
const href = $(el).attr("href");
|
|
if (!href) return;
|
|
try {
|
|
const abs = new URL(href, url).toString();
|
|
rawLinks.push({ raw: href, abs });
|
|
internalLinks.add(abs);
|
|
} catch { }
|
|
});
|
|
|
|
// Schema.org JSON-LD types
|
|
const schemaTypes = parseSchemaTypes($);
|
|
|
|
return {
|
|
title,
|
|
metaDesc,
|
|
h1_1, h1_2, h2_1, h2_2,
|
|
totalHeadings,
|
|
canonical, robotsMeta, noindex, nofollow,
|
|
internalLinks, rawLinks,
|
|
lang,
|
|
wordCount,
|
|
schemaTypes,
|
|
bodyText
|
|
};
|
|
}
|
|
|
|
/* ------------------------------ fetchers -------------------------------- */
|
|
async function fetchWithGot(url) {
|
|
const t0 = Date.now();
|
|
const res = await got(url, {
|
|
timeout: { request: 20000 },
|
|
throwHttpErrors: false,
|
|
headers: REAL_HEADERS,
|
|
http2: false
|
|
});
|
|
const dt = Date.now() - t0;
|
|
const contentType = (res.headers["content-type"] || "").toLowerCase();
|
|
const bytes = res.headers["content-length"]
|
|
? Number(res.headers["content-length"])
|
|
: Buffer.byteLength(res.body || "", "utf8");
|
|
|
|
return {
|
|
status: res.statusCode ?? null,
|
|
status_text: res.statusMessage ?? "",
|
|
time_ms: dt,
|
|
contentType,
|
|
body: res.body,
|
|
bytes,
|
|
render_mode: "http",
|
|
httpVersion: res.httpVersion ?? "",
|
|
headers: res.headers
|
|
};
|
|
}
|
|
|
|
async function createBrowserContext() {
|
|
const browser = await chromium.launch({ headless: true, args: ["--disable-blink-features=AutomationControlled"] });
|
|
const context = await browser.newContext({
|
|
ignoreHTTPSErrors: true, // Ignore SSL certificate errors
|
|
userAgent: REAL_UA,
|
|
viewport: { width: 1366, height: 768 },
|
|
deviceScaleFactor: 1,
|
|
isMobile: false,
|
|
locale: "en-US",
|
|
extraHTTPHeaders: REAL_HEADERS
|
|
});
|
|
await context.addInitScript(() => {
|
|
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
|
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] });
|
|
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
|
|
});
|
|
return { browser: context.browser(), context };
|
|
}
|
|
|
|
async function fetchWithPlaywrightAndExtract(url, shared) {
|
|
const page = await shared.context.newPage();
|
|
const t0 = Date.now();
|
|
let status = null, mainHeaders = {}, statusText = "";
|
|
|
|
try {
|
|
const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
|
|
status = resp?.status() ?? null;
|
|
statusText = resp?.statusText() ?? "";
|
|
try { mainHeaders = resp ? await resp.headers() : {}; } catch { }
|
|
|
|
try { await page.waitForLoadState("networkidle", { timeout: 12000 }); } catch { }
|
|
try {
|
|
await page.waitForFunction(() => {
|
|
const main = document.querySelector("main") || document.body;
|
|
const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length;
|
|
const hasHeading = !!document.querySelector("h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']");
|
|
return textLen > 160 || hasHeading;
|
|
}, { timeout: 8000 });
|
|
} catch { }
|
|
|
|
const dom = await page.evaluate(() => {
|
|
const clean = s => (s || "").replace(/\s+/g, " ").trim();
|
|
const getTextList = sel => Array.from(document.querySelectorAll(sel))
|
|
.map(el => clean(el.textContent)).filter(Boolean);
|
|
|
|
const title = document.title || "";
|
|
const ogTitle = document.querySelector('meta[property="og:title"]')?.content || "";
|
|
const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || "";
|
|
const metaDesc = document.querySelector('meta[name="description"]')?.content || "";
|
|
const canonical = document.querySelector('link[rel="canonical"]')?.href || "";
|
|
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || "";
|
|
const lang = document.documentElement.getAttribute("lang") || "";
|
|
|
|
const h1 = getTextList("h1");
|
|
const h2 = getTextList("h2");
|
|
const h3 = getTextList("h3");
|
|
const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length;
|
|
|
|
const links = Array.from(document.querySelectorAll("a[href]"))
|
|
.map(a => {
|
|
const raw = a.getAttribute("href");
|
|
try { return { raw, abs: new URL(raw, location.href).toString() }; }
|
|
catch { return null; }
|
|
})
|
|
.filter(Boolean);
|
|
|
|
const firstHeading = h1[0] || h2[0] || "";
|
|
const bodyText = clean((document.querySelector("main") || document.body).innerText || "");
|
|
|
|
const schemaScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]')).map(s => s.textContent || "");
|
|
|
|
return {
|
|
htmlLen: (document.documentElement.outerHTML || "").length,
|
|
title, ogTitle, twTitle, metaDesc, canonical, robotsMeta, lang,
|
|
h1, h2, totalHeadings,
|
|
links,
|
|
bodyText,
|
|
schemaScripts
|
|
};
|
|
});
|
|
|
|
// Parse schema types from strings (outside of page)
|
|
const schemaTypes = [];
|
|
for (const raw of dom.schemaScripts || []) {
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
const collect = (obj) => {
|
|
if (!obj) return;
|
|
if (Array.isArray(obj)) { obj.forEach(collect); return; }
|
|
if (typeof obj === "object") {
|
|
const t = obj["@type"];
|
|
if (typeof t === "string") schemaTypes.push(t);
|
|
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && schemaTypes.push(x));
|
|
Object.values(obj).forEach(collect);
|
|
}
|
|
};
|
|
collect(parsed);
|
|
} catch { }
|
|
}
|
|
|
|
const dt = Date.now() - t0;
|
|
const robotsLower = (dom.robotsMeta || "").toLowerCase();
|
|
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
|
|
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
|
|
const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim();
|
|
|
|
return {
|
|
status,
|
|
status_text: statusText,
|
|
time_ms: dt,
|
|
contentType: "text/html",
|
|
bytes: dom.htmlLen || 0,
|
|
render_mode: "rendered",
|
|
headers: mainHeaders,
|
|
domExtract: {
|
|
title: finalTitle,
|
|
metaDesc: dom.metaDesc || "",
|
|
canonical: dom.canonical || "",
|
|
robotsMeta: dom.robotsMeta || "",
|
|
lang: dom.lang || "",
|
|
noindex, nofollow,
|
|
h1_1: dom.h1?.[0] || "",
|
|
h1_2: dom.h1?.[1] || "",
|
|
h2_1: dom.h2?.[0] || "",
|
|
h2_2: dom.h2?.[1] || "",
|
|
totalHeadings: dom.totalHeadings || 0,
|
|
links: new Set((dom.links || []).map(l => l.abs)),
|
|
rawLinks: dom.links || [],
|
|
bodyText: dom.bodyText || "",
|
|
schemaTypes: Array.from(new Set(schemaTypes))
|
|
}
|
|
};
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
}
|
|
|
|
/* ------------------------- render decision ------------------------------ */
|
|
function shouldRender(currentUrl, httpRes, parsed, homeTitle) {
|
|
const { pathname } = new URL(currentUrl);
|
|
if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell
|
|
if (parsed.totalHeadings === 0) return true;
|
|
if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true;
|
|
return false;
|
|
}
|
|
function withWWW(urlStr) {
|
|
try { const u = new URL(urlStr); if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname; return u.toString(); }
|
|
catch { return urlStr; }
|
|
}
|
|
|
|
/* ------------------------ per-page enrichers ---------------------------- */
|
|
function measurePixelWidth(text, size = 16, font = "arial") {
|
|
if (!text) return 0;
|
|
try { return pixelWidth(text, { font, size }); } catch { return Math.round(text.length * size * 0.5); }
|
|
}
|
|
function computeReadability(text) {
|
|
if (!text) return {};
|
|
const safe = text.slice(0, 200000); // cap
|
|
const out = {};
|
|
try { out.flesch_reading_ease = readability.fleschReadingEase(safe); } catch { }
|
|
try { out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe); } catch { }
|
|
try { out.gunning_fog = readability.gunningFog(safe); } catch { }
|
|
try { out.coleman_liau = readability.colemanLiauIndex(safe); } catch { }
|
|
try { out.ari = readability.automatedReadabilityIndex(safe); } catch { }
|
|
try { out.smog = readability.smogIndex(safe); } catch { }
|
|
return out;
|
|
}
|
|
|
|
/* -------------------------------- main ---------------------------------- */
|
|
// async function crawl(startUrl, maxPages = 50) {
|
|
|
|
export async function crawl(startUrl, maxPages = 50) {
|
|
const start = normalizeUrl(startUrl, { stripHash: true });
|
|
queue.push(start);
|
|
|
|
// Seed from sitemap.xml + record provenance
|
|
try {
|
|
const sitemapUrls = await getSitemapUrls(start);
|
|
for (const u of sitemapUrls) {
|
|
queue.push(u);
|
|
addEdge("sitemap.xml", u, u, "sitemap");
|
|
}
|
|
console.log(`📌 Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`);
|
|
} catch (e) {
|
|
console.log("⚠️ Sitemap step skipped:", e.message);
|
|
}
|
|
|
|
let shared = null;
|
|
async function getShared() { if (!shared) shared = await createBrowserContext(); return shared; }
|
|
|
|
let homeTitle = null;
|
|
|
|
while (queue.length > 0 && visited.size < maxPages) {
|
|
const url = queue.shift();
|
|
if (!url) continue;
|
|
|
|
const normUrl = normalizeUrl(url, { stripHash: true });
|
|
if (visited.has(normUrl)) continue;
|
|
visited.add(normUrl);
|
|
|
|
let attemptUrls = [normUrl];
|
|
let usedWWWRetry = false;
|
|
|
|
for (let attempt = 0; attempt < attemptUrls.length; attempt++) {
|
|
const currentUrl = attemptUrls[attempt];
|
|
try {
|
|
// 1) HTTP fetch
|
|
let pageRes = await fetchWithGot(currentUrl);
|
|
|
|
let parsed = {
|
|
title: "", metaDesc: "", h1_1: "", h1_2: "", h2_1: "", h2_2: "",
|
|
totalHeadings: 0, canonical: "", robotsMeta: "", noindex: false, nofollow: false,
|
|
internalLinks: new Set(), rawLinks: [],
|
|
lang: "", wordCount: 0, bodyText: "", schemaTypes: []
|
|
};
|
|
if (pageRes.contentType.includes("text/html")) {
|
|
const p = parseHtml(pageRes.body || "", currentUrl);
|
|
parsed = { ...parsed, ...p };
|
|
}
|
|
|
|
if (!homeTitle && new URL(currentUrl).pathname === "/") {
|
|
homeTitle = parsed.title || "";
|
|
}
|
|
|
|
// 2) Render if needed
|
|
if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) {
|
|
const s = await getShared();
|
|
const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s);
|
|
if (rendered.domExtract) {
|
|
pageRes = { ...rendered, body: null };
|
|
parsed = {
|
|
...parsed,
|
|
title: rendered.domExtract.title,
|
|
metaDesc: rendered.domExtract.metaDesc,
|
|
h1_1: rendered.domExtract.h1_1,
|
|
h1_2: rendered.domExtract.h1_2,
|
|
h2_1: rendered.domExtract.h2_1,
|
|
h2_2: rendered.domExtract.h2_2,
|
|
totalHeadings: rendered.domExtract.totalHeadings,
|
|
canonical: rendered.domExtract.canonical,
|
|
robotsMeta: rendered.domExtract.robotsMeta,
|
|
noindex: rendered.domExtract.noindex,
|
|
nofollow: rendered.domExtract.nofollow,
|
|
internalLinks: rendered.domExtract.links,
|
|
rawLinks: rendered.domExtract.rawLinks,
|
|
lang: rendered.domExtract.lang || parsed.lang,
|
|
bodyText: rendered.domExtract.bodyText || parsed.bodyText,
|
|
wordCount: (rendered.domExtract.bodyText || "").split(/\s+/).filter(Boolean).length,
|
|
schemaTypes: rendered.domExtract.schemaTypes
|
|
};
|
|
}
|
|
}
|
|
|
|
// If still looks empty, try www once
|
|
if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) {
|
|
attemptUrls.push(withWWW(currentUrl));
|
|
usedWWWRetry = true;
|
|
continue;
|
|
}
|
|
|
|
// Enqueue internal links + record provenance
|
|
for (const link of parsed.internalLinks) {
|
|
if (isInternal(start, link)) {
|
|
const ln = normalizeUrl(link, { stripHash: true });
|
|
const rawMatch = (parsed.rawLinks || []).find(r => r.abs === link)?.raw ?? "";
|
|
addEdge(currentUrl, rawMatch, ln, pageRes.render_mode);
|
|
if (!visited.has(ln)) queue.push(ln);
|
|
}
|
|
}
|
|
|
|
// ---- Per-page metrics & enrichers ----
|
|
const title = parsed.title || "";
|
|
const metaDesc = parsed.metaDesc || "";
|
|
const h1_1 = parsed.h1_1 || "";
|
|
const h1_2 = parsed.h1_2 || "";
|
|
const lang = parsed.lang || "";
|
|
const bodyText = parsed.bodyText || "";
|
|
const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0);
|
|
|
|
const titlePx = measurePixelWidth(title, 16, "arial");
|
|
const descPx = measurePixelWidth(metaDesc, 14, "arial");
|
|
const h1_1_px = measurePixelWidth(h1_1, 24, "arial");
|
|
const h1_2_px = measurePixelWidth(h1_2, 24, "arial");
|
|
|
|
const read = computeReadability(bodyText);
|
|
|
|
const headers = pageRes.headers || {};
|
|
const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? "";
|
|
const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? "";
|
|
const setCookie = !!headers["set-cookie"];
|
|
|
|
const outlinks = parsed.internalLinks.size;
|
|
const inlinks = (referrers.get(currentUrl) || []).length;
|
|
|
|
// Save page row
|
|
results.push({
|
|
url: currentUrl,
|
|
status: pageRes.status,
|
|
status_text: pageRes.status_text ?? "",
|
|
time_ms: pageRes.time_ms,
|
|
bytes: pageRes.bytes,
|
|
content_type: pageRes.contentType,
|
|
http_version: pageRes.httpVersion ?? "",
|
|
title,
|
|
title_length: title.length,
|
|
title_pixel_width: titlePx,
|
|
meta_description: metaDesc,
|
|
meta_description_length: metaDesc.length,
|
|
meta_description_pixel_width: descPx,
|
|
h1_1,
|
|
h1_1_length: h1_1.length,
|
|
h1_1_pixel_width: h1_1_px,
|
|
h1_2,
|
|
h1_2_length: h1_2.length,
|
|
h1_2_pixel_width: h1_2_px,
|
|
h2_1: parsed.h2_1 || "",
|
|
h2_2: parsed.h2_2 || "",
|
|
canonical: parsed.canonical,
|
|
robots_meta: parsed.robotsMeta,
|
|
x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots,
|
|
noindex: parsed.noindex,
|
|
nofollow: parsed.nofollow,
|
|
lang,
|
|
word_count: wordCount,
|
|
flesch_reading_ease: read.flesch_reading_ease ?? "",
|
|
flesch_kincaid_grade: read.flesch_kincaid_grade ?? "",
|
|
gunning_fog: read.gunning_fog ?? "",
|
|
coleman_liau: read.coleman_liau ?? "",
|
|
ari: read.ari ?? "",
|
|
smog: read.smog ?? "",
|
|
schema_types: parsed.schemaTypes || [],
|
|
inlinks,
|
|
outlinks,
|
|
render_mode: pageRes.render_mode,
|
|
last_modified: lastModified,
|
|
set_cookie: setCookie,
|
|
crawl_timestamp: new Date().toISOString()
|
|
});
|
|
|
|
console.log(
|
|
`[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${title || h1_1}`
|
|
);
|
|
break; // success for this URL; stop attempts
|
|
} catch (err) {
|
|
console.error(`[ERROR] ${currentUrl} -> ${err.message}`);
|
|
results.push({
|
|
url: currentUrl,
|
|
status: null, status_text: "", time_ms: null, bytes: null, content_type: "",
|
|
http_version: "", title: "", title_length: 0, title_pixel_width: "",
|
|
meta_description: "", meta_description_length: 0, meta_description_pixel_width: "",
|
|
h1_1: "", h1_1_length: 0, h1_1_pixel_width: "", h1_2: "", h1_2_length: 0, h1_2_pixel_width: "",
|
|
h2_1: "", h2_2: "",
|
|
canonical: "", robots_meta: "", x_robots_tag: "", noindex: false, nofollow: false,
|
|
lang: "", word_count: "", flesch_reading_ease: "", flesch_kincaid_grade: "",
|
|
gunning_fog: "", coleman_liau: "", ari: "", smog: "",
|
|
schema_types: [], inlinks: 0, outlinks: 0, render_mode: "error",
|
|
last_modified: "", set_cookie: "", crawl_timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
if (shared) await shared.browser.close();
|
|
|
|
// -------------------- Post-process: duplicates & similarity -------------
|
|
// Titles
|
|
const titleMap = new Map();
|
|
for (const r of results) {
|
|
const key = (r.title || "").trim();
|
|
if (!titleMap.has(key)) titleMap.set(key, []);
|
|
titleMap.get(key).push(r);
|
|
}
|
|
for (const [t, arr] of titleMap.entries()) {
|
|
if (!t) continue;
|
|
const isDup = arr.length > 1;
|
|
for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no";
|
|
}
|
|
|
|
// Meta descriptions
|
|
const descMap = new Map();
|
|
for (const r of results) {
|
|
const key = (r.meta_description || "").trim();
|
|
if (!descMap.has(key)) descMap.set(key, []);
|
|
descMap.get(key).push(r);
|
|
}
|
|
for (const [d, arr] of descMap.entries()) {
|
|
if (!d) continue;
|
|
const isDup = arr.length > 1;
|
|
for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no";
|
|
}
|
|
|
|
// Nearest neighbor similarities (within site, lightweight)
|
|
const titleList = results.map(r => ({ url: r.url, text: (r.title || "").trim() }));
|
|
const descList = results.map(r => ({ url: r.url, text: (r.meta_description || "").trim() }));
|
|
for (const r of results) {
|
|
// titles
|
|
const others = titleList.filter(x => x.url !== r.url && x.text);
|
|
let bestT = { rating: 0, target: "" };
|
|
if (r.title && others.length) {
|
|
const ratings = stringSimilarity.findBestMatch(r.title, others.map(x => x.text));
|
|
const best = ratings.bestMatch;
|
|
bestT.rating = best.rating;
|
|
const idx = ratings.ratings.findIndex(x => x.rating === best.rating);
|
|
bestT.target = others[idx]?.url || "";
|
|
}
|
|
r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : "";
|
|
r.nearest_title_url = bestT.target;
|
|
|
|
// descriptions
|
|
const othersD = descList.filter(x => x.url !== r.url && x.text);
|
|
let bestD = { rating: 0, target: "" };
|
|
if (r.meta_description && othersD.length) {
|
|
const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map(x => x.text));
|
|
const best = ratingsD.bestMatch;
|
|
bestD.rating = best.rating;
|
|
const idx = ratingsD.ratings.findIndex(x => x.rating === best.rating);
|
|
bestD.target = othersD[idx]?.url || "";
|
|
}
|
|
r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : "";
|
|
r.nearest_description_url = bestD.target;
|
|
}
|
|
|
|
console.log(`\n✅ Crawl finished. Total pages: ${visited.size}`);
|
|
writePageReports(results);
|
|
writeLinkEdges(edges);
|
|
writeErrors(results);
|
|
}
|
|
|
|
// // CLI: node crawler.js https://site.com 200
|
|
// const START_URL = process.argv[2] || "https://example.com";
|
|
// const MAX_PAGES = Number(process.argv[3] || 100);
|
|
// crawl(START_URL, MAX_PAGES);
|