import got from "got"; import * as cheerio from "cheerio"; import normalizeUrl from "normalize-url"; import { isInternal } from "./utils/urlHelpers.js"; import { getSitemapUrls } from "./utils/sitemap.js"; import fs from "node:fs"; import path from "node:path"; import { chromium } from "playwright"; // NEW libs import pixelWidth from "string-pixel-width"; import * as readability from "text-readability"; import stringSimilarity from "string-similarity"; /* ------------------------------ globals --------------------------------- */ const visited = new Set(); const queue = []; const results = []; // Link provenance: every discovered edge (source -> target) const edges = []; // { from, raw_href, to, discovered_by } // Quick referrer map for error report const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}> const REAL_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"; const REAL_HEADERS = { "user-agent": REAL_UA, "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "accept-language": "en-US,en;q=0.9", "upgrade-insecure-requests": "1", }; /* ------------------------------ utils ----------------------------------- */ function csvEscape(v) { if (v === undefined || v === null) return ""; const s = String(v); return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s; } function ensureDir(dir) { if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); } function writePageReports(results) { ensureDir("reports"); const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19); const base = path.join("reports", `crawl-${stamp}`); fs.writeFileSync(`${base}.json`, JSON.stringify(results, null, 2), "utf8"); console.log(`\nšŸ“ Full JSON report saved: ${base}.json`); // Columns (a Screaming-Frog-ish shape with our extras) const headers = [ "url", "status", "status_text", "time_ms", "bytes", "content_type", "http_version", "title", "title_length", "title_pixel_width", "meta_description", "meta_description_length", "meta_description_pixel_width", "h1_1", "h1_1_length", "h1_1_pixel_width", "h1_2", "h1_2_length", "h1_2_pixel_width", "h2_1", "h2_2", "canonical", "robots_meta", "x_robots_tag", "noindex", "nofollow", "lang", "word_count", "flesch_reading_ease", "flesch_kincaid_grade", "gunning_fog", "coleman_liau", "ari", "smog", "schema_types", "inlinks", "outlinks", "render_mode", "last_modified", "set_cookie", "crawl_timestamp", "duplicate_title_exact", "nearest_title_similarity", "nearest_title_url", "duplicate_description_exact", "nearest_description_similarity", "nearest_description_url" ]; const lines = [headers.join(",")]; for (const r of results) { lines.push([ r.url, r.status, r.status_text ?? "", r.time_ms, r.bytes, r.content_type, r.http_version ?? "", r.title, r.title_length, r.title_pixel_width, r.meta_description, r.meta_description_length, r.meta_description_pixel_width, r.h1_1 ?? "", r.h1_1_length ?? 0, r.h1_1_pixel_width ?? "", r.h1_2 ?? "", r.h1_2_length ?? 0, r.h1_2_pixel_width ?? "", r.h2_1 ?? "", r.h2_2 ?? "", r.canonical, r.robots_meta, r.x_robots_tag ?? "", r.noindex, r.nofollow, r.lang ?? "", r.word_count ?? "", r.flesch_reading_ease ?? "", r.flesch_kincaid_grade ?? "", r.gunning_fog ?? "", r.coleman_liau ?? "", r.ari ?? "", r.smog ?? "", Array.isArray(r.schema_types) ? r.schema_types.join("|") : "", r.inlinks ?? 0, r.outlinks ?? 0, r.render_mode, r.last_modified ?? "", r.set_cookie ? "yes" : "no", r.crawl_timestamp ?? "", r.duplicate_title_exact ?? "", r.nearest_title_similarity ?? "", r.nearest_title_url ?? "", r.duplicate_description_exact ?? "", r.nearest_description_similarity ?? "", r.nearest_description_url ?? "" ].map(csvEscape).join(",")); } //fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8"); //console.log(`\nšŸ“ Page reports saved:\n - ${base}.csv\n - ${base}.json`); } function writeLinkEdges(edges) { ensureDir("reports"); const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19); const file = path.join("reports", `links-${stamp}.csv`); const headers = ["from", "raw_href", "to", "discovered_by"]; const lines = [headers.join(",")]; for (const e of edges) { lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(",")); } fs.writeFileSync(file, lines.join("\n"), "utf8"); console.log(`šŸ”— Link provenance saved: ${file}`); } function writeErrors(results) { ensureDir("reports"); const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19); const file = path.join("reports", `errors-${stamp}.csv`); const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"]; const lines = [headers.join(",")]; for (const r of results) { if (r && r.status !== null && r.status >= 400) { const refs = referrers.get(r.url) || []; if (refs.length === 0) { lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(",")); } else { for (const ref of refs) { lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(",")); } } } } fs.writeFileSync(file, lines.join("\n"), "utf8"); console.log(`ā— Error report saved: ${file}`); } function addEdge(from, rawHref, to, discovered_by) { edges.push({ from, raw_href: rawHref || "", to, discovered_by }); if (!referrers.has(to)) referrers.set(to, []); referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by }); } /* ---------------------- parse HTML without JS --------------------------- */ function safeJsonParse(txt) { try { return JSON.parse(txt); } catch { return null; } } function parseSchemaTypes($) { const types = new Set(); $('script[type="application/ld+json"]').each((_, el) => { const raw = $(el).contents().text(); const parsed = safeJsonParse(raw); if (!parsed) return; const collect = (obj) => { if (!obj) return; if (Array.isArray(obj)) { obj.forEach(collect); return; } if (typeof obj === "object") { const t = obj["@type"]; if (typeof t === "string") types.add(t); else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && types.add(x)); // nested Object.values(obj).forEach(collect); } }; collect(parsed); }); return [...types]; } function parseHtml(html, url) { const $ = cheerio.load(html); let title = ($("title").first().text() || "").trim(); const ogTitle = $('meta[property="og:title"]').attr("content") || ""; const twTitle = $('meta[name="twitter:title"]').attr("content") || ""; // Headings (capture top two H1s and H2s) const h1s = $("h1").map((_, el) => $(el).text().trim()).get(); const h2s = $("h2").map((_, el) => $(el).text().trim()).get(); const h1_1 = h1s[0] || ""; const h1_2 = h1s[1] || ""; const h2_1 = h2s[0] || ""; const h2_2 = h2s[1] || ""; const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length; if (!title) title = (ogTitle || twTitle || h1_1 || "").trim(); const metaDesc = ($('meta[name="description"]').attr("content") || "").trim(); const canonical = ($('link[rel="canonical"]').attr("href") || "").trim(); const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim(); const robotsLower = robotsMeta.toLowerCase(); const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower); const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower); const lang = ($("html").attr("lang") || "").trim(); // Basic text body for word count / readability const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim(); const wordCount = bodyText ? bodyText.split(/\s+/).length : 0; // Internal links + raw href const internalLinks = new Set(); const rawLinks = []; $("a[href]").each((_, el) => { const href = $(el).attr("href"); if (!href) return; try { const abs = new URL(href, url).toString(); rawLinks.push({ raw: href, abs }); internalLinks.add(abs); } catch { } }); // Schema.org JSON-LD types const schemaTypes = parseSchemaTypes($); return { title, metaDesc, h1_1, h1_2, h2_1, h2_2, totalHeadings, canonical, robotsMeta, noindex, nofollow, internalLinks, rawLinks, lang, wordCount, schemaTypes, bodyText }; } /* ------------------------------ fetchers -------------------------------- */ async function fetchWithGot(url) { const t0 = Date.now(); const res = await got(url, { timeout: { request: 20000 }, throwHttpErrors: false, headers: REAL_HEADERS, http2: false }); const dt = Date.now() - t0; const contentType = (res.headers["content-type"] || "").toLowerCase(); const bytes = res.headers["content-length"] ? Number(res.headers["content-length"]) : Buffer.byteLength(res.body || "", "utf8"); return { status: res.statusCode ?? null, status_text: res.statusMessage ?? "", time_ms: dt, contentType, body: res.body, bytes, render_mode: "http", httpVersion: res.httpVersion ?? "", headers: res.headers }; } async function createBrowserContext() { const browser = await chromium.launch({ headless: true, args: ["--disable-blink-features=AutomationControlled"] }); const context = await browser.newContext({ ignoreHTTPSErrors: true, // Ignore SSL certificate errors userAgent: REAL_UA, viewport: { width: 1366, height: 768 }, deviceScaleFactor: 1, isMobile: false, locale: "en-US", extraHTTPHeaders: REAL_HEADERS }); await context.addInitScript(() => { Object.defineProperty(navigator, "webdriver", { get: () => false }); Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] }); Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] }); }); return { browser: context.browser(), context }; } async function fetchWithPlaywrightAndExtract(url, shared) { const page = await shared.context.newPage(); const t0 = Date.now(); let status = null, mainHeaders = {}, statusText = ""; try { const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 }); status = resp?.status() ?? null; statusText = resp?.statusText() ?? ""; try { mainHeaders = resp ? await resp.headers() : {}; } catch { } try { await page.waitForLoadState("networkidle", { timeout: 12000 }); } catch { } try { await page.waitForFunction(() => { const main = document.querySelector("main") || document.body; const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length; const hasHeading = !!document.querySelector("h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']"); return textLen > 160 || hasHeading; }, { timeout: 8000 }); } catch { } const dom = await page.evaluate(() => { const clean = s => (s || "").replace(/\s+/g, " ").trim(); const getTextList = sel => Array.from(document.querySelectorAll(sel)) .map(el => clean(el.textContent)).filter(Boolean); const title = document.title || ""; const ogTitle = document.querySelector('meta[property="og:title"]')?.content || ""; const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || ""; const metaDesc = document.querySelector('meta[name="description"]')?.content || ""; const canonical = document.querySelector('link[rel="canonical"]')?.href || ""; const robotsMeta = document.querySelector('meta[name="robots"]')?.content || ""; const lang = document.documentElement.getAttribute("lang") || ""; const h1 = getTextList("h1"); const h2 = getTextList("h2"); const h3 = getTextList("h3"); const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length; const links = Array.from(document.querySelectorAll("a[href]")) .map(a => { const raw = a.getAttribute("href"); try { return { raw, abs: new URL(raw, location.href).toString() }; } catch { return null; } }) .filter(Boolean); const firstHeading = h1[0] || h2[0] || ""; const bodyText = clean((document.querySelector("main") || document.body).innerText || ""); const schemaScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]')).map(s => s.textContent || ""); return { htmlLen: (document.documentElement.outerHTML || "").length, title, ogTitle, twTitle, metaDesc, canonical, robotsMeta, lang, h1, h2, totalHeadings, links, bodyText, schemaScripts }; }); // Parse schema types from strings (outside of page) const schemaTypes = []; for (const raw of dom.schemaScripts || []) { try { const parsed = JSON.parse(raw); const collect = (obj) => { if (!obj) return; if (Array.isArray(obj)) { obj.forEach(collect); return; } if (typeof obj === "object") { const t = obj["@type"]; if (typeof t === "string") schemaTypes.push(t); else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && schemaTypes.push(x)); Object.values(obj).forEach(collect); } }; collect(parsed); } catch { } } const dt = Date.now() - t0; const robotsLower = (dom.robotsMeta || "").toLowerCase(); const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower); const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower); const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim(); return { status, status_text: statusText, time_ms: dt, contentType: "text/html", bytes: dom.htmlLen || 0, render_mode: "rendered", headers: mainHeaders, domExtract: { title: finalTitle, metaDesc: dom.metaDesc || "", canonical: dom.canonical || "", robotsMeta: dom.robotsMeta || "", lang: dom.lang || "", noindex, nofollow, h1_1: dom.h1?.[0] || "", h1_2: dom.h1?.[1] || "", h2_1: dom.h2?.[0] || "", h2_2: dom.h2?.[1] || "", totalHeadings: dom.totalHeadings || 0, links: new Set((dom.links || []).map(l => l.abs)), rawLinks: dom.links || [], bodyText: dom.bodyText || "", schemaTypes: Array.from(new Set(schemaTypes)) } }; } finally { await page.close(); } } /* ------------------------- render decision ------------------------------ */ function shouldRender(currentUrl, httpRes, parsed, homeTitle) { const { pathname } = new URL(currentUrl); if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell if (parsed.totalHeadings === 0) return true; if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true; return false; } function withWWW(urlStr) { try { const u = new URL(urlStr); if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname; return u.toString(); } catch { return urlStr; } } /* ------------------------ per-page enrichers ---------------------------- */ function measurePixelWidth(text, size = 16, font = "arial") { if (!text) return 0; try { return pixelWidth(text, { font, size }); } catch { return Math.round(text.length * size * 0.5); } } function computeReadability(text) { if (!text) return {}; const safe = text.slice(0, 200000); // cap const out = {}; try { out.flesch_reading_ease = readability.fleschReadingEase(safe); } catch { } try { out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe); } catch { } try { out.gunning_fog = readability.gunningFog(safe); } catch { } try { out.coleman_liau = readability.colemanLiauIndex(safe); } catch { } try { out.ari = readability.automatedReadabilityIndex(safe); } catch { } try { out.smog = readability.smogIndex(safe); } catch { } return out; } /* -------------------------------- main ---------------------------------- */ // async function crawl(startUrl, maxPages = 50) { export async function crawl(startUrl, maxPages = 50) { const start = normalizeUrl(startUrl, { stripHash: true }); queue.push(start); // Seed from sitemap.xml + record provenance try { const sitemapUrls = await getSitemapUrls(start); for (const u of sitemapUrls) { queue.push(u); addEdge("sitemap.xml", u, u, "sitemap"); } console.log(`šŸ“Œ Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`); } catch (e) { console.log("āš ļø Sitemap step skipped:", e.message); } let shared = null; async function getShared() { if (!shared) shared = await createBrowserContext(); return shared; } let homeTitle = null; while (queue.length > 0 && visited.size < maxPages) { const url = queue.shift(); if (!url) continue; const normUrl = normalizeUrl(url, { stripHash: true }); if (visited.has(normUrl)) continue; visited.add(normUrl); let attemptUrls = [normUrl]; let usedWWWRetry = false; for (let attempt = 0; attempt < attemptUrls.length; attempt++) { const currentUrl = attemptUrls[attempt]; try { // 1) HTTP fetch let pageRes = await fetchWithGot(currentUrl); let parsed = { title: "", metaDesc: "", h1_1: "", h1_2: "", h2_1: "", h2_2: "", totalHeadings: 0, canonical: "", robotsMeta: "", noindex: false, nofollow: false, internalLinks: new Set(), rawLinks: [], lang: "", wordCount: 0, bodyText: "", schemaTypes: [] }; if (pageRes.contentType.includes("text/html")) { const p = parseHtml(pageRes.body || "", currentUrl); parsed = { ...parsed, ...p }; } if (!homeTitle && new URL(currentUrl).pathname === "/") { homeTitle = parsed.title || ""; } // 2) Render if needed if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) { const s = await getShared(); const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s); if (rendered.domExtract) { pageRes = { ...rendered, body: null }; parsed = { ...parsed, title: rendered.domExtract.title, metaDesc: rendered.domExtract.metaDesc, h1_1: rendered.domExtract.h1_1, h1_2: rendered.domExtract.h1_2, h2_1: rendered.domExtract.h2_1, h2_2: rendered.domExtract.h2_2, totalHeadings: rendered.domExtract.totalHeadings, canonical: rendered.domExtract.canonical, robotsMeta: rendered.domExtract.robotsMeta, noindex: rendered.domExtract.noindex, nofollow: rendered.domExtract.nofollow, internalLinks: rendered.domExtract.links, rawLinks: rendered.domExtract.rawLinks, lang: rendered.domExtract.lang || parsed.lang, bodyText: rendered.domExtract.bodyText || parsed.bodyText, wordCount: (rendered.domExtract.bodyText || "").split(/\s+/).filter(Boolean).length, schemaTypes: rendered.domExtract.schemaTypes }; } } // If still looks empty, try www once if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) { attemptUrls.push(withWWW(currentUrl)); usedWWWRetry = true; continue; } // Enqueue internal links + record provenance for (const link of parsed.internalLinks) { if (isInternal(start, link)) { const ln = normalizeUrl(link, { stripHash: true }); const rawMatch = (parsed.rawLinks || []).find(r => r.abs === link)?.raw ?? ""; addEdge(currentUrl, rawMatch, ln, pageRes.render_mode); if (!visited.has(ln)) queue.push(ln); } } // ---- Per-page metrics & enrichers ---- const title = parsed.title || ""; const metaDesc = parsed.metaDesc || ""; const h1_1 = parsed.h1_1 || ""; const h1_2 = parsed.h1_2 || ""; const lang = parsed.lang || ""; const bodyText = parsed.bodyText || ""; const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0); const titlePx = measurePixelWidth(title, 16, "arial"); const descPx = measurePixelWidth(metaDesc, 14, "arial"); const h1_1_px = measurePixelWidth(h1_1, 24, "arial"); const h1_2_px = measurePixelWidth(h1_2, 24, "arial"); const read = computeReadability(bodyText); const headers = pageRes.headers || {}; const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? ""; const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? ""; const setCookie = !!headers["set-cookie"]; const outlinks = parsed.internalLinks.size; const inlinks = (referrers.get(currentUrl) || []).length; // Save page row results.push({ url: currentUrl, status: pageRes.status, status_text: pageRes.status_text ?? "", time_ms: pageRes.time_ms, bytes: pageRes.bytes, content_type: pageRes.contentType, http_version: pageRes.httpVersion ?? "", title, title_length: title.length, title_pixel_width: titlePx, meta_description: metaDesc, meta_description_length: metaDesc.length, meta_description_pixel_width: descPx, h1_1, h1_1_length: h1_1.length, h1_1_pixel_width: h1_1_px, h1_2, h1_2_length: h1_2.length, h1_2_pixel_width: h1_2_px, h2_1: parsed.h2_1 || "", h2_2: parsed.h2_2 || "", canonical: parsed.canonical, robots_meta: parsed.robotsMeta, x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots, noindex: parsed.noindex, nofollow: parsed.nofollow, lang, word_count: wordCount, flesch_reading_ease: read.flesch_reading_ease ?? "", flesch_kincaid_grade: read.flesch_kincaid_grade ?? "", gunning_fog: read.gunning_fog ?? "", coleman_liau: read.coleman_liau ?? "", ari: read.ari ?? "", smog: read.smog ?? "", schema_types: parsed.schemaTypes || [], inlinks, outlinks, render_mode: pageRes.render_mode, last_modified: lastModified, set_cookie: setCookie, crawl_timestamp: new Date().toISOString() }); console.log( `[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${title || h1_1}` ); break; // success for this URL; stop attempts } catch (err) { console.error(`[ERROR] ${currentUrl} -> ${err.message}`); results.push({ url: currentUrl, status: null, status_text: "", time_ms: null, bytes: null, content_type: "", http_version: "", title: "", title_length: 0, title_pixel_width: "", meta_description: "", meta_description_length: 0, meta_description_pixel_width: "", h1_1: "", h1_1_length: 0, h1_1_pixel_width: "", h1_2: "", h1_2_length: 0, h1_2_pixel_width: "", h2_1: "", h2_2: "", canonical: "", robots_meta: "", x_robots_tag: "", noindex: false, nofollow: false, lang: "", word_count: "", flesch_reading_ease: "", flesch_kincaid_grade: "", gunning_fog: "", coleman_liau: "", ari: "", smog: "", schema_types: [], inlinks: 0, outlinks: 0, render_mode: "error", last_modified: "", set_cookie: "", crawl_timestamp: new Date().toISOString() }); } } } if (shared) await shared.browser.close(); // -------------------- Post-process: duplicates & similarity ------------- // Titles const titleMap = new Map(); for (const r of results) { const key = (r.title || "").trim(); if (!titleMap.has(key)) titleMap.set(key, []); titleMap.get(key).push(r); } for (const [t, arr] of titleMap.entries()) { if (!t) continue; const isDup = arr.length > 1; for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no"; } // Meta descriptions const descMap = new Map(); for (const r of results) { const key = (r.meta_description || "").trim(); if (!descMap.has(key)) descMap.set(key, []); descMap.get(key).push(r); } for (const [d, arr] of descMap.entries()) { if (!d) continue; const isDup = arr.length > 1; for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no"; } // Nearest neighbor similarities (within site, lightweight) const titleList = results.map(r => ({ url: r.url, text: (r.title || "").trim() })); const descList = results.map(r => ({ url: r.url, text: (r.meta_description || "").trim() })); for (const r of results) { // titles const others = titleList.filter(x => x.url !== r.url && x.text); let bestT = { rating: 0, target: "" }; if (r.title && others.length) { const ratings = stringSimilarity.findBestMatch(r.title, others.map(x => x.text)); const best = ratings.bestMatch; bestT.rating = best.rating; const idx = ratings.ratings.findIndex(x => x.rating === best.rating); bestT.target = others[idx]?.url || ""; } r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : ""; r.nearest_title_url = bestT.target; // descriptions const othersD = descList.filter(x => x.url !== r.url && x.text); let bestD = { rating: 0, target: "" }; if (r.meta_description && othersD.length) { const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map(x => x.text)); const best = ratingsD.bestMatch; bestD.rating = best.rating; const idx = ratingsD.ratings.findIndex(x => x.rating === best.rating); bestD.target = othersD[idx]?.url || ""; } r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : ""; r.nearest_description_url = bestD.target; } console.log(`\nāœ… Crawl finished. Total pages: ${visited.size}`); writePageReports(results); writeLinkEdges(edges); writeErrors(results); } // // CLI: node crawler.js https://site.com 200 // const START_URL = process.argv[2] || "https://example.com"; // const MAX_PAGES = Number(process.argv[3] || 100); // crawl(START_URL, MAX_PAGES);