258 lines
10 KiB
TypeScript
258 lines
10 KiB
TypeScript
'use client';
|
||
|
||
import { useMemo, useState } from "react";
|
||
|
||
// Path: app/(defaults)/crawl/page.tsx (App Router)
|
||
// If using Pages Router, place at pages/crawl.tsx
|
||
// TailwindCSS assumed.
|
||
|
||
export default function CrawlPage() {
|
||
const [siteUrl, setSiteUrl] = useState("");
|
||
const [maxUrls, setMaxUrls] = useState<number | "">("");
|
||
const [autoMaxLoading, setAutoMaxLoading] = useState(false);
|
||
const [crawlLoading, setCrawlLoading] = useState(false);
|
||
const [error, setError] = useState<string | null>(null);
|
||
const [report, setReport] = useState<any>(null);
|
||
|
||
const apiBase = "https://app.crawlerx.co/crawl";
|
||
|
||
const isValidUrl = useMemo(() => {
|
||
try {
|
||
if (!siteUrl) return false;
|
||
const normalized = siteUrl.match(/^https?:\/\//i) ? siteUrl : `https://${siteUrl}`;
|
||
const u = new URL(normalized);
|
||
return !!u.hostname;
|
||
} catch {
|
||
return false;
|
||
}
|
||
}, [siteUrl]);
|
||
|
||
const normalizedUrl = useMemo(() => {
|
||
if (!siteUrl) return "";
|
||
return siteUrl.match(/^https?:\/\//i) ? siteUrl : `https://${siteUrl}`;
|
||
}, [siteUrl]);
|
||
|
||
async function autoDetectMaxFromSitemap() {
|
||
setError(null);
|
||
setAutoMaxLoading(true);
|
||
try {
|
||
if (!isValidUrl) throw new Error("Enter a valid website URL first.");
|
||
// Server-side proxy avoids CORS
|
||
const res = await fetch(`/api/sitemap?u=${encodeURIComponent(normalizedUrl)}`);
|
||
if (!res.ok) throw new Error(`Sitemap probe failed (${res.status})`);
|
||
const json = await res.json();
|
||
if (typeof json.count !== "number" || json.count < 1) throw new Error("Sitemap found but contains no URLs.");
|
||
setMaxUrls(json.count);
|
||
} catch (e: any) {
|
||
setError(e?.message || "Failed to detect Max from sitemap.");
|
||
} finally {
|
||
setAutoMaxLoading(false);
|
||
}
|
||
}
|
||
|
||
async function handleCrawl() {
|
||
setError(null);
|
||
setCrawlLoading(true);
|
||
setReport(null);
|
||
|
||
try {
|
||
if (!isValidUrl) throw new Error("Please enter a valid website URL (with or without https://).");
|
||
const max = typeof maxUrls === "number" && maxUrls > 0 ? maxUrls : 50;
|
||
const apiUrl = `${apiBase}?url=${encodeURIComponent(normalizedUrl)}&max=${max}`;
|
||
const res = await fetch(apiUrl);
|
||
if (!res.ok) throw new Error(`Crawler API error: ${res.status} ${res.statusText}`);
|
||
const data = await res.json();
|
||
setReport(data);
|
||
} catch (e: any) {
|
||
setError(e?.message || "Failed to crawl the site.");
|
||
} finally {
|
||
setCrawlLoading(false);
|
||
}
|
||
}
|
||
|
||
function downloadJson() {
|
||
if (!report) return;
|
||
const blob = new Blob([JSON.stringify(report, null, 2)], { type: "application/json" });
|
||
const url = URL.createObjectURL(blob);
|
||
const a = document.createElement("a");
|
||
a.href = url;
|
||
const host = (() => {
|
||
try { return new URL(normalizedUrl).hostname; } catch { return "report"; }
|
||
})();
|
||
a.download = `crawlerx-report-${host}.json`;
|
||
document.body.appendChild(a);
|
||
a.click();
|
||
a.remove();
|
||
URL.revokeObjectURL(url);
|
||
}
|
||
|
||
const { rows, columns } = useMemo(() => {
|
||
if (!report) return { rows: [] as any[], columns: [] as string[] };
|
||
const data = Array.isArray(report) ? report : Array.isArray(report?.results) ? report.results : null;
|
||
if (!data || !Array.isArray(data) || data.length === 0) return { rows: [], columns: [] };
|
||
|
||
const preferred = ["url", "status", "title", "description", "h1", "issues", "links", "loadTime" ];
|
||
const colset = new Set<string>();
|
||
data.slice(0, 25).forEach((r: any) => Object.keys(r || {}).forEach((k) => colset.add(k)));
|
||
const cols = preferred.filter((k) => colset.has(k)).concat(Array.from(colset).filter((k) => !preferred.includes(k)).slice(0, 6));
|
||
return { rows: data, columns: cols };
|
||
}, [report]);
|
||
|
||
return (
|
||
<div className="min-h-screen bg-gradient-to-b from-gray-50 to-white">
|
||
<div className="mx-auto max-w-6xl px-4 py-10">
|
||
<header className="mb-8">
|
||
<h1 className="text-3xl sm:text-4xl font-semibold tracking-tight text-gray-900">CrawlerX — Crawl & Report</h1>
|
||
<p className="mt-2 text-gray-600">Enter a website, auto-detect the sitemap size for <span className="font-medium">Max</span>, then run a crawl via the CrawlerX API and download the JSON report.</p>
|
||
</header>
|
||
|
||
<div className="grid gap-4 sm:grid-cols-[1fr_auto_auto] items-end bg-white p-4 rounded-2xl shadow-sm border border-gray-200">
|
||
<div className="flex flex-col">
|
||
<label className="text-sm font-medium text-gray-700 mb-1">Website URL</label>
|
||
<input
|
||
type="url"
|
||
value={siteUrl}
|
||
onChange={(e) => setSiteUrl(e.target.value)}
|
||
placeholder="https://example.com"
|
||
className="w-full rounded-xl border border-gray-300 px-3 py-2 focus:outline-none focus:ring-4 focus:ring-blue-100 focus:border-blue-500"
|
||
/>
|
||
</div>
|
||
|
||
<div className="flex flex-col">
|
||
<label className="text-sm font-medium text-gray-700 mb-1 flex items-center gap-2">
|
||
Max URLs
|
||
<button
|
||
type="button"
|
||
onClick={autoDetectMaxFromSitemap}
|
||
disabled={!isValidUrl || autoMaxLoading}
|
||
className="text-xs rounded-lg px-2 py-1 border border-gray-300 hover:bg-gray-50 disabled:opacity-50"
|
||
title="Fetch and count URLs from the site's sitemap.xml"
|
||
>
|
||
{autoMaxLoading ? "Detecting…" : "Auto‑detect from sitemap"}
|
||
</button>
|
||
</label>
|
||
<input
|
||
type="number"
|
||
min={1}
|
||
value={maxUrls}
|
||
onChange={(e) => setMaxUrls(e.target.value ? Number(e.target.value) : "")}
|
||
placeholder="e.g. 50"
|
||
className="w-40 rounded-xl border border-gray-300 px-3 py-2 focus:outline-none focus:ring-4 focus:ring-blue-100 focus:border-blue-500"
|
||
/>
|
||
</div>
|
||
|
||
<div className="flex gap-3 sm:justify-end">
|
||
<button
|
||
type="button"
|
||
onClick={handleCrawl}
|
||
disabled={!isValidUrl || crawlLoading}
|
||
className="h-10 mt-6 inline-flex items-center justify-center rounded-xl bg-blue-600 px-4 text-white font-medium shadow-sm hover:bg-blue-700 disabled:opacity-50"
|
||
>
|
||
{crawlLoading ? "Crawling…" : "Run Crawl"}
|
||
</button>
|
||
</div>
|
||
</div>
|
||
|
||
{error && (
|
||
<div className="mt-4 rounded-xl border border-red-200 bg-red-50 px-4 py-3 text-sm text-red-700">{error}</div>
|
||
)}
|
||
|
||
{report && (
|
||
<section className="mt-8">
|
||
<div className="flex items-center justify-between mb-3">
|
||
<h2 className="text-2xl font-semibold text-gray-900">Crawler Report</h2>
|
||
<div className="flex gap-2">
|
||
<button
|
||
onClick={downloadJson}
|
||
className="inline-flex items-center rounded-xl border border-gray-300 bg-white px-3 py-2 text-sm font-medium hover:bg-gray-50"
|
||
>
|
||
Download JSON
|
||
</button>
|
||
</div>
|
||
</div>
|
||
|
||
<div className="flex flex-wrap gap-2 mb-4">
|
||
{summaryChips(report).map((c) => (
|
||
<span key={c.label} className="inline-flex items-center gap-2 rounded-full border border-gray-200 bg-gray-50 px-3 py-1 text-xs text-gray-700">
|
||
<span className="font-semibold">{c.value}</span>
|
||
<span className="text-gray-500">{c.label}</span>
|
||
</span>
|
||
))}
|
||
</div>
|
||
|
||
{rows.length > 0 ? (
|
||
<div className="overflow-auto rounded-2xl border border-gray-200">
|
||
<table className="min-w-full text-sm">
|
||
<thead className="bg-gray-50 text-left sticky top-0">
|
||
<tr>
|
||
{columns.map((c) => (
|
||
<th key={c} className="px-3 py-2 font-semibold text-gray-700 whitespace-nowrap">{c}</th>
|
||
))}
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
{rows.map((r: any, idx: number) => (
|
||
<tr key={idx} className="odd:bg-white even:bg-gray-50">
|
||
{columns.map((c) => (
|
||
<td key={c} className="px-3 py-2 align-top text-gray-800 max-w-[28rem]">
|
||
{renderCell(r[c])}
|
||
</td>
|
||
))}
|
||
</tr>
|
||
))}
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
) : (
|
||
<pre className="mt-3 whitespace-pre-wrap break-words rounded-2xl bg-gray-900 text-gray-100 p-4 text-xs overflow-auto">
|
||
{JSON.stringify(report, null, 2)}
|
||
</pre>
|
||
)}
|
||
</section>
|
||
)}
|
||
|
||
<p className="mt-6 text-xs text-gray-500">
|
||
Tip: If sitemap auto‑detection fails due to server restrictions, enter Max manually or use the /api/sitemap proxy.
|
||
</p>
|
||
</div>
|
||
</div>
|
||
);
|
||
}
|
||
|
||
function renderCell(value: any) {
|
||
if (value == null) return <span className="text-gray-400">—</span>;
|
||
if (typeof value === "string") {
|
||
if (/^https?:\/\//i.test(value)) {
|
||
return (
|
||
<a href={value} target="_blank" rel="noreferrer" className="text-blue-600 hover:underline break-all">
|
||
{value}
|
||
</a>
|
||
);
|
||
}
|
||
return <span className="break-words">{truncate(value, 220)}</span>;
|
||
}
|
||
if (typeof value === "number" || typeof value === "boolean") return <span>{String(value)}</span>;
|
||
if (Array.isArray(value)) return <span className="text-gray-700">[{value.length} items]</span>;
|
||
return (
|
||
<details>
|
||
<summary className="cursor-pointer text-gray-700">object</summary>
|
||
<pre className="mt-1 whitespace-pre-wrap break-words bg-gray-100 rounded-lg p-2 text-[11px]">{JSON.stringify(value, null, 2)}</pre>
|
||
</details>
|
||
);
|
||
}
|
||
|
||
function truncate(s: string, n: number) {
|
||
return s.length > n ? s.slice(0, n - 1) + "…" : s;
|
||
}
|
||
|
||
function summaryChips(report: any): { label: string; value: string | number }[] {
|
||
const chips: { label: string; value: string | number }[] = [];
|
||
const arr = Array.isArray(report) ? report : Array.isArray(report?.results) ? report.results : null;
|
||
if (arr) chips.push({ label: "Pages crawled", value: arr.length });
|
||
const totals = report?.totals || report?.summary || {};
|
||
for (const [k, v] of Object.entries(totals)) {
|
||
if (typeof v === "number") chips.push({ label: k, value: v });
|
||
}
|
||
return chips.slice(0, 8);
|
||
}
|