first commit

This commit is contained in:
Alaguraj0361 2025-10-09 10:10:50 +05:30
commit 6c345df1c2
49 changed files with 17097 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
# .gitignore
node_modules
.env
reports

11
config/db.js Normal file
View File

@ -0,0 +1,11 @@
import mongoose from 'mongoose';
export async function connectDB() {
try {
await mongoose.connect(process.env.MONGODB_URI, { dbName: 'crawlerX' });
console.log('✅ MongoDB connected');
} catch (err) {
console.error('❌ MongoDB connection error:', err);
process.exit(1);
}
}

View File

@ -0,0 +1,150 @@
import bcrypt from "bcrypt";
import jwt from "jsonwebtoken";
import User from "../models/user.model.js";
import { sendResetPasswordMail, sendSignupMail, } from "../utils/mailer.js";
import crypto from "crypto";
export async function signup(req, res) {
try {
const { email, password } = req.body;
if (!email || !password)
return res.status(400).json({ error: "Email and password required" });
const exists = await User.findOne({ email });
if (exists) return res.status(400).json({ error: "User already exists" });
const passwordHash = await bcrypt.hash(password, 10);
const user = await User.create({ email, passwordHash });
// ✅ send confirmation email (non-blocking)
sendSignupMail(email)
.then(() => console.log("Signup email sent to", email))
.catch(err => console.error("Email send failed:", err));
res.status(201).json({ message: "Signup success, email sent", id: user._id });
} catch (err) {
console.error(err);
res.status(500).json({ error: "Signup failed" });
}
}
export async function login(req, res) {
try {
const { email, password } = req.body;
const user = await User.findOne({ email });
if (!user) return res.status(401).json({ error: "Invalid credentials" });
const match = await bcrypt.compare(password, user.passwordHash);
if (!match) return res.status(401).json({ error: "Invalid credentials" });
const token = jwt.sign(
{ id: user._id, email: user.email },
process.env.JWT_SECRET,
{ expiresIn: "1h" }
);
res.json({ message: "Login success", token });
} catch (err) {
console.error(err);
res.status(500).json({ error: "Login failed" });
}
}
/**
* POST /api/auth/change-password
* Body: { currentPassword, newPassword }
* Header: Authorization: Bearer <token>
*/
export async function changePassword(req, res) {
try {
const { currentPassword, newPassword } = req.body;
// if using FormData, fields come from req.body AFTER a multipart parser
if (!currentPassword || !newPassword) {
return res.status(400).json({ error: "Current password and new password are required" });
}
const user = await User.findById(req.user.id);
if (!user) return res.status(404).json({ error: "User not found" });
const isMatch = await bcrypt.compare(currentPassword, user.passwordHash);
if (!isMatch)
return res.status(401).json({ error: "Current password is incorrect" });
user.passwordHash = await bcrypt.hash(newPassword, 10);
await user.save();
res.json({ message: "Password updated successfully" });
} catch (err) {
console.error("changePassword error:", err); // ✅ show actual error
res.status(500).json({ error: "Failed to change password" });
}
}
/**
* POST /api/auth/forgot-password
* Body: { email }
*/
export async function forgotPassword(req, res) {
try {
const { email } = req.body;
if (!email) return res.status(400).json({ error: "Email is required" });
const user = await User.findOne({ email });
if (!user)
return res.json({
message: "If the email is registered, a reset link has been sent.",
verificationCode: null, // user not found
});
// Generate 4-digit numeric verification code
const verificationCode = Math.floor(1000 + Math.random() * 9000).toString();
// Save code and expiry in DB
user.resetPasswordToken = verificationCode;
user.resetPasswordExpires = Date.now() + 60 * 60 * 1000; // 1 hour
await user.save();
// Send code via email
await sendResetPasswordMail(email, verificationCode);
// ✅ Return verification code in response
res.json({
message: "If the email is registered, a reset link has been sent.",
verificationCode, // This is the 4-digit code
});
} catch (err) {
console.error("forgotPassword error:", err);
res.status(500).json({ error: "Failed to send reset link" });
}
}
/**
* POST /api/auth/reset-password
* Body: { token, newPassword }
*/
export async function resetPassword(req, res) {
try {
const { token, newPassword } = req.body;
if (!token || !newPassword)
return res.status(400).json({ error: "Token and new password are required" });
const user = await User.findOne({
resetPasswordToken: token,
resetPasswordExpires: { $gt: Date.now() },
});
if (!user) return res.status(400).json({ error: "Invalid or expired token" });
user.passwordHash = await bcrypt.hash(newPassword, 10);
user.resetPasswordToken = undefined;
user.resetPasswordExpires = undefined;
await user.save();
res.json({ message: "Password has been reset successfully" });
} catch (err) {
console.error("resetPassword error:", err);
res.status(500).json({ error: "Failed to reset password" });
}
}

View File

@ -0,0 +1,111 @@
import Blog from '../models/blog.model.js';
import Category from '../models/category.model.js';
import slugify from 'slugify';
// ✅ Create Blog for particular project
export const createBlog = async (req, res) => {
try {
const { projectId, title, description, categoryId, tags } = req.body;
if (!projectId) return res.status(400).json({ message: 'projectId is required' });
const slug = slugify(title, { lower: true, strict: true });
const blog = await Blog.create({
projectId,
title,
description,
slug,
category: categoryId,
tags,
imageUrl: req.files?.imageUrl ? `/uploads/${req.files.imageUrl[0].filename}` : '',
bigImageUrl: req.files?.bigImageUrl ? `/uploads/${req.files.bigImageUrl[0].filename}` : ''
});
res.status(201).json(blog);
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// ✅ Get All Blogs for a particular project
export const getAllBlogs = async (req, res) => {
try {
const { page = 1, limit = 10, search = '', category, projectId } = req.query;
if (!projectId) return res.status(400).json({ message: 'projectId is required' });
const query = {
projectId,
title: { $regex: search, $options: 'i' }
};
if (category) {
const cat = await Category.findOne({ slug: category });
if (cat) query.category = cat._id;
}
const blogs = await Blog.find(query)
.populate('category', 'name slug')
.sort({ createdAt: -1 })
.skip((page - 1) * limit)
.limit(parseInt(limit));
const total = await Blog.countDocuments(query);
res.json({ total, page: parseInt(page), blogs });
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// ✅ Get Single Blog by Slug + projectId
export const getBlogBySlug = async (req, res) => {
try {
const { projectId } = req.query; // 👈 query param மூலம்
if (!projectId) return res.status(400).json({ message: 'projectId is required' });
const blog = await Blog.findOne({ slug: req.params.slug, projectId })
.populate('category', 'name slug');
if (!blog) return res.status(404).json({ message: 'Blog not found' });
res.json(blog);
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// ✅ Add Comment to Blog (projectId check optional id already unique)
export const addComment = async (req, res) => {
try {
const { text, name } = req.body;
const blog = await Blog.findById(req.params.id);
if (!blog) return res.status(404).json({ message: 'Blog not found' });
blog.comments.push({
user: req.user?._id,
name: name || 'Anonymous',
text
});
await blog.save();
res.json(blog.comments);
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// ✅ Like / Unlike
export const likeBlog = async (req, res) => {
try {
const blog = await Blog.findById(req.params.id);
if (!blog) return res.status(404).json({ message: 'Blog not found' });
const userId = req.user._id;
if (blog.likes.includes(userId)) blog.likes.pull(userId);
else blog.likes.push(userId);
await blog.save();
res.json({ likesCount: blog.likes.length });
} catch (err) {
res.status(500).json({ message: err.message });
}
};

View File

@ -0,0 +1,46 @@
import Category from '../models/category.model.js';
import slugify from "slugify";
// Create a new category (Admin only)
export const createCategory = async (req, res) => {
try {
const { name, projectId } = req.body;
if (!projectId) return res.status(400).json({ message: "projectId is required" });
const slug = slugify(name, { lower: true, strict: true });
const category = await Category.create({
name,
slug,
projectId
});
res.status(201).json(category);
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// Get all categories for a project
export const getCategories = async (req, res) => {
try {
const { projectId } = req.query;
if (!projectId) return res.status(400).json({ message: "projectId is required" });
const categories = await Category.find({ projectId }).sort({ name: 1 });
res.json(categories);
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// Delete a category (Admin only)
export const deleteCategory = async (req, res) => {
try {
const { id } = req.params;
await Category.findByIdAndDelete(id);
res.json({ message: "Category deleted" });
} catch (err) {
res.status(500).json({ message: err.message });
}
};

View File

@ -0,0 +1,54 @@
import Blog from "../models/blog.model.js";
// Add comment to a blog
export const addComment = async (req, res) => {
try {
const { blogId } = req.params;
const { text, name } = req.body;
const blog = await Blog.findById(blogId);
if (!blog) return res.status(404).json({ message: "Blog not found" });
blog.comments.push({
user: req.user?._id || null,
name: name || "Anonymous",
text
});
await blog.save();
res.status(201).json(blog.comments);
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// Get all comments for a blog
export const getComments = async (req, res) => {
try {
const { blogId } = req.params;
const blog = await Blog.findById(blogId);
if (!blog) return res.status(404).json({ message: "Blog not found" });
res.json(blog.comments.sort((a,b) => b.createdAt - a.createdAt));
} catch (err) {
res.status(500).json({ message: err.message });
}
};
// Delete a comment (Admin only)
export const deleteComment = async (req, res) => {
try {
const { blogId, commentId } = req.params;
const blog = await Blog.findById(blogId);
if (!blog) return res.status(404).json({ message: "Blog not found" });
blog.comments.id(commentId)?.remove();
await blog.save();
res.json({ message: "Comment deleted" });
} catch (err) {
res.status(500).json({ message: err.message });
}
};

View File

@ -0,0 +1,115 @@
import path from "node:path";
import fs from "node:fs";
import { fileURLToPath } from "node:url";
import { crawl } from "../crawler.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const ts = () =>
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-");
function attachJson(res, filename, obj) {
const json = JSON.stringify(obj, null, 2);
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return res.send(json);
}
function isAbs(p) {
try {
return path.isAbsolute(p);
} catch {
return false;
}
}
export async function crawlHandler(req, res) {
try {
const { url, max, stream, download, nostore } = req.query;
if (!url) return res.status(400).json({ error: "Missing url param" });
const target = new URL(String(url));
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
const wantsStream =
String(stream) === "1" ||
(req.get("accept") || "").includes("text/event-stream");
/* ---------- SSE mode ---------- */
if (wantsStream) {
if (String(download) === "1") {
return res.status(400).json({ error: "download not supported with stream=1" });
}
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache, no-transform");
res.setHeader("Connection", "keep-alive");
res.flushHeaders?.();
const send = (obj, evt) => {
if (evt) res.write(`event: ${evt}\n`);
res.write(`data: ${JSON.stringify(obj)}\n\n`);
};
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
let finished = false;
req.on("close", () => {
clearInterval(heartbeat);
if (!finished) console.warn("SSE client disconnected.");
});
const onProgress = (tick) => send(tick, "tick");
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
const result = await crawl(target.toString(), limit, onProgress, {
persistReports: false,
collectPages: true,
});
finished = true;
clearInterval(heartbeat);
send({ ok: true, done: true, result }, "done");
return res.end();
}
/* ---------- Non-streaming mode ---------- */
const preferMemory = String(nostore) === "1" || String(download) === "1";
const result = await crawl(
target.toString(),
limit,
undefined,
preferMemory
? { persistReports: false, collectPages: true }
: { persistReports: true, collectPages: true }
);
if (String(download) === "1") {
const filename = `crawl-${ts()}.json`;
if (Array.isArray(result?.results)) {
return attachJson(res, filename, result.results);
}
const jsonPath = result?.files?.json;
if (jsonPath) {
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
if (fs.existsSync(abs)) {
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return fs.createReadStream(abs).pipe(res);
}
}
return attachJson(res, filename, result ?? {});
}
return res.json({
ok: true,
message: "Crawl completed",
url: target.toString(),
limit,
...result,
});
} catch (err) {
console.error("Crawl error:", err);
res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
}
}

View File

@ -0,0 +1,113 @@
import lighthouse from 'lighthouse';
import { launch } from 'chrome-launcher';
import PageSpeedTest from '../models/pageSpeedTest.model.js';
import path from 'path';
import fs from 'fs';
const reportsDir = path.join(process.cwd(), 'public', 'lighthouse-treemap');
// Ensure folder exists
if (!fs.existsSync(reportsDir)) fs.mkdirSync(reportsDir, { recursive: true });
const launchChromeAndRunLighthouse = async (url, device = 'mobile') => {
const chrome = await launch({ chromeFlags: ['--headless'] });
const options = {
port: chrome.port,
emulatedFormFactor: device,
throttlingMethod: device === 'mobile' ? 'simulate' : 'devtools',
output: 'json', // JSON for metrics
};
const runnerResult = await lighthouse(url, options);
const lhr = runnerResult.lhr;
// Create HTML treemap report (only once, for mobile)
let treemapFile = null;
if (device === 'mobile') {
const fileName = `treemap-${Date.now()}.html`;
treemapFile = `/lighthouse-treemap/${fileName}`;
// Generate HTML report
const htmlReport = await lighthouse(url, {
port: chrome.port,
emulatedFormFactor: device,
throttlingMethod: 'simulate',
output: 'html',
});
fs.writeFileSync(path.join(reportsDir, fileName), htmlReport.report);
}
await chrome.kill();
// Structured result
const result = {
url,
device,
scores: {
performance: Math.round(lhr.categories.performance?.score * 100),
accessibility: Math.round(lhr.categories.accessibility?.score * 100),
bestPractices: Math.round(lhr.categories['best-practices']?.score * 100),
seo: Math.round(lhr.categories.seo?.score * 100),
pwa: lhr.categories.pwa?.score ? Math.round(lhr.categories.pwa.score * 100) : null,
},
metrics: {
firstContentfulPaint: lhr.audits['first-contentful-paint']?.displayValue || null,
largestContentfulPaint: lhr.audits['largest-contentful-paint']?.displayValue || null,
totalBlockingTime: lhr.audits['total-blocking-time']?.displayValue || null,
timeToInteractive: lhr.audits['interactive']?.displayValue || null,
speedIndex: lhr.audits['speed-index']?.displayValue || null,
cumulativeLayoutShift: lhr.audits['cumulative-layout-shift']?.displayValue || null,
},
opportunities: Object.values(lhr.audits)
.filter(a => a.details?.type === 'opportunity')
.map(a => ({
title: a.title,
description: a.description,
estimatedSavings: a.details?.overallSavingsMs
? `${Math.round(a.details.overallSavingsMs)} ms`
: null,
})),
diagnostics: {
usesHTTPS: lhr.audits['is-on-https']?.score === 1,
usesEfficientCachePolicy: lhr.audits['uses-long-cache-ttl']?.score === 1,
imageCompression: lhr.audits['uses-optimized-images']?.score === 1,
},
failedAudits: Object.values(lhr.audits)
.filter(a => a.score !== null && a.score !== 1 && a.scoreDisplayMode !== 'notApplicable')
.map(a => ({ title: a.title, description: a.description })),
passedAudits: Object.values(lhr.audits)
.filter(a => a.score === 1 && a.scoreDisplayMode !== 'notApplicable' && !a.details?.type)
.map(a => a.title),
notApplicableAudits: Object.values(lhr.audits)
.filter(a => a.scoreDisplayMode === 'notApplicable')
.map(a => a.title),
screenshot: lhr.audits['final-screenshot']?.details?.data || null,
createdAt: new Date(),
treemapPath: treemapFile,
};
const report = await PageSpeedTest.create(result);
return { report };
};
export const runAudit = async (req, res, next) => {
try {
const { url } = req.body;
if (!url) return res.status(400).json({ message: 'URL is required' });
const mobileResult = await launchChromeAndRunLighthouse(url, 'mobile');
const desktopResult = await launchChromeAndRunLighthouse(url, 'desktop');
res.status(200).json({
message: 'Audit completed successfully',
results: {
mobile: mobileResult.report,
desktop: desktopResult.report,
treemap: mobileResult.report.treemapPath, // HTML report
},
});
} catch (err) {
next(err);
}
};

View File

@ -0,0 +1,39 @@
import { CakeOrder } from "../../models/maisondetreats/cakeOrder.model.js";
import { sendCakeOrderMail } from "../../utils/mailer.js";
export const createCakeOrder = async (req, res) => {
try {
const { order, email } = req.body;
if (!order || typeof order !== "object") {
return res.status(400).json({ message: "Order data is required" });
}
const newOrder = await CakeOrder.create({ order, email });
// ✅ send confirmation email (non-blocking)
if (email) {
sendCakeOrderMail(email, order)
.then(() => console.log("Cake order email sent to", email))
.catch((err) => console.error("Email send failed:", err));
}
res.status(201).json({
message: "Cake order created successfully",
data: newOrder,
});
} catch (err) {
console.error("Error creating cake order:", err);
res.status(500).json({ message: "Server error", error: err.message });
}
};
// GET /api/cake-orders → List all orders
export const getAllCakeOrders = async (_req, res) => {
try {
const orders = await CakeOrder.find().sort({ createdAt: -1 });
res.json({ data: orders });
} catch (err) {
console.error("Error fetching cake orders:", err);
res.status(500).json({ message: "Server error", error: err.message });
}
};

View File

@ -0,0 +1,68 @@
// message.controller.js
import dotenv from "dotenv";
import axios from "axios";
import Message from "../models/message.model.js";
dotenv.config();
export const sendMessage = async (req, res) => {
try {
const { project, name, email, message } = req.body;
if (!project) return res.status(400).json({ success: false, error: "Project is required" });
if (!message) return res.status(400).json({ success: false, error: "Message is required" });
// Save message to MongoDB
const newMessage = await Message.create({ project, name, email, message });
// Send WhatsApp Template Message
const url = `https://graph.facebook.com/v22.0/774121419125441/messages`;
const payload = {
messaging_product: "whatsapp",
to: 917871207631,
type: "template",
template: {
name: "new_message_alert",
language: { code: "en_US" },
components: [
{
type: "body",
parameters: [
{ type: "text", text: project || "Project" },
{ type: "text", text: name || "Guest" },
{ type: "text", text: email || "N/A" },
{ type: "text", text: message || "No message" },
],
},
],
},
};
const headers = {
Authorization: `Bearer EAALKxEMPlp0BPkmoTAJlZAZAymtgqzcUuGVdZAZAKSZAw1csXR5Xy2DodBUC2zXckOYvQ2jOV4aFlZAeCo4IuJCyMb5aFt2UfNRQ1pDGk08QlbCjjCTMsZALipZCMNYyNVwN2pTDwUcYeNZByOrweVVdXD1ErZAbzjc04wmR8ilhQXink4it05BatwkZBf3xCLyy3k6R0tgx9JoymQTn83iZANBWDzvmX3vW5dx6Pud6xNEfqYNsjwZDZD`,
"Content-Type": "application/json",
};
const response = await axios.post(url, payload, { headers });
console.log("✅ WhatsApp API Response:", response.data);
return res.status(201).json({ success: true, data: newMessage });
} catch (err) {
console.error("❌ WhatsApp API Error:", err.response?.data || err.message);
return res.status(500).json({ success: false, error: "Server Error" });
}
};
// ✅ Add this function and export it
export const getMessages = async (req, res) => {
try {
const { project } = req.query;
if (!project) return res.status(400).json({ success: false, error: "Project is required" });
const messages = await Message.find({ project }).sort({ createdAt: -1 });
return res.status(200).json({ success: true, data: messages });
} catch (err) {
console.error(err);
return res.status(500).json({ success: false, error: "Server Error" });
}
};

View File

@ -0,0 +1,132 @@
// controllers/payment.controller.js
import Stripe from "stripe";
import { Payment } from "../models/payment.model.js";
// ✅ Load Stripe Secret Key from .env
const stripe = new Stripe("sk_test_51SB8SnIFk8fh986GkYaNPVSfZzh6gcuXhq3tOa5hyE4U4vYIqrHwyGRu2OE1N5TNW39tJmfFOyYfsh4HcZOjlsj100xIeM46zU", {
apiVersion: "2022-11-15",
});
/**
* 🔹 Option 1: PaymentIntent API (client uses clientSecret)
*/
export async function createPaymentIntent(req, res) {
try {
const { amount } = req.body;
if (!amount) return res.status(400).json({ error: "amount is required" });
const paymentIntent = await stripe.paymentIntents.create({
amount: Math.round(amount * 100), // dollars → cents
currency: "usd",
automatic_payment_methods: { enabled: true },
});
await Payment.create({
amount: Math.round(amount * 100),
stripePaymentIntentId: paymentIntent.id,
status: "pending",
});
res.json({ clientSecret: paymentIntent.client_secret });
} catch (err) {
console.error("❌ Error creating PaymentIntent:", err);
res.status(500).json({ error: "Internal Server Error" });
}
}
/**
* 🔹 Option 2: Stripe Checkout Session (redirect flow)
*/
export async function createCheckoutSession(req, res) {
try {
const { email, amount, planId } = req.body;
if (!email || !amount) {
return res.status(400).json({ error: "email and amount are required" });
}
const session = await stripe.checkout.sessions.create({
payment_method_types: ["card"],
mode: "payment",
customer_email: email,
line_items: [
{
price_data: {
currency: "usd",
product_data: { name: planId || "SEO Plan" },
unit_amount: Math.round(amount * 100),
},
quantity: 1,
},
],
success_url: "https://app.crawlerx.co/success",
cancel_url: "https://app.crawlerx.co/cancel",
});
// Save to DB using stripeSessionId instead of stripePaymentIntentId
await Payment.create({
email,
amount: Math.round(amount * 100),
stripeSessionId: session.id, // ✅ use session id
status: "pending",
});
res.json({ sessionId: session.id });
} catch (err) {
console.error("❌ Error creating checkout session:", err);
res.status(500).json({ error: "Internal Server Error" });
}
}
/**
* 🔹 Stripe Webhook
* Stripe requires `express.raw({ type: "application/json" })` in route
*/
export async function handleWebhook(req, res) {
const sig = req.headers["stripe-signature"];
let event;
try {
event = stripe.webhooks.constructEvent(
req.rawBody, // Must be raw body
sig,
process.env.STRIPE_WEBHOOK_SECRET
);
} catch (err) {
console.error("❌ Webhook signature verification failed:", err.message);
return res.status(400).send(`Webhook Error: ${err.message}`);
}
switch (event.type) {
case "payment_intent.succeeded": {
const paymentIntent = event.data.object;
console.log("✅ PaymentIntent succeeded:", paymentIntent.id);
await Payment.findOneAndUpdate(
{ stripePaymentIntentId: paymentIntent.id },
{ status: "succeeded" }
);
break;
}
case "checkout.session.completed": {
const session = event.data.object;
console.log("✅ Checkout session completed:", session.id);
// Update DB record created earlier
await Payment.findOneAndUpdate(
{ email: session.customer_email, status: "pending" },
{
stripePaymentIntentId: session.payment_intent,
status: "succeeded",
}
);
break;
}
default:
console.log(`Unhandled event type ${event.type}`);
}
res.json({ received: true });
}

View File

@ -0,0 +1,20 @@
import { getSitemapUrls } from "../utils/sitemap.js";
export async function sitemapHandler(req, res) {
try {
const { u } = req.query;
if (!u) return res.status(400).json({ error: "Missing ?u=https://site.com" });
const origin = new URL(String(u));
const urls = await getSitemapUrls(origin.toString());
res.json({
ok: true,
origin: origin.origin,
count: urls.length,
urls,
});
} catch (err) {
console.error("sitemap error:", err);
res.status(500).json({ error: "Failed to fetch sitemap", details: String(err?.message ?? err) });
}
}

709
crawler copy.js Normal file
View File

@ -0,0 +1,709 @@
import got from "got";
import * as cheerio from "cheerio";
import normalizeUrl from "normalize-url";
import { isInternal } from "./utils/urlHelpers.js";
import { getSitemapUrls } from "./utils/sitemap.js";
import fs from "node:fs";
import path from "node:path";
import { chromium } from "playwright";
// NEW libs
import pixelWidth from "string-pixel-width";
import * as readability from "text-readability";
import stringSimilarity from "string-similarity";
/* ------------------------------ globals --------------------------------- */
const visited = new Set();
const queue = [];
const results = [];
// Link provenance: every discovered edge (source -> target)
const edges = []; // { from, raw_href, to, discovered_by }
// Quick referrer map for error report
const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}>
const REAL_UA =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
const REAL_HEADERS = {
"user-agent": REAL_UA,
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"upgrade-insecure-requests": "1",
};
/* ------------------------------ utils ----------------------------------- */
function csvEscape(v) {
if (v === undefined || v === null) return "";
const s = String(v);
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
}
function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}
function writePageReports(results) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const base = path.join("reports", `crawl-${stamp}`);
fs.writeFileSync(`${base}.json`, JSON.stringify(results, null, 2), "utf8");
console.log(`\n📝 Full JSON report saved: ${base}.json`);
// Columns (a Screaming-Frog-ish shape with our extras)
const headers = [
"url", "status", "status_text", "time_ms", "bytes", "content_type", "http_version",
"title", "title_length", "title_pixel_width",
"meta_description", "meta_description_length", "meta_description_pixel_width",
"h1_1", "h1_1_length", "h1_1_pixel_width", "h1_2", "h1_2_length", "h1_2_pixel_width",
"h2_1", "h2_2",
"canonical", "robots_meta", "x_robots_tag", "noindex", "nofollow",
"lang", "word_count", "flesch_reading_ease", "flesch_kincaid_grade",
"gunning_fog", "coleman_liau", "ari", "smog",
"schema_types", "inlinks", "outlinks", "render_mode",
"last_modified", "set_cookie", "crawl_timestamp",
"duplicate_title_exact", "nearest_title_similarity", "nearest_title_url",
"duplicate_description_exact", "nearest_description_similarity", "nearest_description_url"
];
const lines = [headers.join(",")];
for (const r of results) {
lines.push([
r.url,
r.status,
r.status_text ?? "",
r.time_ms,
r.bytes,
r.content_type,
r.http_version ?? "",
r.title,
r.title_length,
r.title_pixel_width,
r.meta_description,
r.meta_description_length,
r.meta_description_pixel_width,
r.h1_1 ?? "",
r.h1_1_length ?? 0,
r.h1_1_pixel_width ?? "",
r.h1_2 ?? "",
r.h1_2_length ?? 0,
r.h1_2_pixel_width ?? "",
r.h2_1 ?? "",
r.h2_2 ?? "",
r.canonical,
r.robots_meta,
r.x_robots_tag ?? "",
r.noindex,
r.nofollow,
r.lang ?? "",
r.word_count ?? "",
r.flesch_reading_ease ?? "",
r.flesch_kincaid_grade ?? "",
r.gunning_fog ?? "",
r.coleman_liau ?? "",
r.ari ?? "",
r.smog ?? "",
Array.isArray(r.schema_types) ? r.schema_types.join("|") : "",
r.inlinks ?? 0,
r.outlinks ?? 0,
r.render_mode,
r.last_modified ?? "",
r.set_cookie ? "yes" : "no",
r.crawl_timestamp ?? "",
r.duplicate_title_exact ?? "",
r.nearest_title_similarity ?? "",
r.nearest_title_url ?? "",
r.duplicate_description_exact ?? "",
r.nearest_description_similarity ?? "",
r.nearest_description_url ?? ""
].map(csvEscape).join(","));
}
//fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8");
//console.log(`\n📝 Page reports saved:\n - ${base}.csv\n - ${base}.json`);
}
function writeLinkEdges(edges) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `links-${stamp}.csv`);
const headers = ["from", "raw_href", "to", "discovered_by"];
const lines = [headers.join(",")];
for (const e of edges) {
lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(","));
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`🔗 Link provenance saved: ${file}`);
}
function writeErrors(results) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `errors-${stamp}.csv`);
const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"];
const lines = [headers.join(",")];
for (const r of results) {
if (r && r.status !== null && r.status >= 400) {
const refs = referrers.get(r.url) || [];
if (refs.length === 0) {
lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(","));
} else {
for (const ref of refs) {
lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(","));
}
}
}
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`❗ Error report saved: ${file}`);
}
function addEdge(from, rawHref, to, discovered_by) {
edges.push({ from, raw_href: rawHref || "", to, discovered_by });
if (!referrers.has(to)) referrers.set(to, []);
referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by });
}
/* ---------------------- parse HTML without JS --------------------------- */
function safeJsonParse(txt) {
try { return JSON.parse(txt); } catch { return null; }
}
function parseSchemaTypes($) {
const types = new Set();
$('script[type="application/ld+json"]').each((_, el) => {
const raw = $(el).contents().text();
const parsed = safeJsonParse(raw);
if (!parsed) return;
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) { obj.forEach(collect); return; }
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") types.add(t);
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && types.add(x));
// nested
Object.values(obj).forEach(collect);
}
};
collect(parsed);
});
return [...types];
}
function parseHtml(html, url) {
const $ = cheerio.load(html);
let title = ($("title").first().text() || "").trim();
const ogTitle = $('meta[property="og:title"]').attr("content") || "";
const twTitle = $('meta[name="twitter:title"]').attr("content") || "";
// Headings (capture top two H1s and H2s)
const h1s = $("h1").map((_, el) => $(el).text().trim()).get();
const h2s = $("h2").map((_, el) => $(el).text().trim()).get();
const h1_1 = h1s[0] || "";
const h1_2 = h1s[1] || "";
const h2_1 = h2s[0] || "";
const h2_2 = h2s[1] || "";
const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length;
if (!title) title = (ogTitle || twTitle || h1_1 || "").trim();
const metaDesc = ($('meta[name="description"]').attr("content") || "").trim();
const canonical = ($('link[rel="canonical"]').attr("href") || "").trim();
const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim();
const robotsLower = robotsMeta.toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const lang = ($("html").attr("lang") || "").trim();
// Basic text body for word count / readability
const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim();
const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
// Internal links + raw href
const internalLinks = new Set();
const rawLinks = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const abs = new URL(href, url).toString();
rawLinks.push({ raw: href, abs });
internalLinks.add(abs);
} catch { }
});
// Schema.org JSON-LD types
const schemaTypes = parseSchemaTypes($);
return {
title,
metaDesc,
h1_1, h1_2, h2_1, h2_2,
totalHeadings,
canonical, robotsMeta, noindex, nofollow,
internalLinks, rawLinks,
lang,
wordCount,
schemaTypes,
bodyText
};
}
/* ------------------------------ fetchers -------------------------------- */
async function fetchWithGot(url) {
const t0 = Date.now();
const res = await got(url, {
timeout: { request: 20000 },
throwHttpErrors: false,
headers: REAL_HEADERS,
http2: false
});
const dt = Date.now() - t0;
const contentType = (res.headers["content-type"] || "").toLowerCase();
const bytes = res.headers["content-length"]
? Number(res.headers["content-length"])
: Buffer.byteLength(res.body || "", "utf8");
return {
status: res.statusCode ?? null,
status_text: res.statusMessage ?? "",
time_ms: dt,
contentType,
body: res.body,
bytes,
render_mode: "http",
httpVersion: res.httpVersion ?? "",
headers: res.headers
};
}
async function createBrowserContext() {
const browser = await chromium.launch({ headless: true, args: ["--disable-blink-features=AutomationControlled"] });
const context = await browser.newContext({
ignoreHTTPSErrors: true, // Ignore SSL certificate errors
userAgent: REAL_UA,
viewport: { width: 1366, height: 768 },
deviceScaleFactor: 1,
isMobile: false,
locale: "en-US",
extraHTTPHeaders: REAL_HEADERS
});
await context.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] });
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
});
return { browser: context.browser(), context };
}
async function fetchWithPlaywrightAndExtract(url, shared) {
const page = await shared.context.newPage();
const t0 = Date.now();
let status = null, mainHeaders = {}, statusText = "";
try {
const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
status = resp?.status() ?? null;
statusText = resp?.statusText() ?? "";
try { mainHeaders = resp ? await resp.headers() : {}; } catch { }
try { await page.waitForLoadState("networkidle", { timeout: 12000 }); } catch { }
try {
await page.waitForFunction(() => {
const main = document.querySelector("main") || document.body;
const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length;
const hasHeading = !!document.querySelector("h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']");
return textLen > 160 || hasHeading;
}, { timeout: 8000 });
} catch { }
const dom = await page.evaluate(() => {
const clean = s => (s || "").replace(/\s+/g, " ").trim();
const getTextList = sel => Array.from(document.querySelectorAll(sel))
.map(el => clean(el.textContent)).filter(Boolean);
const title = document.title || "";
const ogTitle = document.querySelector('meta[property="og:title"]')?.content || "";
const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || "";
const metaDesc = document.querySelector('meta[name="description"]')?.content || "";
const canonical = document.querySelector('link[rel="canonical"]')?.href || "";
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || "";
const lang = document.documentElement.getAttribute("lang") || "";
const h1 = getTextList("h1");
const h2 = getTextList("h2");
const h3 = getTextList("h3");
const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length;
const links = Array.from(document.querySelectorAll("a[href]"))
.map(a => {
const raw = a.getAttribute("href");
try { return { raw, abs: new URL(raw, location.href).toString() }; }
catch { return null; }
})
.filter(Boolean);
const firstHeading = h1[0] || h2[0] || "";
const bodyText = clean((document.querySelector("main") || document.body).innerText || "");
const schemaScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]')).map(s => s.textContent || "");
return {
htmlLen: (document.documentElement.outerHTML || "").length,
title, ogTitle, twTitle, metaDesc, canonical, robotsMeta, lang,
h1, h2, totalHeadings,
links,
bodyText,
schemaScripts
};
});
// Parse schema types from strings (outside of page)
const schemaTypes = [];
for (const raw of dom.schemaScripts || []) {
try {
const parsed = JSON.parse(raw);
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) { obj.forEach(collect); return; }
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") schemaTypes.push(t);
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && schemaTypes.push(x));
Object.values(obj).forEach(collect);
}
};
collect(parsed);
} catch { }
}
const dt = Date.now() - t0;
const robotsLower = (dom.robotsMeta || "").toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim();
return {
status,
status_text: statusText,
time_ms: dt,
contentType: "text/html",
bytes: dom.htmlLen || 0,
render_mode: "rendered",
headers: mainHeaders,
domExtract: {
title: finalTitle,
metaDesc: dom.metaDesc || "",
canonical: dom.canonical || "",
robotsMeta: dom.robotsMeta || "",
lang: dom.lang || "",
noindex, nofollow,
h1_1: dom.h1?.[0] || "",
h1_2: dom.h1?.[1] || "",
h2_1: dom.h2?.[0] || "",
h2_2: dom.h2?.[1] || "",
totalHeadings: dom.totalHeadings || 0,
links: new Set((dom.links || []).map(l => l.abs)),
rawLinks: dom.links || [],
bodyText: dom.bodyText || "",
schemaTypes: Array.from(new Set(schemaTypes))
}
};
} finally {
await page.close();
}
}
/* ------------------------- render decision ------------------------------ */
function shouldRender(currentUrl, httpRes, parsed, homeTitle) {
const { pathname } = new URL(currentUrl);
if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell
if (parsed.totalHeadings === 0) return true;
if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true;
return false;
}
function withWWW(urlStr) {
try { const u = new URL(urlStr); if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname; return u.toString(); }
catch { return urlStr; }
}
/* ------------------------ per-page enrichers ---------------------------- */
function measurePixelWidth(text, size = 16, font = "arial") {
if (!text) return 0;
try { return pixelWidth(text, { font, size }); } catch { return Math.round(text.length * size * 0.5); }
}
function computeReadability(text) {
if (!text) return {};
const safe = text.slice(0, 200000); // cap
const out = {};
try { out.flesch_reading_ease = readability.fleschReadingEase(safe); } catch { }
try { out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe); } catch { }
try { out.gunning_fog = readability.gunningFog(safe); } catch { }
try { out.coleman_liau = readability.colemanLiauIndex(safe); } catch { }
try { out.ari = readability.automatedReadabilityIndex(safe); } catch { }
try { out.smog = readability.smogIndex(safe); } catch { }
return out;
}
/* -------------------------------- main ---------------------------------- */
// async function crawl(startUrl, maxPages = 50) {
export async function crawl(startUrl, maxPages = 50) {
const start = normalizeUrl(startUrl, { stripHash: true });
queue.push(start);
// Seed from sitemap.xml + record provenance
try {
const sitemapUrls = await getSitemapUrls(start);
for (const u of sitemapUrls) {
queue.push(u);
addEdge("sitemap.xml", u, u, "sitemap");
}
console.log(`📌 Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`);
} catch (e) {
console.log("⚠️ Sitemap step skipped:", e.message);
}
let shared = null;
async function getShared() { if (!shared) shared = await createBrowserContext(); return shared; }
let homeTitle = null;
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift();
if (!url) continue;
const normUrl = normalizeUrl(url, { stripHash: true });
if (visited.has(normUrl)) continue;
visited.add(normUrl);
let attemptUrls = [normUrl];
let usedWWWRetry = false;
for (let attempt = 0; attempt < attemptUrls.length; attempt++) {
const currentUrl = attemptUrls[attempt];
try {
// 1) HTTP fetch
let pageRes = await fetchWithGot(currentUrl);
let parsed = {
title: "", metaDesc: "", h1_1: "", h1_2: "", h2_1: "", h2_2: "",
totalHeadings: 0, canonical: "", robotsMeta: "", noindex: false, nofollow: false,
internalLinks: new Set(), rawLinks: [],
lang: "", wordCount: 0, bodyText: "", schemaTypes: []
};
if (pageRes.contentType.includes("text/html")) {
const p = parseHtml(pageRes.body || "", currentUrl);
parsed = { ...parsed, ...p };
}
if (!homeTitle && new URL(currentUrl).pathname === "/") {
homeTitle = parsed.title || "";
}
// 2) Render if needed
if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) {
const s = await getShared();
const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s);
if (rendered.domExtract) {
pageRes = { ...rendered, body: null };
parsed = {
...parsed,
title: rendered.domExtract.title,
metaDesc: rendered.domExtract.metaDesc,
h1_1: rendered.domExtract.h1_1,
h1_2: rendered.domExtract.h1_2,
h2_1: rendered.domExtract.h2_1,
h2_2: rendered.domExtract.h2_2,
totalHeadings: rendered.domExtract.totalHeadings,
canonical: rendered.domExtract.canonical,
robotsMeta: rendered.domExtract.robotsMeta,
noindex: rendered.domExtract.noindex,
nofollow: rendered.domExtract.nofollow,
internalLinks: rendered.domExtract.links,
rawLinks: rendered.domExtract.rawLinks,
lang: rendered.domExtract.lang || parsed.lang,
bodyText: rendered.domExtract.bodyText || parsed.bodyText,
wordCount: (rendered.domExtract.bodyText || "").split(/\s+/).filter(Boolean).length,
schemaTypes: rendered.domExtract.schemaTypes
};
}
}
// If still looks empty, try www once
if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) {
attemptUrls.push(withWWW(currentUrl));
usedWWWRetry = true;
continue;
}
// Enqueue internal links + record provenance
for (const link of parsed.internalLinks) {
if (isInternal(start, link)) {
const ln = normalizeUrl(link, { stripHash: true });
const rawMatch = (parsed.rawLinks || []).find(r => r.abs === link)?.raw ?? "";
addEdge(currentUrl, rawMatch, ln, pageRes.render_mode);
if (!visited.has(ln)) queue.push(ln);
}
}
// ---- Per-page metrics & enrichers ----
const title = parsed.title || "";
const metaDesc = parsed.metaDesc || "";
const h1_1 = parsed.h1_1 || "";
const h1_2 = parsed.h1_2 || "";
const lang = parsed.lang || "";
const bodyText = parsed.bodyText || "";
const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0);
const titlePx = measurePixelWidth(title, 16, "arial");
const descPx = measurePixelWidth(metaDesc, 14, "arial");
const h1_1_px = measurePixelWidth(h1_1, 24, "arial");
const h1_2_px = measurePixelWidth(h1_2, 24, "arial");
const read = computeReadability(bodyText);
const headers = pageRes.headers || {};
const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? "";
const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? "";
const setCookie = !!headers["set-cookie"];
const outlinks = parsed.internalLinks.size;
const inlinks = (referrers.get(currentUrl) || []).length;
// Save page row
results.push({
url: currentUrl,
status: pageRes.status,
status_text: pageRes.status_text ?? "",
time_ms: pageRes.time_ms,
bytes: pageRes.bytes,
content_type: pageRes.contentType,
http_version: pageRes.httpVersion ?? "",
title,
title_length: title.length,
title_pixel_width: titlePx,
meta_description: metaDesc,
meta_description_length: metaDesc.length,
meta_description_pixel_width: descPx,
h1_1,
h1_1_length: h1_1.length,
h1_1_pixel_width: h1_1_px,
h1_2,
h1_2_length: h1_2.length,
h1_2_pixel_width: h1_2_px,
h2_1: parsed.h2_1 || "",
h2_2: parsed.h2_2 || "",
canonical: parsed.canonical,
robots_meta: parsed.robotsMeta,
x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots,
noindex: parsed.noindex,
nofollow: parsed.nofollow,
lang,
word_count: wordCount,
flesch_reading_ease: read.flesch_reading_ease ?? "",
flesch_kincaid_grade: read.flesch_kincaid_grade ?? "",
gunning_fog: read.gunning_fog ?? "",
coleman_liau: read.coleman_liau ?? "",
ari: read.ari ?? "",
smog: read.smog ?? "",
schema_types: parsed.schemaTypes || [],
inlinks,
outlinks,
render_mode: pageRes.render_mode,
last_modified: lastModified,
set_cookie: setCookie,
crawl_timestamp: new Date().toISOString()
});
console.log(
`[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${title || h1_1}`
);
break; // success for this URL; stop attempts
} catch (err) {
console.error(`[ERROR] ${currentUrl} -> ${err.message}`);
results.push({
url: currentUrl,
status: null, status_text: "", time_ms: null, bytes: null, content_type: "",
http_version: "", title: "", title_length: 0, title_pixel_width: "",
meta_description: "", meta_description_length: 0, meta_description_pixel_width: "",
h1_1: "", h1_1_length: 0, h1_1_pixel_width: "", h1_2: "", h1_2_length: 0, h1_2_pixel_width: "",
h2_1: "", h2_2: "",
canonical: "", robots_meta: "", x_robots_tag: "", noindex: false, nofollow: false,
lang: "", word_count: "", flesch_reading_ease: "", flesch_kincaid_grade: "",
gunning_fog: "", coleman_liau: "", ari: "", smog: "",
schema_types: [], inlinks: 0, outlinks: 0, render_mode: "error",
last_modified: "", set_cookie: "", crawl_timestamp: new Date().toISOString()
});
}
}
}
if (shared) await shared.browser.close();
// -------------------- Post-process: duplicates & similarity -------------
// Titles
const titleMap = new Map();
for (const r of results) {
const key = (r.title || "").trim();
if (!titleMap.has(key)) titleMap.set(key, []);
titleMap.get(key).push(r);
}
for (const [t, arr] of titleMap.entries()) {
if (!t) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no";
}
// Meta descriptions
const descMap = new Map();
for (const r of results) {
const key = (r.meta_description || "").trim();
if (!descMap.has(key)) descMap.set(key, []);
descMap.get(key).push(r);
}
for (const [d, arr] of descMap.entries()) {
if (!d) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no";
}
// Nearest neighbor similarities (within site, lightweight)
const titleList = results.map(r => ({ url: r.url, text: (r.title || "").trim() }));
const descList = results.map(r => ({ url: r.url, text: (r.meta_description || "").trim() }));
for (const r of results) {
// titles
const others = titleList.filter(x => x.url !== r.url && x.text);
let bestT = { rating: 0, target: "" };
if (r.title && others.length) {
const ratings = stringSimilarity.findBestMatch(r.title, others.map(x => x.text));
const best = ratings.bestMatch;
bestT.rating = best.rating;
const idx = ratings.ratings.findIndex(x => x.rating === best.rating);
bestT.target = others[idx]?.url || "";
}
r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : "";
r.nearest_title_url = bestT.target;
// descriptions
const othersD = descList.filter(x => x.url !== r.url && x.text);
let bestD = { rating: 0, target: "" };
if (r.meta_description && othersD.length) {
const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map(x => x.text));
const best = ratingsD.bestMatch;
bestD.rating = best.rating;
const idx = ratingsD.ratings.findIndex(x => x.rating === best.rating);
bestD.target = othersD[idx]?.url || "";
}
r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : "";
r.nearest_description_url = bestD.target;
}
console.log(`\n✅ Crawl finished. Total pages: ${visited.size}`);
writePageReports(results);
writeLinkEdges(edges);
writeErrors(results);
}
// // CLI: node crawler.js https://site.com 200
// const START_URL = process.argv[2] || "https://example.com";
// const MAX_PAGES = Number(process.argv[3] || 100);
// crawl(START_URL, MAX_PAGES);

921
crawler.js Normal file
View File

@ -0,0 +1,921 @@
// crawler.js
import got from "got";
import * as cheerio from "cheerio";
import normalizeUrl from "normalize-url";
import { isInternal } from "./utils/urlHelpers.js";
import { getSitemapUrls } from "./utils/sitemap.js";
import fs from "node:fs";
import path from "node:path";
import { chromium } from "playwright";
// NEW libs
import pixelWidth from "string-pixel-width";
import * as readability from "text-readability";
import stringSimilarity from "string-similarity";
/* ------------------------------ globals --------------------------------- */
// NOTE: We'll reset these at the start of crawl() so repeated runs don't share state.
const visited = new Set();
const queue = [];
const results = [];
// Link provenance: every discovered edge (source -> target)
const edges = []; // { from, raw_href, to, discovered_by }
// Quick referrer map for error report
const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}>
const REAL_UA =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
const REAL_HEADERS = {
"user-agent": REAL_UA,
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"upgrade-insecure-requests": "1",
};
/* ------------------------------ utils ----------------------------------- */
function csvEscape(v) {
if (v === undefined || v === null) return "";
const s = String(v);
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
}
function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}
function writePageReports(rows) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const base = path.join("reports", `crawl-${stamp}`);
fs.writeFileSync(`${base}.json`, JSON.stringify(rows, null, 2), "utf8");
console.log(`\n📝 Full JSON report saved: ${base}.json`);
// Columns (a Screaming-Frog-ish shape with our extras)
const headers = [
"url",
"status",
"status_text",
"time_ms",
"bytes",
"content_type",
"http_version",
"title",
"title_length",
"title_pixel_width",
"meta_description",
"meta_description_length",
"meta_description_pixel_width",
"h1_1",
"h1_1_length",
"h1_1_pixel_width",
"h1_2",
"h1_2_length",
"h1_2_pixel_width",
"h2_1",
"h2_2",
"canonical",
"robots_meta",
"x_robots_tag",
"noindex",
"nofollow",
"lang",
"word_count",
"flesch_reading_ease",
"flesch_kincaid_grade",
"gunning_fog",
"coleman_liau",
"ari",
"smog",
"schema_types",
"inlinks",
"outlinks",
"render_mode",
"last_modified",
"set_cookie",
"crawl_timestamp",
"duplicate_title_exact",
"nearest_title_similarity",
"nearest_title_url",
"duplicate_description_exact",
"nearest_description_similarity",
"nearest_description_url",
];
const lines = [headers.join(",")];
for (const r of rows) {
lines.push(
[
r.url,
r.status,
r.status_text ?? "",
r.time_ms,
r.bytes,
r.content_type,
r.http_version ?? "",
r.title,
r.title_length,
r.title_pixel_width,
r.meta_description,
r.meta_description_length,
r.meta_description_pixel_width,
r.h1_1 ?? "",
r.h1_1_length ?? 0,
r.h1_1_pixel_width ?? "",
r.h1_2 ?? "",
r.h1_2_length ?? 0,
r.h1_2_pixel_width ?? "",
r.h2_1 ?? "",
r.h2_2 ?? "",
r.canonical,
r.robots_meta,
r.x_robots_tag ?? "",
r.noindex,
r.nofollow,
r.lang ?? "",
r.word_count ?? "",
r.flesch_reading_ease ?? "",
r.flesch_kincaid_grade ?? "",
r.gunning_fog ?? "",
r.coleman_liau ?? "",
r.ari ?? "",
r.smog ?? "",
Array.isArray(r.schema_types) ? r.schema_types.join("|") : "",
r.inlinks ?? 0,
r.outlinks ?? 0,
r.render_mode,
r.last_modified ?? "",
r.set_cookie ? "yes" : "no",
r.crawl_timestamp ?? "",
r.duplicate_title_exact ?? "",
r.nearest_title_similarity ?? "",
r.nearest_title_url ?? "",
r.duplicate_description_exact ?? "",
r.nearest_description_similarity ?? "",
r.nearest_description_url ?? "",
]
.map(csvEscape)
.join(",")
);
}
// If you also want CSV persisted, uncomment:
// fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8");
// console.log(`📝 CSV report saved: ${base}.csv`);
return { json: path.resolve(`${base}.json`) /*, csv: path.resolve(`${base}.csv`)*/ };
}
function writeLinkEdges(edges) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `links-${stamp}.csv`);
const headers = ["from", "raw_href", "to", "discovered_by"];
const lines = [headers.join(",")];
for (const e of edges) {
lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(","));
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`🔗 Link provenance saved: ${file}`);
return { linksCsv: path.resolve(file) };
}
function writeErrors(rows) {
ensureDir("reports");
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
const file = path.join("reports", `errors-${stamp}.csv`);
const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"];
const lines = [headers.join(",")];
for (const r of rows) {
if (r && r.status !== null && r.status >= 400) {
const refs = referrers.get(r.url) || [];
if (refs.length === 0) {
lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(","));
} else {
for (const ref of refs) {
lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(","));
}
}
}
}
fs.writeFileSync(file, lines.join("\n"), "utf8");
console.log(`❗ Error report saved: ${file}`);
return { errorsCsv: path.resolve(file) };
}
function addEdge(from, rawHref, to, discovered_by) {
edges.push({ from, raw_href: rawHref || "", to, discovered_by });
if (!referrers.has(to)) referrers.set(to, []);
referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by });
}
/* ---------------------- parse HTML without JS --------------------------- */
function safeJsonParse(txt) {
try {
return JSON.parse(txt);
} catch {
return null;
}
}
function parseSchemaTypes($) {
const types = new Set();
$('script[type="application/ld+json"]').each((_, el) => {
const raw = $(el).contents().text();
const parsed = safeJsonParse(raw);
if (!parsed) return;
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) {
obj.forEach(collect);
return;
}
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") types.add(t);
else if (Array.isArray(t)) t.forEach((x) => typeof x === "string" && types.add(x));
// nested
Object.values(obj).forEach(collect);
}
};
collect(parsed);
});
return [...types];
}
function parseHtml(html, url) {
const $ = cheerio.load(html);
let title = ($("title").first().text() || "").trim();
const ogTitle = $('meta[property="og:title"]').attr("content") || "";
const twTitle = $('meta[name="twitter:title"]').attr("content") || "";
// Headings (capture top two H1s and H2s)
const h1s = $("h1")
.map((_, el) => $(el).text().trim())
.get();
const h2s = $("h2")
.map((_, el) => $(el).text().trim())
.get();
const h1_1 = h1s[0] || "";
const h1_2 = h1s[1] || "";
const h2_1 = h2s[0] || "";
const h2_2 = h2s[1] || "";
const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length;
if (!title) title = (ogTitle || twTitle || h1_1 || "").trim();
const metaDesc = ($('meta[name="description"]').attr("content") || "").trim();
const canonical = ($('link[rel="canonical"]').attr("href") || "").trim();
const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim();
const robotsLower = robotsMeta.toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const lang = ($("html").attr("lang") || "").trim();
// Basic text body for word count / readability
const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim();
const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
// Internal links + raw href
const internalLinks = new Set();
const rawLinks = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const abs = new URL(href, url).toString();
rawLinks.push({ raw: href, abs });
internalLinks.add(abs);
} catch {}
});
// Schema.org JSON-LD types
const schemaTypes = parseSchemaTypes($);
return {
title,
metaDesc,
h1_1,
h1_2,
h2_1,
h2_2,
totalHeadings,
canonical,
robotsMeta,
noindex,
nofollow,
internalLinks,
rawLinks,
lang,
wordCount,
schemaTypes,
bodyText,
};
}
/* ------------------------------ fetchers -------------------------------- */
async function fetchWithGot(url) {
const t0 = Date.now();
const res = await got(url, {
timeout: { request: 20000 },
throwHttpErrors: false,
headers: REAL_HEADERS,
http2: false,
});
const dt = Date.now() - t0;
const contentType = (res.headers["content-type"] || "").toLowerCase();
const bytes = res.headers["content-length"]
? Number(res.headers["content-length"])
: Buffer.byteLength(res.body || "", "utf8");
return {
status: res.statusCode ?? null,
status_text: res.statusMessage ?? "",
time_ms: dt,
contentType,
body: res.body,
bytes,
render_mode: "http",
httpVersion: res.httpVersion ?? "",
headers: res.headers,
};
}
async function createBrowserContext() {
const browser = await chromium.launch({
headless: true,
args: ["--disable-blink-features=AutomationControlled"],
});
const context = await browser.newContext({
ignoreHTTPSErrors: true, // Ignore SSL certificate errors
userAgent: REAL_UA,
viewport: { width: 1366, height: 768 },
deviceScaleFactor: 1,
isMobile: false,
locale: "en-US",
extraHTTPHeaders: REAL_HEADERS,
});
await context.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] });
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
});
return { browser: context.browser(), context };
}
async function fetchWithPlaywrightAndExtract(url, shared) {
const page = await shared.context.newPage();
const t0 = Date.now();
let status = null,
mainHeaders = {},
statusText = "";
try {
const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
status = resp?.status() ?? null;
statusText = resp?.statusText() ?? "";
try {
mainHeaders = resp ? await resp.headers() : {};
} catch {}
try {
await page.waitForLoadState("networkidle", { timeout: 12000 });
} catch {}
try {
await page.waitForFunction(
() => {
const main = document.querySelector("main") || document.body;
const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length;
const hasHeading = !!document.querySelector(
"h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']"
);
return textLen > 160 || hasHeading;
},
{ timeout: 8000 }
);
} catch {}
const dom = await page.evaluate(() => {
const clean = (s) => (s || "").replace(/\s+/g, " ").trim();
const getTextList = (sel) =>
Array.from(document.querySelectorAll(sel))
.map((el) => clean(el.textContent))
.filter(Boolean);
const title = document.title || "";
const ogTitle = document.querySelector('meta[property="og:title"]')?.content || "";
const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || "";
const metaDesc = document.querySelector('meta[name="description"]')?.content || "";
const canonical = document.querySelector('link[rel="canonical"]')?.href || "";
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || "";
const lang = document.documentElement.getAttribute("lang") || "";
const h1 = getTextList("h1");
const h2 = getTextList("h2");
const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length;
const links = Array.from(document.querySelectorAll("a[href]"))
.map((a) => {
const raw = a.getAttribute("href");
try {
return { raw, abs: new URL(raw, location.href).toString() };
} catch {
return null;
}
})
.filter(Boolean);
const bodyText = clean((document.querySelector("main") || document.body).innerText || "");
const schemaScripts = Array.from(
document.querySelectorAll('script[type="application/ld+json"]')
).map((s) => s.textContent || "");
return {
htmlLen: (document.documentElement.outerHTML || "").length,
title,
ogTitle,
twTitle,
metaDesc,
canonical,
robotsMeta,
lang,
h1,
h2,
totalHeadings,
links,
bodyText,
schemaScripts,
};
});
// Parse schema types from strings (outside of page)
const schemaTypes = [];
for (const raw of dom.schemaScripts || []) {
try {
const parsed = JSON.parse(raw);
const collect = (obj) => {
if (!obj) return;
if (Array.isArray(obj)) {
obj.forEach(collect);
return;
}
if (typeof obj === "object") {
const t = obj["@type"];
if (typeof t === "string") schemaTypes.push(t);
else if (Array.isArray(t)) t.forEach((x) => typeof x === "string" && schemaTypes.push(x));
Object.values(obj).forEach(collect);
}
};
collect(parsed);
} catch {}
}
const dt = Date.now() - t0;
const robotsLower = (dom.robotsMeta || "").toLowerCase();
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim();
return {
status,
status_text: statusText,
time_ms: dt,
contentType: "text/html",
bytes: dom.htmlLen || 0,
render_mode: "rendered",
headers: mainHeaders,
domExtract: {
title: finalTitle,
metaDesc: dom.metaDesc || "",
canonical: dom.canonical || "",
robotsMeta: dom.robotsMeta || "",
lang: dom.lang || "",
noindex,
nofollow,
h1_1: dom.h1?.[0] || "",
h1_2: dom.h1?.[1] || "",
h2_1: dom.h2?.[0] || "",
h2_2: dom.h2?.[1] || "",
totalHeadings: dom.totalHeadings || 0,
links: new Set((dom.links || []).map((l) => l.abs)),
rawLinks: dom.links || [],
bodyText: dom.bodyText || "",
schemaTypes: Array.from(new Set(schemaTypes)),
},
};
} finally {
await page.close();
}
}
/* ------------------------- render decision ------------------------------ */
function shouldRender(currentUrl, httpRes, parsed, homeTitle) {
const { pathname } = new URL(currentUrl);
if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell
if (parsed.totalHeadings === 0) return true;
if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true;
return false;
}
function withWWW(urlStr) {
try {
const u = new URL(urlStr);
if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname;
return u.toString();
} catch {
return urlStr;
}
}
/* ------------------------ per-page enrichers ---------------------------- */
function measurePixelWidth(text, size = 16, font = "arial") {
if (!text) return 0;
try {
return pixelWidth(text, { font, size });
} catch {
return Math.round(text.length * size * 0.5);
}
}
function computeReadability(text) {
if (!text) return {};
const safe = text.slice(0, 200000); // cap
const out = {};
try {
out.flesch_reading_ease = readability.fleschReadingEase(safe);
} catch {}
try {
out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe);
} catch {}
try {
out.gunning_fog = readability.gunningFog(safe);
} catch {}
try {
out.coleman_liau = readability.colemanLiauIndex(safe);
} catch {}
try {
out.ari = readability.automatedReadabilityIndex(safe);
} catch {}
try {
out.smog = readability.smogIndex(safe);
} catch {}
return out;
}
/* -------------------------------- main ---------------------------------- */
/**
* Crawl a site and return a structured report.
* @param {string} startUrl
* @param {number} maxPages
* @param {(tick:any)=>void} [onProgress] optional callback for progress events
* @param {{persistReports?: boolean, collectPages?: boolean}} [options]
* @returns {{ results: any[], files: Record<string,string>, total: number }}
*/
export async function crawl(startUrl, maxPages = 50, onProgress, options = {}) {
const persistReports = options.persistReports !== false; // default true
// Reset global state per run
visited.clear();
queue.length = 0;
results.length = 0;
edges.length = 0;
referrers.clear();
const start = normalizeUrl(startUrl, { stripHash: true });
queue.push(start);
// Seed from sitemap.xml + record provenance
try {
const sitemapUrls = await getSitemapUrls(start);
for (const u of sitemapUrls) {
queue.push(u);
addEdge("sitemap.xml", u, u, "sitemap");
}
console.log(`📌 Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`);
} catch (e) {
console.log("⚠️ Sitemap step skipped:", e.message);
}
let shared = null;
async function getShared() {
if (!shared) shared = await createBrowserContext();
return shared;
}
let homeTitle = null;
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift();
if (!url) continue;
const normUrl = normalizeUrl(url, { stripHash: true });
if (visited.has(normUrl)) continue;
visited.add(normUrl);
const attemptUrls = [normUrl];
let usedWWWRetry = false;
for (let attempt = 0; attempt < attemptUrls.length; attempt++) {
const currentUrl = attemptUrls[attempt];
try {
// 1) HTTP fetch
let pageRes = await fetchWithGot(currentUrl);
let parsed = {
title: "",
metaDesc: "",
h1_1: "",
h1_2: "",
h2_1: "",
h2_2: "",
totalHeadings: 0,
canonical: "",
robotsMeta: "",
noindex: false,
nofollow: false,
internalLinks: new Set(),
rawLinks: [],
lang: "",
wordCount: 0,
bodyText: "",
schemaTypes: [],
};
if (pageRes.contentType.includes("text/html")) {
const p = parseHtml(pageRes.body || "", currentUrl);
parsed = { ...parsed, ...p };
}
if (!homeTitle && new URL(currentUrl).pathname === "/") {
homeTitle = parsed.title || "";
}
// 2) Render if needed
if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) {
const s = await getShared();
const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s);
if (rendered.domExtract) {
pageRes = { ...rendered, body: null };
parsed = {
...parsed,
title: rendered.domExtract.title,
metaDesc: rendered.domExtract.metaDesc,
h1_1: rendered.domExtract.h1_1,
h1_2: rendered.domExtract.h1_2,
h2_1: rendered.domExtract.h2_1,
h2_2: rendered.domExtract.h2_2,
totalHeadings: rendered.domExtract.totalHeadings,
canonical: rendered.domExtract.canonical,
robotsMeta: rendered.domExtract.robotsMeta,
noindex: rendered.domExtract.noindex,
nofollow: rendered.domExtract.nofollow,
internalLinks: rendered.domExtract.links,
rawLinks: rendered.domExtract.rawLinks,
lang: rendered.domExtract.lang || parsed.lang,
bodyText: rendered.domExtract.bodyText || parsed.bodyText,
wordCount: (rendered.domExtract.bodyText || "")
.split(/\s+/)
.filter(Boolean).length,
schemaTypes: rendered.domExtract.schemaTypes,
};
}
}
// If still looks empty, try www once
if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) {
attemptUrls.push(withWWW(currentUrl));
usedWWWRetry = true;
continue;
}
// Enqueue internal links + record provenance
for (const link of parsed.internalLinks) {
if (isInternal(start, link)) {
const ln = normalizeUrl(link, { stripHash: true });
const rawMatch = (parsed.rawLinks || []).find((r) => r.abs === link)?.raw ?? "";
addEdge(currentUrl, rawMatch, ln, pageRes.render_mode);
if (!visited.has(ln)) queue.push(ln);
}
}
// ---- Per-page metrics & enrichers ----
const title = parsed.title || "";
const metaDesc = parsed.metaDesc || "";
const h1_1 = parsed.h1_1 || "";
const h1_2 = parsed.h1_2 || "";
const lang = parsed.lang || "";
const bodyText = parsed.bodyText || "";
const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0);
const titlePx = measurePixelWidth(title, 16, "arial");
const descPx = measurePixelWidth(metaDesc, 14, "arial");
const h1_1_px = measurePixelWidth(h1_1, 24, "arial");
const h1_2_px = measurePixelWidth(h1_2, 24, "arial");
const read = computeReadability(bodyText);
const headers = pageRes.headers || {};
const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? "";
const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? "";
const setCookie = !!headers["set-cookie"];
const outlinks = parsed.internalLinks.size;
const inlinks = (referrers.get(currentUrl) || []).length;
// Save page row
results.push({
url: currentUrl,
status: pageRes.status,
status_text: pageRes.status_text ?? "",
time_ms: pageRes.time_ms,
bytes: pageRes.bytes,
content_type: pageRes.contentType,
http_version: pageRes.httpVersion ?? "",
title,
title_length: title.length,
title_pixel_width: titlePx,
meta_description: metaDesc,
meta_description_length: metaDesc.length,
meta_description_pixel_width: descPx,
h1_1,
h1_1_length: h1_1.length,
h1_1_pixel_width: h1_1_px,
h1_2,
h1_2_length: h1_2.length,
h1_2_pixel_width: h1_2_px,
h2_1: parsed.h2_1 || "",
h2_2: parsed.h2_2 || "",
canonical: parsed.canonical,
robots_meta: parsed.robotsMeta,
x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots,
noindex: parsed.noindex,
nofollow: parsed.nofollow,
lang,
word_count: wordCount,
flesch_reading_ease: read.flesch_reading_ease ?? "",
flesch_kincaid_grade: read.flesch_kincaid_grade ?? "",
gunning_fog: read.gunning_fog ?? "",
coleman_liau: read.coleman_liau ?? "",
ari: read.ari ?? "",
smog: read.smog ?? "",
schema_types: parsed.schemaTypes || [],
inlinks,
outlinks,
render_mode: pageRes.render_mode,
last_modified: lastModified,
set_cookie: setCookie,
crawl_timestamp: new Date().toISOString(),
});
console.log(
`[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${
title || h1_1
}`
);
// optional progress callback (non-fatal)
try {
onProgress?.({
url: currentUrl,
status: pageRes.status,
title,
inlinks,
outlinks,
visited: visited.size,
queued: queue.length,
});
} catch {}
break; // success for this URL; stop attempts
} catch (err) {
console.error(`[ERROR] ${currentUrl} -> ${err.message}`);
results.push({
url: currentUrl,
status: null,
status_text: "",
time_ms: null,
bytes: null,
content_type: "",
http_version: "",
title: "",
title_length: 0,
title_pixel_width: "",
meta_description: "",
meta_description_length: 0,
meta_description_pixel_width: "",
h1_1: "",
h1_1_length: 0,
h1_1_pixel_width: "",
h1_2: "",
h1_2_length: 0,
h1_2_pixel_width: "",
h2_1: "",
h2_2: "",
canonical: "",
robots_meta: "",
x_robots_tag: "",
noindex: false,
nofollow: false,
lang: "",
word_count: "",
flesch_reading_ease: "",
flesch_kincaid_grade: "",
gunning_fog: "",
coleman_liau: "",
ari: "",
smog: "",
schema_types: [],
inlinks: 0,
outlinks: 0,
render_mode: "error",
last_modified: "",
set_cookie: "",
crawl_timestamp: new Date().toISOString(),
});
try {
onProgress?.({
url: currentUrl,
error: String(err?.message || err),
visited: visited.size,
queued: queue.length,
});
} catch {}
}
}
}
if (shared) await shared.browser.close();
// -------------------- Post-process: duplicates & similarity -------------
// Titles
const titleMap = new Map();
for (const r of results) {
const key = (r.title || "").trim();
if (!titleMap.has(key)) titleMap.set(key, []);
titleMap.get(key).push(r);
}
for (const [t, arr] of titleMap.entries()) {
if (!t) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no";
}
// Meta descriptions
const descMap = new Map();
for (const r of results) {
const key = (r.meta_description || "").trim();
if (!descMap.has(key)) descMap.set(key, []);
descMap.get(key).push(r);
}
for (const [d, arr] of descMap.entries()) {
if (!d) continue;
const isDup = arr.length > 1;
for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no";
}
// Nearest neighbor similarities (within site, lightweight)
const titleList = results.map((r) => ({ url: r.url, text: (r.title || "").trim() }));
const descList = results.map((r) => ({ url: r.url, text: (r.meta_description || "").trim() }));
for (const r of results) {
// titles
const others = titleList.filter((x) => x.url !== r.url && x.text);
let bestT = { rating: 0, target: "" };
if (r.title && others.length) {
const ratings = stringSimilarity.findBestMatch(r.title, others.map((x) => x.text));
const best = ratings.bestMatch;
bestT.rating = best.rating;
const idx = ratings.ratings.findIndex((x) => x.rating === best.rating);
bestT.target = others[idx]?.url || "";
}
r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : "";
r.nearest_title_url = bestT.target;
// descriptions
const othersD = descList.filter((x) => x.url !== r.url && x.text);
let bestD = { rating: 0, target: "" };
if (r.meta_description && othersD.length) {
const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map((x) => x.text));
const best = ratingsD.bestMatch;
bestD.rating = best.rating;
const idx = ratingsD.ratings.findIndex((x) => x.rating === best.rating);
bestD.target = othersD[idx]?.url || "";
}
r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : "";
r.nearest_description_url = bestD.target;
}
console.log(`\n✅ Crawl finished. Total pages: ${visited.size}`);
let files = {};
if (persistReports) {
const a = writePageReports(results);
const b = writeLinkEdges(edges);
const c = writeErrors(results);
files = { ...a, ...b, ...c };
}
return { results, files, total: results.length };
}
// // CLI: node crawler.js https://site.com 200
// const START_URL = process.argv[2] || "https://example.com";
// const MAX_PAGES = Number(process.argv[3] || 100);
// crawl(START_URL, MAX_PAGES);

View File

@ -0,0 +1,19 @@
import jwt from "jsonwebtoken";
export function authMiddleware(req, res, next) {
const header = req.headers.authorization;
if (!header?.startsWith("Bearer ")) {
return res.status(401).json({ error: "Missing token" });
}
const token = header.split(" ")[1];
try {
req.user = jwt.verify(token, process.env.JWT_SECRET);
next();
} catch (err) {
console.error("JWT verification failed:", err.message);
return res.status(401).json({ error: "Invalid or expired token" });
}
}

View File

@ -0,0 +1,4 @@
export const errorHandler = (err, req, res, next) => {
console.error(err);
res.status(500).json({ message: err.message || 'Internal Server Error' });
};

41
models/blog.model.js Normal file
View File

@ -0,0 +1,41 @@
import mongoose from 'mongoose';
const commentSchema = new mongoose.Schema({
user: { type: mongoose.Schema.Types.ObjectId, ref: 'User' },
name: String,
text: { type: String, required: true },
createdAt: { type: Date, default: Date.now }
});
const blogSchema = new mongoose.Schema({
projectId: { type: String, required: true, index: true },
title: { type: String, required: true },
slug: { type: String, required: true, unique: false },
description: { type: String, required: true },
imageUrl: String,
bigImageUrl: String, // ✅ New field
category: { type: mongoose.Schema.Types.ObjectId, ref: 'Category' },
tags: [String],
comments: [commentSchema],
likes: [{ type: mongoose.Schema.Types.ObjectId, ref: 'User' }],
author: { type: mongoose.Schema.Types.ObjectId, ref: 'User' }
}, { timestamps: true });
// 👇 projectId + slug combo unique
blogSchema.index({ projectId: 1, slug: 1 }, { unique: true });
// 👇 Add base URL when converting to JSON
blogSchema.set('toJSON', {
transform: (doc, ret) => {
const baseUrl = process.env.BACKEND_URL || 'http://localhost:3010';
if (ret.imageUrl && !ret.imageUrl.startsWith('http')) {
ret.imageUrl = `${baseUrl}${ret.imageUrl}`;
}
if (ret.bigImageUrl && !ret.bigImageUrl.startsWith('http')) {
ret.bigImageUrl = `${baseUrl}${ret.bigImageUrl}`;
}
return ret;
}
});
export default mongoose.model('Blog', blogSchema);

21
models/category.model.js Normal file
View File

@ -0,0 +1,21 @@
import mongoose from "mongoose";
const categorySchema = new mongoose.Schema(
{
name: {
type: String,
required: true,
unique: true,
trim: true,
},
projectId: {
type: String, // For multi-project support
required: true,
},
},
{ timestamps: true }
);
const Category = mongoose.model("Category", categorySchema);
export default Category;

10
models/comments.model.js Normal file
View File

@ -0,0 +1,10 @@
import mongoose from "mongoose";
const commentSchema = new mongoose.Schema({
blog: { type: mongoose.Schema.Types.ObjectId, ref: "Blog" },
name: String,
text: String,
createdAt: { type: Date, default: Date.now }
});
export default mongoose.model("Comment", commentSchema);

View File

@ -0,0 +1,18 @@
import mongoose from "mongoose";
const CakeOrderSchema = new mongoose.Schema(
{
order: {
type: Object,
required: true,
// Example format:
// {
// "Mini Cakes": { "Thandai Cake": 1, "Mango Cardamom": 1 },
// "Mithai-Inspired Macarons": { "Mango macarons (pack of 6)": 1, "Pista (pack of 6)": 10 }
// }
},
},
{ timestamps: true }
);
export const CakeOrder = mongoose.model("CakeOrder", CakeOrderSchema);

12
models/message.model.js Normal file
View File

@ -0,0 +1,12 @@
import mongoose from "mongoose";
const MessageSchema = new mongoose.Schema({
project: { type: String, required: true }, // new field to identify project
name: { type: String, default: "Guest" },
email: { type: String },
message: { type: String, required: true },
}, { timestamps: true });
const Message = mongoose.models.Message || mongoose.model("Message", MessageSchema);
export default Message;

View File

@ -0,0 +1,42 @@
import mongoose from 'mongoose';
const pageSpeedTestSchema = new mongoose.Schema({
url: { type: String, required: true },
device: { type: String, enum: ['mobile', 'desktop'], required: true },
scores: {
performance: Number,
accessibility: Number,
bestPractices: Number,
seo: Number,
pwa: Number,
},
metrics: {
firstContentfulPaint: String,
largestContentfulPaint: String,
totalBlockingTime: String,
timeToInteractive: String,
speedIndex: String,
cumulativeLayoutShift: String,
},
opportunities: [
{
title: String,
description: String,
estimatedSavings: String,
},
],
diagnostics: Object,
failedAudits: [
{
title: String,
description: String,
},
],
passedAudits: [String],
notApplicableAudits: [String],
screenshot: String,
treemapPath: { type: String },
createdAt: { type: Date, default: Date.now },
});
export default mongoose.model('PageSpeedTest', pageSpeedTestSchema);

12
models/payment.model.js Normal file
View File

@ -0,0 +1,12 @@
import mongoose from "mongoose";
const paymentSchema = new mongoose.Schema({
email: { type: String, required: true },
amount: { type: Number, required: true }, // store in cents
currency: { type: String, default: "usd" },
stripePaymentIntentId: { type: String }, // ❌ remove required: true
stripeSessionId: { type: String }, // ✅ store Checkout Session ID
status: { type: String, default: "pending" }, // pending, succeeded, failed
}, { timestamps: true });
export const Payment = mongoose.model("Payment", paymentSchema);

14
models/user.model.js Normal file
View File

@ -0,0 +1,14 @@
import mongoose from "mongoose";
const userSchema = new mongoose.Schema(
{
email: { type: String, required: true, unique: true, lowercase: true },
passwordHash: { type: String, required: true },
// ➡️ Add these two lines
resetPasswordToken: { type: String },
resetPasswordExpires: { type: Date },
},
{ timestamps: true }
);
export default mongoose.model("User", userSchema);

5093
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

36
package.json Normal file
View File

@ -0,0 +1,36 @@
{
"name": "crawlerx",
"version": "1.0.0",
"type": "module",
"main": "crawler.js",
"scripts": {
"start": "node crawler.js https://example.com 200",
"dev": "nodemon crawler.js https://example.com 200"
},
"dependencies": {
"axios": "^1.12.2",
"bcrypt": "^6.0.0",
"cheerio": "^1.1.0",
"chrome-launcher": "^1.2.1",
"cors": "^2.8.5",
"dotenv": "^17.2.2",
"express": "^5.1.0",
"got": "^14.4.7",
"jsonwebtoken": "^9.0.2",
"lighthouse": "^12.8.2",
"mongoose": "^8.18.1",
"multer": "^2.0.2",
"nodemailer": "^7.0.6",
"normalize-url": "^8.0.2",
"sitemapper": "^3.2.7",
"slugify": "^1.6.6",
"string-pixel-width": "^1.11.0",
"string-similarity": "^4.0.4",
"stripe": "^18.5.0",
"text-readability": "^1.1.1"
},
"devDependencies": {
"nodemon": "^3.1.10",
"playwright": "^1.55.0"
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

18
routes/auth.routes.js Normal file
View File

@ -0,0 +1,18 @@
import express from "express";
import { signup, login, changePassword, forgotPassword, resetPassword } from "../controllers/auth.controller.js";
import { authMiddleware } from "../middlewares/auth.middleware.js";
const router = express.Router();
router.post("/signup", signup);
router.post("/login", login);
router.post("/change-password", authMiddleware, changePassword);
router.post("/forgot-password", forgotPassword);
router.post("/reset-password", resetPassword);
// example protected route
router.get("/profile", authMiddleware, (req, res) => {
res.json({ user: req.user });
});
export default router;

54
routes/blog.routes.js Normal file
View File

@ -0,0 +1,54 @@
import express from "express";
import multer from "multer";
import {
createBlog,
getAllBlogs,
getBlogBySlug,
likeBlog,
} from "../controllers/blog.controller.js";
import {
createCategory,
getCategories,
deleteCategory,
} from "../controllers/category.controller.js";
import {
addComment as addCommentController,
getComments,
deleteComment,
} from "../controllers/comment.controller.js";
const router = express.Router();
const upload = multer({ dest: "uploads/" });
// =======================
// Blog Routes
// =======================
// Create a blog
router.post("/", upload.single("image"), createBlog);
// Get all blogs
router.get("/", getAllBlogs);
// Get blog by slug
router.get("/:slug", getBlogBySlug);
// Like a blog
router.post("/:id/like", likeBlog);
// =======================
// Category Routes
// =======================
router.post("/category", createCategory); // Create Category (admin)
router.get("/category", getCategories); // List Categories
router.delete("/category/:id", deleteCategory); // Delete Category (admin)
// =======================
// Comment Routes
// =======================
router.post("/:blogId/comments", addCommentController); // Add Comment
router.get("/:blogId/comments", getComments); // Get Comments
router.delete("/:blogId/comments/:commentId", deleteComment); // Delete Comment (admin)
export default router;

6
routes/crawl.routes.js Normal file
View File

@ -0,0 +1,6 @@
import { Router } from "express";
import { crawlHandler } from "../controllers/crawl.controller.js";
const router = Router();
router.get("/", crawlHandler);
export default router;

View File

@ -0,0 +1,8 @@
import express from 'express';
import { runAudit } from '../controllers/lighthouseController.js';
const router = express.Router();
router.post('/audit', runAudit);
export default router;

View File

@ -0,0 +1,12 @@
import express from "express";
import { createCakeOrder, getAllCakeOrders } from "../../controllers/maisondetreats/cakeOrder.controller.js";
const router = express.Router();
// Create a new cake order
router.post("/", createCakeOrder);
// Get all cake orders
router.get("/", getAllCakeOrders);
export default router;

12
routes/message.routes.js Normal file
View File

@ -0,0 +1,12 @@
import express from "express";
import { sendMessage, getMessages } from "../controllers/message.controller.js";
const router = express.Router();
// POST /api/messages → Save a message
router.post("/", sendMessage);
// GET /api/messages → Get all messages (optional)
router.get("/", getMessages);
export default router;

14
routes/payment.route.js Normal file
View File

@ -0,0 +1,14 @@
import express from "express";
import {
createPaymentIntent,
createCheckoutSession,
handleWebhook
} from "../controllers/payment.controller.js";
const router = express.Router();
router.post("/create-intent", createPaymentIntent);
router.post("/create-checkout-session", createCheckoutSession);
router.post("/webhook", express.raw({ type: "application/json" }), handleWebhook);
export default router;

6
routes/sitemap.routes.js Normal file
View File

@ -0,0 +1,6 @@
import { Router } from "express";
import { sitemapHandler } from "../controllers/sitemap.controller.js";
const router = Router();
router.get("/", sitemapHandler);
export default router;

237
server copy.js Normal file
View File

@ -0,0 +1,237 @@
// // server.js
// import express from "express";
// import { Queue } from "bullmq";
// import { connection } from "./redis.js";
// import crypto from "crypto";
// const app = express();
// app.use(express.json());
// const crawlQueue = new Queue("crawl", { connection });
// // Start a new crawl
// app.post("/crawl", async (req, res) => {
// const { startUrl } = req.body;
// if (!startUrl) return res.status(400).json({ error: "Missing startUrl" });
// const crawlId = crypto.randomUUID();
// await crawlQueue.add("fetch", { crawlId, url: startUrl });
// res.json({ crawlId, message: "Crawl started" });
// });
// // (Optional) Check progress
// app.get("/status/:id", async (req, res) => {
// // For now just reply with "in progress"
// res.json({ crawlId: req.params.id, status: "in progress" });
// });
// app.listen(3000, () => {
// console.log("Crawler API running at http://localhost:3000");
// });
// // server.js
// import express from "express";
// import cors from "cors"; // ← optional but recommended
// import { crawl } from "./crawler.js"; // ensure crawl is a NAMED export; if default, use: import crawl from "./crawler.js";
// const app = express();
// const PORT = process.env.PORT || 3010;
// /* Parse JSON BEFORE any middleware that might read req.body */
// app.use(express.json());
// /* CORS (adjust origins as needed) */
// app.use(cors({
// origin: [
// "http://localhost:3000",
// "https://your-frontend.example" // ← replace or remove
// ],
// }));
// /* Safe request logger */
// app.use((req, res, next) => {
// console.log(`[${new Date().toISOString()}] ${req.method} ${req.originalUrl}`);
// if (req.query && Object.keys(req.query).length) console.log("Query:", req.query);
// if (req.body && typeof req.body === "object" && Object.keys(req.body).length) console.log("Body:", req.body);
// next();
// });
// /* GET /crawl?url=https://site.com&max=50 */
// app.get("/crawl", async (req, res) => {
// try {
// const { url, max } = req.query;
// if (!url) return res.status(400).json({ error: "Missing url param" });
// // validate & normalize
// const target = new URL(String(url)); // throws if invalid
// const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
// await crawl(target.toString(), limit);
// res.json({ ok: true, message: `Crawl started`, url: target.toString(), limit });
// } catch (err) {
// console.error("Crawl error:", err);
// res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
// }
// });
// /* Global safety nets so crashes dont become silent restart loops */
// process.on("unhandledRejection", (err) => console.error("unhandledRejection:", err));
// process.on("uncaughtException", (err) => console.error("uncaughtException:", err));
// /* Bind to all interfaces so remote calls work */
// app.listen(PORT, "0.0.0.0", () => {
// console.log(`🚀 Server running at http://localhost:${PORT}`);
// });
// server.js
import express from "express";
import cors from "cors";
import path from "node:path";
import fs from "node:fs";
import fsp from "node:fs/promises";
import { fileURLToPath } from "node:url";
import { crawl } from "./crawler.js"; // crawl(target, limit, onProgress?, options?)
const app = express();
const PORT = process.env.PORT || 3010;
const __dirname = path.dirname(fileURLToPath(import.meta.url));
/* ------------ Middleware ------------ */
app.use(express.json());
app.use(cors({ origin: ["http://localhost:3000", "https://app.crawlerx.co"] }));
app.use(express.static(path.join(__dirname, "public")));
app.get("/", (_req, res) => {
const viewer = path.join(__dirname, "public", "crawlerx_viewer.html");
return fs.existsSync(viewer)
? res.sendFile(viewer)
: res.type("text/plain").send("CrawlerX backend is running.");
});
app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() }));
/* ------------ Helpers ------------ */
const ts = () =>
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-"); // safe filename
function attachJson(res, filename, obj) {
const json = JSON.stringify(obj, null, 2);
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return res.send(json);
}
function isAbs(p) {
try { return path.isAbsolute(p); } catch { return false; }
}
/* ------------ Crawl endpoint ------------ */
/**
* GET /crawl?url=https://site.com&max=50[&stream=1][&download=1][&nostore=1]
* - stream=1 : SSE live progress (no download)
* - download=1 : respond as a JSON download (attachment)
* - nostore=1 : ask crawler not to write files (if supported by your crawler)
*/
app.get("/crawl", async (req, res) => {
try {
const { url, max, stream, download, nostore } = req.query;
if (!url) return res.status(400).json({ error: "Missing url param" });
const target = new URL(String(url)); // validate
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
const wantsStream =
String(stream) === "1" ||
(req.get("accept") || "").includes("text/event-stream");
/* ---------- SSE mode ---------- */
if (wantsStream) {
if (String(download) === "1") {
return res.status(400).json({ error: "download not supported with stream=1" });
}
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache, no-transform");
res.setHeader("Connection", "keep-alive");
res.flushHeaders?.();
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
const send = (obj, evt) => {
if (evt) res.write(`event: ${evt}\n`);
res.write(`data: ${JSON.stringify(obj)}\n\n`);
};
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
let finished = false;
req.on("close", () => { clearInterval(heartbeat); if (!finished) console.warn("SSE client disconnected."); });
const result = await crawl(
target.toString(),
limit,
(tick) => send(tick),
// If your crawler supports it, this avoids writing files during SSE runs:
{ persistReports: false, collectPages: true }
);
finished = true;
clearInterval(heartbeat);
send({ ok: true, done: true, result }, "done");
return res.end();
}
/* ---------- Non-streaming mode ---------- */
// Ask crawler (if it supports options) to avoid writing files when nostore=1 or download requested.
const preferMemory = String(nostore) === "1" || String(download) === "1";
const result = await crawl(
target.toString(),
limit,
undefined,
preferMemory ? { persistReports: false, collectPages: true } : undefined
);
// If caller wants a downloadable JSON file...
if (String(download) === "1") {
const filename = `crawl-${ts()}.json`;
// 1) Best case: crawler returned in-memory data (no disk IO).
// Use whichever property your crawler exposes. We try common shapes.
const inMemory =
result?.jsonData ??
result?.pages ??
result?.report ??
(Array.isArray(result) ? result : null);
if (inMemory) {
return attachJson(res, filename, inMemory);
}
// 2) Fallback: crawler saved a JSON report path that we can stream.
const jsonPath = result?.reports?.json;
if (jsonPath && fs.existsSync(isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath))) {
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
return fs.createReadStream(abs).pipe(res);
}
// 3) Last resort: send the entire result itself as JSON.
return attachJson(res, filename, result);
}
// Default JSON (inline, not attachment)
return res.json({
ok: true,
message: "Crawl completed",
url: target.toString(),
limit,
result
});
} catch (err) {
console.error("Crawl error:", err);
return res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
}
});
/* ------------ Safety nets ------------ */
process.on("unhandledRejection", (e) => console.error("unhandledRejection:", e));
process.on("uncaughtException", (e) => console.error("uncaughtException:", e));
/* ------------ Start server ------------ */
app.listen(PORT, "0.0.0.0", () => {
console.log(`🚀 Server running at http://localhost:${PORT}`);
});

80
server.js Normal file
View File

@ -0,0 +1,80 @@
import express from "express";
import cors from "cors";
import path from "node:path";
import fs from "node:fs";
import { fileURLToPath } from "node:url";
import dotenv from "dotenv";
import crawlRoutes from "./routes/crawl.routes.js";
import sitemapRoutes from "./routes/sitemap.routes.js";
import authRoutes from "./routes/auth.routes.js"; // Login & Signup endpoints
import paymentRoutes from "./routes/payment.route.js";
import lighthouseRoutes from "./routes/lighthouse.routes.js"; // <-- ADD THIS
import messageRoutes from "./routes/message.routes.js";
import cakeOrderRoutes from "./routes/maisondetreats/cakeOrder.routes.js";
import blogRoutes from "./routes/blog.routes.js";
import { connectDB } from "./config/db.js";
import { mailer } from "./utils/mailer.js";
// ------------------ Load environment ------------------
dotenv.config(); // Must be first
// ------------------ Connect database ------------------
await connectDB();
// ------------------ Express setup ------------------
const app = express();
const PORT = process.env.PORT || 3010;
const __dirname = path.dirname(fileURLToPath(import.meta.url));
app.use(express.json());
app.use(
cors({
origin: [
"http://localhost:3000",
"http://127.0.0.1:3000",
"https://api.crawlerx.co",
"https://app.crawlerx.co",
],
})
);
app.use(express.static(path.join(__dirname, "public")));
// ------------------ SMTP verification ------------------
console.log("SMTP Host:", process.env.SMTP_HOST);
console.log("SMTP Port:", process.env.SMTP_PORT);
// ------------------ Routes ------------------
app.get("/", (_req, res) => {
const viewer = path.join(__dirname, "public", "crawlerx_viewer.html");
if (fs.existsSync(viewer)) {
return res.sendFile(viewer);
} else {
return res
.type("text/plain")
.send("CrawlerX backend is running.");
}
});
app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() }));
app.use("/crawl", crawlRoutes);
app.use("/sitemap", sitemapRoutes);
app.use("/api/auth", authRoutes); // Login & Signup endpoints
app.use("/api/payment", paymentRoutes);
app.use("/api/lighthouse", lighthouseRoutes);
app.use("/api/blog", blogRoutes); // All blog/category/comment routes now prefixed with /api/blog
app.use("/api/messages", messageRoutes);
app.use("/api/cake-orders", cakeOrderRoutes);
// Serve uploaded files
app.use('/uploads', express.static(path.join(process.cwd(), 'uploads')));
// ------------------ Safety nets ------------------
process.on("unhandledRejection", (err) => console.error("Unhandled Rejection:", err));
process.on("uncaughtException", (err) => console.error("Uncaught Exception:", err));
// ------------------ Start server ------------------
app.listen(PORT, "0.0.0.0", () => {
console.log(`🚀 Server running at http://localhost:${PORT}`);
});

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

105
utils/mailer.js Normal file
View File

@ -0,0 +1,105 @@
import nodemailer from "nodemailer";
//
// Create reusable transporter object
//
export const mailer = nodemailer.createTransport({
host: "mail.crawlerx.co", // your Hestia mail host
port: 587, // STARTTLS
secure: false, // must be false for 587
auth: {
user: "info@crawlerx.co", // e.g. info@crawlerx.co
pass: "CrawlerX@2025", // mailbox password
},
name: "mail.crawlerx.co", // explicitly set hostname
tls: {
rejectUnauthorized: false, // allow self-signed certs
},
logger: true, // optional: logs connection steps
debug: true, // optional: debug SMTP connection
});
//
// Send welcome / signup email
//
export async function sendSignupMail(toEmail) {
try {
await mailer.sendMail({
from: `"CrawlerX" <${process.env.SMTP_USER}>`,
to: toEmail,
subject: "Welcome to CrawlerX",
html: `
<h2>Welcome!</h2>
<p>Your signup was successful. You can now log in and start using the app.</p>
`,
});
console.log(`✅ Signup email sent to ${toEmail}`);
} catch (err) {
console.error("❌ Error sending signup email:", err);
}
}
//
// Send reset-password email with 4-digit code or token link
//
export async function sendResetPasswordMail(email, token) {
try {
const resetURL = `${process.env.FRONTEND_URL}/reset-password?email=${email}&token=${token}`;
await mailer.sendMail({
from: `"CrawlerX" <${process.env.SMTP_USER}>`,
to: email,
subject: "Reset your password",
html: `
<p>You requested a password reset.</p>
<p>Click here to reset: <a href="${resetURL}">${resetURL}</a></p>
<p>This link is valid for 1 hour.</p>
`,
});
console.log(`✅ Reset password email sent to ${email}`);
} catch (err) {
console.error("❌ Error sending reset password email:", err);
}
}
export const sendCakeOrderMail = async (toEmail, orderData) => {
try {
const transporter = nodemailer.createTransport({
host: "mail.crawlerx.co",
port: 587,
secure: false, // use TLS? false for port 587
auth: {
user: "info@crawlerx.co",
pass: "CrawlerX@2025",
},
tls: {
rejectUnauthorized: false, // <--- allow self-signed certificate
},
});
const orderItems = Object.entries(orderData)
.map(([category, flavours]) => {
const items = Object.entries(flavours)
.map(([flavour, qty]) => `${flavour}: ${qty}`)
.join("\n");
return `${category}:\n${items}`;
})
.join("\n\n");
const mailOptions = {
from: `"Maison de Treats" <info@crawlerx.co>`,
to: toEmail,
subject: "🎉 Your Cake Order Confirmation",
text: `Thank you for your order! Here are the details:\n\n${orderItems}`,
html: `<h2>Thank you for your order!</h2>
<p>Here are your cake order details:</p>
<pre>${orderItems}</pre>`,
};
await transporter.sendMail(mailOptions);
console.log("Cake order email sent to", toEmail);
} catch (err) {
console.error("Failed to send cake order email:", err);
}
};

20
utils/sitemap.js Normal file
View File

@ -0,0 +1,20 @@
import Sitemapper from "sitemapper";
import normalizeUrl from "normalize-url";
/** Return an array of normalized URLs from <origin>/sitemap.xml (or [] if none). */
export async function getSitemapUrls(startUrl) {
const origin = new URL(startUrl).origin;
const sitemapUrl = `${origin}/sitemap.xml`;
const sm = new Sitemapper({ url: sitemapUrl, timeout: 15000 });
try {
const { sites } = await sm.fetch();
const out = [];
for (const u of sites || []) {
try { out.push(normalizeUrl(u, { stripHash: true })); } catch {}
}
return out;
} catch {
return [];
}
}

8
utils/stripe.js Normal file
View File

@ -0,0 +1,8 @@
import Stripe from "stripe";
import dotenv from "dotenv";
dotenv.config();
export const stripe = new Stripe(process.env.STRIPE_SECRET_KEY, {
apiVersion: "2024-06-20",
});

11
utils/urlHelpers.js Normal file
View File

@ -0,0 +1,11 @@
export function isInternal(base, candidate) {
try {
const baseUrl = new URL(base);
const testUrl = new URL(candidate, base);
const protocolOk = testUrl.protocol === "http:" || testUrl.protocol === "https:";
const stripWWW = (h) => h.replace(/^www\./i, "");
return protocolOk && stripWWW(baseUrl.hostname) === stripWWW(testUrl.hostname);
} catch {
return false;
}
}