first commit
This commit is contained in:
commit
6c345df1c2
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# .gitignore
|
||||||
|
node_modules
|
||||||
|
.env
|
||||||
|
reports
|
||||||
11
config/db.js
Normal file
11
config/db.js
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
import mongoose from 'mongoose';
|
||||||
|
|
||||||
|
export async function connectDB() {
|
||||||
|
try {
|
||||||
|
await mongoose.connect(process.env.MONGODB_URI, { dbName: 'crawlerX' });
|
||||||
|
console.log('✅ MongoDB connected');
|
||||||
|
} catch (err) {
|
||||||
|
console.error('❌ MongoDB connection error:', err);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
150
controllers/auth.controller.js
Normal file
150
controllers/auth.controller.js
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
import bcrypt from "bcrypt";
|
||||||
|
import jwt from "jsonwebtoken";
|
||||||
|
import User from "../models/user.model.js";
|
||||||
|
import { sendResetPasswordMail, sendSignupMail, } from "../utils/mailer.js";
|
||||||
|
import crypto from "crypto";
|
||||||
|
|
||||||
|
|
||||||
|
export async function signup(req, res) {
|
||||||
|
try {
|
||||||
|
const { email, password } = req.body;
|
||||||
|
if (!email || !password)
|
||||||
|
return res.status(400).json({ error: "Email and password required" });
|
||||||
|
|
||||||
|
const exists = await User.findOne({ email });
|
||||||
|
if (exists) return res.status(400).json({ error: "User already exists" });
|
||||||
|
|
||||||
|
const passwordHash = await bcrypt.hash(password, 10);
|
||||||
|
const user = await User.create({ email, passwordHash });
|
||||||
|
|
||||||
|
// ✅ send confirmation email (non-blocking)
|
||||||
|
sendSignupMail(email)
|
||||||
|
.then(() => console.log("Signup email sent to", email))
|
||||||
|
.catch(err => console.error("Email send failed:", err));
|
||||||
|
|
||||||
|
res.status(201).json({ message: "Signup success, email sent", id: user._id });
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
res.status(500).json({ error: "Signup failed" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function login(req, res) {
|
||||||
|
try {
|
||||||
|
const { email, password } = req.body;
|
||||||
|
const user = await User.findOne({ email });
|
||||||
|
if (!user) return res.status(401).json({ error: "Invalid credentials" });
|
||||||
|
|
||||||
|
const match = await bcrypt.compare(password, user.passwordHash);
|
||||||
|
if (!match) return res.status(401).json({ error: "Invalid credentials" });
|
||||||
|
|
||||||
|
const token = jwt.sign(
|
||||||
|
{ id: user._id, email: user.email },
|
||||||
|
process.env.JWT_SECRET,
|
||||||
|
{ expiresIn: "1h" }
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({ message: "Login success", token });
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
res.status(500).json({ error: "Login failed" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/auth/change-password
|
||||||
|
* Body: { currentPassword, newPassword }
|
||||||
|
* Header: Authorization: Bearer <token>
|
||||||
|
*/
|
||||||
|
export async function changePassword(req, res) {
|
||||||
|
try {
|
||||||
|
const { currentPassword, newPassword } = req.body;
|
||||||
|
|
||||||
|
// if using FormData, fields come from req.body AFTER a multipart parser
|
||||||
|
if (!currentPassword || !newPassword) {
|
||||||
|
return res.status(400).json({ error: "Current password and new password are required" });
|
||||||
|
}
|
||||||
|
|
||||||
|
const user = await User.findById(req.user.id);
|
||||||
|
if (!user) return res.status(404).json({ error: "User not found" });
|
||||||
|
|
||||||
|
const isMatch = await bcrypt.compare(currentPassword, user.passwordHash);
|
||||||
|
if (!isMatch)
|
||||||
|
return res.status(401).json({ error: "Current password is incorrect" });
|
||||||
|
|
||||||
|
user.passwordHash = await bcrypt.hash(newPassword, 10);
|
||||||
|
await user.save();
|
||||||
|
|
||||||
|
res.json({ message: "Password updated successfully" });
|
||||||
|
} catch (err) {
|
||||||
|
console.error("changePassword error:", err); // ✅ show actual error
|
||||||
|
res.status(500).json({ error: "Failed to change password" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/auth/forgot-password
|
||||||
|
* Body: { email }
|
||||||
|
*/
|
||||||
|
export async function forgotPassword(req, res) {
|
||||||
|
try {
|
||||||
|
const { email } = req.body;
|
||||||
|
if (!email) return res.status(400).json({ error: "Email is required" });
|
||||||
|
|
||||||
|
const user = await User.findOne({ email });
|
||||||
|
|
||||||
|
if (!user)
|
||||||
|
return res.json({
|
||||||
|
message: "If the email is registered, a reset link has been sent.",
|
||||||
|
verificationCode: null, // user not found
|
||||||
|
});
|
||||||
|
|
||||||
|
// Generate 4-digit numeric verification code
|
||||||
|
const verificationCode = Math.floor(1000 + Math.random() * 9000).toString();
|
||||||
|
|
||||||
|
// Save code and expiry in DB
|
||||||
|
user.resetPasswordToken = verificationCode;
|
||||||
|
user.resetPasswordExpires = Date.now() + 60 * 60 * 1000; // 1 hour
|
||||||
|
await user.save();
|
||||||
|
|
||||||
|
// Send code via email
|
||||||
|
await sendResetPasswordMail(email, verificationCode);
|
||||||
|
|
||||||
|
// ✅ Return verification code in response
|
||||||
|
res.json({
|
||||||
|
message: "If the email is registered, a reset link has been sent.",
|
||||||
|
verificationCode, // This is the 4-digit code
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error("forgotPassword error:", err);
|
||||||
|
res.status(500).json({ error: "Failed to send reset link" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/auth/reset-password
|
||||||
|
* Body: { token, newPassword }
|
||||||
|
*/
|
||||||
|
export async function resetPassword(req, res) {
|
||||||
|
try {
|
||||||
|
const { token, newPassword } = req.body;
|
||||||
|
if (!token || !newPassword)
|
||||||
|
return res.status(400).json({ error: "Token and new password are required" });
|
||||||
|
|
||||||
|
const user = await User.findOne({
|
||||||
|
resetPasswordToken: token,
|
||||||
|
resetPasswordExpires: { $gt: Date.now() },
|
||||||
|
});
|
||||||
|
if (!user) return res.status(400).json({ error: "Invalid or expired token" });
|
||||||
|
|
||||||
|
user.passwordHash = await bcrypt.hash(newPassword, 10);
|
||||||
|
user.resetPasswordToken = undefined;
|
||||||
|
user.resetPasswordExpires = undefined;
|
||||||
|
await user.save();
|
||||||
|
|
||||||
|
res.json({ message: "Password has been reset successfully" });
|
||||||
|
} catch (err) {
|
||||||
|
console.error("resetPassword error:", err);
|
||||||
|
res.status(500).json({ error: "Failed to reset password" });
|
||||||
|
}
|
||||||
|
}
|
||||||
111
controllers/blog.controller.js
Normal file
111
controllers/blog.controller.js
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
import Blog from '../models/blog.model.js';
|
||||||
|
import Category from '../models/category.model.js';
|
||||||
|
import slugify from 'slugify';
|
||||||
|
|
||||||
|
// ✅ Create Blog for particular project
|
||||||
|
export const createBlog = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { projectId, title, description, categoryId, tags } = req.body;
|
||||||
|
if (!projectId) return res.status(400).json({ message: 'projectId is required' });
|
||||||
|
|
||||||
|
const slug = slugify(title, { lower: true, strict: true });
|
||||||
|
|
||||||
|
const blog = await Blog.create({
|
||||||
|
projectId,
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
slug,
|
||||||
|
category: categoryId,
|
||||||
|
tags,
|
||||||
|
imageUrl: req.files?.imageUrl ? `/uploads/${req.files.imageUrl[0].filename}` : '',
|
||||||
|
bigImageUrl: req.files?.bigImageUrl ? `/uploads/${req.files.bigImageUrl[0].filename}` : ''
|
||||||
|
});
|
||||||
|
|
||||||
|
res.status(201).json(blog);
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ✅ Get All Blogs for a particular project
|
||||||
|
export const getAllBlogs = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { page = 1, limit = 10, search = '', category, projectId } = req.query;
|
||||||
|
if (!projectId) return res.status(400).json({ message: 'projectId is required' });
|
||||||
|
|
||||||
|
const query = {
|
||||||
|
projectId,
|
||||||
|
title: { $regex: search, $options: 'i' }
|
||||||
|
};
|
||||||
|
|
||||||
|
if (category) {
|
||||||
|
const cat = await Category.findOne({ slug: category });
|
||||||
|
if (cat) query.category = cat._id;
|
||||||
|
}
|
||||||
|
|
||||||
|
const blogs = await Blog.find(query)
|
||||||
|
.populate('category', 'name slug')
|
||||||
|
.sort({ createdAt: -1 })
|
||||||
|
.skip((page - 1) * limit)
|
||||||
|
.limit(parseInt(limit));
|
||||||
|
|
||||||
|
const total = await Blog.countDocuments(query);
|
||||||
|
|
||||||
|
res.json({ total, page: parseInt(page), blogs });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ✅ Get Single Blog by Slug + projectId
|
||||||
|
export const getBlogBySlug = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { projectId } = req.query; // 👈 query param மூலம்
|
||||||
|
if (!projectId) return res.status(400).json({ message: 'projectId is required' });
|
||||||
|
|
||||||
|
const blog = await Blog.findOne({ slug: req.params.slug, projectId })
|
||||||
|
.populate('category', 'name slug');
|
||||||
|
|
||||||
|
if (!blog) return res.status(404).json({ message: 'Blog not found' });
|
||||||
|
res.json(blog);
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ✅ Add Comment to Blog (projectId check optional – id already unique)
|
||||||
|
export const addComment = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { text, name } = req.body;
|
||||||
|
const blog = await Blog.findById(req.params.id);
|
||||||
|
if (!blog) return res.status(404).json({ message: 'Blog not found' });
|
||||||
|
|
||||||
|
blog.comments.push({
|
||||||
|
user: req.user?._id,
|
||||||
|
name: name || 'Anonymous',
|
||||||
|
text
|
||||||
|
});
|
||||||
|
|
||||||
|
await blog.save();
|
||||||
|
res.json(blog.comments);
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ✅ Like / Unlike
|
||||||
|
export const likeBlog = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const blog = await Blog.findById(req.params.id);
|
||||||
|
if (!blog) return res.status(404).json({ message: 'Blog not found' });
|
||||||
|
|
||||||
|
const userId = req.user._id;
|
||||||
|
if (blog.likes.includes(userId)) blog.likes.pull(userId);
|
||||||
|
else blog.likes.push(userId);
|
||||||
|
|
||||||
|
await blog.save();
|
||||||
|
res.json({ likesCount: blog.likes.length });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
46
controllers/category.controller.js
Normal file
46
controllers/category.controller.js
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import Category from '../models/category.model.js';
|
||||||
|
import slugify from "slugify";
|
||||||
|
|
||||||
|
// Create a new category (Admin only)
|
||||||
|
export const createCategory = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { name, projectId } = req.body;
|
||||||
|
if (!projectId) return res.status(400).json({ message: "projectId is required" });
|
||||||
|
|
||||||
|
const slug = slugify(name, { lower: true, strict: true });
|
||||||
|
|
||||||
|
const category = await Category.create({
|
||||||
|
name,
|
||||||
|
slug,
|
||||||
|
projectId
|
||||||
|
});
|
||||||
|
|
||||||
|
res.status(201).json(category);
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Get all categories for a project
|
||||||
|
export const getCategories = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { projectId } = req.query;
|
||||||
|
if (!projectId) return res.status(400).json({ message: "projectId is required" });
|
||||||
|
|
||||||
|
const categories = await Category.find({ projectId }).sort({ name: 1 });
|
||||||
|
res.json(categories);
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Delete a category (Admin only)
|
||||||
|
export const deleteCategory = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
await Category.findByIdAndDelete(id);
|
||||||
|
res.json({ message: "Category deleted" });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
54
controllers/comment.controller.js
Normal file
54
controllers/comment.controller.js
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import Blog from "../models/blog.model.js";
|
||||||
|
|
||||||
|
// Add comment to a blog
|
||||||
|
export const addComment = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { blogId } = req.params;
|
||||||
|
const { text, name } = req.body;
|
||||||
|
|
||||||
|
const blog = await Blog.findById(blogId);
|
||||||
|
if (!blog) return res.status(404).json({ message: "Blog not found" });
|
||||||
|
|
||||||
|
blog.comments.push({
|
||||||
|
user: req.user?._id || null,
|
||||||
|
name: name || "Anonymous",
|
||||||
|
text
|
||||||
|
});
|
||||||
|
|
||||||
|
await blog.save();
|
||||||
|
res.status(201).json(blog.comments);
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Get all comments for a blog
|
||||||
|
export const getComments = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { blogId } = req.params;
|
||||||
|
|
||||||
|
const blog = await Blog.findById(blogId);
|
||||||
|
if (!blog) return res.status(404).json({ message: "Blog not found" });
|
||||||
|
|
||||||
|
res.json(blog.comments.sort((a,b) => b.createdAt - a.createdAt));
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Delete a comment (Admin only)
|
||||||
|
export const deleteComment = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { blogId, commentId } = req.params;
|
||||||
|
|
||||||
|
const blog = await Blog.findById(blogId);
|
||||||
|
if (!blog) return res.status(404).json({ message: "Blog not found" });
|
||||||
|
|
||||||
|
blog.comments.id(commentId)?.remove();
|
||||||
|
await blog.save();
|
||||||
|
|
||||||
|
res.json({ message: "Comment deleted" });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ message: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
115
controllers/crawl.controller.js
Normal file
115
controllers/crawl.controller.js
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import path from "node:path";
|
||||||
|
import fs from "node:fs";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import { crawl } from "../crawler.js";
|
||||||
|
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
const ts = () =>
|
||||||
|
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-");
|
||||||
|
|
||||||
|
function attachJson(res, filename, obj) {
|
||||||
|
const json = JSON.stringify(obj, null, 2);
|
||||||
|
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
||||||
|
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
||||||
|
return res.send(json);
|
||||||
|
}
|
||||||
|
function isAbs(p) {
|
||||||
|
try {
|
||||||
|
return path.isAbsolute(p);
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function crawlHandler(req, res) {
|
||||||
|
try {
|
||||||
|
const { url, max, stream, download, nostore } = req.query;
|
||||||
|
if (!url) return res.status(400).json({ error: "Missing url param" });
|
||||||
|
|
||||||
|
const target = new URL(String(url));
|
||||||
|
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
|
||||||
|
const wantsStream =
|
||||||
|
String(stream) === "1" ||
|
||||||
|
(req.get("accept") || "").includes("text/event-stream");
|
||||||
|
|
||||||
|
/* ---------- SSE mode ---------- */
|
||||||
|
if (wantsStream) {
|
||||||
|
if (String(download) === "1") {
|
||||||
|
return res.status(400).json({ error: "download not supported with stream=1" });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.setHeader("Content-Type", "text/event-stream");
|
||||||
|
res.setHeader("Cache-Control", "no-cache, no-transform");
|
||||||
|
res.setHeader("Connection", "keep-alive");
|
||||||
|
res.flushHeaders?.();
|
||||||
|
|
||||||
|
const send = (obj, evt) => {
|
||||||
|
if (evt) res.write(`event: ${evt}\n`);
|
||||||
|
res.write(`data: ${JSON.stringify(obj)}\n\n`);
|
||||||
|
};
|
||||||
|
|
||||||
|
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
|
||||||
|
let finished = false;
|
||||||
|
|
||||||
|
req.on("close", () => {
|
||||||
|
clearInterval(heartbeat);
|
||||||
|
if (!finished) console.warn("SSE client disconnected.");
|
||||||
|
});
|
||||||
|
|
||||||
|
const onProgress = (tick) => send(tick, "tick");
|
||||||
|
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
|
||||||
|
|
||||||
|
const result = await crawl(target.toString(), limit, onProgress, {
|
||||||
|
persistReports: false,
|
||||||
|
collectPages: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
finished = true;
|
||||||
|
clearInterval(heartbeat);
|
||||||
|
send({ ok: true, done: true, result }, "done");
|
||||||
|
return res.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- Non-streaming mode ---------- */
|
||||||
|
const preferMemory = String(nostore) === "1" || String(download) === "1";
|
||||||
|
const result = await crawl(
|
||||||
|
target.toString(),
|
||||||
|
limit,
|
||||||
|
undefined,
|
||||||
|
preferMemory
|
||||||
|
? { persistReports: false, collectPages: true }
|
||||||
|
: { persistReports: true, collectPages: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
if (String(download) === "1") {
|
||||||
|
const filename = `crawl-${ts()}.json`;
|
||||||
|
|
||||||
|
if (Array.isArray(result?.results)) {
|
||||||
|
return attachJson(res, filename, result.results);
|
||||||
|
}
|
||||||
|
|
||||||
|
const jsonPath = result?.files?.json;
|
||||||
|
if (jsonPath) {
|
||||||
|
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
|
||||||
|
if (fs.existsSync(abs)) {
|
||||||
|
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
||||||
|
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
||||||
|
return fs.createReadStream(abs).pipe(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return attachJson(res, filename, result ?? {});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.json({
|
||||||
|
ok: true,
|
||||||
|
message: "Crawl completed",
|
||||||
|
url: target.toString(),
|
||||||
|
limit,
|
||||||
|
...result,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Crawl error:", err);
|
||||||
|
res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
|
||||||
|
}
|
||||||
|
}
|
||||||
113
controllers/lighthouseController.js
Normal file
113
controllers/lighthouseController.js
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
import lighthouse from 'lighthouse';
|
||||||
|
import { launch } from 'chrome-launcher';
|
||||||
|
import PageSpeedTest from '../models/pageSpeedTest.model.js';
|
||||||
|
import path from 'path';
|
||||||
|
import fs from 'fs';
|
||||||
|
|
||||||
|
const reportsDir = path.join(process.cwd(), 'public', 'lighthouse-treemap');
|
||||||
|
// Ensure folder exists
|
||||||
|
if (!fs.existsSync(reportsDir)) fs.mkdirSync(reportsDir, { recursive: true });
|
||||||
|
|
||||||
|
const launchChromeAndRunLighthouse = async (url, device = 'mobile') => {
|
||||||
|
const chrome = await launch({ chromeFlags: ['--headless'] });
|
||||||
|
|
||||||
|
const options = {
|
||||||
|
port: chrome.port,
|
||||||
|
emulatedFormFactor: device,
|
||||||
|
throttlingMethod: device === 'mobile' ? 'simulate' : 'devtools',
|
||||||
|
output: 'json', // JSON for metrics
|
||||||
|
};
|
||||||
|
|
||||||
|
const runnerResult = await lighthouse(url, options);
|
||||||
|
const lhr = runnerResult.lhr;
|
||||||
|
|
||||||
|
// Create HTML treemap report (only once, for mobile)
|
||||||
|
let treemapFile = null;
|
||||||
|
if (device === 'mobile') {
|
||||||
|
const fileName = `treemap-${Date.now()}.html`;
|
||||||
|
treemapFile = `/lighthouse-treemap/${fileName}`;
|
||||||
|
|
||||||
|
// Generate HTML report
|
||||||
|
const htmlReport = await lighthouse(url, {
|
||||||
|
port: chrome.port,
|
||||||
|
emulatedFormFactor: device,
|
||||||
|
throttlingMethod: 'simulate',
|
||||||
|
output: 'html',
|
||||||
|
});
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(reportsDir, fileName), htmlReport.report);
|
||||||
|
}
|
||||||
|
|
||||||
|
await chrome.kill();
|
||||||
|
|
||||||
|
// Structured result
|
||||||
|
const result = {
|
||||||
|
url,
|
||||||
|
device,
|
||||||
|
scores: {
|
||||||
|
performance: Math.round(lhr.categories.performance?.score * 100),
|
||||||
|
accessibility: Math.round(lhr.categories.accessibility?.score * 100),
|
||||||
|
bestPractices: Math.round(lhr.categories['best-practices']?.score * 100),
|
||||||
|
seo: Math.round(lhr.categories.seo?.score * 100),
|
||||||
|
pwa: lhr.categories.pwa?.score ? Math.round(lhr.categories.pwa.score * 100) : null,
|
||||||
|
},
|
||||||
|
metrics: {
|
||||||
|
firstContentfulPaint: lhr.audits['first-contentful-paint']?.displayValue || null,
|
||||||
|
largestContentfulPaint: lhr.audits['largest-contentful-paint']?.displayValue || null,
|
||||||
|
totalBlockingTime: lhr.audits['total-blocking-time']?.displayValue || null,
|
||||||
|
timeToInteractive: lhr.audits['interactive']?.displayValue || null,
|
||||||
|
speedIndex: lhr.audits['speed-index']?.displayValue || null,
|
||||||
|
cumulativeLayoutShift: lhr.audits['cumulative-layout-shift']?.displayValue || null,
|
||||||
|
},
|
||||||
|
opportunities: Object.values(lhr.audits)
|
||||||
|
.filter(a => a.details?.type === 'opportunity')
|
||||||
|
.map(a => ({
|
||||||
|
title: a.title,
|
||||||
|
description: a.description,
|
||||||
|
estimatedSavings: a.details?.overallSavingsMs
|
||||||
|
? `${Math.round(a.details.overallSavingsMs)} ms`
|
||||||
|
: null,
|
||||||
|
})),
|
||||||
|
diagnostics: {
|
||||||
|
usesHTTPS: lhr.audits['is-on-https']?.score === 1,
|
||||||
|
usesEfficientCachePolicy: lhr.audits['uses-long-cache-ttl']?.score === 1,
|
||||||
|
imageCompression: lhr.audits['uses-optimized-images']?.score === 1,
|
||||||
|
},
|
||||||
|
failedAudits: Object.values(lhr.audits)
|
||||||
|
.filter(a => a.score !== null && a.score !== 1 && a.scoreDisplayMode !== 'notApplicable')
|
||||||
|
.map(a => ({ title: a.title, description: a.description })),
|
||||||
|
passedAudits: Object.values(lhr.audits)
|
||||||
|
.filter(a => a.score === 1 && a.scoreDisplayMode !== 'notApplicable' && !a.details?.type)
|
||||||
|
.map(a => a.title),
|
||||||
|
notApplicableAudits: Object.values(lhr.audits)
|
||||||
|
.filter(a => a.scoreDisplayMode === 'notApplicable')
|
||||||
|
.map(a => a.title),
|
||||||
|
screenshot: lhr.audits['final-screenshot']?.details?.data || null,
|
||||||
|
createdAt: new Date(),
|
||||||
|
treemapPath: treemapFile,
|
||||||
|
};
|
||||||
|
|
||||||
|
const report = await PageSpeedTest.create(result);
|
||||||
|
return { report };
|
||||||
|
};
|
||||||
|
|
||||||
|
export const runAudit = async (req, res, next) => {
|
||||||
|
try {
|
||||||
|
const { url } = req.body;
|
||||||
|
if (!url) return res.status(400).json({ message: 'URL is required' });
|
||||||
|
|
||||||
|
const mobileResult = await launchChromeAndRunLighthouse(url, 'mobile');
|
||||||
|
const desktopResult = await launchChromeAndRunLighthouse(url, 'desktop');
|
||||||
|
|
||||||
|
res.status(200).json({
|
||||||
|
message: 'Audit completed successfully',
|
||||||
|
results: {
|
||||||
|
mobile: mobileResult.report,
|
||||||
|
desktop: desktopResult.report,
|
||||||
|
treemap: mobileResult.report.treemapPath, // HTML report
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
next(err);
|
||||||
|
}
|
||||||
|
};
|
||||||
39
controllers/maisondetreats/cakeOrder.controller.js
Normal file
39
controllers/maisondetreats/cakeOrder.controller.js
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import { CakeOrder } from "../../models/maisondetreats/cakeOrder.model.js";
|
||||||
|
import { sendCakeOrderMail } from "../../utils/mailer.js";
|
||||||
|
|
||||||
|
export const createCakeOrder = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { order, email } = req.body;
|
||||||
|
|
||||||
|
if (!order || typeof order !== "object") {
|
||||||
|
return res.status(400).json({ message: "Order data is required" });
|
||||||
|
}
|
||||||
|
|
||||||
|
const newOrder = await CakeOrder.create({ order, email });
|
||||||
|
|
||||||
|
// ✅ send confirmation email (non-blocking)
|
||||||
|
if (email) {
|
||||||
|
sendCakeOrderMail(email, order)
|
||||||
|
.then(() => console.log("Cake order email sent to", email))
|
||||||
|
.catch((err) => console.error("Email send failed:", err));
|
||||||
|
}
|
||||||
|
|
||||||
|
res.status(201).json({
|
||||||
|
message: "Cake order created successfully",
|
||||||
|
data: newOrder,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Error creating cake order:", err);
|
||||||
|
res.status(500).json({ message: "Server error", error: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// GET /api/cake-orders → List all orders
|
||||||
|
export const getAllCakeOrders = async (_req, res) => {
|
||||||
|
try {
|
||||||
|
const orders = await CakeOrder.find().sort({ createdAt: -1 });
|
||||||
|
res.json({ data: orders });
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Error fetching cake orders:", err);
|
||||||
|
res.status(500).json({ message: "Server error", error: err.message });
|
||||||
|
}
|
||||||
|
};
|
||||||
68
controllers/message.controller.js
Normal file
68
controllers/message.controller.js
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
// message.controller.js
|
||||||
|
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
import axios from "axios";
|
||||||
|
import Message from "../models/message.model.js";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
export const sendMessage = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { project, name, email, message } = req.body;
|
||||||
|
|
||||||
|
if (!project) return res.status(400).json({ success: false, error: "Project is required" });
|
||||||
|
if (!message) return res.status(400).json({ success: false, error: "Message is required" });
|
||||||
|
|
||||||
|
// Save message to MongoDB
|
||||||
|
const newMessage = await Message.create({ project, name, email, message });
|
||||||
|
|
||||||
|
// Send WhatsApp Template Message
|
||||||
|
const url = `https://graph.facebook.com/v22.0/774121419125441/messages`;
|
||||||
|
const payload = {
|
||||||
|
messaging_product: "whatsapp",
|
||||||
|
to: 917871207631,
|
||||||
|
type: "template",
|
||||||
|
template: {
|
||||||
|
name: "new_message_alert",
|
||||||
|
language: { code: "en_US" },
|
||||||
|
components: [
|
||||||
|
{
|
||||||
|
type: "body",
|
||||||
|
parameters: [
|
||||||
|
{ type: "text", text: project || "Project" },
|
||||||
|
{ type: "text", text: name || "Guest" },
|
||||||
|
{ type: "text", text: email || "N/A" },
|
||||||
|
{ type: "text", text: message || "No message" },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
const headers = {
|
||||||
|
Authorization: `Bearer EAALKxEMPlp0BPkmoTAJlZAZAymtgqzcUuGVdZAZAKSZAw1csXR5Xy2DodBUC2zXckOYvQ2jOV4aFlZAeCo4IuJCyMb5aFt2UfNRQ1pDGk08QlbCjjCTMsZALipZCMNYyNVwN2pTDwUcYeNZByOrweVVdXD1ErZAbzjc04wmR8ilhQXink4it05BatwkZBf3xCLyy3k6R0tgx9JoymQTn83iZANBWDzvmX3vW5dx6Pud6xNEfqYNsjwZDZD`,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await axios.post(url, payload, { headers });
|
||||||
|
console.log("✅ WhatsApp API Response:", response.data);
|
||||||
|
|
||||||
|
return res.status(201).json({ success: true, data: newMessage });
|
||||||
|
} catch (err) {
|
||||||
|
console.error("❌ WhatsApp API Error:", err.response?.data || err.message);
|
||||||
|
return res.status(500).json({ success: false, error: "Server Error" });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ✅ Add this function and export it
|
||||||
|
export const getMessages = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { project } = req.query;
|
||||||
|
if (!project) return res.status(400).json({ success: false, error: "Project is required" });
|
||||||
|
|
||||||
|
const messages = await Message.find({ project }).sort({ createdAt: -1 });
|
||||||
|
return res.status(200).json({ success: true, data: messages });
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
return res.status(500).json({ success: false, error: "Server Error" });
|
||||||
|
}
|
||||||
|
};
|
||||||
132
controllers/payment.controller.js
Normal file
132
controllers/payment.controller.js
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
// controllers/payment.controller.js
|
||||||
|
import Stripe from "stripe";
|
||||||
|
import { Payment } from "../models/payment.model.js";
|
||||||
|
|
||||||
|
// ✅ Load Stripe Secret Key from .env
|
||||||
|
const stripe = new Stripe("sk_test_51SB8SnIFk8fh986GkYaNPVSfZzh6gcuXhq3tOa5hyE4U4vYIqrHwyGRu2OE1N5TNW39tJmfFOyYfsh4HcZOjlsj100xIeM46zU", {
|
||||||
|
apiVersion: "2022-11-15",
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 🔹 Option 1: PaymentIntent API (client uses clientSecret)
|
||||||
|
*/
|
||||||
|
export async function createPaymentIntent(req, res) {
|
||||||
|
try {
|
||||||
|
const { amount } = req.body;
|
||||||
|
if (!amount) return res.status(400).json({ error: "amount is required" });
|
||||||
|
|
||||||
|
const paymentIntent = await stripe.paymentIntents.create({
|
||||||
|
amount: Math.round(amount * 100), // dollars → cents
|
||||||
|
currency: "usd",
|
||||||
|
automatic_payment_methods: { enabled: true },
|
||||||
|
});
|
||||||
|
|
||||||
|
await Payment.create({
|
||||||
|
amount: Math.round(amount * 100),
|
||||||
|
stripePaymentIntentId: paymentIntent.id,
|
||||||
|
status: "pending",
|
||||||
|
});
|
||||||
|
|
||||||
|
res.json({ clientSecret: paymentIntent.client_secret });
|
||||||
|
} catch (err) {
|
||||||
|
console.error("❌ Error creating PaymentIntent:", err);
|
||||||
|
res.status(500).json({ error: "Internal Server Error" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 🔹 Option 2: Stripe Checkout Session (redirect flow)
|
||||||
|
*/
|
||||||
|
export async function createCheckoutSession(req, res) {
|
||||||
|
try {
|
||||||
|
const { email, amount, planId } = req.body;
|
||||||
|
if (!email || !amount) {
|
||||||
|
return res.status(400).json({ error: "email and amount are required" });
|
||||||
|
}
|
||||||
|
|
||||||
|
const session = await stripe.checkout.sessions.create({
|
||||||
|
payment_method_types: ["card"],
|
||||||
|
mode: "payment",
|
||||||
|
customer_email: email,
|
||||||
|
line_items: [
|
||||||
|
{
|
||||||
|
price_data: {
|
||||||
|
currency: "usd",
|
||||||
|
product_data: { name: planId || "SEO Plan" },
|
||||||
|
unit_amount: Math.round(amount * 100),
|
||||||
|
},
|
||||||
|
quantity: 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
success_url: "https://app.crawlerx.co/success",
|
||||||
|
cancel_url: "https://app.crawlerx.co/cancel",
|
||||||
|
});
|
||||||
|
|
||||||
|
// Save to DB using stripeSessionId instead of stripePaymentIntentId
|
||||||
|
await Payment.create({
|
||||||
|
email,
|
||||||
|
amount: Math.round(amount * 100),
|
||||||
|
stripeSessionId: session.id, // ✅ use session id
|
||||||
|
status: "pending",
|
||||||
|
});
|
||||||
|
|
||||||
|
res.json({ sessionId: session.id });
|
||||||
|
} catch (err) {
|
||||||
|
console.error("❌ Error creating checkout session:", err);
|
||||||
|
res.status(500).json({ error: "Internal Server Error" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 🔹 Stripe Webhook
|
||||||
|
* Stripe requires `express.raw({ type: "application/json" })` in route
|
||||||
|
*/
|
||||||
|
export async function handleWebhook(req, res) {
|
||||||
|
const sig = req.headers["stripe-signature"];
|
||||||
|
let event;
|
||||||
|
|
||||||
|
try {
|
||||||
|
event = stripe.webhooks.constructEvent(
|
||||||
|
req.rawBody, // Must be raw body
|
||||||
|
sig,
|
||||||
|
process.env.STRIPE_WEBHOOK_SECRET
|
||||||
|
);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("❌ Webhook signature verification failed:", err.message);
|
||||||
|
return res.status(400).send(`Webhook Error: ${err.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (event.type) {
|
||||||
|
case "payment_intent.succeeded": {
|
||||||
|
const paymentIntent = event.data.object;
|
||||||
|
console.log("✅ PaymentIntent succeeded:", paymentIntent.id);
|
||||||
|
|
||||||
|
await Payment.findOneAndUpdate(
|
||||||
|
{ stripePaymentIntentId: paymentIntent.id },
|
||||||
|
{ status: "succeeded" }
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case "checkout.session.completed": {
|
||||||
|
const session = event.data.object;
|
||||||
|
console.log("✅ Checkout session completed:", session.id);
|
||||||
|
|
||||||
|
// Update DB record created earlier
|
||||||
|
await Payment.findOneAndUpdate(
|
||||||
|
{ email: session.customer_email, status: "pending" },
|
||||||
|
{
|
||||||
|
stripePaymentIntentId: session.payment_intent,
|
||||||
|
status: "succeeded",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
console.log(`Unhandled event type ${event.type}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json({ received: true });
|
||||||
|
}
|
||||||
20
controllers/sitemap.controller.js
Normal file
20
controllers/sitemap.controller.js
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import { getSitemapUrls } from "../utils/sitemap.js";
|
||||||
|
|
||||||
|
export async function sitemapHandler(req, res) {
|
||||||
|
try {
|
||||||
|
const { u } = req.query;
|
||||||
|
if (!u) return res.status(400).json({ error: "Missing ?u=https://site.com" });
|
||||||
|
|
||||||
|
const origin = new URL(String(u));
|
||||||
|
const urls = await getSitemapUrls(origin.toString());
|
||||||
|
res.json({
|
||||||
|
ok: true,
|
||||||
|
origin: origin.origin,
|
||||||
|
count: urls.length,
|
||||||
|
urls,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error("sitemap error:", err);
|
||||||
|
res.status(500).json({ error: "Failed to fetch sitemap", details: String(err?.message ?? err) });
|
||||||
|
}
|
||||||
|
}
|
||||||
709
crawler copy.js
Normal file
709
crawler copy.js
Normal file
@ -0,0 +1,709 @@
|
|||||||
|
import got from "got";
|
||||||
|
import * as cheerio from "cheerio";
|
||||||
|
import normalizeUrl from "normalize-url";
|
||||||
|
import { isInternal } from "./utils/urlHelpers.js";
|
||||||
|
import { getSitemapUrls } from "./utils/sitemap.js";
|
||||||
|
import fs from "node:fs";
|
||||||
|
import path from "node:path";
|
||||||
|
import { chromium } from "playwright";
|
||||||
|
|
||||||
|
// NEW libs
|
||||||
|
import pixelWidth from "string-pixel-width";
|
||||||
|
import * as readability from "text-readability";
|
||||||
|
import stringSimilarity from "string-similarity";
|
||||||
|
|
||||||
|
/* ------------------------------ globals --------------------------------- */
|
||||||
|
const visited = new Set();
|
||||||
|
const queue = [];
|
||||||
|
const results = [];
|
||||||
|
|
||||||
|
// Link provenance: every discovered edge (source -> target)
|
||||||
|
const edges = []; // { from, raw_href, to, discovered_by }
|
||||||
|
|
||||||
|
// Quick referrer map for error report
|
||||||
|
const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}>
|
||||||
|
|
||||||
|
const REAL_UA =
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
||||||
|
const REAL_HEADERS = {
|
||||||
|
"user-agent": REAL_UA,
|
||||||
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"upgrade-insecure-requests": "1",
|
||||||
|
};
|
||||||
|
|
||||||
|
/* ------------------------------ utils ----------------------------------- */
|
||||||
|
function csvEscape(v) {
|
||||||
|
if (v === undefined || v === null) return "";
|
||||||
|
const s = String(v);
|
||||||
|
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
|
||||||
|
}
|
||||||
|
function ensureDir(dir) {
|
||||||
|
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
||||||
|
}
|
||||||
|
function writePageReports(results) {
|
||||||
|
ensureDir("reports");
|
||||||
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
||||||
|
const base = path.join("reports", `crawl-${stamp}`);
|
||||||
|
|
||||||
|
fs.writeFileSync(`${base}.json`, JSON.stringify(results, null, 2), "utf8");
|
||||||
|
console.log(`\n📝 Full JSON report saved: ${base}.json`);
|
||||||
|
|
||||||
|
// Columns (a Screaming-Frog-ish shape with our extras)
|
||||||
|
const headers = [
|
||||||
|
"url", "status", "status_text", "time_ms", "bytes", "content_type", "http_version",
|
||||||
|
"title", "title_length", "title_pixel_width",
|
||||||
|
"meta_description", "meta_description_length", "meta_description_pixel_width",
|
||||||
|
"h1_1", "h1_1_length", "h1_1_pixel_width", "h1_2", "h1_2_length", "h1_2_pixel_width",
|
||||||
|
"h2_1", "h2_2",
|
||||||
|
"canonical", "robots_meta", "x_robots_tag", "noindex", "nofollow",
|
||||||
|
"lang", "word_count", "flesch_reading_ease", "flesch_kincaid_grade",
|
||||||
|
"gunning_fog", "coleman_liau", "ari", "smog",
|
||||||
|
"schema_types", "inlinks", "outlinks", "render_mode",
|
||||||
|
"last_modified", "set_cookie", "crawl_timestamp",
|
||||||
|
"duplicate_title_exact", "nearest_title_similarity", "nearest_title_url",
|
||||||
|
"duplicate_description_exact", "nearest_description_similarity", "nearest_description_url"
|
||||||
|
];
|
||||||
|
const lines = [headers.join(",")];
|
||||||
|
for (const r of results) {
|
||||||
|
lines.push([
|
||||||
|
r.url,
|
||||||
|
r.status,
|
||||||
|
r.status_text ?? "",
|
||||||
|
r.time_ms,
|
||||||
|
r.bytes,
|
||||||
|
r.content_type,
|
||||||
|
r.http_version ?? "",
|
||||||
|
r.title,
|
||||||
|
r.title_length,
|
||||||
|
r.title_pixel_width,
|
||||||
|
r.meta_description,
|
||||||
|
r.meta_description_length,
|
||||||
|
r.meta_description_pixel_width,
|
||||||
|
r.h1_1 ?? "",
|
||||||
|
r.h1_1_length ?? 0,
|
||||||
|
r.h1_1_pixel_width ?? "",
|
||||||
|
r.h1_2 ?? "",
|
||||||
|
r.h1_2_length ?? 0,
|
||||||
|
r.h1_2_pixel_width ?? "",
|
||||||
|
r.h2_1 ?? "",
|
||||||
|
r.h2_2 ?? "",
|
||||||
|
r.canonical,
|
||||||
|
r.robots_meta,
|
||||||
|
r.x_robots_tag ?? "",
|
||||||
|
r.noindex,
|
||||||
|
r.nofollow,
|
||||||
|
r.lang ?? "",
|
||||||
|
r.word_count ?? "",
|
||||||
|
r.flesch_reading_ease ?? "",
|
||||||
|
r.flesch_kincaid_grade ?? "",
|
||||||
|
r.gunning_fog ?? "",
|
||||||
|
r.coleman_liau ?? "",
|
||||||
|
r.ari ?? "",
|
||||||
|
r.smog ?? "",
|
||||||
|
Array.isArray(r.schema_types) ? r.schema_types.join("|") : "",
|
||||||
|
r.inlinks ?? 0,
|
||||||
|
r.outlinks ?? 0,
|
||||||
|
r.render_mode,
|
||||||
|
r.last_modified ?? "",
|
||||||
|
r.set_cookie ? "yes" : "no",
|
||||||
|
r.crawl_timestamp ?? "",
|
||||||
|
r.duplicate_title_exact ?? "",
|
||||||
|
r.nearest_title_similarity ?? "",
|
||||||
|
r.nearest_title_url ?? "",
|
||||||
|
r.duplicate_description_exact ?? "",
|
||||||
|
r.nearest_description_similarity ?? "",
|
||||||
|
r.nearest_description_url ?? ""
|
||||||
|
].map(csvEscape).join(","));
|
||||||
|
}
|
||||||
|
//fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8");
|
||||||
|
//console.log(`\n📝 Page reports saved:\n - ${base}.csv\n - ${base}.json`);
|
||||||
|
}
|
||||||
|
function writeLinkEdges(edges) {
|
||||||
|
ensureDir("reports");
|
||||||
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
||||||
|
const file = path.join("reports", `links-${stamp}.csv`);
|
||||||
|
const headers = ["from", "raw_href", "to", "discovered_by"];
|
||||||
|
const lines = [headers.join(",")];
|
||||||
|
for (const e of edges) {
|
||||||
|
lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(","));
|
||||||
|
}
|
||||||
|
fs.writeFileSync(file, lines.join("\n"), "utf8");
|
||||||
|
console.log(`🔗 Link provenance saved: ${file}`);
|
||||||
|
}
|
||||||
|
function writeErrors(results) {
|
||||||
|
ensureDir("reports");
|
||||||
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
||||||
|
const file = path.join("reports", `errors-${stamp}.csv`);
|
||||||
|
const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"];
|
||||||
|
const lines = [headers.join(",")];
|
||||||
|
|
||||||
|
for (const r of results) {
|
||||||
|
if (r && r.status !== null && r.status >= 400) {
|
||||||
|
const refs = referrers.get(r.url) || [];
|
||||||
|
if (refs.length === 0) {
|
||||||
|
lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(","));
|
||||||
|
} else {
|
||||||
|
for (const ref of refs) {
|
||||||
|
lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(","));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fs.writeFileSync(file, lines.join("\n"), "utf8");
|
||||||
|
console.log(`❗ Error report saved: ${file}`);
|
||||||
|
}
|
||||||
|
function addEdge(from, rawHref, to, discovered_by) {
|
||||||
|
edges.push({ from, raw_href: rawHref || "", to, discovered_by });
|
||||||
|
if (!referrers.has(to)) referrers.set(to, []);
|
||||||
|
referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by });
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------- parse HTML without JS --------------------------- */
|
||||||
|
function safeJsonParse(txt) {
|
||||||
|
try { return JSON.parse(txt); } catch { return null; }
|
||||||
|
}
|
||||||
|
function parseSchemaTypes($) {
|
||||||
|
const types = new Set();
|
||||||
|
$('script[type="application/ld+json"]').each((_, el) => {
|
||||||
|
const raw = $(el).contents().text();
|
||||||
|
const parsed = safeJsonParse(raw);
|
||||||
|
if (!parsed) return;
|
||||||
|
const collect = (obj) => {
|
||||||
|
if (!obj) return;
|
||||||
|
if (Array.isArray(obj)) { obj.forEach(collect); return; }
|
||||||
|
if (typeof obj === "object") {
|
||||||
|
const t = obj["@type"];
|
||||||
|
if (typeof t === "string") types.add(t);
|
||||||
|
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && types.add(x));
|
||||||
|
// nested
|
||||||
|
Object.values(obj).forEach(collect);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
collect(parsed);
|
||||||
|
});
|
||||||
|
return [...types];
|
||||||
|
}
|
||||||
|
function parseHtml(html, url) {
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
let title = ($("title").first().text() || "").trim();
|
||||||
|
const ogTitle = $('meta[property="og:title"]').attr("content") || "";
|
||||||
|
const twTitle = $('meta[name="twitter:title"]').attr("content") || "";
|
||||||
|
|
||||||
|
// Headings (capture top two H1s and H2s)
|
||||||
|
const h1s = $("h1").map((_, el) => $(el).text().trim()).get();
|
||||||
|
const h2s = $("h2").map((_, el) => $(el).text().trim()).get();
|
||||||
|
|
||||||
|
const h1_1 = h1s[0] || "";
|
||||||
|
const h1_2 = h1s[1] || "";
|
||||||
|
const h2_1 = h2s[0] || "";
|
||||||
|
const h2_2 = h2s[1] || "";
|
||||||
|
|
||||||
|
const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length;
|
||||||
|
|
||||||
|
if (!title) title = (ogTitle || twTitle || h1_1 || "").trim();
|
||||||
|
|
||||||
|
const metaDesc = ($('meta[name="description"]').attr("content") || "").trim();
|
||||||
|
const canonical = ($('link[rel="canonical"]').attr("href") || "").trim();
|
||||||
|
const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim();
|
||||||
|
const robotsLower = robotsMeta.toLowerCase();
|
||||||
|
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
|
||||||
|
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
|
||||||
|
|
||||||
|
const lang = ($("html").attr("lang") || "").trim();
|
||||||
|
|
||||||
|
// Basic text body for word count / readability
|
||||||
|
const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim();
|
||||||
|
const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
|
||||||
|
|
||||||
|
// Internal links + raw href
|
||||||
|
const internalLinks = new Set();
|
||||||
|
const rawLinks = [];
|
||||||
|
$("a[href]").each((_, el) => {
|
||||||
|
const href = $(el).attr("href");
|
||||||
|
if (!href) return;
|
||||||
|
try {
|
||||||
|
const abs = new URL(href, url).toString();
|
||||||
|
rawLinks.push({ raw: href, abs });
|
||||||
|
internalLinks.add(abs);
|
||||||
|
} catch { }
|
||||||
|
});
|
||||||
|
|
||||||
|
// Schema.org JSON-LD types
|
||||||
|
const schemaTypes = parseSchemaTypes($);
|
||||||
|
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
metaDesc,
|
||||||
|
h1_1, h1_2, h2_1, h2_2,
|
||||||
|
totalHeadings,
|
||||||
|
canonical, robotsMeta, noindex, nofollow,
|
||||||
|
internalLinks, rawLinks,
|
||||||
|
lang,
|
||||||
|
wordCount,
|
||||||
|
schemaTypes,
|
||||||
|
bodyText
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------------ fetchers -------------------------------- */
|
||||||
|
async function fetchWithGot(url) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
const res = await got(url, {
|
||||||
|
timeout: { request: 20000 },
|
||||||
|
throwHttpErrors: false,
|
||||||
|
headers: REAL_HEADERS,
|
||||||
|
http2: false
|
||||||
|
});
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
const contentType = (res.headers["content-type"] || "").toLowerCase();
|
||||||
|
const bytes = res.headers["content-length"]
|
||||||
|
? Number(res.headers["content-length"])
|
||||||
|
: Buffer.byteLength(res.body || "", "utf8");
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: res.statusCode ?? null,
|
||||||
|
status_text: res.statusMessage ?? "",
|
||||||
|
time_ms: dt,
|
||||||
|
contentType,
|
||||||
|
body: res.body,
|
||||||
|
bytes,
|
||||||
|
render_mode: "http",
|
||||||
|
httpVersion: res.httpVersion ?? "",
|
||||||
|
headers: res.headers
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createBrowserContext() {
|
||||||
|
const browser = await chromium.launch({ headless: true, args: ["--disable-blink-features=AutomationControlled"] });
|
||||||
|
const context = await browser.newContext({
|
||||||
|
ignoreHTTPSErrors: true, // Ignore SSL certificate errors
|
||||||
|
userAgent: REAL_UA,
|
||||||
|
viewport: { width: 1366, height: 768 },
|
||||||
|
deviceScaleFactor: 1,
|
||||||
|
isMobile: false,
|
||||||
|
locale: "en-US",
|
||||||
|
extraHTTPHeaders: REAL_HEADERS
|
||||||
|
});
|
||||||
|
await context.addInitScript(() => {
|
||||||
|
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
||||||
|
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] });
|
||||||
|
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
|
||||||
|
});
|
||||||
|
return { browser: context.browser(), context };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWithPlaywrightAndExtract(url, shared) {
|
||||||
|
const page = await shared.context.newPage();
|
||||||
|
const t0 = Date.now();
|
||||||
|
let status = null, mainHeaders = {}, statusText = "";
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
|
||||||
|
status = resp?.status() ?? null;
|
||||||
|
statusText = resp?.statusText() ?? "";
|
||||||
|
try { mainHeaders = resp ? await resp.headers() : {}; } catch { }
|
||||||
|
|
||||||
|
try { await page.waitForLoadState("networkidle", { timeout: 12000 }); } catch { }
|
||||||
|
try {
|
||||||
|
await page.waitForFunction(() => {
|
||||||
|
const main = document.querySelector("main") || document.body;
|
||||||
|
const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length;
|
||||||
|
const hasHeading = !!document.querySelector("h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']");
|
||||||
|
return textLen > 160 || hasHeading;
|
||||||
|
}, { timeout: 8000 });
|
||||||
|
} catch { }
|
||||||
|
|
||||||
|
const dom = await page.evaluate(() => {
|
||||||
|
const clean = s => (s || "").replace(/\s+/g, " ").trim();
|
||||||
|
const getTextList = sel => Array.from(document.querySelectorAll(sel))
|
||||||
|
.map(el => clean(el.textContent)).filter(Boolean);
|
||||||
|
|
||||||
|
const title = document.title || "";
|
||||||
|
const ogTitle = document.querySelector('meta[property="og:title"]')?.content || "";
|
||||||
|
const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || "";
|
||||||
|
const metaDesc = document.querySelector('meta[name="description"]')?.content || "";
|
||||||
|
const canonical = document.querySelector('link[rel="canonical"]')?.href || "";
|
||||||
|
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || "";
|
||||||
|
const lang = document.documentElement.getAttribute("lang") || "";
|
||||||
|
|
||||||
|
const h1 = getTextList("h1");
|
||||||
|
const h2 = getTextList("h2");
|
||||||
|
const h3 = getTextList("h3");
|
||||||
|
const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length;
|
||||||
|
|
||||||
|
const links = Array.from(document.querySelectorAll("a[href]"))
|
||||||
|
.map(a => {
|
||||||
|
const raw = a.getAttribute("href");
|
||||||
|
try { return { raw, abs: new URL(raw, location.href).toString() }; }
|
||||||
|
catch { return null; }
|
||||||
|
})
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
const firstHeading = h1[0] || h2[0] || "";
|
||||||
|
const bodyText = clean((document.querySelector("main") || document.body).innerText || "");
|
||||||
|
|
||||||
|
const schemaScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]')).map(s => s.textContent || "");
|
||||||
|
|
||||||
|
return {
|
||||||
|
htmlLen: (document.documentElement.outerHTML || "").length,
|
||||||
|
title, ogTitle, twTitle, metaDesc, canonical, robotsMeta, lang,
|
||||||
|
h1, h2, totalHeadings,
|
||||||
|
links,
|
||||||
|
bodyText,
|
||||||
|
schemaScripts
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Parse schema types from strings (outside of page)
|
||||||
|
const schemaTypes = [];
|
||||||
|
for (const raw of dom.schemaScripts || []) {
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(raw);
|
||||||
|
const collect = (obj) => {
|
||||||
|
if (!obj) return;
|
||||||
|
if (Array.isArray(obj)) { obj.forEach(collect); return; }
|
||||||
|
if (typeof obj === "object") {
|
||||||
|
const t = obj["@type"];
|
||||||
|
if (typeof t === "string") schemaTypes.push(t);
|
||||||
|
else if (Array.isArray(t)) t.forEach(x => typeof x === "string" && schemaTypes.push(x));
|
||||||
|
Object.values(obj).forEach(collect);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
collect(parsed);
|
||||||
|
} catch { }
|
||||||
|
}
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
const robotsLower = (dom.robotsMeta || "").toLowerCase();
|
||||||
|
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
|
||||||
|
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
|
||||||
|
const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim();
|
||||||
|
|
||||||
|
return {
|
||||||
|
status,
|
||||||
|
status_text: statusText,
|
||||||
|
time_ms: dt,
|
||||||
|
contentType: "text/html",
|
||||||
|
bytes: dom.htmlLen || 0,
|
||||||
|
render_mode: "rendered",
|
||||||
|
headers: mainHeaders,
|
||||||
|
domExtract: {
|
||||||
|
title: finalTitle,
|
||||||
|
metaDesc: dom.metaDesc || "",
|
||||||
|
canonical: dom.canonical || "",
|
||||||
|
robotsMeta: dom.robotsMeta || "",
|
||||||
|
lang: dom.lang || "",
|
||||||
|
noindex, nofollow,
|
||||||
|
h1_1: dom.h1?.[0] || "",
|
||||||
|
h1_2: dom.h1?.[1] || "",
|
||||||
|
h2_1: dom.h2?.[0] || "",
|
||||||
|
h2_2: dom.h2?.[1] || "",
|
||||||
|
totalHeadings: dom.totalHeadings || 0,
|
||||||
|
links: new Set((dom.links || []).map(l => l.abs)),
|
||||||
|
rawLinks: dom.links || [],
|
||||||
|
bodyText: dom.bodyText || "",
|
||||||
|
schemaTypes: Array.from(new Set(schemaTypes))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
await page.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------- render decision ------------------------------ */
|
||||||
|
function shouldRender(currentUrl, httpRes, parsed, homeTitle) {
|
||||||
|
const { pathname } = new URL(currentUrl);
|
||||||
|
if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell
|
||||||
|
if (parsed.totalHeadings === 0) return true;
|
||||||
|
if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
function withWWW(urlStr) {
|
||||||
|
try { const u = new URL(urlStr); if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname; return u.toString(); }
|
||||||
|
catch { return urlStr; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------ per-page enrichers ---------------------------- */
|
||||||
|
function measurePixelWidth(text, size = 16, font = "arial") {
|
||||||
|
if (!text) return 0;
|
||||||
|
try { return pixelWidth(text, { font, size }); } catch { return Math.round(text.length * size * 0.5); }
|
||||||
|
}
|
||||||
|
function computeReadability(text) {
|
||||||
|
if (!text) return {};
|
||||||
|
const safe = text.slice(0, 200000); // cap
|
||||||
|
const out = {};
|
||||||
|
try { out.flesch_reading_ease = readability.fleschReadingEase(safe); } catch { }
|
||||||
|
try { out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe); } catch { }
|
||||||
|
try { out.gunning_fog = readability.gunningFog(safe); } catch { }
|
||||||
|
try { out.coleman_liau = readability.colemanLiauIndex(safe); } catch { }
|
||||||
|
try { out.ari = readability.automatedReadabilityIndex(safe); } catch { }
|
||||||
|
try { out.smog = readability.smogIndex(safe); } catch { }
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------------------- main ---------------------------------- */
|
||||||
|
// async function crawl(startUrl, maxPages = 50) {
|
||||||
|
|
||||||
|
export async function crawl(startUrl, maxPages = 50) {
|
||||||
|
const start = normalizeUrl(startUrl, { stripHash: true });
|
||||||
|
queue.push(start);
|
||||||
|
|
||||||
|
// Seed from sitemap.xml + record provenance
|
||||||
|
try {
|
||||||
|
const sitemapUrls = await getSitemapUrls(start);
|
||||||
|
for (const u of sitemapUrls) {
|
||||||
|
queue.push(u);
|
||||||
|
addEdge("sitemap.xml", u, u, "sitemap");
|
||||||
|
}
|
||||||
|
console.log(`📌 Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`);
|
||||||
|
} catch (e) {
|
||||||
|
console.log("⚠️ Sitemap step skipped:", e.message);
|
||||||
|
}
|
||||||
|
|
||||||
|
let shared = null;
|
||||||
|
async function getShared() { if (!shared) shared = await createBrowserContext(); return shared; }
|
||||||
|
|
||||||
|
let homeTitle = null;
|
||||||
|
|
||||||
|
while (queue.length > 0 && visited.size < maxPages) {
|
||||||
|
const url = queue.shift();
|
||||||
|
if (!url) continue;
|
||||||
|
|
||||||
|
const normUrl = normalizeUrl(url, { stripHash: true });
|
||||||
|
if (visited.has(normUrl)) continue;
|
||||||
|
visited.add(normUrl);
|
||||||
|
|
||||||
|
let attemptUrls = [normUrl];
|
||||||
|
let usedWWWRetry = false;
|
||||||
|
|
||||||
|
for (let attempt = 0; attempt < attemptUrls.length; attempt++) {
|
||||||
|
const currentUrl = attemptUrls[attempt];
|
||||||
|
try {
|
||||||
|
// 1) HTTP fetch
|
||||||
|
let pageRes = await fetchWithGot(currentUrl);
|
||||||
|
|
||||||
|
let parsed = {
|
||||||
|
title: "", metaDesc: "", h1_1: "", h1_2: "", h2_1: "", h2_2: "",
|
||||||
|
totalHeadings: 0, canonical: "", robotsMeta: "", noindex: false, nofollow: false,
|
||||||
|
internalLinks: new Set(), rawLinks: [],
|
||||||
|
lang: "", wordCount: 0, bodyText: "", schemaTypes: []
|
||||||
|
};
|
||||||
|
if (pageRes.contentType.includes("text/html")) {
|
||||||
|
const p = parseHtml(pageRes.body || "", currentUrl);
|
||||||
|
parsed = { ...parsed, ...p };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!homeTitle && new URL(currentUrl).pathname === "/") {
|
||||||
|
homeTitle = parsed.title || "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Render if needed
|
||||||
|
if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) {
|
||||||
|
const s = await getShared();
|
||||||
|
const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s);
|
||||||
|
if (rendered.domExtract) {
|
||||||
|
pageRes = { ...rendered, body: null };
|
||||||
|
parsed = {
|
||||||
|
...parsed,
|
||||||
|
title: rendered.domExtract.title,
|
||||||
|
metaDesc: rendered.domExtract.metaDesc,
|
||||||
|
h1_1: rendered.domExtract.h1_1,
|
||||||
|
h1_2: rendered.domExtract.h1_2,
|
||||||
|
h2_1: rendered.domExtract.h2_1,
|
||||||
|
h2_2: rendered.domExtract.h2_2,
|
||||||
|
totalHeadings: rendered.domExtract.totalHeadings,
|
||||||
|
canonical: rendered.domExtract.canonical,
|
||||||
|
robotsMeta: rendered.domExtract.robotsMeta,
|
||||||
|
noindex: rendered.domExtract.noindex,
|
||||||
|
nofollow: rendered.domExtract.nofollow,
|
||||||
|
internalLinks: rendered.domExtract.links,
|
||||||
|
rawLinks: rendered.domExtract.rawLinks,
|
||||||
|
lang: rendered.domExtract.lang || parsed.lang,
|
||||||
|
bodyText: rendered.domExtract.bodyText || parsed.bodyText,
|
||||||
|
wordCount: (rendered.domExtract.bodyText || "").split(/\s+/).filter(Boolean).length,
|
||||||
|
schemaTypes: rendered.domExtract.schemaTypes
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still looks empty, try www once
|
||||||
|
if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) {
|
||||||
|
attemptUrls.push(withWWW(currentUrl));
|
||||||
|
usedWWWRetry = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enqueue internal links + record provenance
|
||||||
|
for (const link of parsed.internalLinks) {
|
||||||
|
if (isInternal(start, link)) {
|
||||||
|
const ln = normalizeUrl(link, { stripHash: true });
|
||||||
|
const rawMatch = (parsed.rawLinks || []).find(r => r.abs === link)?.raw ?? "";
|
||||||
|
addEdge(currentUrl, rawMatch, ln, pageRes.render_mode);
|
||||||
|
if (!visited.has(ln)) queue.push(ln);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-page metrics & enrichers ----
|
||||||
|
const title = parsed.title || "";
|
||||||
|
const metaDesc = parsed.metaDesc || "";
|
||||||
|
const h1_1 = parsed.h1_1 || "";
|
||||||
|
const h1_2 = parsed.h1_2 || "";
|
||||||
|
const lang = parsed.lang || "";
|
||||||
|
const bodyText = parsed.bodyText || "";
|
||||||
|
const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0);
|
||||||
|
|
||||||
|
const titlePx = measurePixelWidth(title, 16, "arial");
|
||||||
|
const descPx = measurePixelWidth(metaDesc, 14, "arial");
|
||||||
|
const h1_1_px = measurePixelWidth(h1_1, 24, "arial");
|
||||||
|
const h1_2_px = measurePixelWidth(h1_2, 24, "arial");
|
||||||
|
|
||||||
|
const read = computeReadability(bodyText);
|
||||||
|
|
||||||
|
const headers = pageRes.headers || {};
|
||||||
|
const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? "";
|
||||||
|
const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? "";
|
||||||
|
const setCookie = !!headers["set-cookie"];
|
||||||
|
|
||||||
|
const outlinks = parsed.internalLinks.size;
|
||||||
|
const inlinks = (referrers.get(currentUrl) || []).length;
|
||||||
|
|
||||||
|
// Save page row
|
||||||
|
results.push({
|
||||||
|
url: currentUrl,
|
||||||
|
status: pageRes.status,
|
||||||
|
status_text: pageRes.status_text ?? "",
|
||||||
|
time_ms: pageRes.time_ms,
|
||||||
|
bytes: pageRes.bytes,
|
||||||
|
content_type: pageRes.contentType,
|
||||||
|
http_version: pageRes.httpVersion ?? "",
|
||||||
|
title,
|
||||||
|
title_length: title.length,
|
||||||
|
title_pixel_width: titlePx,
|
||||||
|
meta_description: metaDesc,
|
||||||
|
meta_description_length: metaDesc.length,
|
||||||
|
meta_description_pixel_width: descPx,
|
||||||
|
h1_1,
|
||||||
|
h1_1_length: h1_1.length,
|
||||||
|
h1_1_pixel_width: h1_1_px,
|
||||||
|
h1_2,
|
||||||
|
h1_2_length: h1_2.length,
|
||||||
|
h1_2_pixel_width: h1_2_px,
|
||||||
|
h2_1: parsed.h2_1 || "",
|
||||||
|
h2_2: parsed.h2_2 || "",
|
||||||
|
canonical: parsed.canonical,
|
||||||
|
robots_meta: parsed.robotsMeta,
|
||||||
|
x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots,
|
||||||
|
noindex: parsed.noindex,
|
||||||
|
nofollow: parsed.nofollow,
|
||||||
|
lang,
|
||||||
|
word_count: wordCount,
|
||||||
|
flesch_reading_ease: read.flesch_reading_ease ?? "",
|
||||||
|
flesch_kincaid_grade: read.flesch_kincaid_grade ?? "",
|
||||||
|
gunning_fog: read.gunning_fog ?? "",
|
||||||
|
coleman_liau: read.coleman_liau ?? "",
|
||||||
|
ari: read.ari ?? "",
|
||||||
|
smog: read.smog ?? "",
|
||||||
|
schema_types: parsed.schemaTypes || [],
|
||||||
|
inlinks,
|
||||||
|
outlinks,
|
||||||
|
render_mode: pageRes.render_mode,
|
||||||
|
last_modified: lastModified,
|
||||||
|
set_cookie: setCookie,
|
||||||
|
crawl_timestamp: new Date().toISOString()
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${title || h1_1}`
|
||||||
|
);
|
||||||
|
break; // success for this URL; stop attempts
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[ERROR] ${currentUrl} -> ${err.message}`);
|
||||||
|
results.push({
|
||||||
|
url: currentUrl,
|
||||||
|
status: null, status_text: "", time_ms: null, bytes: null, content_type: "",
|
||||||
|
http_version: "", title: "", title_length: 0, title_pixel_width: "",
|
||||||
|
meta_description: "", meta_description_length: 0, meta_description_pixel_width: "",
|
||||||
|
h1_1: "", h1_1_length: 0, h1_1_pixel_width: "", h1_2: "", h1_2_length: 0, h1_2_pixel_width: "",
|
||||||
|
h2_1: "", h2_2: "",
|
||||||
|
canonical: "", robots_meta: "", x_robots_tag: "", noindex: false, nofollow: false,
|
||||||
|
lang: "", word_count: "", flesch_reading_ease: "", flesch_kincaid_grade: "",
|
||||||
|
gunning_fog: "", coleman_liau: "", ari: "", smog: "",
|
||||||
|
schema_types: [], inlinks: 0, outlinks: 0, render_mode: "error",
|
||||||
|
last_modified: "", set_cookie: "", crawl_timestamp: new Date().toISOString()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shared) await shared.browser.close();
|
||||||
|
|
||||||
|
// -------------------- Post-process: duplicates & similarity -------------
|
||||||
|
// Titles
|
||||||
|
const titleMap = new Map();
|
||||||
|
for (const r of results) {
|
||||||
|
const key = (r.title || "").trim();
|
||||||
|
if (!titleMap.has(key)) titleMap.set(key, []);
|
||||||
|
titleMap.get(key).push(r);
|
||||||
|
}
|
||||||
|
for (const [t, arr] of titleMap.entries()) {
|
||||||
|
if (!t) continue;
|
||||||
|
const isDup = arr.length > 1;
|
||||||
|
for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Meta descriptions
|
||||||
|
const descMap = new Map();
|
||||||
|
for (const r of results) {
|
||||||
|
const key = (r.meta_description || "").trim();
|
||||||
|
if (!descMap.has(key)) descMap.set(key, []);
|
||||||
|
descMap.get(key).push(r);
|
||||||
|
}
|
||||||
|
for (const [d, arr] of descMap.entries()) {
|
||||||
|
if (!d) continue;
|
||||||
|
const isDup = arr.length > 1;
|
||||||
|
for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Nearest neighbor similarities (within site, lightweight)
|
||||||
|
const titleList = results.map(r => ({ url: r.url, text: (r.title || "").trim() }));
|
||||||
|
const descList = results.map(r => ({ url: r.url, text: (r.meta_description || "").trim() }));
|
||||||
|
for (const r of results) {
|
||||||
|
// titles
|
||||||
|
const others = titleList.filter(x => x.url !== r.url && x.text);
|
||||||
|
let bestT = { rating: 0, target: "" };
|
||||||
|
if (r.title && others.length) {
|
||||||
|
const ratings = stringSimilarity.findBestMatch(r.title, others.map(x => x.text));
|
||||||
|
const best = ratings.bestMatch;
|
||||||
|
bestT.rating = best.rating;
|
||||||
|
const idx = ratings.ratings.findIndex(x => x.rating === best.rating);
|
||||||
|
bestT.target = others[idx]?.url || "";
|
||||||
|
}
|
||||||
|
r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : "";
|
||||||
|
r.nearest_title_url = bestT.target;
|
||||||
|
|
||||||
|
// descriptions
|
||||||
|
const othersD = descList.filter(x => x.url !== r.url && x.text);
|
||||||
|
let bestD = { rating: 0, target: "" };
|
||||||
|
if (r.meta_description && othersD.length) {
|
||||||
|
const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map(x => x.text));
|
||||||
|
const best = ratingsD.bestMatch;
|
||||||
|
bestD.rating = best.rating;
|
||||||
|
const idx = ratingsD.ratings.findIndex(x => x.rating === best.rating);
|
||||||
|
bestD.target = othersD[idx]?.url || "";
|
||||||
|
}
|
||||||
|
r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : "";
|
||||||
|
r.nearest_description_url = bestD.target;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\n✅ Crawl finished. Total pages: ${visited.size}`);
|
||||||
|
writePageReports(results);
|
||||||
|
writeLinkEdges(edges);
|
||||||
|
writeErrors(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
// // CLI: node crawler.js https://site.com 200
|
||||||
|
// const START_URL = process.argv[2] || "https://example.com";
|
||||||
|
// const MAX_PAGES = Number(process.argv[3] || 100);
|
||||||
|
// crawl(START_URL, MAX_PAGES);
|
||||||
921
crawler.js
Normal file
921
crawler.js
Normal file
@ -0,0 +1,921 @@
|
|||||||
|
// crawler.js
|
||||||
|
import got from "got";
|
||||||
|
import * as cheerio from "cheerio";
|
||||||
|
import normalizeUrl from "normalize-url";
|
||||||
|
import { isInternal } from "./utils/urlHelpers.js";
|
||||||
|
import { getSitemapUrls } from "./utils/sitemap.js";
|
||||||
|
import fs from "node:fs";
|
||||||
|
import path from "node:path";
|
||||||
|
import { chromium } from "playwright";
|
||||||
|
|
||||||
|
// NEW libs
|
||||||
|
import pixelWidth from "string-pixel-width";
|
||||||
|
import * as readability from "text-readability";
|
||||||
|
import stringSimilarity from "string-similarity";
|
||||||
|
|
||||||
|
/* ------------------------------ globals --------------------------------- */
|
||||||
|
// NOTE: We'll reset these at the start of crawl() so repeated runs don't share state.
|
||||||
|
const visited = new Set();
|
||||||
|
const queue = [];
|
||||||
|
const results = [];
|
||||||
|
|
||||||
|
// Link provenance: every discovered edge (source -> target)
|
||||||
|
const edges = []; // { from, raw_href, to, discovered_by }
|
||||||
|
|
||||||
|
// Quick referrer map for error report
|
||||||
|
const referrers = new Map(); // url -> Array<{from, raw_href, discovered_by}>
|
||||||
|
|
||||||
|
const REAL_UA =
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
||||||
|
const REAL_HEADERS = {
|
||||||
|
"user-agent": REAL_UA,
|
||||||
|
accept:
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"upgrade-insecure-requests": "1",
|
||||||
|
};
|
||||||
|
|
||||||
|
/* ------------------------------ utils ----------------------------------- */
|
||||||
|
function csvEscape(v) {
|
||||||
|
if (v === undefined || v === null) return "";
|
||||||
|
const s = String(v);
|
||||||
|
return /[",\n]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
|
||||||
|
}
|
||||||
|
function ensureDir(dir) {
|
||||||
|
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
||||||
|
}
|
||||||
|
function writePageReports(rows) {
|
||||||
|
ensureDir("reports");
|
||||||
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
||||||
|
const base = path.join("reports", `crawl-${stamp}`);
|
||||||
|
|
||||||
|
fs.writeFileSync(`${base}.json`, JSON.stringify(rows, null, 2), "utf8");
|
||||||
|
console.log(`\n📝 Full JSON report saved: ${base}.json`);
|
||||||
|
|
||||||
|
// Columns (a Screaming-Frog-ish shape with our extras)
|
||||||
|
const headers = [
|
||||||
|
"url",
|
||||||
|
"status",
|
||||||
|
"status_text",
|
||||||
|
"time_ms",
|
||||||
|
"bytes",
|
||||||
|
"content_type",
|
||||||
|
"http_version",
|
||||||
|
"title",
|
||||||
|
"title_length",
|
||||||
|
"title_pixel_width",
|
||||||
|
"meta_description",
|
||||||
|
"meta_description_length",
|
||||||
|
"meta_description_pixel_width",
|
||||||
|
"h1_1",
|
||||||
|
"h1_1_length",
|
||||||
|
"h1_1_pixel_width",
|
||||||
|
"h1_2",
|
||||||
|
"h1_2_length",
|
||||||
|
"h1_2_pixel_width",
|
||||||
|
"h2_1",
|
||||||
|
"h2_2",
|
||||||
|
"canonical",
|
||||||
|
"robots_meta",
|
||||||
|
"x_robots_tag",
|
||||||
|
"noindex",
|
||||||
|
"nofollow",
|
||||||
|
"lang",
|
||||||
|
"word_count",
|
||||||
|
"flesch_reading_ease",
|
||||||
|
"flesch_kincaid_grade",
|
||||||
|
"gunning_fog",
|
||||||
|
"coleman_liau",
|
||||||
|
"ari",
|
||||||
|
"smog",
|
||||||
|
"schema_types",
|
||||||
|
"inlinks",
|
||||||
|
"outlinks",
|
||||||
|
"render_mode",
|
||||||
|
"last_modified",
|
||||||
|
"set_cookie",
|
||||||
|
"crawl_timestamp",
|
||||||
|
"duplicate_title_exact",
|
||||||
|
"nearest_title_similarity",
|
||||||
|
"nearest_title_url",
|
||||||
|
"duplicate_description_exact",
|
||||||
|
"nearest_description_similarity",
|
||||||
|
"nearest_description_url",
|
||||||
|
];
|
||||||
|
const lines = [headers.join(",")];
|
||||||
|
for (const r of rows) {
|
||||||
|
lines.push(
|
||||||
|
[
|
||||||
|
r.url,
|
||||||
|
r.status,
|
||||||
|
r.status_text ?? "",
|
||||||
|
r.time_ms,
|
||||||
|
r.bytes,
|
||||||
|
r.content_type,
|
||||||
|
r.http_version ?? "",
|
||||||
|
r.title,
|
||||||
|
r.title_length,
|
||||||
|
r.title_pixel_width,
|
||||||
|
r.meta_description,
|
||||||
|
r.meta_description_length,
|
||||||
|
r.meta_description_pixel_width,
|
||||||
|
r.h1_1 ?? "",
|
||||||
|
r.h1_1_length ?? 0,
|
||||||
|
r.h1_1_pixel_width ?? "",
|
||||||
|
r.h1_2 ?? "",
|
||||||
|
r.h1_2_length ?? 0,
|
||||||
|
r.h1_2_pixel_width ?? "",
|
||||||
|
r.h2_1 ?? "",
|
||||||
|
r.h2_2 ?? "",
|
||||||
|
r.canonical,
|
||||||
|
r.robots_meta,
|
||||||
|
r.x_robots_tag ?? "",
|
||||||
|
r.noindex,
|
||||||
|
r.nofollow,
|
||||||
|
r.lang ?? "",
|
||||||
|
r.word_count ?? "",
|
||||||
|
r.flesch_reading_ease ?? "",
|
||||||
|
r.flesch_kincaid_grade ?? "",
|
||||||
|
r.gunning_fog ?? "",
|
||||||
|
r.coleman_liau ?? "",
|
||||||
|
r.ari ?? "",
|
||||||
|
r.smog ?? "",
|
||||||
|
Array.isArray(r.schema_types) ? r.schema_types.join("|") : "",
|
||||||
|
r.inlinks ?? 0,
|
||||||
|
r.outlinks ?? 0,
|
||||||
|
r.render_mode,
|
||||||
|
r.last_modified ?? "",
|
||||||
|
r.set_cookie ? "yes" : "no",
|
||||||
|
r.crawl_timestamp ?? "",
|
||||||
|
r.duplicate_title_exact ?? "",
|
||||||
|
r.nearest_title_similarity ?? "",
|
||||||
|
r.nearest_title_url ?? "",
|
||||||
|
r.duplicate_description_exact ?? "",
|
||||||
|
r.nearest_description_similarity ?? "",
|
||||||
|
r.nearest_description_url ?? "",
|
||||||
|
]
|
||||||
|
.map(csvEscape)
|
||||||
|
.join(",")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// If you also want CSV persisted, uncomment:
|
||||||
|
// fs.writeFileSync(`${base}.csv`, lines.join("\n"), "utf8");
|
||||||
|
// console.log(`📝 CSV report saved: ${base}.csv`);
|
||||||
|
|
||||||
|
return { json: path.resolve(`${base}.json`) /*, csv: path.resolve(`${base}.csv`)*/ };
|
||||||
|
}
|
||||||
|
function writeLinkEdges(edges) {
|
||||||
|
ensureDir("reports");
|
||||||
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
||||||
|
const file = path.join("reports", `links-${stamp}.csv`);
|
||||||
|
const headers = ["from", "raw_href", "to", "discovered_by"];
|
||||||
|
const lines = [headers.join(",")];
|
||||||
|
for (const e of edges) {
|
||||||
|
lines.push([e.from, e.raw_href, e.to, e.discovered_by].map(csvEscape).join(","));
|
||||||
|
}
|
||||||
|
fs.writeFileSync(file, lines.join("\n"), "utf8");
|
||||||
|
console.log(`🔗 Link provenance saved: ${file}`);
|
||||||
|
return { linksCsv: path.resolve(file) };
|
||||||
|
}
|
||||||
|
function writeErrors(rows) {
|
||||||
|
ensureDir("reports");
|
||||||
|
const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
|
||||||
|
const file = path.join("reports", `errors-${stamp}.csv`);
|
||||||
|
const headers = ["url", "status", "title", "from_page", "raw_href", "discovered_by"];
|
||||||
|
const lines = [headers.join(",")];
|
||||||
|
|
||||||
|
for (const r of rows) {
|
||||||
|
if (r && r.status !== null && r.status >= 400) {
|
||||||
|
const refs = referrers.get(r.url) || [];
|
||||||
|
if (refs.length === 0) {
|
||||||
|
lines.push([r.url, r.status, r.title, "", "", ""].map(csvEscape).join(","));
|
||||||
|
} else {
|
||||||
|
for (const ref of refs) {
|
||||||
|
lines.push([r.url, r.status, r.title, ref.from, ref.raw_href, ref.discovered_by].map(csvEscape).join(","));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fs.writeFileSync(file, lines.join("\n"), "utf8");
|
||||||
|
console.log(`❗ Error report saved: ${file}`);
|
||||||
|
return { errorsCsv: path.resolve(file) };
|
||||||
|
}
|
||||||
|
function addEdge(from, rawHref, to, discovered_by) {
|
||||||
|
edges.push({ from, raw_href: rawHref || "", to, discovered_by });
|
||||||
|
if (!referrers.has(to)) referrers.set(to, []);
|
||||||
|
referrers.get(to).push({ from, raw_href: rawHref || "", discovered_by });
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------- parse HTML without JS --------------------------- */
|
||||||
|
function safeJsonParse(txt) {
|
||||||
|
try {
|
||||||
|
return JSON.parse(txt);
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function parseSchemaTypes($) {
|
||||||
|
const types = new Set();
|
||||||
|
$('script[type="application/ld+json"]').each((_, el) => {
|
||||||
|
const raw = $(el).contents().text();
|
||||||
|
const parsed = safeJsonParse(raw);
|
||||||
|
if (!parsed) return;
|
||||||
|
const collect = (obj) => {
|
||||||
|
if (!obj) return;
|
||||||
|
if (Array.isArray(obj)) {
|
||||||
|
obj.forEach(collect);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (typeof obj === "object") {
|
||||||
|
const t = obj["@type"];
|
||||||
|
if (typeof t === "string") types.add(t);
|
||||||
|
else if (Array.isArray(t)) t.forEach((x) => typeof x === "string" && types.add(x));
|
||||||
|
// nested
|
||||||
|
Object.values(obj).forEach(collect);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
collect(parsed);
|
||||||
|
});
|
||||||
|
return [...types];
|
||||||
|
}
|
||||||
|
function parseHtml(html, url) {
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
let title = ($("title").first().text() || "").trim();
|
||||||
|
const ogTitle = $('meta[property="og:title"]').attr("content") || "";
|
||||||
|
const twTitle = $('meta[name="twitter:title"]').attr("content") || "";
|
||||||
|
|
||||||
|
// Headings (capture top two H1s and H2s)
|
||||||
|
const h1s = $("h1")
|
||||||
|
.map((_, el) => $(el).text().trim())
|
||||||
|
.get();
|
||||||
|
const h2s = $("h2")
|
||||||
|
.map((_, el) => $(el).text().trim())
|
||||||
|
.get();
|
||||||
|
|
||||||
|
const h1_1 = h1s[0] || "";
|
||||||
|
const h1_2 = h1s[1] || "";
|
||||||
|
const h2_1 = h2s[0] || "";
|
||||||
|
const h2_2 = h2s[1] || "";
|
||||||
|
|
||||||
|
const totalHeadings = $("h1,h2,h3,h4,h5,h6,[role='heading']").length;
|
||||||
|
|
||||||
|
if (!title) title = (ogTitle || twTitle || h1_1 || "").trim();
|
||||||
|
|
||||||
|
const metaDesc = ($('meta[name="description"]').attr("content") || "").trim();
|
||||||
|
const canonical = ($('link[rel="canonical"]').attr("href") || "").trim();
|
||||||
|
const robotsMeta = ($('meta[name="robots"]').attr("content") || "").trim();
|
||||||
|
const robotsLower = robotsMeta.toLowerCase();
|
||||||
|
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
|
||||||
|
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
|
||||||
|
|
||||||
|
const lang = ($("html").attr("lang") || "").trim();
|
||||||
|
|
||||||
|
// Basic text body for word count / readability
|
||||||
|
const bodyText = ($("main").text() || $("body").text() || "").replace(/\s+/g, " ").trim();
|
||||||
|
const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
|
||||||
|
|
||||||
|
// Internal links + raw href
|
||||||
|
const internalLinks = new Set();
|
||||||
|
const rawLinks = [];
|
||||||
|
$("a[href]").each((_, el) => {
|
||||||
|
const href = $(el).attr("href");
|
||||||
|
if (!href) return;
|
||||||
|
try {
|
||||||
|
const abs = new URL(href, url).toString();
|
||||||
|
rawLinks.push({ raw: href, abs });
|
||||||
|
internalLinks.add(abs);
|
||||||
|
} catch {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Schema.org JSON-LD types
|
||||||
|
const schemaTypes = parseSchemaTypes($);
|
||||||
|
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
metaDesc,
|
||||||
|
h1_1,
|
||||||
|
h1_2,
|
||||||
|
h2_1,
|
||||||
|
h2_2,
|
||||||
|
totalHeadings,
|
||||||
|
canonical,
|
||||||
|
robotsMeta,
|
||||||
|
noindex,
|
||||||
|
nofollow,
|
||||||
|
internalLinks,
|
||||||
|
rawLinks,
|
||||||
|
lang,
|
||||||
|
wordCount,
|
||||||
|
schemaTypes,
|
||||||
|
bodyText,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------------ fetchers -------------------------------- */
|
||||||
|
async function fetchWithGot(url) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
const res = await got(url, {
|
||||||
|
timeout: { request: 20000 },
|
||||||
|
throwHttpErrors: false,
|
||||||
|
headers: REAL_HEADERS,
|
||||||
|
http2: false,
|
||||||
|
});
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
const contentType = (res.headers["content-type"] || "").toLowerCase();
|
||||||
|
const bytes = res.headers["content-length"]
|
||||||
|
? Number(res.headers["content-length"])
|
||||||
|
: Buffer.byteLength(res.body || "", "utf8");
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: res.statusCode ?? null,
|
||||||
|
status_text: res.statusMessage ?? "",
|
||||||
|
time_ms: dt,
|
||||||
|
contentType,
|
||||||
|
body: res.body,
|
||||||
|
bytes,
|
||||||
|
render_mode: "http",
|
||||||
|
httpVersion: res.httpVersion ?? "",
|
||||||
|
headers: res.headers,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createBrowserContext() {
|
||||||
|
const browser = await chromium.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ["--disable-blink-features=AutomationControlled"],
|
||||||
|
});
|
||||||
|
const context = await browser.newContext({
|
||||||
|
ignoreHTTPSErrors: true, // Ignore SSL certificate errors
|
||||||
|
userAgent: REAL_UA,
|
||||||
|
viewport: { width: 1366, height: 768 },
|
||||||
|
deviceScaleFactor: 1,
|
||||||
|
isMobile: false,
|
||||||
|
locale: "en-US",
|
||||||
|
extraHTTPHeaders: REAL_HEADERS,
|
||||||
|
});
|
||||||
|
await context.addInitScript(() => {
|
||||||
|
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
||||||
|
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3] });
|
||||||
|
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
|
||||||
|
});
|
||||||
|
return { browser: context.browser(), context };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWithPlaywrightAndExtract(url, shared) {
|
||||||
|
const page = await shared.context.newPage();
|
||||||
|
const t0 = Date.now();
|
||||||
|
let status = null,
|
||||||
|
mainHeaders = {},
|
||||||
|
statusText = "";
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resp = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
|
||||||
|
status = resp?.status() ?? null;
|
||||||
|
statusText = resp?.statusText() ?? "";
|
||||||
|
try {
|
||||||
|
mainHeaders = resp ? await resp.headers() : {};
|
||||||
|
} catch {}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.waitForLoadState("networkidle", { timeout: 12000 });
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
await page.waitForFunction(
|
||||||
|
() => {
|
||||||
|
const main = document.querySelector("main") || document.body;
|
||||||
|
const textLen = (main?.innerText || "").replace(/\s+/g, " ").trim().length;
|
||||||
|
const hasHeading = !!document.querySelector(
|
||||||
|
"h1, h2, [role='heading'], [class*='title'], [class*='heading'], [class*='hero'], [class*='banner']"
|
||||||
|
);
|
||||||
|
return textLen > 160 || hasHeading;
|
||||||
|
},
|
||||||
|
{ timeout: 8000 }
|
||||||
|
);
|
||||||
|
} catch {}
|
||||||
|
|
||||||
|
const dom = await page.evaluate(() => {
|
||||||
|
const clean = (s) => (s || "").replace(/\s+/g, " ").trim();
|
||||||
|
const getTextList = (sel) =>
|
||||||
|
Array.from(document.querySelectorAll(sel))
|
||||||
|
.map((el) => clean(el.textContent))
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
const title = document.title || "";
|
||||||
|
const ogTitle = document.querySelector('meta[property="og:title"]')?.content || "";
|
||||||
|
const twTitle = document.querySelector('meta[name="twitter:title"]')?.content || "";
|
||||||
|
const metaDesc = document.querySelector('meta[name="description"]')?.content || "";
|
||||||
|
const canonical = document.querySelector('link[rel="canonical"]')?.href || "";
|
||||||
|
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || "";
|
||||||
|
const lang = document.documentElement.getAttribute("lang") || "";
|
||||||
|
|
||||||
|
const h1 = getTextList("h1");
|
||||||
|
const h2 = getTextList("h2");
|
||||||
|
const totalHeadings = document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']").length;
|
||||||
|
|
||||||
|
const links = Array.from(document.querySelectorAll("a[href]"))
|
||||||
|
.map((a) => {
|
||||||
|
const raw = a.getAttribute("href");
|
||||||
|
try {
|
||||||
|
return { raw, abs: new URL(raw, location.href).toString() };
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
const bodyText = clean((document.querySelector("main") || document.body).innerText || "");
|
||||||
|
|
||||||
|
const schemaScripts = Array.from(
|
||||||
|
document.querySelectorAll('script[type="application/ld+json"]')
|
||||||
|
).map((s) => s.textContent || "");
|
||||||
|
|
||||||
|
return {
|
||||||
|
htmlLen: (document.documentElement.outerHTML || "").length,
|
||||||
|
title,
|
||||||
|
ogTitle,
|
||||||
|
twTitle,
|
||||||
|
metaDesc,
|
||||||
|
canonical,
|
||||||
|
robotsMeta,
|
||||||
|
lang,
|
||||||
|
h1,
|
||||||
|
h2,
|
||||||
|
totalHeadings,
|
||||||
|
links,
|
||||||
|
bodyText,
|
||||||
|
schemaScripts,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Parse schema types from strings (outside of page)
|
||||||
|
const schemaTypes = [];
|
||||||
|
for (const raw of dom.schemaScripts || []) {
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(raw);
|
||||||
|
const collect = (obj) => {
|
||||||
|
if (!obj) return;
|
||||||
|
if (Array.isArray(obj)) {
|
||||||
|
obj.forEach(collect);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (typeof obj === "object") {
|
||||||
|
const t = obj["@type"];
|
||||||
|
if (typeof t === "string") schemaTypes.push(t);
|
||||||
|
else if (Array.isArray(t)) t.forEach((x) => typeof x === "string" && schemaTypes.push(x));
|
||||||
|
Object.values(obj).forEach(collect);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
collect(parsed);
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
const robotsLower = (dom.robotsMeta || "").toLowerCase();
|
||||||
|
const noindex = /(^|[,;\s])noindex([,;\s]|$)/.test(robotsLower);
|
||||||
|
const nofollow = /(^|[,;\s])nofollow([,;\s]|$)/.test(robotsLower);
|
||||||
|
const finalTitle = (dom.title || dom.ogTitle || dom.twTitle || dom.h1?.[0] || "").trim();
|
||||||
|
|
||||||
|
return {
|
||||||
|
status,
|
||||||
|
status_text: statusText,
|
||||||
|
time_ms: dt,
|
||||||
|
contentType: "text/html",
|
||||||
|
bytes: dom.htmlLen || 0,
|
||||||
|
render_mode: "rendered",
|
||||||
|
headers: mainHeaders,
|
||||||
|
domExtract: {
|
||||||
|
title: finalTitle,
|
||||||
|
metaDesc: dom.metaDesc || "",
|
||||||
|
canonical: dom.canonical || "",
|
||||||
|
robotsMeta: dom.robotsMeta || "",
|
||||||
|
lang: dom.lang || "",
|
||||||
|
noindex,
|
||||||
|
nofollow,
|
||||||
|
h1_1: dom.h1?.[0] || "",
|
||||||
|
h1_2: dom.h1?.[1] || "",
|
||||||
|
h2_1: dom.h2?.[0] || "",
|
||||||
|
h2_2: dom.h2?.[1] || "",
|
||||||
|
totalHeadings: dom.totalHeadings || 0,
|
||||||
|
links: new Set((dom.links || []).map((l) => l.abs)),
|
||||||
|
rawLinks: dom.links || [],
|
||||||
|
bodyText: dom.bodyText || "",
|
||||||
|
schemaTypes: Array.from(new Set(schemaTypes)),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
await page.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------- render decision ------------------------------ */
|
||||||
|
function shouldRender(currentUrl, httpRes, parsed, homeTitle) {
|
||||||
|
const { pathname } = new URL(currentUrl);
|
||||||
|
if ((httpRes.bytes ?? 0) < 4000) return true; // tiny HTML shell
|
||||||
|
if (parsed.totalHeadings === 0) return true;
|
||||||
|
if (homeTitle && parsed.title && parsed.title === homeTitle && pathname !== "/") return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
function withWWW(urlStr) {
|
||||||
|
try {
|
||||||
|
const u = new URL(urlStr);
|
||||||
|
if (!u.hostname.startsWith("www.")) u.hostname = "www." + u.hostname;
|
||||||
|
return u.toString();
|
||||||
|
} catch {
|
||||||
|
return urlStr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------ per-page enrichers ---------------------------- */
|
||||||
|
function measurePixelWidth(text, size = 16, font = "arial") {
|
||||||
|
if (!text) return 0;
|
||||||
|
try {
|
||||||
|
return pixelWidth(text, { font, size });
|
||||||
|
} catch {
|
||||||
|
return Math.round(text.length * size * 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function computeReadability(text) {
|
||||||
|
if (!text) return {};
|
||||||
|
const safe = text.slice(0, 200000); // cap
|
||||||
|
const out = {};
|
||||||
|
try {
|
||||||
|
out.flesch_reading_ease = readability.fleschReadingEase(safe);
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
out.flesch_kincaid_grade = readability.fleschKincaidGrade(safe);
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
out.gunning_fog = readability.gunningFog(safe);
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
out.coleman_liau = readability.colemanLiauIndex(safe);
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
out.ari = readability.automatedReadabilityIndex(safe);
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
out.smog = readability.smogIndex(safe);
|
||||||
|
} catch {}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------------------- main ---------------------------------- */
|
||||||
|
/**
|
||||||
|
* Crawl a site and return a structured report.
|
||||||
|
* @param {string} startUrl
|
||||||
|
* @param {number} maxPages
|
||||||
|
* @param {(tick:any)=>void} [onProgress] optional callback for progress events
|
||||||
|
* @param {{persistReports?: boolean, collectPages?: boolean}} [options]
|
||||||
|
* @returns {{ results: any[], files: Record<string,string>, total: number }}
|
||||||
|
*/
|
||||||
|
export async function crawl(startUrl, maxPages = 50, onProgress, options = {}) {
|
||||||
|
const persistReports = options.persistReports !== false; // default true
|
||||||
|
|
||||||
|
// Reset global state per run
|
||||||
|
visited.clear();
|
||||||
|
queue.length = 0;
|
||||||
|
results.length = 0;
|
||||||
|
edges.length = 0;
|
||||||
|
referrers.clear();
|
||||||
|
|
||||||
|
const start = normalizeUrl(startUrl, { stripHash: true });
|
||||||
|
queue.push(start);
|
||||||
|
|
||||||
|
// Seed from sitemap.xml + record provenance
|
||||||
|
try {
|
||||||
|
const sitemapUrls = await getSitemapUrls(start);
|
||||||
|
for (const u of sitemapUrls) {
|
||||||
|
queue.push(u);
|
||||||
|
addEdge("sitemap.xml", u, u, "sitemap");
|
||||||
|
}
|
||||||
|
console.log(`📌 Seeded ${sitemapUrls.length} URL(s) from sitemap.xml`);
|
||||||
|
} catch (e) {
|
||||||
|
console.log("⚠️ Sitemap step skipped:", e.message);
|
||||||
|
}
|
||||||
|
|
||||||
|
let shared = null;
|
||||||
|
async function getShared() {
|
||||||
|
if (!shared) shared = await createBrowserContext();
|
||||||
|
return shared;
|
||||||
|
}
|
||||||
|
|
||||||
|
let homeTitle = null;
|
||||||
|
|
||||||
|
while (queue.length > 0 && visited.size < maxPages) {
|
||||||
|
const url = queue.shift();
|
||||||
|
if (!url) continue;
|
||||||
|
|
||||||
|
const normUrl = normalizeUrl(url, { stripHash: true });
|
||||||
|
if (visited.has(normUrl)) continue;
|
||||||
|
visited.add(normUrl);
|
||||||
|
|
||||||
|
const attemptUrls = [normUrl];
|
||||||
|
let usedWWWRetry = false;
|
||||||
|
|
||||||
|
for (let attempt = 0; attempt < attemptUrls.length; attempt++) {
|
||||||
|
const currentUrl = attemptUrls[attempt];
|
||||||
|
try {
|
||||||
|
// 1) HTTP fetch
|
||||||
|
let pageRes = await fetchWithGot(currentUrl);
|
||||||
|
|
||||||
|
let parsed = {
|
||||||
|
title: "",
|
||||||
|
metaDesc: "",
|
||||||
|
h1_1: "",
|
||||||
|
h1_2: "",
|
||||||
|
h2_1: "",
|
||||||
|
h2_2: "",
|
||||||
|
totalHeadings: 0,
|
||||||
|
canonical: "",
|
||||||
|
robotsMeta: "",
|
||||||
|
noindex: false,
|
||||||
|
nofollow: false,
|
||||||
|
internalLinks: new Set(),
|
||||||
|
rawLinks: [],
|
||||||
|
lang: "",
|
||||||
|
wordCount: 0,
|
||||||
|
bodyText: "",
|
||||||
|
schemaTypes: [],
|
||||||
|
};
|
||||||
|
if (pageRes.contentType.includes("text/html")) {
|
||||||
|
const p = parseHtml(pageRes.body || "", currentUrl);
|
||||||
|
parsed = { ...parsed, ...p };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!homeTitle && new URL(currentUrl).pathname === "/") {
|
||||||
|
homeTitle = parsed.title || "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Render if needed
|
||||||
|
if (pageRes.contentType.includes("text/html") && shouldRender(currentUrl, pageRes, parsed, homeTitle)) {
|
||||||
|
const s = await getShared();
|
||||||
|
const rendered = await fetchWithPlaywrightAndExtract(currentUrl, s);
|
||||||
|
if (rendered.domExtract) {
|
||||||
|
pageRes = { ...rendered, body: null };
|
||||||
|
parsed = {
|
||||||
|
...parsed,
|
||||||
|
title: rendered.domExtract.title,
|
||||||
|
metaDesc: rendered.domExtract.metaDesc,
|
||||||
|
h1_1: rendered.domExtract.h1_1,
|
||||||
|
h1_2: rendered.domExtract.h1_2,
|
||||||
|
h2_1: rendered.domExtract.h2_1,
|
||||||
|
h2_2: rendered.domExtract.h2_2,
|
||||||
|
totalHeadings: rendered.domExtract.totalHeadings,
|
||||||
|
canonical: rendered.domExtract.canonical,
|
||||||
|
robotsMeta: rendered.domExtract.robotsMeta,
|
||||||
|
noindex: rendered.domExtract.noindex,
|
||||||
|
nofollow: rendered.domExtract.nofollow,
|
||||||
|
internalLinks: rendered.domExtract.links,
|
||||||
|
rawLinks: rendered.domExtract.rawLinks,
|
||||||
|
lang: rendered.domExtract.lang || parsed.lang,
|
||||||
|
bodyText: rendered.domExtract.bodyText || parsed.bodyText,
|
||||||
|
wordCount: (rendered.domExtract.bodyText || "")
|
||||||
|
.split(/\s+/)
|
||||||
|
.filter(Boolean).length,
|
||||||
|
schemaTypes: rendered.domExtract.schemaTypes,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still looks empty, try www once
|
||||||
|
if (!usedWWWRetry && parsed.totalHeadings === 0 && !parsed.h1_1) {
|
||||||
|
attemptUrls.push(withWWW(currentUrl));
|
||||||
|
usedWWWRetry = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enqueue internal links + record provenance
|
||||||
|
for (const link of parsed.internalLinks) {
|
||||||
|
if (isInternal(start, link)) {
|
||||||
|
const ln = normalizeUrl(link, { stripHash: true });
|
||||||
|
const rawMatch = (parsed.rawLinks || []).find((r) => r.abs === link)?.raw ?? "";
|
||||||
|
addEdge(currentUrl, rawMatch, ln, pageRes.render_mode);
|
||||||
|
if (!visited.has(ln)) queue.push(ln);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-page metrics & enrichers ----
|
||||||
|
const title = parsed.title || "";
|
||||||
|
const metaDesc = parsed.metaDesc || "";
|
||||||
|
const h1_1 = parsed.h1_1 || "";
|
||||||
|
const h1_2 = parsed.h1_2 || "";
|
||||||
|
const lang = parsed.lang || "";
|
||||||
|
const bodyText = parsed.bodyText || "";
|
||||||
|
const wordCount = parsed.wordCount || (bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0);
|
||||||
|
|
||||||
|
const titlePx = measurePixelWidth(title, 16, "arial");
|
||||||
|
const descPx = measurePixelWidth(metaDesc, 14, "arial");
|
||||||
|
const h1_1_px = measurePixelWidth(h1_1, 24, "arial");
|
||||||
|
const h1_2_px = measurePixelWidth(h1_2, 24, "arial");
|
||||||
|
|
||||||
|
const read = computeReadability(bodyText);
|
||||||
|
|
||||||
|
const headers = pageRes.headers || {};
|
||||||
|
const xRobots = (headers["x-robots-tag"] || headers["x-robots-tag".toLowerCase()]) ?? "";
|
||||||
|
const lastModified = headers["last-modified"] ?? headers["Last-Modified"] ?? "";
|
||||||
|
const setCookie = !!headers["set-cookie"];
|
||||||
|
|
||||||
|
const outlinks = parsed.internalLinks.size;
|
||||||
|
const inlinks = (referrers.get(currentUrl) || []).length;
|
||||||
|
|
||||||
|
// Save page row
|
||||||
|
results.push({
|
||||||
|
url: currentUrl,
|
||||||
|
status: pageRes.status,
|
||||||
|
status_text: pageRes.status_text ?? "",
|
||||||
|
time_ms: pageRes.time_ms,
|
||||||
|
bytes: pageRes.bytes,
|
||||||
|
content_type: pageRes.contentType,
|
||||||
|
http_version: pageRes.httpVersion ?? "",
|
||||||
|
title,
|
||||||
|
title_length: title.length,
|
||||||
|
title_pixel_width: titlePx,
|
||||||
|
meta_description: metaDesc,
|
||||||
|
meta_description_length: metaDesc.length,
|
||||||
|
meta_description_pixel_width: descPx,
|
||||||
|
h1_1,
|
||||||
|
h1_1_length: h1_1.length,
|
||||||
|
h1_1_pixel_width: h1_1_px,
|
||||||
|
h1_2,
|
||||||
|
h1_2_length: h1_2.length,
|
||||||
|
h1_2_pixel_width: h1_2_px,
|
||||||
|
h2_1: parsed.h2_1 || "",
|
||||||
|
h2_2: parsed.h2_2 || "",
|
||||||
|
canonical: parsed.canonical,
|
||||||
|
robots_meta: parsed.robotsMeta,
|
||||||
|
x_robots_tag: Array.isArray(xRobots) ? xRobots.join("; ") : xRobots,
|
||||||
|
noindex: parsed.noindex,
|
||||||
|
nofollow: parsed.nofollow,
|
||||||
|
lang,
|
||||||
|
word_count: wordCount,
|
||||||
|
flesch_reading_ease: read.flesch_reading_ease ?? "",
|
||||||
|
flesch_kincaid_grade: read.flesch_kincaid_grade ?? "",
|
||||||
|
gunning_fog: read.gunning_fog ?? "",
|
||||||
|
coleman_liau: read.coleman_liau ?? "",
|
||||||
|
ari: read.ari ?? "",
|
||||||
|
smog: read.smog ?? "",
|
||||||
|
schema_types: parsed.schemaTypes || [],
|
||||||
|
inlinks,
|
||||||
|
outlinks,
|
||||||
|
render_mode: pageRes.render_mode,
|
||||||
|
last_modified: lastModified,
|
||||||
|
set_cookie: setCookie,
|
||||||
|
crawl_timestamp: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`[${pageRes.status ?? "ERR"}] ${pageRes.time_ms}ms ${String(pageRes.render_mode).padEnd(8)} H:${parsed.totalHeadings} ${currentUrl} ${
|
||||||
|
title || h1_1
|
||||||
|
}`
|
||||||
|
);
|
||||||
|
|
||||||
|
// optional progress callback (non-fatal)
|
||||||
|
try {
|
||||||
|
onProgress?.({
|
||||||
|
url: currentUrl,
|
||||||
|
status: pageRes.status,
|
||||||
|
title,
|
||||||
|
inlinks,
|
||||||
|
outlinks,
|
||||||
|
visited: visited.size,
|
||||||
|
queued: queue.length,
|
||||||
|
});
|
||||||
|
} catch {}
|
||||||
|
|
||||||
|
break; // success for this URL; stop attempts
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[ERROR] ${currentUrl} -> ${err.message}`);
|
||||||
|
results.push({
|
||||||
|
url: currentUrl,
|
||||||
|
status: null,
|
||||||
|
status_text: "",
|
||||||
|
time_ms: null,
|
||||||
|
bytes: null,
|
||||||
|
content_type: "",
|
||||||
|
http_version: "",
|
||||||
|
title: "",
|
||||||
|
title_length: 0,
|
||||||
|
title_pixel_width: "",
|
||||||
|
meta_description: "",
|
||||||
|
meta_description_length: 0,
|
||||||
|
meta_description_pixel_width: "",
|
||||||
|
h1_1: "",
|
||||||
|
h1_1_length: 0,
|
||||||
|
h1_1_pixel_width: "",
|
||||||
|
h1_2: "",
|
||||||
|
h1_2_length: 0,
|
||||||
|
h1_2_pixel_width: "",
|
||||||
|
h2_1: "",
|
||||||
|
h2_2: "",
|
||||||
|
canonical: "",
|
||||||
|
robots_meta: "",
|
||||||
|
x_robots_tag: "",
|
||||||
|
noindex: false,
|
||||||
|
nofollow: false,
|
||||||
|
lang: "",
|
||||||
|
word_count: "",
|
||||||
|
flesch_reading_ease: "",
|
||||||
|
flesch_kincaid_grade: "",
|
||||||
|
gunning_fog: "",
|
||||||
|
coleman_liau: "",
|
||||||
|
ari: "",
|
||||||
|
smog: "",
|
||||||
|
schema_types: [],
|
||||||
|
inlinks: 0,
|
||||||
|
outlinks: 0,
|
||||||
|
render_mode: "error",
|
||||||
|
last_modified: "",
|
||||||
|
set_cookie: "",
|
||||||
|
crawl_timestamp: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
onProgress?.({
|
||||||
|
url: currentUrl,
|
||||||
|
error: String(err?.message || err),
|
||||||
|
visited: visited.size,
|
||||||
|
queued: queue.length,
|
||||||
|
});
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shared) await shared.browser.close();
|
||||||
|
|
||||||
|
// -------------------- Post-process: duplicates & similarity -------------
|
||||||
|
// Titles
|
||||||
|
const titleMap = new Map();
|
||||||
|
for (const r of results) {
|
||||||
|
const key = (r.title || "").trim();
|
||||||
|
if (!titleMap.has(key)) titleMap.set(key, []);
|
||||||
|
titleMap.get(key).push(r);
|
||||||
|
}
|
||||||
|
for (const [t, arr] of titleMap.entries()) {
|
||||||
|
if (!t) continue;
|
||||||
|
const isDup = arr.length > 1;
|
||||||
|
for (const row of arr) row.duplicate_title_exact = isDup ? "yes" : "no";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Meta descriptions
|
||||||
|
const descMap = new Map();
|
||||||
|
for (const r of results) {
|
||||||
|
const key = (r.meta_description || "").trim();
|
||||||
|
if (!descMap.has(key)) descMap.set(key, []);
|
||||||
|
descMap.get(key).push(r);
|
||||||
|
}
|
||||||
|
for (const [d, arr] of descMap.entries()) {
|
||||||
|
if (!d) continue;
|
||||||
|
const isDup = arr.length > 1;
|
||||||
|
for (const row of arr) row.duplicate_description_exact = isDup ? "yes" : "no";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Nearest neighbor similarities (within site, lightweight)
|
||||||
|
const titleList = results.map((r) => ({ url: r.url, text: (r.title || "").trim() }));
|
||||||
|
const descList = results.map((r) => ({ url: r.url, text: (r.meta_description || "").trim() }));
|
||||||
|
for (const r of results) {
|
||||||
|
// titles
|
||||||
|
const others = titleList.filter((x) => x.url !== r.url && x.text);
|
||||||
|
let bestT = { rating: 0, target: "" };
|
||||||
|
if (r.title && others.length) {
|
||||||
|
const ratings = stringSimilarity.findBestMatch(r.title, others.map((x) => x.text));
|
||||||
|
const best = ratings.bestMatch;
|
||||||
|
bestT.rating = best.rating;
|
||||||
|
const idx = ratings.ratings.findIndex((x) => x.rating === best.rating);
|
||||||
|
bestT.target = others[idx]?.url || "";
|
||||||
|
}
|
||||||
|
r.nearest_title_similarity = bestT.rating ? bestT.rating.toFixed(3) : "";
|
||||||
|
r.nearest_title_url = bestT.target;
|
||||||
|
|
||||||
|
// descriptions
|
||||||
|
const othersD = descList.filter((x) => x.url !== r.url && x.text);
|
||||||
|
let bestD = { rating: 0, target: "" };
|
||||||
|
if (r.meta_description && othersD.length) {
|
||||||
|
const ratingsD = stringSimilarity.findBestMatch(r.meta_description, othersD.map((x) => x.text));
|
||||||
|
const best = ratingsD.bestMatch;
|
||||||
|
bestD.rating = best.rating;
|
||||||
|
const idx = ratingsD.ratings.findIndex((x) => x.rating === best.rating);
|
||||||
|
bestD.target = othersD[idx]?.url || "";
|
||||||
|
}
|
||||||
|
r.nearest_description_similarity = bestD.rating ? bestD.rating.toFixed(3) : "";
|
||||||
|
r.nearest_description_url = bestD.target;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\n✅ Crawl finished. Total pages: ${visited.size}`);
|
||||||
|
|
||||||
|
let files = {};
|
||||||
|
if (persistReports) {
|
||||||
|
const a = writePageReports(results);
|
||||||
|
const b = writeLinkEdges(edges);
|
||||||
|
const c = writeErrors(results);
|
||||||
|
files = { ...a, ...b, ...c };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { results, files, total: results.length };
|
||||||
|
}
|
||||||
|
|
||||||
|
// // CLI: node crawler.js https://site.com 200
|
||||||
|
// const START_URL = process.argv[2] || "https://example.com";
|
||||||
|
// const MAX_PAGES = Number(process.argv[3] || 100);
|
||||||
|
// crawl(START_URL, MAX_PAGES);
|
||||||
19
middlewares/auth.middleware.js
Normal file
19
middlewares/auth.middleware.js
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import jwt from "jsonwebtoken";
|
||||||
|
|
||||||
|
export function authMiddleware(req, res, next) {
|
||||||
|
const header = req.headers.authorization;
|
||||||
|
|
||||||
|
if (!header?.startsWith("Bearer ")) {
|
||||||
|
return res.status(401).json({ error: "Missing token" });
|
||||||
|
}
|
||||||
|
|
||||||
|
const token = header.split(" ")[1];
|
||||||
|
|
||||||
|
try {
|
||||||
|
req.user = jwt.verify(token, process.env.JWT_SECRET);
|
||||||
|
next();
|
||||||
|
} catch (err) {
|
||||||
|
console.error("JWT verification failed:", err.message);
|
||||||
|
return res.status(401).json({ error: "Invalid or expired token" });
|
||||||
|
}
|
||||||
|
}
|
||||||
4
middlewares/pageSpeedErrorHandler.js
Normal file
4
middlewares/pageSpeedErrorHandler.js
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
export const errorHandler = (err, req, res, next) => {
|
||||||
|
console.error(err);
|
||||||
|
res.status(500).json({ message: err.message || 'Internal Server Error' });
|
||||||
|
};
|
||||||
41
models/blog.model.js
Normal file
41
models/blog.model.js
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import mongoose from 'mongoose';
|
||||||
|
|
||||||
|
const commentSchema = new mongoose.Schema({
|
||||||
|
user: { type: mongoose.Schema.Types.ObjectId, ref: 'User' },
|
||||||
|
name: String,
|
||||||
|
text: { type: String, required: true },
|
||||||
|
createdAt: { type: Date, default: Date.now }
|
||||||
|
});
|
||||||
|
|
||||||
|
const blogSchema = new mongoose.Schema({
|
||||||
|
projectId: { type: String, required: true, index: true },
|
||||||
|
title: { type: String, required: true },
|
||||||
|
slug: { type: String, required: true, unique: false },
|
||||||
|
description: { type: String, required: true },
|
||||||
|
imageUrl: String,
|
||||||
|
bigImageUrl: String, // ✅ New field
|
||||||
|
category: { type: mongoose.Schema.Types.ObjectId, ref: 'Category' },
|
||||||
|
tags: [String],
|
||||||
|
comments: [commentSchema],
|
||||||
|
likes: [{ type: mongoose.Schema.Types.ObjectId, ref: 'User' }],
|
||||||
|
author: { type: mongoose.Schema.Types.ObjectId, ref: 'User' }
|
||||||
|
}, { timestamps: true });
|
||||||
|
|
||||||
|
// 👇 projectId + slug combo unique
|
||||||
|
blogSchema.index({ projectId: 1, slug: 1 }, { unique: true });
|
||||||
|
|
||||||
|
// 👇 Add base URL when converting to JSON
|
||||||
|
blogSchema.set('toJSON', {
|
||||||
|
transform: (doc, ret) => {
|
||||||
|
const baseUrl = process.env.BACKEND_URL || 'http://localhost:3010';
|
||||||
|
if (ret.imageUrl && !ret.imageUrl.startsWith('http')) {
|
||||||
|
ret.imageUrl = `${baseUrl}${ret.imageUrl}`;
|
||||||
|
}
|
||||||
|
if (ret.bigImageUrl && !ret.bigImageUrl.startsWith('http')) {
|
||||||
|
ret.bigImageUrl = `${baseUrl}${ret.bigImageUrl}`;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
export default mongoose.model('Blog', blogSchema);
|
||||||
21
models/category.model.js
Normal file
21
models/category.model.js
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import mongoose from "mongoose";
|
||||||
|
|
||||||
|
const categorySchema = new mongoose.Schema(
|
||||||
|
{
|
||||||
|
name: {
|
||||||
|
type: String,
|
||||||
|
required: true,
|
||||||
|
unique: true,
|
||||||
|
trim: true,
|
||||||
|
},
|
||||||
|
projectId: {
|
||||||
|
type: String, // For multi-project support
|
||||||
|
required: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ timestamps: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
const Category = mongoose.model("Category", categorySchema);
|
||||||
|
|
||||||
|
export default Category;
|
||||||
10
models/comments.model.js
Normal file
10
models/comments.model.js
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import mongoose from "mongoose";
|
||||||
|
|
||||||
|
const commentSchema = new mongoose.Schema({
|
||||||
|
blog: { type: mongoose.Schema.Types.ObjectId, ref: "Blog" },
|
||||||
|
name: String,
|
||||||
|
text: String,
|
||||||
|
createdAt: { type: Date, default: Date.now }
|
||||||
|
});
|
||||||
|
|
||||||
|
export default mongoose.model("Comment", commentSchema);
|
||||||
18
models/maisondetreats/cakeOrder.model.js
Normal file
18
models/maisondetreats/cakeOrder.model.js
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import mongoose from "mongoose";
|
||||||
|
|
||||||
|
const CakeOrderSchema = new mongoose.Schema(
|
||||||
|
{
|
||||||
|
order: {
|
||||||
|
type: Object,
|
||||||
|
required: true,
|
||||||
|
// Example format:
|
||||||
|
// {
|
||||||
|
// "Mini Cakes": { "Thandai Cake": 1, "Mango Cardamom": 1 },
|
||||||
|
// "Mithai-Inspired Macarons": { "Mango macarons (pack of 6)": 1, "Pista (pack of 6)": 10 }
|
||||||
|
// }
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ timestamps: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
export const CakeOrder = mongoose.model("CakeOrder", CakeOrderSchema);
|
||||||
12
models/message.model.js
Normal file
12
models/message.model.js
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import mongoose from "mongoose";
|
||||||
|
|
||||||
|
const MessageSchema = new mongoose.Schema({
|
||||||
|
project: { type: String, required: true }, // new field to identify project
|
||||||
|
name: { type: String, default: "Guest" },
|
||||||
|
email: { type: String },
|
||||||
|
message: { type: String, required: true },
|
||||||
|
}, { timestamps: true });
|
||||||
|
|
||||||
|
const Message = mongoose.models.Message || mongoose.model("Message", MessageSchema);
|
||||||
|
|
||||||
|
export default Message;
|
||||||
42
models/pageSpeedTest.model.js
Normal file
42
models/pageSpeedTest.model.js
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import mongoose from 'mongoose';
|
||||||
|
|
||||||
|
const pageSpeedTestSchema = new mongoose.Schema({
|
||||||
|
url: { type: String, required: true },
|
||||||
|
device: { type: String, enum: ['mobile', 'desktop'], required: true },
|
||||||
|
scores: {
|
||||||
|
performance: Number,
|
||||||
|
accessibility: Number,
|
||||||
|
bestPractices: Number,
|
||||||
|
seo: Number,
|
||||||
|
pwa: Number,
|
||||||
|
},
|
||||||
|
metrics: {
|
||||||
|
firstContentfulPaint: String,
|
||||||
|
largestContentfulPaint: String,
|
||||||
|
totalBlockingTime: String,
|
||||||
|
timeToInteractive: String,
|
||||||
|
speedIndex: String,
|
||||||
|
cumulativeLayoutShift: String,
|
||||||
|
},
|
||||||
|
opportunities: [
|
||||||
|
{
|
||||||
|
title: String,
|
||||||
|
description: String,
|
||||||
|
estimatedSavings: String,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
diagnostics: Object,
|
||||||
|
failedAudits: [
|
||||||
|
{
|
||||||
|
title: String,
|
||||||
|
description: String,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
passedAudits: [String],
|
||||||
|
notApplicableAudits: [String],
|
||||||
|
screenshot: String,
|
||||||
|
treemapPath: { type: String },
|
||||||
|
createdAt: { type: Date, default: Date.now },
|
||||||
|
});
|
||||||
|
|
||||||
|
export default mongoose.model('PageSpeedTest', pageSpeedTestSchema);
|
||||||
12
models/payment.model.js
Normal file
12
models/payment.model.js
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import mongoose from "mongoose";
|
||||||
|
|
||||||
|
const paymentSchema = new mongoose.Schema({
|
||||||
|
email: { type: String, required: true },
|
||||||
|
amount: { type: Number, required: true }, // store in cents
|
||||||
|
currency: { type: String, default: "usd" },
|
||||||
|
stripePaymentIntentId: { type: String }, // ❌ remove required: true
|
||||||
|
stripeSessionId: { type: String }, // ✅ store Checkout Session ID
|
||||||
|
status: { type: String, default: "pending" }, // pending, succeeded, failed
|
||||||
|
}, { timestamps: true });
|
||||||
|
|
||||||
|
export const Payment = mongoose.model("Payment", paymentSchema);
|
||||||
14
models/user.model.js
Normal file
14
models/user.model.js
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import mongoose from "mongoose";
|
||||||
|
|
||||||
|
const userSchema = new mongoose.Schema(
|
||||||
|
{
|
||||||
|
email: { type: String, required: true, unique: true, lowercase: true },
|
||||||
|
passwordHash: { type: String, required: true },
|
||||||
|
// ➡️ Add these two lines
|
||||||
|
resetPasswordToken: { type: String },
|
||||||
|
resetPasswordExpires: { type: Date },
|
||||||
|
},
|
||||||
|
{ timestamps: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
export default mongoose.model("User", userSchema);
|
||||||
5093
package-lock.json
generated
Normal file
5093
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
36
package.json
Normal file
36
package.json
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
"name": "crawlerx",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"main": "crawler.js",
|
||||||
|
"scripts": {
|
||||||
|
"start": "node crawler.js https://example.com 200",
|
||||||
|
"dev": "nodemon crawler.js https://example.com 200"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.12.2",
|
||||||
|
"bcrypt": "^6.0.0",
|
||||||
|
"cheerio": "^1.1.0",
|
||||||
|
"chrome-launcher": "^1.2.1",
|
||||||
|
"cors": "^2.8.5",
|
||||||
|
"dotenv": "^17.2.2",
|
||||||
|
"express": "^5.1.0",
|
||||||
|
"got": "^14.4.7",
|
||||||
|
"jsonwebtoken": "^9.0.2",
|
||||||
|
"lighthouse": "^12.8.2",
|
||||||
|
"mongoose": "^8.18.1",
|
||||||
|
"multer": "^2.0.2",
|
||||||
|
"nodemailer": "^7.0.6",
|
||||||
|
"normalize-url": "^8.0.2",
|
||||||
|
"sitemapper": "^3.2.7",
|
||||||
|
"slugify": "^1.6.6",
|
||||||
|
"string-pixel-width": "^1.11.0",
|
||||||
|
"string-similarity": "^4.0.4",
|
||||||
|
"stripe": "^18.5.0",
|
||||||
|
"text-readability": "^1.1.1"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"nodemon": "^3.1.10",
|
||||||
|
"playwright": "^1.55.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
2897
public/lighthouse-treemap/treemap-1758885492002.html
Normal file
2897
public/lighthouse-treemap/treemap-1758885492002.html
Normal file
File diff suppressed because one or more lines are too long
2897
public/lighthouse-treemap/treemap-1758885890915.html
Normal file
2897
public/lighthouse-treemap/treemap-1758885890915.html
Normal file
File diff suppressed because one or more lines are too long
2897
public/lighthouse-treemap/treemap-1758958684569.html
Normal file
2897
public/lighthouse-treemap/treemap-1758958684569.html
Normal file
File diff suppressed because one or more lines are too long
18
routes/auth.routes.js
Normal file
18
routes/auth.routes.js
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import express from "express";
|
||||||
|
import { signup, login, changePassword, forgotPassword, resetPassword } from "../controllers/auth.controller.js";
|
||||||
|
import { authMiddleware } from "../middlewares/auth.middleware.js";
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
router.post("/signup", signup);
|
||||||
|
router.post("/login", login);
|
||||||
|
router.post("/change-password", authMiddleware, changePassword);
|
||||||
|
router.post("/forgot-password", forgotPassword);
|
||||||
|
router.post("/reset-password", resetPassword);
|
||||||
|
|
||||||
|
// example protected route
|
||||||
|
router.get("/profile", authMiddleware, (req, res) => {
|
||||||
|
res.json({ user: req.user });
|
||||||
|
});
|
||||||
|
|
||||||
|
export default router;
|
||||||
54
routes/blog.routes.js
Normal file
54
routes/blog.routes.js
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import express from "express";
|
||||||
|
import multer from "multer";
|
||||||
|
import {
|
||||||
|
createBlog,
|
||||||
|
getAllBlogs,
|
||||||
|
getBlogBySlug,
|
||||||
|
likeBlog,
|
||||||
|
} from "../controllers/blog.controller.js";
|
||||||
|
|
||||||
|
import {
|
||||||
|
createCategory,
|
||||||
|
getCategories,
|
||||||
|
deleteCategory,
|
||||||
|
} from "../controllers/category.controller.js";
|
||||||
|
|
||||||
|
import {
|
||||||
|
addComment as addCommentController,
|
||||||
|
getComments,
|
||||||
|
deleteComment,
|
||||||
|
} from "../controllers/comment.controller.js";
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
const upload = multer({ dest: "uploads/" });
|
||||||
|
|
||||||
|
// =======================
|
||||||
|
// Blog Routes
|
||||||
|
// =======================
|
||||||
|
// Create a blog
|
||||||
|
router.post("/", upload.single("image"), createBlog);
|
||||||
|
|
||||||
|
// Get all blogs
|
||||||
|
router.get("/", getAllBlogs);
|
||||||
|
|
||||||
|
// Get blog by slug
|
||||||
|
router.get("/:slug", getBlogBySlug);
|
||||||
|
|
||||||
|
// Like a blog
|
||||||
|
router.post("/:id/like", likeBlog);
|
||||||
|
|
||||||
|
// =======================
|
||||||
|
// Category Routes
|
||||||
|
// =======================
|
||||||
|
router.post("/category", createCategory); // Create Category (admin)
|
||||||
|
router.get("/category", getCategories); // List Categories
|
||||||
|
router.delete("/category/:id", deleteCategory); // Delete Category (admin)
|
||||||
|
|
||||||
|
// =======================
|
||||||
|
// Comment Routes
|
||||||
|
// =======================
|
||||||
|
router.post("/:blogId/comments", addCommentController); // Add Comment
|
||||||
|
router.get("/:blogId/comments", getComments); // Get Comments
|
||||||
|
router.delete("/:blogId/comments/:commentId", deleteComment); // Delete Comment (admin)
|
||||||
|
|
||||||
|
export default router;
|
||||||
6
routes/crawl.routes.js
Normal file
6
routes/crawl.routes.js
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import { Router } from "express";
|
||||||
|
import { crawlHandler } from "../controllers/crawl.controller.js";
|
||||||
|
|
||||||
|
const router = Router();
|
||||||
|
router.get("/", crawlHandler);
|
||||||
|
export default router;
|
||||||
8
routes/lighthouse.routes.js
Normal file
8
routes/lighthouse.routes.js
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import express from 'express';
|
||||||
|
import { runAudit } from '../controllers/lighthouseController.js';
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
router.post('/audit', runAudit);
|
||||||
|
|
||||||
|
export default router;
|
||||||
12
routes/maisondetreats/cakeOrder.routes.js
Normal file
12
routes/maisondetreats/cakeOrder.routes.js
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import express from "express";
|
||||||
|
import { createCakeOrder, getAllCakeOrders } from "../../controllers/maisondetreats/cakeOrder.controller.js";
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
// Create a new cake order
|
||||||
|
router.post("/", createCakeOrder);
|
||||||
|
|
||||||
|
// Get all cake orders
|
||||||
|
router.get("/", getAllCakeOrders);
|
||||||
|
|
||||||
|
export default router;
|
||||||
12
routes/message.routes.js
Normal file
12
routes/message.routes.js
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import express from "express";
|
||||||
|
import { sendMessage, getMessages } from "../controllers/message.controller.js";
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
// POST /api/messages → Save a message
|
||||||
|
router.post("/", sendMessage);
|
||||||
|
|
||||||
|
// GET /api/messages → Get all messages (optional)
|
||||||
|
router.get("/", getMessages);
|
||||||
|
|
||||||
|
export default router;
|
||||||
14
routes/payment.route.js
Normal file
14
routes/payment.route.js
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import express from "express";
|
||||||
|
import {
|
||||||
|
createPaymentIntent,
|
||||||
|
createCheckoutSession,
|
||||||
|
handleWebhook
|
||||||
|
} from "../controllers/payment.controller.js";
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
router.post("/create-intent", createPaymentIntent);
|
||||||
|
router.post("/create-checkout-session", createCheckoutSession);
|
||||||
|
router.post("/webhook", express.raw({ type: "application/json" }), handleWebhook);
|
||||||
|
|
||||||
|
export default router;
|
||||||
6
routes/sitemap.routes.js
Normal file
6
routes/sitemap.routes.js
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import { Router } from "express";
|
||||||
|
import { sitemapHandler } from "../controllers/sitemap.controller.js";
|
||||||
|
|
||||||
|
const router = Router();
|
||||||
|
router.get("/", sitemapHandler);
|
||||||
|
export default router;
|
||||||
237
server copy.js
Normal file
237
server copy.js
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
// // server.js
|
||||||
|
// import express from "express";
|
||||||
|
// import { Queue } from "bullmq";
|
||||||
|
// import { connection } from "./redis.js";
|
||||||
|
// import crypto from "crypto";
|
||||||
|
|
||||||
|
// const app = express();
|
||||||
|
// app.use(express.json());
|
||||||
|
|
||||||
|
// const crawlQueue = new Queue("crawl", { connection });
|
||||||
|
|
||||||
|
// // Start a new crawl
|
||||||
|
// app.post("/crawl", async (req, res) => {
|
||||||
|
// const { startUrl } = req.body;
|
||||||
|
// if (!startUrl) return res.status(400).json({ error: "Missing startUrl" });
|
||||||
|
|
||||||
|
// const crawlId = crypto.randomUUID();
|
||||||
|
// await crawlQueue.add("fetch", { crawlId, url: startUrl });
|
||||||
|
|
||||||
|
// res.json({ crawlId, message: "Crawl started" });
|
||||||
|
// });
|
||||||
|
|
||||||
|
// // (Optional) Check progress
|
||||||
|
// app.get("/status/:id", async (req, res) => {
|
||||||
|
// // For now just reply with "in progress"
|
||||||
|
// res.json({ crawlId: req.params.id, status: "in progress" });
|
||||||
|
// });
|
||||||
|
|
||||||
|
// app.listen(3000, () => {
|
||||||
|
// console.log("Crawler API running at http://localhost:3000");
|
||||||
|
// });
|
||||||
|
|
||||||
|
|
||||||
|
// // server.js
|
||||||
|
// import express from "express";
|
||||||
|
// import cors from "cors"; // ← optional but recommended
|
||||||
|
// import { crawl } from "./crawler.js"; // ensure crawl is a NAMED export; if default, use: import crawl from "./crawler.js";
|
||||||
|
|
||||||
|
// const app = express();
|
||||||
|
// const PORT = process.env.PORT || 3010;
|
||||||
|
|
||||||
|
// /* Parse JSON BEFORE any middleware that might read req.body */
|
||||||
|
// app.use(express.json());
|
||||||
|
|
||||||
|
// /* CORS (adjust origins as needed) */
|
||||||
|
// app.use(cors({
|
||||||
|
// origin: [
|
||||||
|
// "http://localhost:3000",
|
||||||
|
// "https://your-frontend.example" // ← replace or remove
|
||||||
|
// ],
|
||||||
|
// }));
|
||||||
|
|
||||||
|
// /* Safe request logger */
|
||||||
|
// app.use((req, res, next) => {
|
||||||
|
// console.log(`[${new Date().toISOString()}] ${req.method} ${req.originalUrl}`);
|
||||||
|
// if (req.query && Object.keys(req.query).length) console.log("Query:", req.query);
|
||||||
|
// if (req.body && typeof req.body === "object" && Object.keys(req.body).length) console.log("Body:", req.body);
|
||||||
|
// next();
|
||||||
|
// });
|
||||||
|
|
||||||
|
// /* GET /crawl?url=https://site.com&max=50 */
|
||||||
|
// app.get("/crawl", async (req, res) => {
|
||||||
|
// try {
|
||||||
|
// const { url, max } = req.query;
|
||||||
|
// if (!url) return res.status(400).json({ error: "Missing url param" });
|
||||||
|
|
||||||
|
// // validate & normalize
|
||||||
|
// const target = new URL(String(url)); // throws if invalid
|
||||||
|
// const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
|
||||||
|
|
||||||
|
// await crawl(target.toString(), limit);
|
||||||
|
// res.json({ ok: true, message: `Crawl started`, url: target.toString(), limit });
|
||||||
|
// } catch (err) {
|
||||||
|
// console.error("Crawl error:", err);
|
||||||
|
// res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
|
||||||
|
// /* Global safety nets so crashes don’t become silent restart loops */
|
||||||
|
// process.on("unhandledRejection", (err) => console.error("unhandledRejection:", err));
|
||||||
|
// process.on("uncaughtException", (err) => console.error("uncaughtException:", err));
|
||||||
|
|
||||||
|
// /* Bind to all interfaces so remote calls work */
|
||||||
|
// app.listen(PORT, "0.0.0.0", () => {
|
||||||
|
// console.log(`🚀 Server running at http://localhost:${PORT}`);
|
||||||
|
// });
|
||||||
|
|
||||||
|
// server.js
|
||||||
|
import express from "express";
|
||||||
|
import cors from "cors";
|
||||||
|
import path from "node:path";
|
||||||
|
import fs from "node:fs";
|
||||||
|
import fsp from "node:fs/promises";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import { crawl } from "./crawler.js"; // crawl(target, limit, onProgress?, options?)
|
||||||
|
|
||||||
|
const app = express();
|
||||||
|
const PORT = process.env.PORT || 3010;
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
/* ------------ Middleware ------------ */
|
||||||
|
app.use(express.json());
|
||||||
|
app.use(cors({ origin: ["http://localhost:3000", "https://app.crawlerx.co"] }));
|
||||||
|
app.use(express.static(path.join(__dirname, "public")));
|
||||||
|
app.get("/", (_req, res) => {
|
||||||
|
const viewer = path.join(__dirname, "public", "crawlerx_viewer.html");
|
||||||
|
return fs.existsSync(viewer)
|
||||||
|
? res.sendFile(viewer)
|
||||||
|
: res.type("text/plain").send("CrawlerX backend is running.");
|
||||||
|
});
|
||||||
|
app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() }));
|
||||||
|
|
||||||
|
/* ------------ Helpers ------------ */
|
||||||
|
const ts = () =>
|
||||||
|
new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-"); // safe filename
|
||||||
|
function attachJson(res, filename, obj) {
|
||||||
|
const json = JSON.stringify(obj, null, 2);
|
||||||
|
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
||||||
|
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
||||||
|
return res.send(json);
|
||||||
|
}
|
||||||
|
function isAbs(p) {
|
||||||
|
try { return path.isAbsolute(p); } catch { return false; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------ Crawl endpoint ------------ */
|
||||||
|
/**
|
||||||
|
* GET /crawl?url=https://site.com&max=50[&stream=1][&download=1][&nostore=1]
|
||||||
|
* - stream=1 : SSE live progress (no download)
|
||||||
|
* - download=1 : respond as a JSON download (attachment)
|
||||||
|
* - nostore=1 : ask crawler not to write files (if supported by your crawler)
|
||||||
|
*/
|
||||||
|
app.get("/crawl", async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { url, max, stream, download, nostore } = req.query;
|
||||||
|
if (!url) return res.status(400).json({ error: "Missing url param" });
|
||||||
|
|
||||||
|
const target = new URL(String(url)); // validate
|
||||||
|
const limit = Math.min(Math.max(parseInt(max ?? "50", 10), 1), 500);
|
||||||
|
const wantsStream =
|
||||||
|
String(stream) === "1" ||
|
||||||
|
(req.get("accept") || "").includes("text/event-stream");
|
||||||
|
|
||||||
|
/* ---------- SSE mode ---------- */
|
||||||
|
if (wantsStream) {
|
||||||
|
if (String(download) === "1") {
|
||||||
|
return res.status(400).json({ error: "download not supported with stream=1" });
|
||||||
|
}
|
||||||
|
res.setHeader("Content-Type", "text/event-stream");
|
||||||
|
res.setHeader("Cache-Control", "no-cache, no-transform");
|
||||||
|
res.setHeader("Connection", "keep-alive");
|
||||||
|
res.flushHeaders?.();
|
||||||
|
const heartbeat = setInterval(() => res.write(":\n\n"), 15000);
|
||||||
|
const send = (obj, evt) => {
|
||||||
|
if (evt) res.write(`event: ${evt}\n`);
|
||||||
|
res.write(`data: ${JSON.stringify(obj)}\n\n`);
|
||||||
|
};
|
||||||
|
send({ ok: true, message: "Crawl started", url: target.toString(), limit }, "started");
|
||||||
|
|
||||||
|
let finished = false;
|
||||||
|
req.on("close", () => { clearInterval(heartbeat); if (!finished) console.warn("SSE client disconnected."); });
|
||||||
|
|
||||||
|
const result = await crawl(
|
||||||
|
target.toString(),
|
||||||
|
limit,
|
||||||
|
(tick) => send(tick),
|
||||||
|
// If your crawler supports it, this avoids writing files during SSE runs:
|
||||||
|
{ persistReports: false, collectPages: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
finished = true;
|
||||||
|
clearInterval(heartbeat);
|
||||||
|
send({ ok: true, done: true, result }, "done");
|
||||||
|
return res.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- Non-streaming mode ---------- */
|
||||||
|
// Ask crawler (if it supports options) to avoid writing files when nostore=1 or download requested.
|
||||||
|
const preferMemory = String(nostore) === "1" || String(download) === "1";
|
||||||
|
const result = await crawl(
|
||||||
|
target.toString(),
|
||||||
|
limit,
|
||||||
|
undefined,
|
||||||
|
preferMemory ? { persistReports: false, collectPages: true } : undefined
|
||||||
|
);
|
||||||
|
|
||||||
|
// If caller wants a downloadable JSON file...
|
||||||
|
if (String(download) === "1") {
|
||||||
|
const filename = `crawl-${ts()}.json`;
|
||||||
|
|
||||||
|
// 1) Best case: crawler returned in-memory data (no disk IO).
|
||||||
|
// Use whichever property your crawler exposes. We try common shapes.
|
||||||
|
const inMemory =
|
||||||
|
result?.jsonData ??
|
||||||
|
result?.pages ??
|
||||||
|
result?.report ??
|
||||||
|
(Array.isArray(result) ? result : null);
|
||||||
|
|
||||||
|
if (inMemory) {
|
||||||
|
return attachJson(res, filename, inMemory);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Fallback: crawler saved a JSON report path that we can stream.
|
||||||
|
const jsonPath = result?.reports?.json;
|
||||||
|
if (jsonPath && fs.existsSync(isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath))) {
|
||||||
|
const abs = isAbs(jsonPath) ? jsonPath : path.join(__dirname, jsonPath);
|
||||||
|
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
||||||
|
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
|
||||||
|
return fs.createReadStream(abs).pipe(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3) Last resort: send the entire result itself as JSON.
|
||||||
|
return attachJson(res, filename, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default JSON (inline, not attachment)
|
||||||
|
return res.json({
|
||||||
|
ok: true,
|
||||||
|
message: "Crawl completed",
|
||||||
|
url: target.toString(),
|
||||||
|
limit,
|
||||||
|
result
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Crawl error:", err);
|
||||||
|
return res.status(500).json({ error: "Crawl failed", details: String(err?.message ?? err) });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/* ------------ Safety nets ------------ */
|
||||||
|
process.on("unhandledRejection", (e) => console.error("unhandledRejection:", e));
|
||||||
|
process.on("uncaughtException", (e) => console.error("uncaughtException:", e));
|
||||||
|
|
||||||
|
/* ------------ Start server ------------ */
|
||||||
|
app.listen(PORT, "0.0.0.0", () => {
|
||||||
|
console.log(`🚀 Server running at http://localhost:${PORT}`);
|
||||||
|
});
|
||||||
80
server.js
Normal file
80
server.js
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import express from "express";
|
||||||
|
import cors from "cors";
|
||||||
|
import path from "node:path";
|
||||||
|
import fs from "node:fs";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
|
||||||
|
import crawlRoutes from "./routes/crawl.routes.js";
|
||||||
|
import sitemapRoutes from "./routes/sitemap.routes.js";
|
||||||
|
import authRoutes from "./routes/auth.routes.js"; // Login & Signup endpoints
|
||||||
|
import paymentRoutes from "./routes/payment.route.js";
|
||||||
|
import lighthouseRoutes from "./routes/lighthouse.routes.js"; // <-- ADD THIS
|
||||||
|
import messageRoutes from "./routes/message.routes.js";
|
||||||
|
import cakeOrderRoutes from "./routes/maisondetreats/cakeOrder.routes.js";
|
||||||
|
import blogRoutes from "./routes/blog.routes.js";
|
||||||
|
import { connectDB } from "./config/db.js";
|
||||||
|
import { mailer } from "./utils/mailer.js";
|
||||||
|
|
||||||
|
// ------------------ Load environment ------------------
|
||||||
|
dotenv.config(); // Must be first
|
||||||
|
|
||||||
|
// ------------------ Connect database ------------------
|
||||||
|
await connectDB();
|
||||||
|
|
||||||
|
// ------------------ Express setup ------------------
|
||||||
|
const app = express();
|
||||||
|
const PORT = process.env.PORT || 3010;
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
app.use(express.json());
|
||||||
|
app.use(
|
||||||
|
cors({
|
||||||
|
origin: [
|
||||||
|
"http://localhost:3000",
|
||||||
|
"http://127.0.0.1:3000",
|
||||||
|
"https://api.crawlerx.co",
|
||||||
|
"https://app.crawlerx.co",
|
||||||
|
],
|
||||||
|
})
|
||||||
|
);
|
||||||
|
app.use(express.static(path.join(__dirname, "public")));
|
||||||
|
|
||||||
|
// ------------------ SMTP verification ------------------
|
||||||
|
|
||||||
|
console.log("SMTP Host:", process.env.SMTP_HOST);
|
||||||
|
console.log("SMTP Port:", process.env.SMTP_PORT);
|
||||||
|
// ------------------ Routes ------------------
|
||||||
|
app.get("/", (_req, res) => {
|
||||||
|
const viewer = path.join(__dirname, "public", "crawlerx_viewer.html");
|
||||||
|
if (fs.existsSync(viewer)) {
|
||||||
|
return res.sendFile(viewer);
|
||||||
|
} else {
|
||||||
|
return res
|
||||||
|
.type("text/plain")
|
||||||
|
.send("CrawlerX backend is running.");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get("/healthz", (_req, res) => res.json({ ok: true, time: new Date().toISOString() }));
|
||||||
|
|
||||||
|
app.use("/crawl", crawlRoutes);
|
||||||
|
app.use("/sitemap", sitemapRoutes);
|
||||||
|
app.use("/api/auth", authRoutes); // Login & Signup endpoints
|
||||||
|
app.use("/api/payment", paymentRoutes);
|
||||||
|
app.use("/api/lighthouse", lighthouseRoutes);
|
||||||
|
app.use("/api/blog", blogRoutes); // All blog/category/comment routes now prefixed with /api/blog
|
||||||
|
app.use("/api/messages", messageRoutes);
|
||||||
|
app.use("/api/cake-orders", cakeOrderRoutes);
|
||||||
|
|
||||||
|
// Serve uploaded files
|
||||||
|
app.use('/uploads', express.static(path.join(process.cwd(), 'uploads')));
|
||||||
|
|
||||||
|
// ------------------ Safety nets ------------------
|
||||||
|
process.on("unhandledRejection", (err) => console.error("Unhandled Rejection:", err));
|
||||||
|
process.on("uncaughtException", (err) => console.error("Uncaught Exception:", err));
|
||||||
|
|
||||||
|
// ------------------ Start server ------------------
|
||||||
|
app.listen(PORT, "0.0.0.0", () => {
|
||||||
|
console.log(`🚀 Server running at http://localhost:${PORT}`);
|
||||||
|
});
|
||||||
BIN
uploads/2176fe75251a06cf07a08e32a845b537
Normal file
BIN
uploads/2176fe75251a06cf07a08e32a845b537
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 34 KiB |
BIN
uploads/240f5f275adf05f94ee3cb81d3a4c829
Normal file
BIN
uploads/240f5f275adf05f94ee3cb81d3a4c829
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 34 KiB |
BIN
uploads/3d2a70806f726d28bd1814bce39b0906
Normal file
BIN
uploads/3d2a70806f726d28bd1814bce39b0906
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
BIN
uploads/866ff6699ca2b462d73b20c4d150b2e0
Normal file
BIN
uploads/866ff6699ca2b462d73b20c4d150b2e0
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 34 KiB |
BIN
uploads/8930208f2d9cc158282fc31953abc6a3
Normal file
BIN
uploads/8930208f2d9cc158282fc31953abc6a3
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 34 KiB |
BIN
uploads/d6d36b979d636c9f38a712bd3c298563
Normal file
BIN
uploads/d6d36b979d636c9f38a712bd3c298563
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 34 KiB |
105
utils/mailer.js
Normal file
105
utils/mailer.js
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import nodemailer from "nodemailer";
|
||||||
|
|
||||||
|
//
|
||||||
|
// Create reusable transporter object
|
||||||
|
//
|
||||||
|
export const mailer = nodemailer.createTransport({
|
||||||
|
host: "mail.crawlerx.co", // your Hestia mail host
|
||||||
|
port: 587, // STARTTLS
|
||||||
|
secure: false, // must be false for 587
|
||||||
|
auth: {
|
||||||
|
user: "info@crawlerx.co", // e.g. info@crawlerx.co
|
||||||
|
pass: "CrawlerX@2025", // mailbox password
|
||||||
|
},
|
||||||
|
name: "mail.crawlerx.co", // explicitly set hostname
|
||||||
|
tls: {
|
||||||
|
rejectUnauthorized: false, // allow self-signed certs
|
||||||
|
},
|
||||||
|
logger: true, // optional: logs connection steps
|
||||||
|
debug: true, // optional: debug SMTP connection
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Send welcome / signup email
|
||||||
|
//
|
||||||
|
export async function sendSignupMail(toEmail) {
|
||||||
|
try {
|
||||||
|
await mailer.sendMail({
|
||||||
|
from: `"CrawlerX" <${process.env.SMTP_USER}>`,
|
||||||
|
to: toEmail,
|
||||||
|
subject: "Welcome to CrawlerX",
|
||||||
|
html: `
|
||||||
|
<h2>Welcome!</h2>
|
||||||
|
<p>Your signup was successful. You can now log in and start using the app.</p>
|
||||||
|
`,
|
||||||
|
});
|
||||||
|
console.log(`✅ Signup email sent to ${toEmail}`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("❌ Error sending signup email:", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Send reset-password email with 4-digit code or token link
|
||||||
|
//
|
||||||
|
export async function sendResetPasswordMail(email, token) {
|
||||||
|
try {
|
||||||
|
const resetURL = `${process.env.FRONTEND_URL}/reset-password?email=${email}&token=${token}`;
|
||||||
|
await mailer.sendMail({
|
||||||
|
from: `"CrawlerX" <${process.env.SMTP_USER}>`,
|
||||||
|
to: email,
|
||||||
|
subject: "Reset your password",
|
||||||
|
html: `
|
||||||
|
<p>You requested a password reset.</p>
|
||||||
|
<p>Click here to reset: <a href="${resetURL}">${resetURL}</a></p>
|
||||||
|
<p>This link is valid for 1 hour.</p>
|
||||||
|
`,
|
||||||
|
});
|
||||||
|
console.log(`✅ Reset password email sent to ${email}`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("❌ Error sending reset password email:", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export const sendCakeOrderMail = async (toEmail, orderData) => {
|
||||||
|
try {
|
||||||
|
const transporter = nodemailer.createTransport({
|
||||||
|
host: "mail.crawlerx.co",
|
||||||
|
port: 587,
|
||||||
|
secure: false, // use TLS? false for port 587
|
||||||
|
auth: {
|
||||||
|
user: "info@crawlerx.co",
|
||||||
|
pass: "CrawlerX@2025",
|
||||||
|
},
|
||||||
|
tls: {
|
||||||
|
rejectUnauthorized: false, // <--- allow self-signed certificate
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const orderItems = Object.entries(orderData)
|
||||||
|
.map(([category, flavours]) => {
|
||||||
|
const items = Object.entries(flavours)
|
||||||
|
.map(([flavour, qty]) => `• ${flavour}: ${qty}`)
|
||||||
|
.join("\n");
|
||||||
|
return `${category}:\n${items}`;
|
||||||
|
})
|
||||||
|
.join("\n\n");
|
||||||
|
|
||||||
|
const mailOptions = {
|
||||||
|
from: `"Maison de Treats" <info@crawlerx.co>`,
|
||||||
|
to: toEmail,
|
||||||
|
subject: "🎉 Your Cake Order Confirmation",
|
||||||
|
text: `Thank you for your order! Here are the details:\n\n${orderItems}`,
|
||||||
|
html: `<h2>Thank you for your order!</h2>
|
||||||
|
<p>Here are your cake order details:</p>
|
||||||
|
<pre>${orderItems}</pre>`,
|
||||||
|
};
|
||||||
|
|
||||||
|
await transporter.sendMail(mailOptions);
|
||||||
|
console.log("Cake order email sent to", toEmail);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Failed to send cake order email:", err);
|
||||||
|
}
|
||||||
|
};
|
||||||
20
utils/sitemap.js
Normal file
20
utils/sitemap.js
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import Sitemapper from "sitemapper";
|
||||||
|
import normalizeUrl from "normalize-url";
|
||||||
|
|
||||||
|
/** Return an array of normalized URLs from <origin>/sitemap.xml (or [] if none). */
|
||||||
|
export async function getSitemapUrls(startUrl) {
|
||||||
|
const origin = new URL(startUrl).origin;
|
||||||
|
const sitemapUrl = `${origin}/sitemap.xml`;
|
||||||
|
const sm = new Sitemapper({ url: sitemapUrl, timeout: 15000 });
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { sites } = await sm.fetch();
|
||||||
|
const out = [];
|
||||||
|
for (const u of sites || []) {
|
||||||
|
try { out.push(normalizeUrl(u, { stripHash: true })); } catch {}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
} catch {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
8
utils/stripe.js
Normal file
8
utils/stripe.js
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import Stripe from "stripe";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
export const stripe = new Stripe(process.env.STRIPE_SECRET_KEY, {
|
||||||
|
apiVersion: "2024-06-20",
|
||||||
|
});
|
||||||
11
utils/urlHelpers.js
Normal file
11
utils/urlHelpers.js
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
export function isInternal(base, candidate) {
|
||||||
|
try {
|
||||||
|
const baseUrl = new URL(base);
|
||||||
|
const testUrl = new URL(candidate, base);
|
||||||
|
const protocolOk = testUrl.protocol === "http:" || testUrl.protocol === "https:";
|
||||||
|
const stripWWW = (h) => h.replace(/^www\./i, "");
|
||||||
|
return protocolOk && stripWWW(baseUrl.hostname) === stripWWW(testUrl.hostname);
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user