Files
mintel.me/apps/web/scripts/clone-recursive.ts
Marc Mintel ecea90dc91
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 1m27s
Build & Deploy / 🏗️ Build (push) Failing after 1m31s
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🩺 Health Check (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
chore: stabilize apps/web (lint, build, typecheck fixes)
2026-02-11 11:56:13 +01:00

240 lines
8.5 KiB
TypeScript

// @ts-ignore
import scrape from "website-scraper";
// @ts-ignore
import PuppeteerPlugin from "website-scraper-puppeteer";
import path from "node:path";
import { fileURLToPath } from "node:url";
import fs from "node:fs";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function run() {
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error("Usage: npm run clone-website <URL> [output-dir]");
process.exit(1);
}
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const safeDomain = domain.replace(/[^a-z0-9-]/gi, "_");
const outputDir = process.argv[3]
? path.resolve(process.cwd(), process.argv[3])
: path.resolve(__dirname, "../cloned-websites", safeDomain);
if (fs.existsSync(outputDir)) {
console.log(`Cleaning existing directory: ${outputDir}`);
fs.rmSync(outputDir, { recursive: true, force: true });
}
console.log(`🚀 Starting recursive clone of ${targetUrl}`);
console.log(`📂 Output: ${outputDir}`);
const options = {
urls: [targetUrl],
directory: outputDir,
recursive: true,
maxDepth: 5,
// Custom filename generation to avoid "https:/" folders
plugins: [
new PuppeteerPlugin({
launchOptions: {
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
},
scrollToBottom: { timeout: 10000, viewportN: 10 },
blockNavigation: false,
}),
new (class LoggerPlugin {
apply(registerAction: any) {
registerAction("onResourceSaved", ({ resource }: any) => {
console.log(` 💾 Saved: ${resource.url} -> ${resource.filename}`);
});
registerAction("onResourceError", ({ resource, error }: any) => {
console.error(` ❌ Error: ${resource.url} - ${error.message}`);
});
}
})(),
new (class FilenamePlugin {
apply(registerAction: any) {
registerAction("generateFilename", ({ resource }: any) => {
const u = new URL(resource.url);
let filename = u.pathname;
// normalize
if (filename.endsWith("/")) filename += "index.html";
else if (!path.extname(filename) && resource.url.includes(domain))
filename += "/index.html"; // Assume folder if internal link without ext
// If it's an external asset, put it in a separate folder
if (u.hostname !== domain) {
filename = `_external/${u.hostname}${filename}`;
}
// Sanitize filename
filename = filename
.split("/")
.map((part) => part.replace(/[^a-z0-9._-]/gi, "_"))
.join("/");
// Remove leading slash
if (filename.startsWith("/")) filename = filename.substring(1);
// Handle "Unnamed page" by checking if empty
if (!filename || filename === "index.html")
return { filename: "index.html" };
return { filename };
});
}
})(),
],
urlFilter: (url: string) => {
const u = new URL(url);
const isTargetDomain = u.hostname === domain;
const isGoogleFonts =
u.hostname.includes("fonts.googleapis.com") ||
u.hostname.includes("fonts.gstatic.com");
// Allow assets from anywhere
const isAsset =
/\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test(
u.pathname,
);
// Allow fonts/css from common CDNs if standard extension check fails
const isCommonAsset =
u.pathname.includes("/css/") ||
u.pathname.includes("/js/") ||
u.pathname.includes("/static/") ||
u.pathname.includes("/assets/") ||
u.pathname.includes("/uploads/");
return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
},
sources: [
{ selector: "img", attr: "src" },
{ selector: "img", attr: "srcset" },
{ selector: "source", attr: "src" },
{ selector: "source", attr: "srcset" },
{ selector: 'link[rel="stylesheet"]', attr: "href" },
{ selector: 'link[rel="preload"]', attr: "href" },
{ selector: 'link[rel="prefetch"]', attr: "href" },
{ selector: "script", attr: "src" },
{ selector: "video", attr: "src" },
{ selector: "video", attr: "poster" },
{ selector: "iframe", attr: "src" },
{ selector: 'link[rel*="icon"]', attr: "href" },
{ selector: 'link[rel="manifest"]', attr: "href" },
{ selector: 'meta[property="og:image"]', attr: "content" },
],
request: {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
},
},
};
try {
// @ts-ignore
const result = await scrape(options);
console.log(
`\n✅ Successfully cloned ${result.length} resources to ${outputDir}`,
);
// Post-processing: Sanitize HTML to remove Next.js hydration scripts
// This prevents the static site from trying to "hydrate" and breaking images/links
console.log("🧹 Sanitizing HTML files...");
sanitizeHtmlFiles(outputDir);
console.log(`open "${path.join(outputDir, "index.html")}"`);
} catch (error) {
console.error("❌ Error cloning website:", error);
process.exit(1);
}
}
function sanitizeHtmlFiles(dir: string) {
const files = fs.readdirSync(dir);
for (const file of files) {
const fullPath = path.join(dir, file);
if (fs.statSync(fullPath).isDirectory()) {
sanitizeHtmlFiles(fullPath);
} else if (file.endsWith(".html")) {
let content = fs.readFileSync(fullPath, "utf8");
// Remove Next.js data script
content = content.replace(
/<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi,
"",
);
// Remove Next.js chunk scripts (hydration)
// match <script src="..._next/static/chunks..." ...
content = content.replace(
/<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi,
"",
);
content = content.replace(
/<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi,
"",
);
// Convert Breeze dynamic script/styles into actual tags if possible
// match <div class="breeze-scripts-load" ...>URL</div>
content = content.replace(
/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi,
(match, url) => {
if (url.endsWith(".css"))
return `<link rel="stylesheet" href="${url}">`;
return `<script src="${url}"></script>`;
},
);
// Inject Fonts (Fix for missing dynamic fonts)
// We inject Inter and Montserrat as safe defaults for industrial/modern sites
// Check specifically for a stylesheet link to google fonts
const hasGoogleFontStylesheet =
/<link[^>]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test(
content,
);
if (!hasGoogleFontStylesheet) {
const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700&display=swap">`;
const styleBlock = `<style>
:root { --main-font: 'Inter', sans-serif; --heading-font: 'Montserrat', sans-serif; }
body, .body-font, p, span, li, a { font-family: var(--main-font) !important; }
h1, h2, h3, h4, h5, h6, .title-font, .heading-font { font-family: var(--heading-font) !important; }
</style>`;
content = content.replace("</head>", `${fontLink}${styleBlock}</head>`);
}
// Force column layout on product pages
if (content.includes('class="products')) {
const layoutScript = `
<script>
document.addEventListener('DOMContentLoaded', function() {
const products = document.querySelector('.products');
if (products) {
products.classList.remove(...Array.from(products.classList).filter(c => c.startsWith('columns-')));
products.classList.add('columns-1');
products.setAttribute('data-n-desktop-columns', '1');
}
});
</script>`;
content = content.replace("</body>", `${layoutScript}</body>`);
}
fs.writeFileSync(fullPath, content);
}
}
}
run();