Files
mintel.me/apps/web/scripts/clone-recursive.ts
Marc Mintel 103d71851c
Some checks failed
🧪 CI (QA) / 🧪 Quality Assurance (push) Failing after 1m3s
chore: overhaul infrastructure and integrate @mintel packages
- Restructure to pnpm monorepo (site moved to apps/web)
- Integrate @mintel/tsconfig, @mintel/eslint-config, @mintel/husky-config
- Implement Docker service architecture (Varnish, Directus, Gatekeeper)
- Setup environment-aware Gitea Actions deployment
2026-02-05 14:18:51 +01:00

245 lines
11 KiB
TypeScript

// @ts-ignore
import scrape from 'website-scraper';
// @ts-ignore
import PuppeteerPlugin from 'website-scraper-puppeteer';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
class CustomFilenameGeneratorPlugin {
apply(registerAction: any) {
registerAction('generateFilename', ({ resource }: any) => {
const url = new URL(resource.url);
const ext = path.extname(url.pathname);
// Clean the path
let safePath = url.pathname;
if (safePath.endsWith('/')) {
safePath += 'index.html';
} else if (!ext && !resource.isHtml()) {
// If no extension and not HTML, guess based on content type?
// But usually safe to leave as is or add extension if known.
} else if (!ext && resource.isHtml()) {
safePath += '.html';
}
// Handle query strings if needed (simplifying by ignoring them for static local files usually better,
// unless they determine content. For a clean clone, we usually ignore unique query params)
// But if the site relies on routing via query params (e.g. ?page=2), we might want to encode them.
// For now, let's keep it simple and clean.
// Remove leading slash
if (safePath.startsWith('/')) safePath = safePath.substring(1);
// Sanitization
safePath = safePath.replace(/[:*?"<>|]/g, '_');
// External assets go to a separate folder to avoid collision
// We can detect external by checking if the resource parent is different?
// Actually, simply using the hostname mapping is safer.
// However, the USER wants "local cloned pages".
// If we just use the path, we merge everything into one root.
// If there are collision (e.g. same path on different domains), this is bad.
// But typically we clone ONE site.
return { filename: safePath };
});
}
}
async function run() {
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error('Usage: npm run clone-website <URL> [output-dir]');
process.exit(1);
}
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
const outputDir = process.argv[3]
? path.resolve(process.cwd(), process.argv[3])
: path.resolve(__dirname, '../cloned-websites', safeDomain);
if (fs.existsSync(outputDir)) {
console.log(`Cleaning existing directory: ${outputDir}`);
fs.rmSync(outputDir, { recursive: true, force: true });
}
console.log(`🚀 Starting recursive clone of ${targetUrl}`);
console.log(`📂 Output: ${outputDir}`);
const options = {
urls: [targetUrl],
directory: outputDir,
recursive: true,
maxDepth: 5,
// Custom filename generation to avoid "https:/" folders
plugins: [
new PuppeteerPlugin({
launchOptions: {
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
},
scrollToBottom: { timeout: 10000, viewportN: 10 },
blockNavigation: false
}),
new class LoggerPlugin {
apply(registerAction: any) {
registerAction('onResourceSaved', ({ resource }: any) => {
console.log(` 💾 Saved: ${resource.url} -> ${resource.filename}`);
});
registerAction('onResourceError', ({ resource, error }: any) => {
console.error(` ❌ Error: ${resource.url} - ${error.message}`);
});
}
},
new class FilenamePlugin {
apply(registerAction: any) {
registerAction('generateFilename', ({ resource }: any) => {
const u = new URL(resource.url);
let filename = u.pathname;
// normalize
if (filename.endsWith('/')) filename += 'index.html';
else if (!path.extname(filename) && resource.url.includes(domain)) filename += '/index.html'; // Assume folder if internal link without ext
// If it's an external asset, put it in a separate folder
if (u.hostname !== domain) {
filename = `_external/${u.hostname}${filename}`;
}
// Sanitize filename
filename = filename.split('/').map(part => part.replace(/[^a-z0-9._-]/gi, '_')).join('/');
// Remove leading slash
if (filename.startsWith('/')) filename = filename.substring(1);
// Handle "Unnamed page" by checking if empty
if (!filename || filename === 'index.html') return { filename: 'index.html' };
return { filename };
});
}
}
],
urlFilter: (url: string) => {
const u = new URL(url);
const isTargetDomain = u.hostname === domain;
const isGoogleFonts = u.hostname.includes('fonts.googleapis.com') || u.hostname.includes('fonts.gstatic.com');
// Allow assets from anywhere
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test(u.pathname);
// Allow fonts/css from common CDNs if standard extension check fails
const isCommonAsset = u.pathname.includes('/css/') || u.pathname.includes('/js/') || u.pathname.includes('/static/') || u.pathname.includes('/assets/') || u.pathname.includes('/uploads/');
return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
},
sources: [
{ selector: 'img', attr: 'src' },
{ selector: 'img', attr: 'srcset' },
{ selector: 'source', attr: 'src' },
{ selector: 'source', attr: 'srcset' },
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
{ selector: 'link[rel="preload"]', attr: 'href' },
{ selector: 'link[rel="prefetch"]', attr: 'href' },
{ selector: 'script', attr: 'src' },
{ selector: 'video', attr: 'src' },
{ selector: 'video', attr: 'poster' },
{ selector: 'iframe', attr: 'src' },
{ selector: 'link[rel*="icon"]', attr: 'href' },
{ selector: 'link[rel="manifest"]', attr: 'href' },
{ selector: 'meta[property="og:image"]', attr: 'content' }
],
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
}
};
try {
// @ts-ignore
const result = await scrape(options);
console.log(`\n✅ Successfully cloned ${result.length} resources to ${outputDir}`);
// Post-processing: Sanitize HTML to remove Next.js hydration scripts
// This prevents the static site from trying to "hydrate" and breaking images/links
console.log('🧹 Sanitizing HTML files...');
sanitizeHtmlFiles(outputDir);
console.log(`open "${path.join(outputDir, 'index.html')}"`);
} catch (error) {
console.error('❌ Error cloning website:', error);
process.exit(1);
}
}
function sanitizeHtmlFiles(dir: string) {
const files = fs.readdirSync(dir);
for (const file of files) {
const fullPath = path.join(dir, file);
if (fs.statSync(fullPath).isDirectory()) {
sanitizeHtmlFiles(fullPath);
} else if (file.endsWith('.html')) {
let content = fs.readFileSync(fullPath, 'utf8');
// Remove Next.js data script
content = content.replace(/<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi, '');
// Remove Next.js chunk scripts (hydration)
// match <script src="..._next/static/chunks..." ...
content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi, '');
content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi, '');
// Convert Breeze dynamic script/styles into actual tags if possible
// match <div class="breeze-scripts-load" ...>URL</div>
content = content.replace(/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => {
if (url.endsWith('.css')) return `<link rel="stylesheet" href="${url}">`;
return `<script src="${url}"></script>`;
});
// Inject Fonts (Fix for missing dynamic fonts)
// We inject Inter and Montserrat as safe defaults for industrial/modern sites
// Check specifically for a stylesheet link to google fonts
const hasGoogleFontStylesheet = /<link[^>]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test(content);
if (!hasGoogleFontStylesheet) {
const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700&display=swap">`;
const styleBlock = `<style>
:root { --main-font: 'Inter', sans-serif; --heading-font: 'Montserrat', sans-serif; }
body, .body-font, p, span, li, a { font-family: var(--main-font) !important; }
h1, h2, h3, h4, h5, h6, .title-font, .heading-font { font-family: var(--heading-font) !important; }
</style>`;
content = content.replace('</head>', `${fontLink}${styleBlock}</head>`);
}
// Force column layout on product pages
if (content.includes('class="products')) {
const layoutScript = `
<script>
document.addEventListener('DOMContentLoaded', function() {
const products = document.querySelector('.products');
if (products) {
products.classList.remove(...Array.from(products.classList).filter(c => c.startsWith('columns-')));
products.classList.add('columns-1');
products.setAttribute('data-n-desktop-columns', '1');
}
});
</script>`;
content = content.replace('</body>', `${layoutScript}</body>`);
}
fs.writeFileSync(fullPath, content);
}
}
}
run();