Files
mintel.me/scripts/clone-page.ts
Marc Mintel 8a7110e9ef
Some checks failed
Build & Deploy Mintel Blog / build-and-deploy (push) Failing after 2m14s
klz case study
2026-02-02 12:01:48 +01:00

201 lines
9.1 KiB
TypeScript

import { chromium } from 'playwright';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function run() {
let rawUrl = process.argv[2];
if (!rawUrl) {
console.error('Usage: npm run clone-page <URL>');
process.exit(1);
}
const targetUrl = rawUrl.trim().replace(/[;'"]+$/, '');
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
const domainDir = path.resolve(__dirname, '../cloned-websites', safeDomain);
const assetsDir = path.join(domainDir, 'assets');
if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
let slug = urlObj.pathname.split('/').filter(Boolean).join('-');
if (!slug) slug = 'index';
const htmlFilename = `${slug}.html`;
console.log(`🚀 CLONING: ${targetUrl}`);
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
});
const page = await context.newPage();
// Asset capture logic
page.on('response', async (response) => {
const url = response.url();
if (response.status() !== 200) return;
const u = new URL(url);
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf|xml)$/i.test(u.pathname) ||
url.includes('wp-content') || url.includes('wp-includes') || url.includes('fonts.googleapis.com');
if (isAsset && url !== targetUrl) {
try {
const buffer = await response.body();
const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
const fileDest = path.join(assetsDir, u.hostname, sanitizedPath);
if (!fs.existsSync(path.dirname(fileDest))) fs.mkdirSync(path.dirname(fileDest), { recursive: true });
// We overwrite for now to ensure freshness
fs.writeFileSync(fileDest, buffer);
// If it's a CSS file, we might want to rewrite it later, but let's do it on the fly or after
} catch (e) { }
}
});
try {
await page.goto(targetUrl, { waitUntil: 'networkidle', timeout: 90000 });
// Comprehensive scroll
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0, distance = 400, timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight || totalHeight > 30000) {
clearInterval(timer);
window.scrollTo(0, 0);
resolve(null);
}
}, 200);
});
});
await page.waitForLoadState('networkidle');
await page.waitForTimeout(5000); // 5 seconds extra for lazy scripts
let content = await page.content();
const rewriteUrl = (fullUrl: string) => {
try {
if (!fullUrl.startsWith('http') && !fullUrl.startsWith('//')) return fullUrl;
let actualUrl = fullUrl;
if (fullUrl.startsWith('//')) actualUrl = `https:${fullUrl}`;
const u = new URL(actualUrl);
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf)$/i.test(u.pathname) ||
actualUrl.includes('wp-content') || actualUrl.includes('wp-includes') || actualUrl.includes('fonts.googleapis.com');
if (isAsset) {
const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
return `./assets/${u.hostname}/${sanitizedPath}`;
}
} catch (e) { }
return fullUrl;
};
// 1. Rewrite src, href, content, poster
content = content.replace(/(src|href|content|poster)=["']([^"']+)["']/gi, (match, attr, url) => {
if (attr === 'href' && !url.includes('.') && !url.includes('http')) return match; // Keep anchor links or paths
return `${attr}="${rewriteUrl(url)}"`;
});
// 2. Rewrite srcset
content = content.replace(/srcset=["']([^"']+)["']/gi, (match, srcset) => {
const parts = srcset.split(',').map(part => {
const trimmed = part.trim();
const lastSpaceIndex = trimmed.lastIndexOf(' ');
if (lastSpaceIndex === -1) return rewriteUrl(trimmed);
const url = trimmed.substring(0, lastSpaceIndex);
const size = trimmed.substring(lastSpaceIndex);
return `${rewriteUrl(url)}${size}`;
});
return `srcset="${parts.join(', ')}"`;
});
// 3. Rewrite inline styles
content = content.replace(/url\(["']?([^"'\)]+)["']?\)/gi, (match, url) => {
return `url("${rewriteUrl(url)}")`;
});
// 4. Salient/Industrial Overrides
const fixes = `
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700;800;900&display=swap" rel="stylesheet">
<style>
:root { --main-font: 'Inter'; --heading-font: 'Montserrat'; }
body, p, li, a, span { font-family: 'Inter', sans-serif !important; }
h1, h2, h3, h4, h5, h6, .title-font { font-family: 'Montserrat', sans-serif !important; font-weight: 700 !important; }
</style>`;
content = content.replace('</head>', `${fixes}</head>`);
// Link Nuker: only if it looks like an internal/external link, not assets
content = content.replace(/<a\b([^>]*)\bhref=["'](https?:\/\/[^"']+|(?![^"']*\.(css|js|png|jpg|jpeg|gif|svg|pdf))[./][^"']*)["']/gi, '<a$1href="#"');
// Fix Breeze dynamic scripts
content = content.replace(/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => {
const u = url.trim();
const cleanUrl = u.split('?')[0];
if (cleanUrl.endsWith('.css')) return `<link rel="stylesheet" href="${u}">`;
return `<script src="${u}"></script>`;
});
fs.writeFileSync(path.join(domainDir, htmlFilename), content);
// 5. CSS REWRITING: Fix absolute paths in all captured CSS files
const allFiles = (dir: string): string[] => {
let results: string[] = [];
fs.readdirSync(dir).forEach(f => {
const fullPath = path.join(dir, f);
if (fs.statSync(fullPath).isDirectory()) {
results = results.concat(allFiles(fullPath));
} else if (f.endsWith('.css')) {
results.push(fullPath);
}
});
return results;
};
const cssFiles = allFiles(assetsDir);
for (const cssFile of cssFiles) {
let cssContent = fs.readFileSync(cssFile, 'utf8');
// Replace absolute domain references with local folder structure
// This is tricky because we need relative paths, but we can use absolute-ish paths relative to root?
// Actually, we can just point them back to the same assets folder structure.
// But since they are inside assets/host/path, they need to go up levels.
// A simpler way: replace domain urls with a full site-root relative path if possible,
// but CSS relative paths are hard.
// Let's just try to flatten them or use absolute paths for the clone.
// Actually, the easiest is to replace https://klz-cables.com/ with /assets/klz-cables.com/
// But the clone is viewed locally.
cssContent = cssContent.replace(/url\(["']?https?:\/\/([^\/"']+\/[^"'\)]+)["']?\)/gi, (match, pathAndHost) => {
const parts = pathAndHost.split('/');
const host = parts[0];
const rest = parts.slice(1).join('/').split('?')[0];
const sanitizedRest = rest.split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
// This is still just a guess at where the asset is.
// But it's better than pointing to a dead live site.
return `url("/assets/${host}/${sanitizedRest}")`;
});
fs.writeFileSync(cssFile, cssContent);
}
console.log(`\n✅ CLONED: ${htmlFilename}`);
} catch (err) {
console.error('❌ FAILED:', err);
} finally {
await browser.close();
}
}
run();