import { chromium } from 'playwright'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import fs from 'node:fs'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); async function run() { let rawUrl = process.argv[2]; if (!rawUrl) { console.error('Usage: npm run clone-page '); process.exit(1); } const targetUrl = rawUrl.trim().replace(/[;'"]+$/, ''); const urlObj = new URL(targetUrl); const domain = urlObj.hostname; const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_'); const domainDir = path.resolve(__dirname, '../cloned-websites', safeDomain); const assetsDir = path.join(domainDir, 'assets'); if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true }); let slug = urlObj.pathname.split('/').filter(Boolean).join('-'); if (!slug) slug = 'index'; const htmlFilename = `${slug}.html`; console.log(`šŸš€ CLONING: ${targetUrl}`); const browser = await chromium.launch({ headless: true }); const context = await browser.newContext({ viewport: { width: 1920, height: 1080 }, userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' }); const page = await context.newPage(); // Asset capture logic page.on('response', async (response) => { const url = response.url(); if (response.status() !== 200) return; const u = new URL(url); const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf|xml)$/i.test(u.pathname) || url.includes('wp-content') || url.includes('wp-includes') || url.includes('fonts.googleapis.com'); if (isAsset && url !== targetUrl) { try { const buffer = await response.body(); const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/'); const fileDest = path.join(assetsDir, u.hostname, sanitizedPath); if (!fs.existsSync(path.dirname(fileDest))) fs.mkdirSync(path.dirname(fileDest), { recursive: true }); // We overwrite for now to ensure freshness fs.writeFileSync(fileDest, buffer); // If it's a CSS file, we might want to rewrite it later, but let's do it on the fly or after } catch (e) { } } }); try { await page.goto(targetUrl, { waitUntil: 'networkidle', timeout: 90000 }); // Comprehensive scroll await page.evaluate(async () => { await new Promise((resolve) => { let totalHeight = 0, distance = 400, timer = setInterval(() => { let scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight || totalHeight > 30000) { clearInterval(timer); window.scrollTo(0, 0); resolve(null); } }, 200); }); }); await page.waitForLoadState('networkidle'); await page.waitForTimeout(5000); // 5 seconds extra for lazy scripts let content = await page.content(); const rewriteUrl = (fullUrl: string) => { try { if (!fullUrl.startsWith('http') && !fullUrl.startsWith('//')) return fullUrl; let actualUrl = fullUrl; if (fullUrl.startsWith('//')) actualUrl = `https:${fullUrl}`; const u = new URL(actualUrl); const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf)$/i.test(u.pathname) || actualUrl.includes('wp-content') || actualUrl.includes('wp-includes') || actualUrl.includes('fonts.googleapis.com'); if (isAsset) { const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/'); return `./assets/${u.hostname}/${sanitizedPath}`; } } catch (e) { } return fullUrl; }; // 1. Rewrite src, href, content, poster content = content.replace(/(src|href|content|poster)=["']([^"']+)["']/gi, (match, attr, url) => { if (attr === 'href' && !url.includes('.') && !url.includes('http')) return match; // Keep anchor links or paths return `${attr}="${rewriteUrl(url)}"`; }); // 2. Rewrite srcset content = content.replace(/srcset=["']([^"']+)["']/gi, (match, srcset) => { const parts = srcset.split(',').map(part => { const trimmed = part.trim(); const lastSpaceIndex = trimmed.lastIndexOf(' '); if (lastSpaceIndex === -1) return rewriteUrl(trimmed); const url = trimmed.substring(0, lastSpaceIndex); const size = trimmed.substring(lastSpaceIndex); return `${rewriteUrl(url)}${size}`; }); return `srcset="${parts.join(', ')}"`; }); // 3. Rewrite inline styles content = content.replace(/url\(["']?([^"'\)]+)["']?\)/gi, (match, url) => { return `url("${rewriteUrl(url)}")`; }); // 4. Salient/Industrial Overrides const fixes = ` `; content = content.replace('', `${fixes}`); // Link Nuker: only if it looks like an internal/external link, not assets content = content.replace(/]*)\bhref=["'](https?:\/\/[^"']+|(?![^"']*\.(css|js|png|jpg|jpeg|gif|svg|pdf))[./][^"']*)["']/gi, ']+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => { const u = url.trim(); const cleanUrl = u.split('?')[0]; if (cleanUrl.endsWith('.css')) return ``; return ``; }); fs.writeFileSync(path.join(domainDir, htmlFilename), content); // 5. CSS REWRITING: Fix absolute paths in all captured CSS files const allFiles = (dir: string): string[] => { let results: string[] = []; fs.readdirSync(dir).forEach(f => { const fullPath = path.join(dir, f); if (fs.statSync(fullPath).isDirectory()) { results = results.concat(allFiles(fullPath)); } else if (f.endsWith('.css')) { results.push(fullPath); } }); return results; }; const cssFiles = allFiles(assetsDir); for (const cssFile of cssFiles) { let cssContent = fs.readFileSync(cssFile, 'utf8'); // Replace absolute domain references with local folder structure // This is tricky because we need relative paths, but we can use absolute-ish paths relative to root? // Actually, we can just point them back to the same assets folder structure. // But since they are inside assets/host/path, they need to go up levels. // A simpler way: replace domain urls with a full site-root relative path if possible, // but CSS relative paths are hard. // Let's just try to flatten them or use absolute paths for the clone. // Actually, the easiest is to replace https://klz-cables.com/ with /assets/klz-cables.com/ // But the clone is viewed locally. cssContent = cssContent.replace(/url\(["']?https?:\/\/([^\/"']+\/[^"'\)]+)["']?\)/gi, (match, pathAndHost) => { const parts = pathAndHost.split('/'); const host = parts[0]; const rest = parts.slice(1).join('/').split('?')[0]; const sanitizedRest = rest.split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/'); // This is still just a guess at where the asset is. // But it's better than pointing to a dead live site. return `url("/assets/${host}/${sanitizedRest}")`; }); fs.writeFileSync(cssFile, cssContent); } console.log(`\nāœ… CLONED: ${htmlFilename}`); } catch (err) { console.error('āŒ FAILED:', err); } finally { await browser.close(); } } run();