import { chromium, type Page } from 'playwright'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import fs from 'node:fs'; import axios from 'axios'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'; function sanitizePath(rawPath: string) { return rawPath.split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/'); } async function downloadFile(url: string, assetsDir: string) { if (url.startsWith('//')) url = `https:${url}`; if (!url.startsWith('http')) return null; try { const u = new URL(url); // Create a collision-resistant local path const relPath = sanitizePath(u.hostname + u.pathname); const dest = path.join(assetsDir, relPath); if (fs.existsSync(dest)) return `./assets/${relPath}`; const res = await axios.get(url, { responseType: 'arraybuffer', headers: { 'User-Agent': USER_AGENT }, timeout: 15000, validateStatus: () => true }); if (res.status !== 200) return null; if (!fs.existsSync(path.dirname(dest))) fs.mkdirSync(path.dirname(dest), { recursive: true }); fs.writeFileSync(dest, Buffer.from(res.data)); return `./assets/${relPath}`; } catch { return null; // Fail silently, proceed with original URL } } async function processCssRecursively(cssContent: string, cssUrl: string, assetsDir: string, urlMap: Record, depth = 0) { if (depth > 5) return cssContent; // Capture both standard url(...) and @import url(...) const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi; let match; let newContent = cssContent; while ((match = urlRegex.exec(cssContent)) !== null) { const originalUrl = match[1]; if (originalUrl.startsWith('data:') || originalUrl.startsWith('blob:')) continue; try { const absUrl = new URL(originalUrl, cssUrl).href; const local = await downloadFile(absUrl, assetsDir); if (local) { // Calculate relative path from CSS file to Asset const u = new URL(cssUrl); const cssPath = u.hostname + u.pathname; const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname; // We need to route from the folder containing the CSS to the asset const rel = path.relative(path.dirname(sanitizePath(cssPath)), sanitizePath(assetPath)); // Replace strictly the URL part newContent = newContent.split(originalUrl).join(rel); urlMap[absUrl] = local; } } catch { } } return newContent; } async function run() { const rawUrl = process.argv[2]; if (!rawUrl) { console.error('Usage: npm run clone-page '); process.exit(1); } const targetUrl = rawUrl.trim(); const urlObj = new URL(targetUrl); // Setup Output Directories const domainSlug = urlObj.hostname.replace('www.', ''); const domainDir = path.resolve(__dirname, `../public/showcase/${domainSlug}`); const assetsDir = path.join(domainDir, 'assets'); if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true }); let pageSlug = urlObj.pathname.split('/').filter(Boolean).join('-'); if (!pageSlug) pageSlug = 'index'; const htmlFilename = `${pageSlug}.html`; console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`); const browser = await chromium.launch({ headless: true }); // Start with a standard viewport, we will resize widely later const context = await browser.newContext({ userAgent: USER_AGENT, viewport: { width: 1920, height: 1080 } }); const page = await context.newPage(); const urlMap: Record = {}; const foundAssets = new Set(); // 1. Live Network Interception page.on('response', response => { const url = response.url(); if (response.status() === 200) { // Capture anything that looks like a static asset if (url.match(/\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i)) { foundAssets.add(url); } } }); try { console.log('🌐 Loading page (Waiting for Network Idle)...'); await page.goto(targetUrl, { waitUntil: 'networkidle', timeout: 90000 }); console.log('🌊 Executing "Scroll Wave" to trigger all lazy loaders naturally...'); await page.evaluate(async () => { await new Promise((resolve) => { let totalHeight = 0; const distance = 400; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight) { clearInterval(timer); window.scrollTo(0, 0); // Reset to top resolve(true); } }, 100); }); }); console.log('📐 Expanding Viewport to "Giant Mode" for final asset capture...'); const fullHeight = await page.evaluate(() => document.body.scrollHeight); await page.setViewportSize({ width: 1920, height: fullHeight + 1000 }); // Final settlement wait await page.waitForTimeout(3000); console.log('💧 Final DOM Hydration & Sanitization...'); await page.evaluate(() => { // A. Deterministic Attribute Hydration (Generic) // Scours every element for attributes that look like asset URLs and promotes them const assetPattern = /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i; document.querySelectorAll('*').forEach(el => { // 0. Skip Meta/Head/Script/Style/SVG tags for attribute promotion if (['META', 'LINK', 'HEAD', 'SCRIPT', 'STYLE', 'SVG', 'PATH'].includes(el.tagName)) return; // 1. Force Visibility (Anti-Flicker) const style = window.getComputedStyle(el); if (style.opacity === '0' || style.visibility === 'hidden') { el.style.setProperty('opacity', '1', 'important'); el.style.setProperty('visibility', 'visible', 'important'); } // 2. Promote Data Attributes for (const attr of Array.from(el.attributes)) { const name = attr.name.toLowerCase(); const val = attr.value; if (assetPattern.test(val) || name.includes('src') || name.includes('image')) { // Standard Image/Video/Source promotion if (el.tagName === 'IMG') { if (name.includes('srcset')) el.srcset = val; else if (!el.src || el.src.includes('data:')) el.src = val; } if (el.tagName === 'SOURCE') { if (name.includes('srcset')) el.srcset = val; } if (el.tagName === 'VIDEO' || el.tagName === 'AUDIO') { if (!el.src) el.src = val; } // Background Image Promotion if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes('href')) { const bg = el.style.backgroundImage; if (!bg || bg === 'none') { el.style.backgroundImage = `url('${val}')`; } } } } }); // B. Ensure basic structural elements are visible post-scroll const body = document.body; if (body) { body.style.setProperty('opacity', '1', 'important'); body.style.setProperty('visibility', 'visible', 'important'); } }); console.log('⏳ Waiting for network idle...'); await page.waitForLoadState('networkidle'); // 1.5 FINAL SETTLEMENT: Let any scroll-triggered JS finish await page.waitForTimeout(1000); // 2. Static Snapshot let content = await page.content(); // 3. Post-Snapshot Asset Discovery (Regex) // Catches assets that never triggered a network request but exist in the markup const regexPatterns = [ /(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi, // Capture CSS url() inside style blocks /url\(["']?([^"'\)]+)["']?\)/gi ]; for (const pattern of regexPatterns) { let match; while ((match = pattern.exec(content)) !== null) { try { foundAssets.add(new URL(match[1], targetUrl).href); } catch { } } } // Specific srcset parsing const srcsetRegex = /[a-z0-9-]+srcset=["']([^"']+)["']/gi; let match; while ((match = srcsetRegex.exec(content)) !== null) { match[1].split(',').forEach(rule => { const parts = rule.trim().split(/\s+/); if (parts[0] && !parts[0].startsWith('data:')) { try { foundAssets.add(new URL(parts[0], targetUrl).href); } catch { } } }); } console.log(`🔍 Processing ${foundAssets.size} discovered assets...`); // 4. Download & Map for (const url of foundAssets) { const local = await downloadFile(url, assetsDir); if (local) { urlMap[url] = local; const clean = url.split('?')[0]; urlMap[clean] = local; // Handle CSS recursively if (clean.endsWith('.css')) { try { const { data } = await axios.get(url, { headers: { 'User-Agent': USER_AGENT } }); // Process CSS and save it const processedCss = await processCssRecursively(data, url, assetsDir, urlMap); const relPath = sanitizePath(new URL(url).hostname + new URL(url).pathname); fs.writeFileSync(path.join(assetsDir, relPath), processedCss); } catch { } } } } console.log('🛠️ Finalizing Static Mirror...'); let finalContent = content; // A. Apply URL Map Replacements // Longer paths first to prevent partial replacement errors const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length); if (sortedUrls.length > 0) { const escaped = sortedUrls.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')); // Create a massive regex for single-pass replacement const masterRegex = new RegExp(`(${escaped.join('|')})`, 'g'); finalContent = finalContent.replace(masterRegex, (match) => urlMap[match] || match); } // B. Global Root-Relative Path Cleanup // Catches things like /wp-content/ that weren't distinct assets or were missed const commonDirs = ['/wp-content/', '/wp-includes/', '/assets/', '/static/', '/images/']; for (const dir of commonDirs) { const localDir = `./assets/${urlObj.hostname}${dir}`; finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`); finalContent = finalContent.split(`'${dir}`).join(`'${localDir}`); finalContent = finalContent.split(`(${dir}`).join(`(${localDir}`); } // C. Domain Nuke // Replace absolute links to the original domain with relative or # const domainPattern = new RegExp(`https?://(www\\.)?${urlObj.hostname.replace(/\./g, '\\.')}[^"']*`, 'gi'); // We carefully only replace if it looks like a resource link, or neutralize if it's a navigation link // For simplicity and "solidness", we'll rely on the specific replacements above first. // This catch-all nuke ensures we don't leak requests. // Convert remaining absolute domain links to relative . finalContent = finalContent.replace(domainPattern, (match) => { // If we have a map for it, it should have been replaced. // If not, it's likely a navigation link or an uncaptured asset. // Safe fallback: return './'; }); // D. Static Stability & Cleanup // Remove tracking/analytics/lazy-load scripts that ruins stability finalContent = finalContent.replace(/]*>([\s\S]*?)<\/script>/gi, (match, content) => { const lower = content.toLowerCase(); if (lower.includes('google-analytics') || lower.includes('gtag') || lower.includes('fbq') || lower.includes('lazy') || lower.includes('tracker')) { return ''; } return match; }); // E. CSS Injections for Stability const headEnd = finalContent.indexOf(''); if (headEnd > -1) { const stabilityCss = ` `; finalContent = finalContent.slice(0, headEnd) + stabilityCss + finalContent.slice(headEnd); } // Save const finalPath = path.join(domainDir, htmlFilename); fs.writeFileSync(finalPath, finalContent); console.log(`✅ SUCCESS: Cloned to ${finalPath}`); } catch (err) { console.error('❌ FATAL ERROR:', err); } finally { await browser.close(); } } run();