import { chromium } from "playwright"; import path from "node:path"; import { fileURLToPath } from "node:url"; import fs from "node:fs"; import axios from "axios"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"; function sanitizePath(rawPath: string) { return rawPath .split("/") .map((p) => p.replace(/[^a-z0-9._-]/gi, "_")) .join("/"); } async function downloadFile(url: string, assetsDir: string) { if (url.startsWith("//")) url = `https:${url}`; if (!url.startsWith("http")) return null; try { const u = new URL(url); // Create a collision-resistant local path const relPath = sanitizePath(u.hostname + u.pathname); const dest = path.join(assetsDir, relPath); if (fs.existsSync(dest)) return `./assets/${relPath}`; const res = await axios.get(url, { responseType: "arraybuffer", headers: { "User-Agent": USER_AGENT }, timeout: 15000, validateStatus: () => true, }); if (res.status !== 200) return null; if (!fs.existsSync(path.dirname(dest))) fs.mkdirSync(path.dirname(dest), { recursive: true }); fs.writeFileSync(dest, Buffer.from(res.data)); return `./assets/${relPath}`; } catch { return null; // Fail silently, proceed with original URL } } async function processCssRecursively( cssContent: string, cssUrl: string, assetsDir: string, urlMap: Record, depth = 0, ) { if (depth > 5) return cssContent; // Capture both standard url(...) and @import url(...) const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi; let match; let newContent = cssContent; while ((match = urlRegex.exec(cssContent)) !== null) { const originalUrl = match[1]; if (originalUrl.startsWith("data:") || originalUrl.startsWith("blob:")) continue; try { const absUrl = new URL(originalUrl, cssUrl).href; const local = await downloadFile(absUrl, assetsDir); if (local) { // Calculate relative path from CSS file to Asset const u = new URL(cssUrl); const cssPath = u.hostname + u.pathname; const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname; // We need to route from the folder containing the CSS to the asset const rel = path.relative( path.dirname(sanitizePath(cssPath)), sanitizePath(assetPath), ); // Replace strictly the URL part newContent = newContent.split(originalUrl).join(rel); urlMap[absUrl] = local; } } catch { // Ignore URL resolution errors } } return newContent; } async function run() { const rawUrl = process.argv[2]; if (!rawUrl) { console.error("Usage: npm run clone-page "); process.exit(1); } const targetUrl = rawUrl.trim(); const urlObj = new URL(targetUrl); // Setup Output Directories const domainSlug = urlObj.hostname.replace("www.", ""); const domainDir = path.resolve(__dirname, `../public/showcase/${domainSlug}`); const assetsDir = path.join(domainDir, "assets"); if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true }); let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-"); if (!pageSlug) pageSlug = "index"; const htmlFilename = `${pageSlug}.html`; console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`); const browser = await chromium.launch({ headless: true }); // Start with a standard viewport, we will resize widely later const context = await browser.newContext({ userAgent: USER_AGENT, viewport: { width: 1920, height: 1080 }, }); const page = await context.newPage(); const urlMap: Record = {}; const foundAssets = new Set(); // 1. Live Network Interception page.on("response", (response) => { const url = response.url(); if (response.status() === 200) { // Capture anything that looks like a static asset if ( url.match( /\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i, ) ) { foundAssets.add(url); } } }); try { console.log("🌐 Loading page (Waiting for Network Idle)..."); await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 }); console.log( '🌊 Executing "Scroll Wave" to trigger all lazy loaders naturally...', ); await page.evaluate(async () => { await new Promise((resolve) => { let totalHeight = 0; const distance = 400; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight) { clearInterval(timer); window.scrollTo(0, 0); // Reset to top resolve(true); } }, 100); }); }); console.log( '📐 Expanding Viewport to "Giant Mode" for final asset capture...', ); const fullHeight = await page.evaluate(() => document.body.scrollHeight); await page.setViewportSize({ width: 1920, height: fullHeight + 1000 }); // Final settlement wait await page.waitForTimeout(3000); console.log("💧 Final DOM Hydration & Sanitization..."); await page.evaluate(() => { // A. Deterministic Attribute Hydration (Generic) // Scours every element for attributes that look like asset URLs and promotes them const assetPattern = /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i; document.querySelectorAll("*").forEach((el) => { // 0. Skip Meta/Head/Script/Style/SVG tags for attribute promotion if ( ["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes( el.tagName, ) ) return; // 1. Force Visibility (Anti-Flicker) const htmlEl = el as HTMLElement; const style = window.getComputedStyle(htmlEl); if (style.opacity === "0" || style.visibility === "hidden") { htmlEl.style.setProperty("opacity", "1", "important"); htmlEl.style.setProperty("visibility", "visible", "important"); } // 2. Promote Data Attributes for (const attr of Array.from(el.attributes)) { const name = attr.name.toLowerCase(); const val = attr.value; if ( assetPattern.test(val) || name.includes("src") || name.includes("image") ) { // Standard Image/Video/Source promotion if (el.tagName === "IMG") { const img = el as HTMLImageElement; if (name.includes("srcset")) img.srcset = val; else if (!img.src || img.src.includes("data:")) img.src = val; } if (el.tagName === "SOURCE") { const source = el as HTMLSourceElement; if (name.includes("srcset")) source.srcset = val; } if (el.tagName === "VIDEO" || el.tagName === "AUDIO") { const media = el as HTMLMediaElement; if (!media.src) media.src = val; } // Background Image Promotion if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes("href")) { const bg = htmlEl.style.backgroundImage; if (!bg || bg === "none") { htmlEl.style.backgroundImage = `url('${val}')`; } } } } }); // B. Ensure basic structural elements are visible post-scroll const body = document.body; if (body) { body.style.setProperty("opacity", "1", "important"); body.style.setProperty("visibility", "visible", "important"); } }); console.log("⏳ Waiting for network idle..."); await page.waitForLoadState("networkidle"); // 1.5 FINAL SETTLEMENT: Let any scroll-triggered JS finish await page.waitForTimeout(1000); // 2. Static Snapshot let content = await page.content(); // 3. Post-Snapshot Asset Discovery (Regex) // Catches assets that never triggered a network request but exist in the markup const regexPatterns = [ /(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi, // Capture CSS url() inside style blocks /url\(["']?([^"'\)]+)["']?\)/gi, ]; for (const pattern of regexPatterns) { let match; while ((match = pattern.exec(content)) !== null) { try { foundAssets.add(new URL(match[1], targetUrl).href); } catch { // Ignore invalid URLs in content } } } // Specific srcset parsing const srcsetRegex = /[a-z0-9-]+srcset=["']([^"']+)["']/gi; let match; while ((match = srcsetRegex.exec(content)) !== null) { match[1].split(",").forEach((rule) => { const parts = rule.trim().split(/\s+/); if (parts[0] && !parts[0].startsWith("data:")) { try { foundAssets.add(new URL(parts[0], targetUrl).href); } catch { // Ignore invalid srcset URLs } } }); } console.log(`🔍 Processing ${foundAssets.size} discovered assets...`); // 4. Download & Map for (const url of foundAssets) { const local = await downloadFile(url, assetsDir); if (local) { urlMap[url] = local; const clean = url.split("?")[0]; urlMap[clean] = local; // Handle CSS recursively if (clean.endsWith(".css")) { try { const { data } = await axios.get(url, { headers: { "User-Agent": USER_AGENT }, }); // Process CSS and save it const processedCss = await processCssRecursively( data, url, assetsDir, urlMap, ); const relPath = sanitizePath( new URL(url).hostname + new URL(url).pathname, ); fs.writeFileSync(path.join(assetsDir, relPath), processedCss); } catch { // Ignore CSS fetch/process errors } } } } console.log("🛠️ Finalizing Static Mirror..."); let finalContent = content; // A. Apply URL Map Replacements // Longer paths first to prevent partial replacement errors const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length); if (sortedUrls.length > 0) { const escaped = sortedUrls.map((u) => u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), ); // Create a massive regex for single-pass replacement const masterRegex = new RegExp(`(${escaped.join("|")})`, "g"); finalContent = finalContent.replace( masterRegex, (match) => urlMap[match] || match, ); } // B. Global Root-Relative Path Cleanup // Catches things like /wp-content/ that weren't distinct assets or were missed const commonDirs = [ "/wp-content/", "/wp-includes/", "/assets/", "/static/", "/images/", ]; for (const dir of commonDirs) { const localDir = `./assets/${urlObj.hostname}${dir}`; finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`); finalContent = finalContent.split(`'${dir}`).join(`'${localDir}`); finalContent = finalContent.split(`(${dir}`).join(`(${localDir}`); } // C. Domain Nuke // Replace absolute links to the original domain with relative or # const domainPattern = new RegExp( `https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`, "gi", ); // We carefully only replace if it looks like a resource link, or neutralize if it's a navigation link // For simplicity and "solidness", we'll rely on the specific replacements above first. // This catch-all nuke ensures we don't leak requests. // Convert remaining absolute domain links to relative . finalContent = finalContent.replace(domainPattern, (match) => { // If we have a map for it, it should have been replaced. // If not, it's likely a navigation link or an uncaptured asset. // Safe fallback: return "./"; }); // D. Static Stability & Cleanup // Remove tracking/analytics/lazy-load scripts that ruins stability finalContent = finalContent.replace( /]*>([\s\S]*?)<\/script>/gi, (match, content) => { const lower = content.toLowerCase(); if ( lower.includes("google-analytics") || lower.includes("gtag") || lower.includes("fbq") || lower.includes("lazy") || lower.includes("tracker") ) { return ""; } return match; }, ); // E. CSS Injections for Stability const headEnd = finalContent.indexOf(""); if (headEnd > -1) { const stabilityCss = ` `; finalContent = finalContent.slice(0, headEnd) + stabilityCss + finalContent.slice(headEnd); } // Save const finalPath = path.join(domainDir, htmlFilename); fs.writeFileSync(finalPath, finalContent); console.log(`✅ SUCCESS: Cloned to ${finalPath}`); } catch (err) { console.error("❌ FATAL ERROR:", err); } finally { await browser.close(); } } run();