import { chromium } from "playwright"; import fs from "node:fs"; import path from "node:path"; import axios from "axios"; import { AssetManager, AssetMap } from "./AssetManager.js"; export interface PageClonerOptions { outputDir: string; userAgent?: string; } export class PageCloner { private options: PageClonerOptions; private assetManager: AssetManager; private userAgent: string; constructor(options: PageClonerOptions) { this.options = options; this.userAgent = options.userAgent || "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"; this.assetManager = new AssetManager(this.userAgent); } public async clone(targetUrl: string): Promise { const urlObj = new URL(targetUrl); const domainSlug = urlObj.hostname.replace("www.", ""); const domainDir = path.resolve(this.options.outputDir, domainSlug); const assetsDir = path.join(domainDir, "assets"); if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true }); let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-"); if (!pageSlug) pageSlug = "index"; const htmlFilename = `${pageSlug}.html`; console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`); const browser = await chromium.launch({ headless: true }); const context = await browser.newContext({ userAgent: this.userAgent, viewport: { width: 1920, height: 1080 }, }); const page = await context.newPage(); const urlMap: AssetMap = {}; const foundAssets = new Set(); page.on("response", (response) => { if (response.status() === 200) { const url = response.url(); if ( url.match( /\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i, ) ) { foundAssets.add(url); } } }); try { await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 }); // Scroll Wave await page.evaluate(async () => { await new Promise((resolve) => { let totalHeight = 0; const distance = 400; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight) { clearInterval(timer); window.scrollTo(0, 0); resolve(true); } }, 100); }); }); const fullHeight = await page.evaluate(() => document.body.scrollHeight); await page.setViewportSize({ width: 1920, height: fullHeight + 1000 }); await page.waitForTimeout(3000); // Sanitization await page.evaluate(() => { const assetPattern = /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i; document.querySelectorAll("*").forEach((el) => { if ( ["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes( el.tagName, ) ) return; const htmlEl = el as HTMLElement; const style = window.getComputedStyle(htmlEl); if (style.opacity === "0" || style.visibility === "hidden") { htmlEl.style.setProperty("opacity", "1", "important"); htmlEl.style.setProperty("visibility", "visible", "important"); } for (const attr of Array.from(el.attributes)) { const name = attr.name.toLowerCase(); const val = attr.value; if ( assetPattern.test(val) || name.includes("src") || name.includes("image") ) { if (el.tagName === "IMG") { const img = el as HTMLImageElement; if (name.includes("srcset")) img.srcset = val; else if (!img.src || img.src.includes("data:")) img.src = val; } if (el.tagName === "SOURCE") (el as HTMLSourceElement).srcset = val; if (el.tagName === "VIDEO" || el.tagName === "AUDIO") (el as HTMLMediaElement).src = val; if ( val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes("href") ) { const bg = htmlEl.style.backgroundImage; if (!bg || bg === "none") htmlEl.style.backgroundImage = `url('${val}')`; } } } }); if (document.body) { document.body.style.setProperty("opacity", "1", "important"); document.body.style.setProperty("visibility", "visible", "important"); } }); await page.waitForLoadState("networkidle"); await page.waitForTimeout(1000); const content = await page.content(); const regexPatterns = [ /(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi, /url\(["']?([^"')]*)["']?\)/gi, ]; for (const pattern of regexPatterns) { let match; while ((match = pattern.exec(content)) !== null) { try { foundAssets.add(new URL(match[1], targetUrl).href); } catch { // Ignore invalid URLs } } } for (const url of foundAssets) { const local = await this.assetManager.downloadFile(url, assetsDir); if (local) { urlMap[url] = local; const clean = url.split("?")[0]; urlMap[clean] = local; if (clean.endsWith(".css")) { try { const { data } = await axios.get(url, { headers: { "User-Agent": this.userAgent }, }); const processedCss = await this.assetManager.processCssRecursively( data, url, assetsDir, urlMap, ); const relPath = this.assetManager.sanitizePath( new URL(url).hostname + new URL(url).pathname, ); fs.writeFileSync(path.join(assetsDir, relPath), processedCss); } catch { // Ignore stylesheet download/process failures } } } } let finalContent = content; const sortedUrls = Object.keys(urlMap).sort( (a, b) => b.length - a.length, ); if (sortedUrls.length > 0) { const escaped = sortedUrls.map((u) => u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), ); const masterRegex = new RegExp(`(${escaped.join("|")})`, "g"); finalContent = finalContent.replace( masterRegex, (match) => urlMap[match] || match, ); } const commonDirs = [ "/wp-content/", "/wp-includes/", "/assets/", "/static/", "/images/", ]; for (const dir of commonDirs) { const localDir = `./assets/${urlObj.hostname}${dir}`; finalContent = finalContent .split(`"${dir}`) .join(`"${localDir}`) .split(`'${dir}`) .join(`'${localDir}`) .split(`(${dir}`) .join(`(${localDir}`); } const domainPattern = new RegExp( `https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`, "gi", ); finalContent = finalContent.replace(domainPattern, () => "./"); finalContent = finalContent.replace( /]*>([\s\S]*?)<\/script>/gi, (match, scriptContent) => { const lower = scriptContent.toLowerCase(); return lower.includes("google-analytics") || lower.includes("gtag") || lower.includes("fbq") || lower.includes("lazy") || lower.includes("tracker") ? "" : match; }, ); const headEnd = finalContent.indexOf(""); if (headEnd > -1) { const stabilityCss = `\n`; finalContent = finalContent.slice(0, headEnd) + stabilityCss + finalContent.slice(headEnd); } const finalPath = path.join(domainDir, htmlFilename); fs.writeFileSync(finalPath, finalContent); return finalPath; } finally { await browser.close(); } } }