257 lines
8.7 KiB
TypeScript
257 lines
8.7 KiB
TypeScript
import { chromium } from "playwright";
|
|
import fs from "node:fs";
|
|
import path from "node:path";
|
|
import axios from "axios";
|
|
import { AssetManager, AssetMap } from "./AssetManager.js";
|
|
|
|
export interface PageClonerOptions {
|
|
outputDir: string;
|
|
userAgent?: string;
|
|
}
|
|
|
|
export class PageCloner {
|
|
private options: PageClonerOptions;
|
|
private assetManager: AssetManager;
|
|
private userAgent: string;
|
|
|
|
constructor(options: PageClonerOptions) {
|
|
this.options = options;
|
|
this.userAgent =
|
|
options.userAgent ||
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36";
|
|
this.assetManager = new AssetManager(this.userAgent);
|
|
}
|
|
|
|
public async clone(targetUrl: string): Promise<string> {
|
|
const urlObj = new URL(targetUrl);
|
|
const domainSlug = urlObj.hostname.replace("www.", "");
|
|
const domainDir = path.resolve(this.options.outputDir, domainSlug);
|
|
const assetsDir = path.join(domainDir, "assets");
|
|
|
|
if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
|
|
|
|
let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-");
|
|
if (!pageSlug) pageSlug = "index";
|
|
const htmlFilename = `${pageSlug}.html`;
|
|
|
|
console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`);
|
|
|
|
const browser = await chromium.launch({ headless: true });
|
|
const context = await browser.newContext({
|
|
userAgent: this.userAgent,
|
|
viewport: { width: 1920, height: 1080 },
|
|
});
|
|
const page = await context.newPage();
|
|
|
|
const urlMap: AssetMap = {};
|
|
const foundAssets = new Set<string>();
|
|
|
|
page.on("response", (response) => {
|
|
if (response.status() === 200) {
|
|
const url = response.url();
|
|
if (
|
|
url.match(
|
|
/\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i,
|
|
)
|
|
) {
|
|
foundAssets.add(url);
|
|
}
|
|
}
|
|
});
|
|
|
|
try {
|
|
await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 });
|
|
|
|
// Scroll Wave
|
|
await page.evaluate(async () => {
|
|
await new Promise((resolve) => {
|
|
let totalHeight = 0;
|
|
const distance = 400;
|
|
const timer = setInterval(() => {
|
|
const scrollHeight = document.body.scrollHeight;
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
if (totalHeight >= scrollHeight) {
|
|
clearInterval(timer);
|
|
window.scrollTo(0, 0);
|
|
resolve(true);
|
|
}
|
|
}, 100);
|
|
});
|
|
});
|
|
|
|
const fullHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
await page.setViewportSize({ width: 1920, height: fullHeight + 1000 });
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Sanitization
|
|
await page.evaluate(() => {
|
|
const assetPattern =
|
|
/\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i;
|
|
document.querySelectorAll("*").forEach((el) => {
|
|
if (
|
|
["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes(
|
|
el.tagName,
|
|
)
|
|
)
|
|
return;
|
|
const htmlEl = el as HTMLElement;
|
|
const style = window.getComputedStyle(htmlEl);
|
|
if (style.opacity === "0" || style.visibility === "hidden") {
|
|
htmlEl.style.setProperty("opacity", "1", "important");
|
|
htmlEl.style.setProperty("visibility", "visible", "important");
|
|
}
|
|
for (const attr of Array.from(el.attributes)) {
|
|
const name = attr.name.toLowerCase();
|
|
const val = attr.value;
|
|
if (
|
|
assetPattern.test(val) ||
|
|
name.includes("src") ||
|
|
name.includes("image")
|
|
) {
|
|
if (el.tagName === "IMG") {
|
|
const img = el as HTMLImageElement;
|
|
if (name.includes("srcset")) img.srcset = val;
|
|
else if (!img.src || img.src.includes("data:")) img.src = val;
|
|
}
|
|
if (el.tagName === "SOURCE")
|
|
(el as HTMLSourceElement).srcset = val;
|
|
if (el.tagName === "VIDEO" || el.tagName === "AUDIO")
|
|
(el as HTMLMediaElement).src = val;
|
|
if (
|
|
val.match(/^(https?:\/\/|\/\/|\/)/) &&
|
|
!name.includes("href")
|
|
) {
|
|
const bg = htmlEl.style.backgroundImage;
|
|
if (!bg || bg === "none")
|
|
htmlEl.style.backgroundImage = `url('${val}')`;
|
|
}
|
|
}
|
|
}
|
|
});
|
|
if (document.body) {
|
|
document.body.style.setProperty("opacity", "1", "important");
|
|
document.body.style.setProperty("visibility", "visible", "important");
|
|
}
|
|
});
|
|
|
|
await page.waitForLoadState("networkidle");
|
|
await page.waitForTimeout(1000);
|
|
|
|
const content = await page.content();
|
|
const regexPatterns = [
|
|
/(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi,
|
|
/url\(["']?([^"')]*)["']?\)/gi,
|
|
];
|
|
|
|
for (const pattern of regexPatterns) {
|
|
let match;
|
|
while ((match = pattern.exec(content)) !== null) {
|
|
try {
|
|
foundAssets.add(new URL(match[1], targetUrl).href);
|
|
} catch {
|
|
// Ignore invalid URLs
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const url of foundAssets) {
|
|
const local = await this.assetManager.downloadFile(url, assetsDir);
|
|
if (local) {
|
|
urlMap[url] = local;
|
|
const clean = url.split("?")[0];
|
|
urlMap[clean] = local;
|
|
if (clean.endsWith(".css")) {
|
|
try {
|
|
const { data } = await axios.get(url, {
|
|
headers: { "User-Agent": this.userAgent },
|
|
});
|
|
const processedCss =
|
|
await this.assetManager.processCssRecursively(
|
|
data,
|
|
url,
|
|
assetsDir,
|
|
urlMap,
|
|
);
|
|
const relPath = this.assetManager.sanitizePath(
|
|
new URL(url).hostname + new URL(url).pathname,
|
|
);
|
|
fs.writeFileSync(path.join(assetsDir, relPath), processedCss);
|
|
} catch {
|
|
// Ignore stylesheet download/process failures
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let finalContent = content;
|
|
const sortedUrls = Object.keys(urlMap).sort(
|
|
(a, b) => b.length - a.length,
|
|
);
|
|
if (sortedUrls.length > 0) {
|
|
const escaped = sortedUrls.map((u) =>
|
|
u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"),
|
|
);
|
|
const masterRegex = new RegExp(`(${escaped.join("|")})`, "g");
|
|
finalContent = finalContent.replace(
|
|
masterRegex,
|
|
(match) => urlMap[match] || match,
|
|
);
|
|
}
|
|
|
|
const commonDirs = [
|
|
"/wp-content/",
|
|
"/wp-includes/",
|
|
"/assets/",
|
|
"/static/",
|
|
"/images/",
|
|
];
|
|
for (const dir of commonDirs) {
|
|
const localDir = `./assets/${urlObj.hostname}${dir}`;
|
|
finalContent = finalContent
|
|
.split(`"${dir}`)
|
|
.join(`"${localDir}`)
|
|
.split(`'${dir}`)
|
|
.join(`'${localDir}`)
|
|
.split(`(${dir}`)
|
|
.join(`(${localDir}`);
|
|
}
|
|
|
|
const domainPattern = new RegExp(
|
|
`https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`,
|
|
"gi",
|
|
);
|
|
finalContent = finalContent.replace(domainPattern, () => "./");
|
|
|
|
finalContent = finalContent.replace(
|
|
/<script\b[^>]*>([\s\S]*?)<\/script>/gi,
|
|
(match, scriptContent) => {
|
|
const lower = scriptContent.toLowerCase();
|
|
return lower.includes("google-analytics") ||
|
|
lower.includes("gtag") ||
|
|
lower.includes("fbq") ||
|
|
lower.includes("lazy") ||
|
|
lower.includes("tracker")
|
|
? ""
|
|
: match;
|
|
},
|
|
);
|
|
|
|
const headEnd = finalContent.indexOf("</head>");
|
|
if (headEnd > -1) {
|
|
const stabilityCss = `\n<style>* { transition: none !important; animation: none !important; scroll-behavior: auto !important; } [data-aos], .reveal, .lazypath, .lazy-load, [data-src] { opacity: 1 !important; visibility: visible !important; transform: none !important; clip-path: none !important; } img, video, iframe { max-width: 100%; display: block; } a { pointer-events: none; cursor: default; } </style>`;
|
|
finalContent =
|
|
finalContent.slice(0, headEnd) +
|
|
stabilityCss +
|
|
finalContent.slice(headEnd);
|
|
}
|
|
|
|
const finalPath = path.join(domainDir, htmlFilename);
|
|
fs.writeFileSync(finalPath, finalContent);
|
|
return finalPath;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|