feat(cloner): add cloner-library and finalize pdf-library rename

2026-02-12 21:59:48 +01:00
parent 57ec4d7544
commit 422e4fccba
33 changed files with 5909 additions and 177 deletions
--- a/packages/cloner-library/src/AssetManager.ts
+++ b/packages/cloner-library/src/AssetManager.ts
@@ -0,0 +1,93 @@
+import axios from "axios";
+import fs from "node:fs";
+import path from "node:path";
+
+export interface AssetMap {
+    [originalUrl: string]: string;
+}
+
+export class AssetManager {
+    private userAgent: string;
+
+    constructor(userAgent: string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36") {
+        this.userAgent = userAgent;
+    }
+
+    public sanitizePath(rawPath: string): string {
+        return rawPath
+            .split("/")
+            .map((p) => p.replace(/[^a-z0-9._-]/gi, "_"))
+            .join("/");
+    }
+
+    public async downloadFile(url: string, assetsDir: string): Promise<string | null> {
+        if (url.startsWith("//")) url = `https:${url}`;
+        if (!url.startsWith("http")) return null;
+
+        try {
+            const u = new URL(url);
+            const relPath = this.sanitizePath(u.hostname + u.pathname);
+            const dest = path.join(assetsDir, relPath);
+
+            if (fs.existsSync(dest)) return `./assets/${relPath}`;
+
+            const res = await axios.get(url, {
+                responseType: "arraybuffer",
+                headers: { "User-Agent": this.userAgent },
+                timeout: 15000,
+                validateStatus: () => true,
+            });
+
+            if (res.status !== 200) return null;
+
+            if (!fs.existsSync(path.dirname(dest)))
+                fs.mkdirSync(path.dirname(dest), { recursive: true });
+            fs.writeFileSync(dest, Buffer.from(res.data));
+            return `./assets/${relPath}`;
+        } catch {
+            return null;
+        }
+    }
+
+    public async processCssRecursively(
+        cssContent: string,
+        cssUrl: string,
+        assetsDir: string,
+        urlMap: AssetMap,
+        depth = 0,
+    ): Promise<string> {
+        if (depth > 5) return cssContent;
+
+        const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi;
+        let match;
+        let newContent = cssContent;
+
+        while ((match = urlRegex.exec(cssContent)) !== null) {
+            const originalUrl = match[1];
+            if (originalUrl.startsWith("data:") || originalUrl.startsWith("blob:"))
+                continue;
+
+            try {
+                const absUrl = new URL(originalUrl, cssUrl).href;
+                const local = await this.downloadFile(absUrl, assetsDir);
+
+                if (local) {
+                    const u = new URL(cssUrl);
+                    const cssPath = u.hostname + u.pathname;
+                    const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname;
+
+                    const rel = path.relative(
+                        path.dirname(this.sanitizePath(cssPath)),
+                        this.sanitizePath(assetPath),
+                    );
+
+                    newContent = newContent.split(originalUrl).join(rel);
+                    urlMap[absUrl] = local;
+                }
+            } catch {
+                // Ignore
+            }
+        }
+        return newContent;
+    }
+}
--- a/packages/cloner-library/src/PageCloner.ts
+++ b/packages/cloner-library/src/PageCloner.ts
@@ -0,0 +1,184 @@
+import { chromium, Browser, BrowserContext, Page } from "playwright";
+import fs from "node:fs";
+import path from "node:path";
+import axios from "axios";
+import { AssetManager, AssetMap } from "./AssetManager.js";
+
+export interface PageClonerOptions {
+    outputDir: string;
+    userAgent?: string;
+}
+
+export class PageCloner {
+    private options: PageClonerOptions;
+    private assetManager: AssetManager;
+    private userAgent: string;
+
+    constructor(options: PageClonerOptions) {
+        this.options = options;
+        this.userAgent = options.userAgent || "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36";
+        this.assetManager = new AssetManager(this.userAgent);
+    }
+
+    public async clone(targetUrl: string): Promise<string> {
+        const urlObj = new URL(targetUrl);
+        const domainSlug = urlObj.hostname.replace("www.", "");
+        const domainDir = path.resolve(this.options.outputDir, domainSlug);
+        const assetsDir = path.join(domainDir, "assets");
+
+        if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
+
+        let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-");
+        if (!pageSlug) pageSlug = "index";
+        const htmlFilename = `${pageSlug}.html`;
+
+        console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`);
+
+        const browser = await chromium.launch({ headless: true });
+        const context = await browser.newContext({
+            userAgent: this.userAgent,
+            viewport: { width: 1920, height: 1080 },
+        });
+        const page = await context.newPage();
+
+        const urlMap: AssetMap = {};
+        const foundAssets = new Set<string>();
+
+        page.on("response", (response) => {
+            if (response.status() === 200) {
+                const url = response.url();
+                if (url.match(/\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i)) {
+                    foundAssets.add(url);
+                }
+            }
+        });
+
+        try {
+            await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 });
+
+            // Scroll Wave
+            await page.evaluate(async () => {
+                await new Promise((resolve) => {
+                    let totalHeight = 0;
+                    const distance = 400;
+                    const timer = setInterval(() => {
+                        const scrollHeight = document.body.scrollHeight;
+                        window.scrollBy(0, distance);
+                        totalHeight += distance;
+                        if (totalHeight >= scrollHeight) {
+                            clearInterval(timer);
+                            window.scrollTo(0, 0);
+                            resolve(true);
+                        }
+                    }, 100);
+                });
+            });
+
+            const fullHeight = await page.evaluate(() => document.body.scrollHeight);
+            await page.setViewportSize({ width: 1920, height: fullHeight + 1000 });
+            await page.waitForTimeout(3000);
+
+            // Sanitization
+            await page.evaluate(() => {
+                const assetPattern = /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i;
+                document.querySelectorAll("*").forEach((el) => {
+                    if (["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes(el.tagName)) return;
+                    const htmlEl = el as HTMLElement;
+                    const style = window.getComputedStyle(htmlEl);
+                    if (style.opacity === "0" || style.visibility === "hidden") {
+                        htmlEl.style.setProperty("opacity", "1", "important");
+                        htmlEl.style.setProperty("visibility", "visible", "important");
+                    }
+                    for (const attr of Array.from(el.attributes)) {
+                        const name = attr.name.toLowerCase();
+                        const val = attr.value;
+                        if (assetPattern.test(val) || name.includes("src") || name.includes("image")) {
+                            if (el.tagName === "IMG") {
+                                const img = el as HTMLImageElement;
+                                if (name.includes("srcset")) img.srcset = val;
+                                else if (!img.src || img.src.includes("data:")) img.src = val;
+                            }
+                            if (el.tagName === "SOURCE") (el as HTMLSourceElement).srcset = val;
+                            if (el.tagName === "VIDEO" || el.tagName === "AUDIO") (el as HTMLMediaElement).src = val;
+                            if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes("href")) {
+                                const bg = htmlEl.style.backgroundImage;
+                                if (!bg || bg === "none") htmlEl.style.backgroundImage = `url('${val}')`;
+                            }
+                        }
+                    }
+                });
+                if (document.body) {
+                    document.body.style.setProperty("opacity", "1", "important");
+                    document.body.style.setProperty("visibility", "visible", "important");
+                }
+            });
+
+            await page.waitForLoadState("networkidle");
+            await page.waitForTimeout(1000);
+
+            let content = await page.content();
+            const regexPatterns = [
+                /(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi,
+                /url\(["']?([^"'\)]+)["']?\)/gi,
+            ];
+
+            for (const pattern of regexPatterns) {
+                let match;
+                while ((match = pattern.exec(content)) !== null) {
+                    try { foundAssets.add(new URL(match[1], targetUrl).href); } catch { }
+                }
+            }
+
+            for (const url of foundAssets) {
+                const local = await this.assetManager.downloadFile(url, assetsDir);
+                if (local) {
+                    urlMap[url] = local;
+                    const clean = url.split("?")[0];
+                    urlMap[clean] = local;
+                    if (clean.endsWith(".css")) {
+                        try {
+                            const { data } = await axios.get(url, { headers: { "User-Agent": this.userAgent } });
+                            const processedCss = await this.assetManager.processCssRecursively(data, url, assetsDir, urlMap);
+                            const relPath = this.assetManager.sanitizePath(new URL(url).hostname + new URL(url).pathname);
+                            fs.writeFileSync(path.join(assetsDir, relPath), processedCss);
+                        } catch { }
+                    }
+                }
+            }
+
+            let finalContent = content;
+            const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length);
+            if (sortedUrls.length > 0) {
+                const escaped = sortedUrls.map((u) => u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
+                const masterRegex = new RegExp(`(${escaped.join("|")})`, "g");
+                finalContent = finalContent.replace(masterRegex, (match) => urlMap[match] || match);
+            }
+
+            const commonDirs = ["/wp-content/", "/wp-includes/", "/assets/", "/static/", "/images/"];
+            for (const dir of commonDirs) {
+                const localDir = `./assets/${urlObj.hostname}${dir}`;
+                finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`).split(`'${dir}`).join(`'${localDir}`).split(`(${dir}`).join(`(${localDir}`);
+            }
+
+            const domainPattern = new RegExp(`https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`, "gi");
+            finalContent = finalContent.replace(domainPattern, () => "./");
+
+            finalContent = finalContent.replace(/<script\b[^>]*>([\s\S]*?)<\/script>/gi, (match, scriptContent) => {
+                const lower = scriptContent.toLowerCase();
+                return (lower.includes("google-analytics") || lower.includes("gtag") || lower.includes("fbq") || lower.includes("lazy") || lower.includes("tracker")) ? "" : match;
+            });
+
+            const headEnd = finalContent.indexOf("</head>");
+            if (headEnd > -1) {
+                const stabilityCss = `\n<style>* { transition: none !important; animation: none !important; scroll-behavior: auto !important; } [data-aos], .reveal, .lazypath, .lazy-load, [data-src] { opacity: 1 !important; visibility: visible !important; transform: none !important; clip-path: none !important; } img, video, iframe { max-width: 100%; display: block; } a { pointer-events: none; cursor: default; } </style>`;
+                finalContent = finalContent.slice(0, headEnd) + stabilityCss + finalContent.slice(headEnd);
+            }
+
+            const finalPath = path.join(domainDir, htmlFilename);
+            fs.writeFileSync(finalPath, finalContent);
+            return finalPath;
+        } finally {
+            await browser.close();
+        }
+    }
+}
--- a/packages/cloner-library/src/WebsiteCloner.ts
+++ b/packages/cloner-library/src/WebsiteCloner.ts
@@ -0,0 +1,123 @@
+import { PlaywrightCrawler, RequestQueue } from 'crawlee';
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { execSync } from 'node:child_process';
+
+export interface WebsiteClonerOptions {
+    baseOutputDir: string;
+    maxRequestsPerCrawl?: number;
+    maxConcurrency?: number;
+}
+
+export class WebsiteCloner {
+    private options: WebsiteClonerOptions;
+
+    constructor(options: WebsiteClonerOptions) {
+        this.options = {
+            maxRequestsPerCrawl: 100,
+            maxConcurrency: 3,
+            ...options
+        };
+    }
+
+    public async clone(targetUrl: string, outputDirName?: string): Promise<string> {
+        const urlObj = new URL(targetUrl);
+        const domain = urlObj.hostname;
+        const finalOutputDirName = outputDirName || domain.replace(/\./g, '-');
+        const baseOutputDir = path.resolve(this.options.baseOutputDir, finalOutputDirName);
+
+        if (fs.existsSync(baseOutputDir)) {
+            fs.rmSync(baseOutputDir, { recursive: true, force: true });
+        }
+        fs.mkdirSync(baseOutputDir, { recursive: true });
+
+        console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
+        console.log(`📂 Output: ${baseOutputDir}`);
+
+        const requestQueue = await RequestQueue.open();
+        await requestQueue.addRequest({ url: targetUrl });
+
+        const crawler = new PlaywrightCrawler({
+            requestQueue,
+            maxRequestsPerCrawl: this.options.maxRequestsPerCrawl,
+            maxConcurrency: this.options.maxConcurrency,
+
+            async requestHandler({ request, enqueueLinks, log }) {
+                const url = request.url;
+                log.info(`Capturing ${url}...`);
+
+                const u = new URL(url);
+                let relPath = u.pathname;
+                if (relPath === '/' || relPath === '') relPath = '/index.html';
+                if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
+                if (relPath.startsWith('/')) relPath = relPath.substring(1);
+
+                const fullPath = path.join(baseOutputDir, relPath);
+                fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+
+                try {
+                    // Note: This assumes single-file-cli is available in the environment
+                    execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
+                        stdio: 'inherit'
+                    });
+                } catch (e) {
+                    log.error(`Failed to capture ${url} with SingleFile`);
+                }
+
+                await enqueueLinks({
+                    strategy: 'same-domain',
+                    transformRequestFunction: (req) => {
+                        if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
+                        return req;
+                    }
+                });
+            },
+        });
+
+        await crawler.run();
+
+        console.log('🔗 Rewriting internal links for offline navigation...');
+        const allFiles = this.getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
+
+        for (const file of allFiles) {
+            let content = fs.readFileSync(file, 'utf8');
+            const fileRelToRoot = path.relative(baseOutputDir, file);
+
+            content = content.replace(/href="([^"]+)"/g, (match, href) => {
+                if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
+                    try {
+                        const linkUrl = new URL(href, targetUrl);
+                        if (linkUrl.hostname === domain) {
+                            let linkPath = linkUrl.pathname;
+                            if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
+                            if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
+                            if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
+
+                            const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
+                            return `href="${relativeLink}"`;
+                        }
+                    } catch (e) { }
+                }
+                return match;
+            });
+
+            fs.writeFileSync(file, content);
+        }
+
+        console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
+        return baseOutputDir;
+    }
+
+    private getFiles(dir: string, fileList: string[] = []) {
+        const files = fs.readdirSync(dir);
+        for (const file of files) {
+            const name = path.join(dir, file);
+            if (fs.statSync(name).isDirectory()) {
+                this.getFiles(name, fileList);
+            } else {
+                fileList.push(name);
+            }
+        }
+        return fileList;
+    }
+}
--- a/packages/cloner-library/src/index.ts
+++ b/packages/cloner-library/src/index.ts
@@ -0,0 +1,3 @@
+export * from "./AssetManager.js";
+export * from "./PageCloner.js";
+export * from "./WebsiteCloner.js";