feat(cloner): add cloner-library and finalize pdf-library rename

2026-02-12 21:59:48 +01:00
parent 57ec4d7544
commit 422e4fccba
33 changed files with 5909 additions and 177 deletions
--- a/packages/cloner-library/src/WebsiteCloner.ts
+++ b/packages/cloner-library/src/WebsiteCloner.ts
@@ -0,0 +1,123 @@
+import { PlaywrightCrawler, RequestQueue } from 'crawlee';
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { execSync } from 'node:child_process';
+
+export interface WebsiteClonerOptions {
+    baseOutputDir: string;
+    maxRequestsPerCrawl?: number;
+    maxConcurrency?: number;
+}
+
+export class WebsiteCloner {
+    private options: WebsiteClonerOptions;
+
+    constructor(options: WebsiteClonerOptions) {
+        this.options = {
+            maxRequestsPerCrawl: 100,
+            maxConcurrency: 3,
+            ...options
+        };
+    }
+
+    public async clone(targetUrl: string, outputDirName?: string): Promise<string> {
+        const urlObj = new URL(targetUrl);
+        const domain = urlObj.hostname;
+        const finalOutputDirName = outputDirName || domain.replace(/\./g, '-');
+        const baseOutputDir = path.resolve(this.options.baseOutputDir, finalOutputDirName);
+
+        if (fs.existsSync(baseOutputDir)) {
+            fs.rmSync(baseOutputDir, { recursive: true, force: true });
+        }
+        fs.mkdirSync(baseOutputDir, { recursive: true });
+
+        console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
+        console.log(`📂 Output: ${baseOutputDir}`);
+
+        const requestQueue = await RequestQueue.open();
+        await requestQueue.addRequest({ url: targetUrl });
+
+        const crawler = new PlaywrightCrawler({
+            requestQueue,
+            maxRequestsPerCrawl: this.options.maxRequestsPerCrawl,
+            maxConcurrency: this.options.maxConcurrency,
+
+            async requestHandler({ request, enqueueLinks, log }) {
+                const url = request.url;
+                log.info(`Capturing ${url}...`);
+
+                const u = new URL(url);
+                let relPath = u.pathname;
+                if (relPath === '/' || relPath === '') relPath = '/index.html';
+                if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
+                if (relPath.startsWith('/')) relPath = relPath.substring(1);
+
+                const fullPath = path.join(baseOutputDir, relPath);
+                fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+
+                try {
+                    // Note: This assumes single-file-cli is available in the environment
+                    execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
+                        stdio: 'inherit'
+                    });
+                } catch (e) {
+                    log.error(`Failed to capture ${url} with SingleFile`);
+                }
+
+                await enqueueLinks({
+                    strategy: 'same-domain',
+                    transformRequestFunction: (req) => {
+                        if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
+                        return req;
+                    }
+                });
+            },
+        });
+
+        await crawler.run();
+
+        console.log('🔗 Rewriting internal links for offline navigation...');
+        const allFiles = this.getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
+
+        for (const file of allFiles) {
+            let content = fs.readFileSync(file, 'utf8');
+            const fileRelToRoot = path.relative(baseOutputDir, file);
+
+            content = content.replace(/href="([^"]+)"/g, (match, href) => {
+                if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
+                    try {
+                        const linkUrl = new URL(href, targetUrl);
+                        if (linkUrl.hostname === domain) {
+                            let linkPath = linkUrl.pathname;
+                            if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
+                            if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
+                            if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
+
+                            const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
+                            return `href="${relativeLink}"`;
+                        }
+                    } catch (e) { }
+                }
+                return match;
+            });
+
+            fs.writeFileSync(file, content);
+        }
+
+        console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
+        return baseOutputDir;
+    }
+
+    private getFiles(dir: string, fileList: string[] = []) {
+        const files = fs.readdirSync(dir);
+        for (const file of files) {
+            const name = path.join(dir, file);
+            if (fs.statSync(name).isDirectory()) {
+                this.getFiles(name, fileList);
+            } else {
+                fileList.push(name);
+            }
+        }
+        return fileList;
+    }
+}