clones

2026-02-01 00:07:10 +01:00
parent 813fb070a7
commit 470854aad4
484 changed files with 45981 additions and 5 deletions
--- a/scripts/clone-website-crawlee.ts
+++ b/scripts/clone-website-crawlee.ts
@@ -0,0 +1,130 @@
+import { PlaywrightCrawler, RequestQueue } from 'crawlee';
+import * as path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import * as fs from 'node:fs';
+import { URL } from 'node:url';
+import { execSync } from 'node:child_process';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+/**
+ * The Ultimate Website Cloner
+ * Uses Crawlee for discovery and single-file-cli for perfect page capture.
+ */
+async function cloneWebsite() {
+    const targetUrl = process.argv[2];
+    if (!targetUrl) {
+        console.error('Please provide a URL as an argument.');
+        process.exit(1);
+    }
+
+    const urlObj = new URL(targetUrl);
+    const domain = urlObj.hostname;
+    const outputDirName = process.argv[3] || domain.replace(/\./g, '-');
+    const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
+    
+    if (fs.existsSync(baseOutputDir)) {
+        fs.rmSync(baseOutputDir, { recursive: true, force: true });
+    }
+    fs.mkdirSync(baseOutputDir, { recursive: true });
+
+    console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
+    console.log(`📂 Output: ${baseOutputDir}`);
+
+    const requestQueue = await RequestQueue.open();
+    await requestQueue.addRequest({ url: targetUrl });
+
+    const crawler = new PlaywrightCrawler({
+        requestQueue,
+        maxRequestsPerCrawl: 100,
+        maxConcurrency: 3, // SingleFile is resource intensive
+
+        async requestHandler({ request, enqueueLinks, log }) {
+            const url = request.url;
+            log.info(`Capturing ${url}...`);
+
+            // 1. Determine local path
+            const u = new URL(url);
+            let relPath = u.pathname;
+            if (relPath === '/' || relPath === '') relPath = '/index.html';
+            if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
+            if (relPath.startsWith('/')) relPath = relPath.substring(1);
+            
+            const fullPath = path.join(baseOutputDir, relPath);
+            fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+
+            // 2. Use single-file-cli for perfect capture
+            // We use --back-links-rewrite=false because we handle link rewriting ourselves for better control
+            try {
+                execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
+                    stdio: 'inherit'
+                });
+            } catch (e) {
+                log.error(`Failed to capture ${url} with SingleFile`);
+            }
+
+            // 3. Enqueue subpages (discovery)
+            // We use a separate lightweight crawl for link discovery
+            await enqueueLinks({
+                strategy: 'same-domain',
+                transformRequestFunction: (req) => {
+                    if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
+                    return req;
+                }
+            });
+        },
+    });
+
+    await crawler.run();
+
+    // 4. Post-processing: Rewrite links between the captured files
+    console.log('🔗 Rewriting internal links for offline navigation...');
+    const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
+    
+    for (const file of allFiles) {
+        let content = fs.readFileSync(file, 'utf8');
+        const fileRelToRoot = path.relative(baseOutputDir, file);
+        
+        // Simple but effective regex for internal links
+        content = content.replace(/href="([^"]+)"/g, (match, href) => {
+            if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
+                try {
+                    const linkUrl = new URL(href, urlObj.href);
+                    if (linkUrl.hostname === domain) {
+                        let linkPath = linkUrl.pathname;
+                        if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
+                        if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
+                        if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
+                        
+                        const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
+                        return `href="${relativeLink}"`;
+                    }
+                } catch (e) {}
+            }
+            return match;
+        });
+        
+        fs.writeFileSync(file, content);
+    }
+
+    console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
+}
+
+function getFiles(dir: string, fileList: string[] = []) {
+    const files = fs.readdirSync(dir);
+    for (const file of files) {
+        const name = path.join(dir, file);
+        if (fs.statSync(name).isDirectory()) {
+            getFiles(name, fileList);
+        } else {
+            fileList.push(name);
+        }
+    }
+    return fileList;
+}
+
+cloneWebsite().catch(err => {
+    console.error('❌ Fatal error:', err);
+    process.exit(1);
+});