clones

2026-02-01 00:07:10 +01:00
parent 813fb070a7
commit 470854aad4
484 changed files with 45981 additions and 5 deletions
--- a/scripts/clone-recursive.ts
+++ b/scripts/clone-recursive.ts
@@ -0,0 +1,210 @@
+// @ts-ignore
+import scrape from 'website-scraper';
+// @ts-ignore
+import PuppeteerPlugin from 'website-scraper-puppeteer';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import fs from 'node:fs';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+class CustomFilenameGeneratorPlugin {
+    apply(registerAction: any) {
+        registerAction('generateFilename', ({ resource }: any) => {
+            const url = new URL(resource.url);
+            const ext = path.extname(url.pathname);
+
+            // Clean the path
+            let safePath = url.pathname;
+            if (safePath.endsWith('/')) {
+                safePath += 'index.html';
+            } else if (!ext && !resource.isHtml()) {
+                // If no extension and not HTML, guess based on content type? 
+                // But usually safe to leave as is or add extension if known.
+            } else if (!ext && resource.isHtml()) {
+                safePath += '.html';
+            }
+
+            // Handle query strings if needed (simplifying by ignoring them for static local files usually better, 
+            // unless they determine content. For a clean clone, we usually ignore unique query params)
+            // But if the site relies on routing via query params (e.g. ?page=2), we might want to encode them.
+            // For now, let's keep it simple and clean.
+
+            // Remove leading slash
+            if (safePath.startsWith('/')) safePath = safePath.substring(1);
+
+            // Sanitization
+            safePath = safePath.replace(/[:*?"<>|]/g, '_');
+
+            // External assets go to a separate folder to avoid collision
+            // We can detect external by checking if the resource parent is different?
+            // Actually, simply using the hostname mapping is safer.
+
+            // However, the USER wants "local cloned pages".
+            // If we just use the path, we merge everything into one root.
+            // If there are collision (e.g. same path on different domains), this is bad.
+            // But typically we clone ONE site.
+
+            return { filename: safePath };
+        });
+    }
+}
+
+async function run() {
+    const targetUrl = process.argv[2];
+    if (!targetUrl) {
+        console.error('Usage: npm run clone-website <URL> [output-dir]');
+        process.exit(1);
+    }
+
+    const urlObj = new URL(targetUrl);
+    const domain = urlObj.hostname;
+    const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
+    const outputDir = process.argv[3]
+        ? path.resolve(process.cwd(), process.argv[3])
+        : path.resolve(__dirname, '../cloned-websites', safeDomain);
+
+    if (fs.existsSync(outputDir)) {
+        console.log(`Cleaning existing directory: ${outputDir}`);
+        fs.rmSync(outputDir, { recursive: true, force: true });
+    }
+
+    console.log(`🚀 Starting recursive clone of ${targetUrl}`);
+    console.log(`📂 Output: ${outputDir}`);
+
+    const options = {
+        urls: [targetUrl],
+        directory: outputDir,
+        recursive: true,
+        maxDepth: 10,
+        // Custom filename generation to avoid "https:/" folders
+        // We use 'bySiteStructure' behavior but manually controlled via plugin 
+        // to forcefully strip protocol/domain issues if any.
+        // Actually, let's just use 'bySiteStructure' but strictly configured? 
+        // No, the user saw garbage. Let's use 'byType' combined with preserving structure for HTML.
+
+        // BETTER STRATEGY: 
+        // Use a custom plugin to control filenames EXACTLY how we want.
+        plugins: [
+            new PuppeteerPlugin({
+                launchOptions: {
+                    headless: true,
+                    args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
+                },
+                scrollToBottom: { timeout: 10000, viewportN: 10 },
+                blockNavigation: false
+            }),
+            new class FilenamePlugin {
+                apply(registerAction: any) {
+                    registerAction('generateFilename', ({ resource }: any) => {
+                        const u = new URL(resource.url);
+                        let filename = u.pathname;
+
+                        // normalize
+                        if (filename.endsWith('/')) filename += 'index.html';
+                        else if (!path.extname(filename) && resource.url.includes(domain)) filename += '/index.html'; // Assume folder if internal link without ext
+
+                        // If it's an external asset, put it in a separate folder
+                        if (u.hostname !== domain) {
+                            filename = `_external/${u.hostname}${filename}`;
+                        }
+
+                        // Sanitize filename
+                        filename = filename.split('/').map(part => part.replace(/[^a-z0-9._-]/gi, '_')).join('/');
+
+                        // Remove leading slash
+                        if (filename.startsWith('/')) filename = filename.substring(1);
+
+                        // Handle "Unnamed page" by checking if empty
+                        if (!filename || filename === 'index.html') return { filename: 'index.html' };
+
+                        return { filename };
+                    });
+                }
+            }
+        ],
+
+        urlFilter: (url: string) => {
+            const u = new URL(url);
+            const isTargetDomain = u.hostname === domain;
+            const isGoogleFonts = u.hostname.includes('fonts.googleapis.com') || u.hostname.includes('fonts.gstatic.com');
+            // Allow assets from anywhere
+            const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json)$/i.test(u.pathname);
+            // Allow fonts/css from common CDNs if standard extension check fails
+            const isCommonAsset = u.pathname.includes('/css/') || u.pathname.includes('/js/') || u.pathname.includes('/static/') || u.pathname.includes('/assets/');
+
+            return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
+        },
+
+
+        sources: [
+            { selector: 'img', attr: 'src' },
+            { selector: 'img', attr: 'srcset' },
+            { selector: 'source', attr: 'src' },
+            { selector: 'source', attr: 'srcset' },
+            { selector: 'link[rel="stylesheet"]', attr: 'href' },
+            { selector: 'script', attr: 'src' },
+            { selector: 'video', attr: 'src' },
+            { selector: 'video', attr: 'poster' },
+            { selector: 'iframe', attr: 'src' },
+            { selector: 'link[rel*="icon"]', attr: 'href' },
+            { selector: 'link[rel="manifest"]', attr: 'href' },
+            { selector: 'meta[property="og:image"]', attr: 'content' }
+        ],
+
+        request: {
+            headers: {
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
+            }
+        }
+    };
+
+    try {
+        // @ts-ignore
+        const result = await scrape(options);
+        console.log(`\n✅ Successfully cloned ${result.length} resources to ${outputDir}`);
+
+        // Post-processing: Sanitize HTML to remove Next.js hydration scripts
+        // This prevents the static site from trying to "hydrate" and breaking images/links
+        console.log('🧹 Sanitizing HTML files...');
+        sanitizeHtmlFiles(outputDir);
+
+        console.log(`open "${path.join(outputDir, 'index.html')}"`);
+    } catch (error) {
+        console.error('❌ Error cloning website:', error);
+        process.exit(1);
+    }
+}
+
+function sanitizeHtmlFiles(dir: string) {
+    const files = fs.readdirSync(dir);
+    for (const file of files) {
+        const fullPath = path.join(dir, file);
+        if (fs.statSync(fullPath).isDirectory()) {
+            sanitizeHtmlFiles(fullPath);
+        } else if (file.endsWith('.html')) {
+            let content = fs.readFileSync(fullPath, 'utf8');
+
+            // Remove Next.js data script
+            content = content.replace(/<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi, '');
+
+            // Remove Next.js chunk scripts (hydration)
+            // match <script src="..._next/static/chunks..." ...
+            content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi, '');
+            content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi, '');
+
+            // Inject Fonts (Fix for missing dynamic fonts)
+            // We inject Inter as a safe default for modern Next.js/Tailwind sites if strictly missing
+            if (!content.includes('fonts.googleapis.com')) {
+                const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap">`;
+                const styleBlock = `<style>.body-font{font-family:'Inter',sans-serif;}.title-font{font-family:'Inter',sans-serif;}</style>`;
+                content = content.replace('</head>', `${fontLink}${styleBlock}</head>`);
+            }
+
+            fs.writeFileSync(fullPath, content);
+        }
+    }
+}
+
+run();
--- a/scripts/clone-website-crawlee.ts
+++ b/scripts/clone-website-crawlee.ts
@@ -0,0 +1,130 @@
+import { PlaywrightCrawler, RequestQueue } from 'crawlee';
+import * as path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import * as fs from 'node:fs';
+import { URL } from 'node:url';
+import { execSync } from 'node:child_process';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+/**
+ * The Ultimate Website Cloner
+ * Uses Crawlee for discovery and single-file-cli for perfect page capture.
+ */
+async function cloneWebsite() {
+    const targetUrl = process.argv[2];
+    if (!targetUrl) {
+        console.error('Please provide a URL as an argument.');
+        process.exit(1);
+    }
+
+    const urlObj = new URL(targetUrl);
+    const domain = urlObj.hostname;
+    const outputDirName = process.argv[3] || domain.replace(/\./g, '-');
+    const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
+    
+    if (fs.existsSync(baseOutputDir)) {
+        fs.rmSync(baseOutputDir, { recursive: true, force: true });
+    }
+    fs.mkdirSync(baseOutputDir, { recursive: true });
+
+    console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
+    console.log(`📂 Output: ${baseOutputDir}`);
+
+    const requestQueue = await RequestQueue.open();
+    await requestQueue.addRequest({ url: targetUrl });
+
+    const crawler = new PlaywrightCrawler({
+        requestQueue,
+        maxRequestsPerCrawl: 100,
+        maxConcurrency: 3, // SingleFile is resource intensive
+
+        async requestHandler({ request, enqueueLinks, log }) {
+            const url = request.url;
+            log.info(`Capturing ${url}...`);
+
+            // 1. Determine local path
+            const u = new URL(url);
+            let relPath = u.pathname;
+            if (relPath === '/' || relPath === '') relPath = '/index.html';
+            if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
+            if (relPath.startsWith('/')) relPath = relPath.substring(1);
+            
+            const fullPath = path.join(baseOutputDir, relPath);
+            fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+
+            // 2. Use single-file-cli for perfect capture
+            // We use --back-links-rewrite=false because we handle link rewriting ourselves for better control
+            try {
+                execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
+                    stdio: 'inherit'
+                });
+            } catch (e) {
+                log.error(`Failed to capture ${url} with SingleFile`);
+            }
+
+            // 3. Enqueue subpages (discovery)
+            // We use a separate lightweight crawl for link discovery
+            await enqueueLinks({
+                strategy: 'same-domain',
+                transformRequestFunction: (req) => {
+                    if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
+                    return req;
+                }
+            });
+        },
+    });
+
+    await crawler.run();
+
+    // 4. Post-processing: Rewrite links between the captured files
+    console.log('🔗 Rewriting internal links for offline navigation...');
+    const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
+    
+    for (const file of allFiles) {
+        let content = fs.readFileSync(file, 'utf8');
+        const fileRelToRoot = path.relative(baseOutputDir, file);
+        
+        // Simple but effective regex for internal links
+        content = content.replace(/href="([^"]+)"/g, (match, href) => {
+            if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
+                try {
+                    const linkUrl = new URL(href, urlObj.href);
+                    if (linkUrl.hostname === domain) {
+                        let linkPath = linkUrl.pathname;
+                        if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
+                        if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
+                        if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
+                        
+                        const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
+                        return `href="${relativeLink}"`;
+                    }
+                } catch (e) {}
+            }
+            return match;
+        });
+        
+        fs.writeFileSync(file, content);
+    }
+
+    console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
+}
+
+function getFiles(dir: string, fileList: string[] = []) {
+    const files = fs.readdirSync(dir);
+    for (const file of files) {
+        const name = path.join(dir, file);
+        if (fs.statSync(name).isDirectory()) {
+            getFiles(name, fileList);
+        } else {
+            fileList.push(name);
+        }
+    }
+    return fileList;
+}
+
+cloneWebsite().catch(err => {
+    console.error('❌ Fatal error:', err);
+    process.exit(1);
+});