mintel.me/apps/web/scripts/clone-website-crawlee.ts

import { PlaywrightCrawler, RequestQueue } from 'crawlee';
import * as path from 'node:path';
import { fileURLToPath } from 'node:url';
import * as fs from 'node:fs';
import { URL } from 'node:url';
import { execSync } from 'node:child_process';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

/**
 * The Ultimate Website Cloner
 * Uses Crawlee for discovery and single-file-cli for perfect page capture.
 */
async function cloneWebsite() {
    const targetUrl = process.argv[2];
    if (!targetUrl) {
        console.error('Please provide a URL as an argument.');
        process.exit(1);
    }

    const urlObj = new URL(targetUrl);
    const domain = urlObj.hostname;
    const outputDirName = process.argv[3] || domain.replace(/\./g, '-');
    const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);

    if (fs.existsSync(baseOutputDir)) {
        fs.rmSync(baseOutputDir, { recursive: true, force: true });
    }
    fs.mkdirSync(baseOutputDir, { recursive: true });

    console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
    console.log(`📂 Output: ${baseOutputDir}`);

    const requestQueue = await RequestQueue.open();
    await requestQueue.addRequest({ url: targetUrl });

    const crawler = new PlaywrightCrawler({
        requestQueue,
        maxRequestsPerCrawl: 100,
        maxConcurrency: 3, // SingleFile is resource intensive

        async requestHandler({ request, enqueueLinks, log }) {
            const url = request.url;
            log.info(`Capturing ${url}...`);

            // 1. Determine local path
            const u = new URL(url);
            let relPath = u.pathname;
            if (relPath === '/' || relPath === '') relPath = '/index.html';
            if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
            if (relPath.startsWith('/')) relPath = relPath.substring(1);

            const fullPath = path.join(baseOutputDir, relPath);
            fs.mkdirSync(path.dirname(fullPath), { recursive: true });

            // 2. Use single-file-cli for perfect capture
            // We use --back-links-rewrite=false because we handle link rewriting ourselves for better control
            try {
                execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
                    stdio: 'inherit'
                });
            } catch (e) {
                log.error(`Failed to capture ${url} with SingleFile`);
            }

            // 3. Enqueue subpages (discovery)
            // We use a separate lightweight crawl for link discovery
            await enqueueLinks({
                strategy: 'same-domain',
                transformRequestFunction: (req) => {
                    if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
                    return req;
                }
            });
        },
    });

    await crawler.run();

    // 4. Post-processing: Rewrite links between the captured files
    console.log('🔗 Rewriting internal links for offline navigation...');
    const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html'));

    for (const file of allFiles) {
        let content = fs.readFileSync(file, 'utf8');
        const fileRelToRoot = path.relative(baseOutputDir, file);

        // Simple but effective regex for internal links
        content = content.replace(/href="([^"]+)"/g, (match, href) => {
            if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
                try {
                    const linkUrl = new URL(href, urlObj.href);
                    if (linkUrl.hostname === domain) {
                        let linkPath = linkUrl.pathname;
                        if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
                        if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
                        if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);

                        const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
                        return `href="${relativeLink}"`;
                    }
                } catch (e) {}
            }
            return match;
        });

        fs.writeFileSync(file, content);
    }

    console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
}

function getFiles(dir: string, fileList: string[] = []) {
    const files = fs.readdirSync(dir);
    for (const file of files) {
        const name = path.join(dir, file);
        if (fs.statSync(name).isDirectory()) {
            getFiles(name, fileList);
        } else {
            fileList.push(name);
        }
    }
    return fileList;
}

cloneWebsite().catch(err => {
    console.error('❌ Fatal error:', err);
    process.exit(1);
});