import scrape from 'website-scraper'; import PuppeteerPlugin from 'website-scraper-puppeteer'; import path from 'path'; import { fileURLToPath } from 'url'; import fs from 'fs'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Custom plugin to handle Next.js and Mac-specific path issues class PortfolioPlugin { apply(registerAction: any) { // 1. Add more sources before starting registerAction('beforeStart', ({ options }: any) => { if (!options.sources) options.sources = []; options.sources.push({ selector: 'img', attr: 'data-nimg' }); options.sources.push({ selector: 'img', attr: 'data-src' }); options.sources.push({ selector: 'img', attr: 'data-srcset' }); options.sources.push({ selector: 'video', attr: 'poster' }); options.sources.push({ selector: 'source', attr: 'data-srcset' }); options.sources.push({ selector: '[style*="background-image"]', attr: 'style' }); options.sources.push({ selector: 'link[as="font"]', attr: 'href' }); options.sources.push({ selector: 'link[as="image"]', attr: 'href' }); options.sources.push({ selector: 'link[as="style"]', attr: 'href' }); options.sources.push({ selector: 'link[as="script"]', attr: 'href' }); }); // 2. Sanitize filenames and handle Next.js optimized images registerAction('generateFilename', ({ resource, filename }: any) => { const url = resource.getUrl(); let result = filename; // Handle Next.js optimized images: /_next/image?url=...&w=... if (url.includes('/_next/image')) { try { const urlParams = new URL(url).searchParams; const originalUrl = urlParams.get('url'); if (originalUrl) { const cleanPath = originalUrl.split('?')[0]; const ext = path.extname(cleanPath) || '.webp'; const name = path.basename(cleanPath, ext); const width = urlParams.get('w') || 'auto'; result = `_next/optimized/${name}-${width}${ext}`; } } catch (e) {} } // CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles // We split by / to ensure we only replace .app at the end of a directory name or filename result = result.split('/').map((segment: string) => segment.endsWith('.app') ? segment.replace(/\.app$/, '-app') : segment ).join('/'); return { filename: result }; }); } } async function cloneWebsite() { const url = process.argv[2]; if (!url) { console.error('Please provide a URL as an argument.'); process.exit(1); } const domain = new URL(url).hostname; let outputDirName = process.argv[3] || domain.replace(/\./g, '-'); // Sanitize top-level folder name for Mac if (outputDirName.endsWith('.app')) { outputDirName = outputDirName.replace(/\.app$/, '-app'); } const outputDir = path.resolve(__dirname, '../cloned-websites', outputDirName); if (fs.existsSync(outputDir)) { fs.rmSync(outputDir, { recursive: true, force: true }); } console.log(`Cloning ${url} to ${outputDir}...`); try { await scrape({ urls: [url], directory: outputDir, recursive: true, maxRecursiveDepth: 5, requestConcurrency: 10, plugins: [ new PuppeteerPlugin({ launchOptions: { headless: true, args: ['--no-sandbox'] }, gotoOptions: { waitUntil: 'networkidle0', timeout: 60000 }, scrollToBottom: { timeout: 20000, viewportN: 20 }, }), new PortfolioPlugin() ], sources: [ { selector: 'img', attr: 'src' }, { selector: 'img', attr: 'srcset' }, { selector: 'img', attr: 'data-src' }, { selector: 'img', attr: 'data-srcset' }, { selector: 'link[rel="stylesheet"]', attr: 'href' }, { selector: 'link[rel*="icon"]', attr: 'href' }, { selector: 'script', attr: 'src' }, { selector: 'link[rel="preload"]', attr: 'href' }, { selector: 'link[rel="prefetch"]', attr: 'href' }, { selector: 'link[rel="modulepreload"]', attr: 'href' }, { selector: 'link[rel="apple-touch-icon"]', attr: 'href' }, { selector: 'link[rel="mask-icon"]', attr: 'href' }, { selector: 'source', attr: 'src' }, { selector: 'source', attr: 'srcset' }, { selector: 'video', attr: 'src' }, { selector: 'video', attr: 'poster' }, { selector: 'audio', attr: 'src' }, { selector: 'iframe', attr: 'src' }, { selector: 'meta[property="og:image"]', attr: 'content' }, { selector: 'meta[name="twitter:image"]', attr: 'content' }, { selector: '[style]', attr: 'style' }, ], urlFilter: (link: string) => { const isAsset = /\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test(link); const isNextAsset = link.includes('/_next/'); const isSameDomain = link.startsWith(url) || link.startsWith('/') || !link.includes('://') || link.includes(domain); const isGoogleTagManager = link.includes('googletagmanager.com'); const isAnalytics = link.includes('analytics.mintel.me'); const isVercelApp = link.includes('vercel.app'); const isDataUrl = link.startsWith('data:'); const isMailto = link.startsWith('mailto:'); const isTel = link.startsWith('tel:'); return (isAsset || isNextAsset || isSameDomain || isGoogleTagManager || isAnalytics || isVercelApp) && !isDataUrl && !isMailto && !isTel; }, filenameGenerator: 'bySiteStructure', subdirectories: [ { directory: 'img', extensions: ['.jpg', '.png', '.svg', '.webp', '.gif', '.ico'] }, { directory: 'js', extensions: ['.js'] }, { directory: 'css', extensions: ['.css'] }, { directory: 'fonts', extensions: ['.woff', '.woff2', '.ttf', '.eot', '.otf'] }, { directory: 'videos', extensions: ['.mp4', '.webm', '.mov', '.ogg'] }, ], }); console.log('✅ Website cloned successfully!'); console.log(`Location: ${outputDir}`); } catch (error) { console.error('❌ Error cloning website:', error); process.exit(1); } } cloneWebsite();