// @ts-ignore import scrape from 'website-scraper'; // @ts-ignore import PuppeteerPlugin from 'website-scraper-puppeteer'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import fs from 'node:fs'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); class CustomFilenameGeneratorPlugin { apply(registerAction: any) { registerAction('generateFilename', ({ resource }: any) => { const url = new URL(resource.url); const ext = path.extname(url.pathname); // Clean the path let safePath = url.pathname; if (safePath.endsWith('/')) { safePath += 'index.html'; } else if (!ext && !resource.isHtml()) { // If no extension and not HTML, guess based on content type? // But usually safe to leave as is or add extension if known. } else if (!ext && resource.isHtml()) { safePath += '.html'; } // Handle query strings if needed (simplifying by ignoring them for static local files usually better, // unless they determine content. For a clean clone, we usually ignore unique query params) // But if the site relies on routing via query params (e.g. ?page=2), we might want to encode them. // For now, let's keep it simple and clean. // Remove leading slash if (safePath.startsWith('/')) safePath = safePath.substring(1); // Sanitization safePath = safePath.replace(/[:*?"<>|]/g, '_'); // External assets go to a separate folder to avoid collision // We can detect external by checking if the resource parent is different? // Actually, simply using the hostname mapping is safer. // However, the USER wants "local cloned pages". // If we just use the path, we merge everything into one root. // If there are collision (e.g. same path on different domains), this is bad. // But typically we clone ONE site. return { filename: safePath }; }); } } async function run() { const targetUrl = process.argv[2]; if (!targetUrl) { console.error('Usage: npm run clone-website [output-dir]'); process.exit(1); } const urlObj = new URL(targetUrl); const domain = urlObj.hostname; const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_'); const outputDir = process.argv[3] ? path.resolve(process.cwd(), process.argv[3]) : path.resolve(__dirname, '../cloned-websites', safeDomain); if (fs.existsSync(outputDir)) { console.log(`Cleaning existing directory: ${outputDir}`); fs.rmSync(outputDir, { recursive: true, force: true }); } console.log(`๐Ÿš€ Starting recursive clone of ${targetUrl}`); console.log(`๐Ÿ“‚ Output: ${outputDir}`); const options = { urls: [targetUrl], directory: outputDir, recursive: true, maxDepth: 5, // Custom filename generation to avoid "https:/" folders plugins: [ new PuppeteerPlugin({ launchOptions: { headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'] }, scrollToBottom: { timeout: 10000, viewportN: 10 }, blockNavigation: false }), new class LoggerPlugin { apply(registerAction: any) { registerAction('onResourceSaved', ({ resource }: any) => { console.log(` ๐Ÿ’พ Saved: ${resource.url} -> ${resource.filename}`); }); registerAction('onResourceError', ({ resource, error }: any) => { console.error(` โŒ Error: ${resource.url} - ${error.message}`); }); } }, new class FilenamePlugin { apply(registerAction: any) { registerAction('generateFilename', ({ resource }: any) => { const u = new URL(resource.url); let filename = u.pathname; // normalize if (filename.endsWith('/')) filename += 'index.html'; else if (!path.extname(filename) && resource.url.includes(domain)) filename += '/index.html'; // Assume folder if internal link without ext // If it's an external asset, put it in a separate folder if (u.hostname !== domain) { filename = `_external/${u.hostname}${filename}`; } // Sanitize filename filename = filename.split('/').map(part => part.replace(/[^a-z0-9._-]/gi, '_')).join('/'); // Remove leading slash if (filename.startsWith('/')) filename = filename.substring(1); // Handle "Unnamed page" by checking if empty if (!filename || filename === 'index.html') return { filename: 'index.html' }; return { filename }; }); } } ], urlFilter: (url: string) => { const u = new URL(url); const isTargetDomain = u.hostname === domain; const isGoogleFonts = u.hostname.includes('fonts.googleapis.com') || u.hostname.includes('fonts.gstatic.com'); // Allow assets from anywhere const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test(u.pathname); // Allow fonts/css from common CDNs if standard extension check fails const isCommonAsset = u.pathname.includes('/css/') || u.pathname.includes('/js/') || u.pathname.includes('/static/') || u.pathname.includes('/assets/') || u.pathname.includes('/uploads/'); return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts; }, sources: [ { selector: 'img', attr: 'src' }, { selector: 'img', attr: 'srcset' }, { selector: 'source', attr: 'src' }, { selector: 'source', attr: 'srcset' }, { selector: 'link[rel="stylesheet"]', attr: 'href' }, { selector: 'link[rel="preload"]', attr: 'href' }, { selector: 'link[rel="prefetch"]', attr: 'href' }, { selector: 'script', attr: 'src' }, { selector: 'video', attr: 'src' }, { selector: 'video', attr: 'poster' }, { selector: 'iframe', attr: 'src' }, { selector: 'link[rel*="icon"]', attr: 'href' }, { selector: 'link[rel="manifest"]', attr: 'href' }, { selector: 'meta[property="og:image"]', attr: 'content' } ], request: { headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' } } }; try { // @ts-ignore const result = await scrape(options); console.log(`\nโœ… Successfully cloned ${result.length} resources to ${outputDir}`); // Post-processing: Sanitize HTML to remove Next.js hydration scripts // This prevents the static site from trying to "hydrate" and breaking images/links console.log('๐Ÿงน Sanitizing HTML files...'); sanitizeHtmlFiles(outputDir); console.log(`open "${path.join(outputDir, 'index.html')}"`); } catch (error) { console.error('โŒ Error cloning website:', error); process.exit(1); } } function sanitizeHtmlFiles(dir: string) { const files = fs.readdirSync(dir); for (const file of files) { const fullPath = path.join(dir, file); if (fs.statSync(fullPath).isDirectory()) { sanitizeHtmlFiles(fullPath); } else if (file.endsWith('.html')) { let content = fs.readFileSync(fullPath, 'utf8'); // Remove Next.js data script content = content.replace(/`; }); // Inject Fonts (Fix for missing dynamic fonts) // We inject Inter and Montserrat as safe defaults for industrial/modern sites // Check specifically for a stylesheet link to google fonts const hasGoogleFontStylesheet = /]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test(content); if (!hasGoogleFontStylesheet) { const fontLink = ``; const styleBlock = ``; content = content.replace('', `${fontLink}${styleBlock}`); } // Force column layout on product pages if (content.includes('class="products')) { const layoutScript = ` `; content = content.replace('', `${layoutScript}`); } fs.writeFileSync(fullPath, content); } } } run();