import scrape from 'website-scraper'; // @ts-ignore import PuppeteerPlugin from 'website-scraper-puppeteer'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import fs from 'node:fs'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); async function run() { let rawUrl = process.argv[2]; if (!rawUrl) { console.error('Usage: npm run clone-page '); process.exit(1); } // CLEANUP: Aggressively strip shell noise like ; or trailing quotes const targetUrl = rawUrl.trim().replace(/[;'"]+$/, ''); const urlObj = new URL(targetUrl); const domain = urlObj.hostname; const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_'); const domainDir = path.resolve(__dirname, '../cloned-websites', safeDomain); if (!fs.existsSync(domainDir)) fs.mkdirSync(domainDir, { recursive: true }); // Determine slug for filename let slug = urlObj.pathname.replace(/^\/|\/$/g, '').replace(/\//g, '-'); if (!slug) slug = 'index'; const htmlFilename = `${slug}.html`; console.log(`šŸš€ CLONING PAGE: ${targetUrl}`); console.log(`šŸ“‚ SAVING AS: ${htmlFilename} in ${domainDir}`); // website-scraper needs an empty directory for each 'scrape' call if we use its defaults, // but we want to MERGE assets. So we scrape to a temp dir and then move. const tempDir = path.join(domainDir, `_temp_${Date.now()}`); const options = { urls: [targetUrl], directory: tempDir, recursive: false, plugins: [ new PuppeteerPlugin({ launchOptions: { headless: true, args: ['--no-sandbox'] }, scrollToBottom: { timeout: 15000, viewportN: 10 }, blockNavigation: false }) ], // Sources list covering Salient/WooCommerce lazy assets sources: [ { selector: 'img', attr: 'src' }, { selector: 'img', attr: 'srcset' }, { selector: 'img', attr: 'data-src' }, { selector: 'img', attr: 'data-lazy-src' }, { selector: 'link[rel="stylesheet"]', attr: 'href' }, { selector: 'link[rel="preload"]', attr: 'href' }, { selector: 'script', attr: 'src' }, { selector: 'video', attr: 'src' }, { selector: 'video', attr: 'poster' }, { selector: 'source', attr: 'src' }, { selector: 'source', attr: 'srcset' }, { selector: 'iframe', attr: 'src' }, { selector: 'meta[property="og:image"]', attr: 'content' }, { selector: '[style*="background-image"]', attr: 'style' } ], // Shared directory for assets subdirectories: [ { directory: 'assets/img', extensions: ['.jpg', '.png', '.svg', '.webp', '.gif', '.ico'] }, { directory: 'assets/js', extensions: ['.js'] }, { directory: 'assets/css', extensions: ['.css'] }, { directory: 'assets/fonts', extensions: ['.woff', '.woff2', '.ttf', '.eot'] }, { directory: 'assets/media', extensions: ['.mp4', '.webm'] } ], request: { headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' } }, urlFilter: (url: string) => { const u = new URL(url); // Allow domain assets, google fonts, and common cdn/upload patterns return u.hostname === domain || u.hostname.includes('fonts.googleapis.com') || u.hostname.includes('fonts.gstatic.com') || url.includes('wp-content') || url.includes('wp-includes'); } }; try { await scrape(options); // Rename the downloaded index.html to our slug.html const downloadedHtml = path.join(tempDir, 'index.html'); const targetHtmlPath = path.join(tempDir, htmlFilename); if (fs.existsSync(downloadedHtml)) { fs.renameSync(downloadedHtml, targetHtmlPath); } // POST-PROCESS: Inject Fonts and fix paths in the HTML if (fs.existsSync(targetHtmlPath)) { let content = fs.readFileSync(targetHtmlPath, 'utf8'); // NUKE TYPOGRAPHY: Strong overrides for Salient theme const fontInjection = ` `; content = content.replace('', `${fontInjection}`); // Link fix: Replace all anchor hrefs with # to prevent unintentional navigation/leaks content = content.replace(/]*)\bhref=["'][^"']*["']/gi, ']+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => { const u = url.trim(); const cleanUrl = u.split('?')[0]; if (cleanUrl.endsWith('.css')) return ``; return ``; }); fs.writeFileSync(targetHtmlPath, content); } // MOVE AND MERGE into domainDir const merge = (src: string, dest: string) => { if (!fs.existsSync(dest)) fs.mkdirSync(dest, { recursive: true }); fs.readdirSync(src).forEach(item => { const s = path.join(src, item); const d = path.join(dest, item); if (fs.statSync(s).isDirectory()) { merge(s, d); } else { // Copy file (overwrite if HTML, skip assets to avoid duplicate downloads) if (item.endsWith('.html') || !fs.existsSync(d)) { fs.copyFileSync(s, d); } } }); }; merge(tempDir, domainDir); fs.rmSync(tempDir, { recursive: true, force: true }); console.log(`\nāœ… SUCCESS: ${path.join(domainDir, htmlFilename)}`); } catch (e) { console.error('āŒ CLONE FAILED:', e); if (fs.existsSync(tempDir)) fs.rmSync(tempDir, { recursive: true, force: true }); } } run();