// @ts-ignore import scrape from "website-scraper"; // @ts-ignore import PuppeteerPlugin from "website-scraper-puppeteer"; import path from "node:path"; import { fileURLToPath } from "node:url"; import fs from "node:fs"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); async function run() { const targetUrl = process.argv[2]; if (!targetUrl) { console.error("Usage: npm run clone-website [output-dir]"); process.exit(1); } const urlObj = new URL(targetUrl); const domain = urlObj.hostname; const safeDomain = domain.replace(/[^a-z0-9-]/gi, "_"); const outputDir = process.argv[3] ? path.resolve(process.cwd(), process.argv[3]) : path.resolve(__dirname, "../cloned-websites", safeDomain); if (fs.existsSync(outputDir)) { console.log(`Cleaning existing directory: ${outputDir}`); fs.rmSync(outputDir, { recursive: true, force: true }); } console.log(`๐Ÿš€ Starting recursive clone of ${targetUrl}`); console.log(`๐Ÿ“‚ Output: ${outputDir}`); const options = { urls: [targetUrl], directory: outputDir, recursive: true, maxDepth: 5, // Custom filename generation to avoid "https:/" folders plugins: [ new PuppeteerPlugin({ launchOptions: { headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", ], }, scrollToBottom: { timeout: 10000, viewportN: 10 }, blockNavigation: false, }), new (class LoggerPlugin { apply(registerAction: any) { registerAction("onResourceSaved", ({ resource }: any) => { console.log(` ๐Ÿ’พ Saved: ${resource.url} -> ${resource.filename}`); }); registerAction("onResourceError", ({ resource, error }: any) => { console.error(` โŒ Error: ${resource.url} - ${error.message}`); }); } })(), new (class FilenamePlugin { apply(registerAction: any) { registerAction("generateFilename", ({ resource }: any) => { const u = new URL(resource.url); let filename = u.pathname; // normalize if (filename.endsWith("/")) filename += "index.html"; else if (!path.extname(filename) && resource.url.includes(domain)) filename += "/index.html"; // Assume folder if internal link without ext // If it's an external asset, put it in a separate folder if (u.hostname !== domain) { filename = `_external/${u.hostname}${filename}`; } // Sanitize filename filename = filename .split("/") .map((part) => part.replace(/[^a-z0-9._-]/gi, "_")) .join("/"); // Remove leading slash if (filename.startsWith("/")) filename = filename.substring(1); // Handle "Unnamed page" by checking if empty if (!filename || filename === "index.html") return { filename: "index.html" }; return { filename }; }); } })(), ], urlFilter: (url: string) => { const u = new URL(url); const isTargetDomain = u.hostname === domain; const isGoogleFonts = u.hostname.includes("fonts.googleapis.com") || u.hostname.includes("fonts.gstatic.com"); // Allow assets from anywhere const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test( u.pathname, ); // Allow fonts/css from common CDNs if standard extension check fails const isCommonAsset = u.pathname.includes("/css/") || u.pathname.includes("/js/") || u.pathname.includes("/static/") || u.pathname.includes("/assets/") || u.pathname.includes("/uploads/"); return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts; }, sources: [ { selector: "img", attr: "src" }, { selector: "img", attr: "srcset" }, { selector: "source", attr: "src" }, { selector: "source", attr: "srcset" }, { selector: 'link[rel="stylesheet"]', attr: "href" }, { selector: 'link[rel="preload"]', attr: "href" }, { selector: 'link[rel="prefetch"]', attr: "href" }, { selector: "script", attr: "src" }, { selector: "video", attr: "src" }, { selector: "video", attr: "poster" }, { selector: "iframe", attr: "src" }, { selector: 'link[rel*="icon"]', attr: "href" }, { selector: 'link[rel="manifest"]', attr: "href" }, { selector: 'meta[property="og:image"]', attr: "content" }, ], request: { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", }, }, }; try { // @ts-ignore const result = await scrape(options); console.log( `\nโœ… Successfully cloned ${result.length} resources to ${outputDir}`, ); // Post-processing: Sanitize HTML to remove Next.js hydration scripts // This prevents the static site from trying to "hydrate" and breaking images/links console.log("๐Ÿงน Sanitizing HTML files..."); sanitizeHtmlFiles(outputDir); console.log(`open "${path.join(outputDir, "index.html")}"`); } catch (error) { console.error("โŒ Error cloning website:", error); process.exit(1); } } function sanitizeHtmlFiles(dir: string) { const files = fs.readdirSync(dir); for (const file of files) { const fullPath = path.join(dir, file); if (fs.statSync(fullPath).isDirectory()) { sanitizeHtmlFiles(fullPath); } else if (file.endsWith(".html")) { let content = fs.readFileSync(fullPath, "utf8"); // Remove Next.js data script content = content.replace( /`; }, ); // Inject Fonts (Fix for missing dynamic fonts) // We inject Inter and Montserrat as safe defaults for industrial/modern sites // Check specifically for a stylesheet link to google fonts const hasGoogleFontStylesheet = /]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test( content, ); if (!hasGoogleFontStylesheet) { const fontLink = ``; const styleBlock = ``; content = content.replace("", `${fontLink}${styleBlock}`); } // Force column layout on product pages if (content.includes('class="products')) { const layoutScript = ` `; content = content.replace("", `${layoutScript}`); } fs.writeFileSync(fullPath, content); } } } run();