import scrape from "website-scraper"; import PuppeteerPlugin from "website-scraper-puppeteer"; import path from "path"; import { fileURLToPath } from "url"; import fs from "fs"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Custom plugin to handle Next.js and Mac-specific path issues class PortfolioPlugin { apply(registerAction: any) { // 1. Add more sources before starting registerAction("beforeStart", ({ options }: any) => { if (!options.sources) options.sources = []; options.sources.push({ selector: "img", attr: "data-nimg" }); options.sources.push({ selector: "img", attr: "data-src" }); options.sources.push({ selector: "img", attr: "data-srcset" }); options.sources.push({ selector: "video", attr: "poster" }); options.sources.push({ selector: "source", attr: "data-srcset" }); options.sources.push({ selector: '[style*="background-image"]', attr: "style", }); options.sources.push({ selector: 'link[as="font"]', attr: "href" }); options.sources.push({ selector: 'link[as="image"]', attr: "href" }); options.sources.push({ selector: 'link[as="style"]', attr: "href" }); options.sources.push({ selector: 'link[as="script"]', attr: "href" }); }); // 2. Sanitize filenames and handle Next.js optimized images registerAction("generateFilename", ({ resource, filename }: any) => { const url = resource.getUrl(); let result = filename; // Handle Next.js optimized images: /_next/image?url=...&w=... if (url.includes("/_next/image")) { try { const urlParams = new URL(url).searchParams; const originalUrl = urlParams.get("url"); if (originalUrl) { const cleanPath = originalUrl.split("?")[0]; const ext = path.extname(cleanPath) || ".webp"; const name = path.basename(cleanPath, ext); const width = urlParams.get("w") || "auto"; result = `_next/optimized/${name}-${width}${ext}`; } } catch (e) { // Ignore invalid optimized image URLs } } // CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles // We split by / to ensure we only replace .app at the end of a directory name or filename result = result .split("/") .map((segment: string) => segment.endsWith(".app") ? segment.replace(/\.app$/, "-app") : segment, ) .join("/"); return { filename: result }; }); } } async function cloneWebsite() { const url = process.argv[2]; if (!url) { console.error("Please provide a URL as an argument."); process.exit(1); } const domain = new URL(url).hostname; let outputDirName = process.argv[3] || domain.replace(/\./g, "-"); // Sanitize top-level folder name for Mac if (outputDirName.endsWith(".app")) { outputDirName = outputDirName.replace(/\.app$/, "-app"); } const outputDir = path.resolve( __dirname, "../cloned-websites", outputDirName, ); if (fs.existsSync(outputDir)) { fs.rmSync(outputDir, { recursive: true, force: true }); } console.log(`Cloning ${url} to ${outputDir}...`); try { await scrape({ urls: [url], directory: outputDir, recursive: true, maxRecursiveDepth: 5, requestConcurrency: 10, plugins: [ new PuppeteerPlugin({ launchOptions: { headless: true, args: ["--no-sandbox"] }, gotoOptions: { waitUntil: "networkidle0", timeout: 60000 }, scrollToBottom: { timeout: 20000, viewportN: 20 }, }), new PortfolioPlugin(), ], sources: [ { selector: "img", attr: "src" }, { selector: "img", attr: "srcset" }, { selector: "img", attr: "data-src" }, { selector: "img", attr: "data-srcset" }, { selector: 'link[rel="stylesheet"]', attr: "href" }, { selector: 'link[rel*="icon"]', attr: "href" }, { selector: "script", attr: "src" }, { selector: 'link[rel="preload"]', attr: "href" }, { selector: 'link[rel="prefetch"]', attr: "href" }, { selector: 'link[rel="modulepreload"]', attr: "href" }, { selector: 'link[rel="apple-touch-icon"]', attr: "href" }, { selector: 'link[rel="mask-icon"]', attr: "href" }, { selector: "source", attr: "src" }, { selector: "source", attr: "srcset" }, { selector: "video", attr: "src" }, { selector: "video", attr: "poster" }, { selector: "audio", attr: "src" }, { selector: "iframe", attr: "src" }, { selector: 'meta[property="og:image"]', attr: "content" }, { selector: 'meta[name="twitter:image"]', attr: "content" }, { selector: "[style]", attr: "style" }, ], urlFilter: (link: string) => { const isAsset = /\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test( link, ); const isNextAsset = link.includes("/_next/"); const isSameDomain = link.startsWith(url) || link.startsWith("/") || !link.includes("://") || link.includes(domain); const isGoogleTagManager = link.includes("googletagmanager.com"); const isAnalytics = link.includes("analytics.mintel.me"); const isVercelApp = link.includes("vercel.app"); const isDataUrl = link.startsWith("data:"); const isMailto = link.startsWith("mailto:"); const isTel = link.startsWith("tel:"); return ( (isAsset || isNextAsset || isSameDomain || isGoogleTagManager || isAnalytics || isVercelApp) && !isDataUrl && !isMailto && !isTel ); }, filenameGenerator: "bySiteStructure", subdirectories: [ { directory: "img", extensions: [".jpg", ".png", ".svg", ".webp", ".gif", ".ico"], }, { directory: "js", extensions: [".js"] }, { directory: "css", extensions: [".css"] }, { directory: "fonts", extensions: [".woff", ".woff2", ".ttf", ".eot", ".otf"], }, { directory: "videos", extensions: [".mp4", ".webm", ".mov", ".ogg"] }, ], }); console.log("✅ Website cloned successfully!"); console.log(`Location: ${outputDir}`); } catch (error) { console.error("❌ Error cloning website:", error); process.exit(1); } } cloneWebsite();