feat(cloner): remove cloner from mintel.me and use registry versions for @mintel/pdf and @mintel/cloner

2026-02-12 22:00:36 +01:00
parent 0fed92ca8c
commit 99e392ce08
7 changed files with 5 additions and 1002 deletions
--- a/apps/web/scripts/clone-page.ts
+++ b/apps/web/scripts/clone-page.ts
@@ -1,436 +0,0 @@
-import { chromium } from "playwright";
-import path from "node:path";
-import { fileURLToPath } from "node:url";
-import fs from "node:fs";
-import axios from "axios";
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-const USER_AGENT =
-  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36";
-
-function sanitizePath(rawPath: string) {
-  return rawPath
-    .split("/")
-    .map((p) => p.replace(/[^a-z0-9._-]/gi, "_"))
-    .join("/");
-}
-
-async function downloadFile(url: string, assetsDir: string) {
-  if (url.startsWith("//")) url = `https:${url}`;
-  if (!url.startsWith("http")) return null;
-
-  try {
-    const u = new URL(url);
-    // Create a collision-resistant local path
-    const relPath = sanitizePath(u.hostname + u.pathname);
-    const dest = path.join(assetsDir, relPath);
-
-    if (fs.existsSync(dest)) return `./assets/${relPath}`;
-
-    const res = await axios.get(url, {
-      responseType: "arraybuffer",
-      headers: { "User-Agent": USER_AGENT },
-      timeout: 15000,
-      validateStatus: () => true,
-    });
-
-    if (res.status !== 200) return null;
-
-    if (!fs.existsSync(path.dirname(dest)))
-      fs.mkdirSync(path.dirname(dest), { recursive: true });
-    fs.writeFileSync(dest, Buffer.from(res.data));
-    return `./assets/${relPath}`;
-  } catch {
-    return null; // Fail silently, proceed with original URL
-  }
-}
-
-async function processCssRecursively(
-  cssContent: string,
-  cssUrl: string,
-  assetsDir: string,
-  urlMap: Record<string, string>,
-  depth = 0,
-) {
-  if (depth > 5) return cssContent;
-
-  // Capture both standard url(...) and @import url(...)
-  const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi;
-  let match;
-  let newContent = cssContent;
-
-  while ((match = urlRegex.exec(cssContent)) !== null) {
-    const originalUrl = match[1];
-    if (originalUrl.startsWith("data:") || originalUrl.startsWith("blob:"))
-      continue;
-
-    try {
-      const absUrl = new URL(originalUrl, cssUrl).href;
-      const local = await downloadFile(absUrl, assetsDir);
-
-      if (local) {
-        // Calculate relative path from CSS file to Asset
-        const u = new URL(cssUrl);
-        const cssPath = u.hostname + u.pathname;
-        const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname;
-
-        // We need to route from the folder containing the CSS to the asset
-        const rel = path.relative(
-          path.dirname(sanitizePath(cssPath)),
-          sanitizePath(assetPath),
-        );
-
-        // Replace strictly the URL part
-        newContent = newContent.split(originalUrl).join(rel);
-        urlMap[absUrl] = local;
-      }
-    } catch {
-      // Ignore URL resolution errors
-    }
-  }
-  return newContent;
-}
-
-async function run() {
-  const rawUrl = process.argv[2];
-  if (!rawUrl) {
-    console.error("Usage: npm run clone-page <url>");
-    process.exit(1);
-  }
-  const targetUrl = rawUrl.trim();
-  const urlObj = new URL(targetUrl);
-
-  // Setup Output Directories
-  const domainSlug = urlObj.hostname.replace("www.", "");
-  const domainDir = path.resolve(__dirname, `../public/showcase/${domainSlug}`);
-  const assetsDir = path.join(domainDir, "assets");
-  if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
-
-  let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-");
-  if (!pageSlug) pageSlug = "index";
-  const htmlFilename = `${pageSlug}.html`;
-
-  console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`);
-
-  const browser = await chromium.launch({ headless: true });
-  // Start with a standard viewport, we will resize widely later
-  const context = await browser.newContext({
-    userAgent: USER_AGENT,
-    viewport: { width: 1920, height: 1080 },
-  });
-  const page = await context.newPage();
-
-  const urlMap: Record<string, string> = {};
-  const foundAssets = new Set<string>();
-
-  // 1. Live Network Interception
-  page.on("response", (response) => {
-    const url = response.url();
-    if (response.status() === 200) {
-      // Capture anything that looks like a static asset
-      if (
-        url.match(
-          /\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i,
-        )
-      ) {
-        foundAssets.add(url);
-      }
-    }
-  });
-
-  try {
-    console.log("🌐 Loading page (Waiting for Network Idle)...");
-    await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 });
-
-    console.log(
-      '🌊 Executing "Scroll Wave" to trigger all lazy loaders naturally...',
-    );
-    await page.evaluate(async () => {
-      await new Promise((resolve) => {
-        let totalHeight = 0;
-        const distance = 400;
-        const timer = setInterval(() => {
-          const scrollHeight = document.body.scrollHeight;
-          window.scrollBy(0, distance);
-          totalHeight += distance;
-
-          if (totalHeight >= scrollHeight) {
-            clearInterval(timer);
-            window.scrollTo(0, 0); // Reset to top
-            resolve(true);
-          }
-        }, 100);
-      });
-    });
-
-    console.log(
-      '📐 Expanding Viewport to "Giant Mode" for final asset capture...',
-    );
-    const fullHeight = await page.evaluate(() => document.body.scrollHeight);
-    await page.setViewportSize({ width: 1920, height: fullHeight + 1000 });
-
-    // Final settlement wait
-    await page.waitForTimeout(3000);
-
-    console.log("💧 Final DOM Hydration & Sanitization...");
-    await page.evaluate(() => {
-      // A. Deterministic Attribute Hydration (Generic)
-      // Scours every element for attributes that look like asset URLs and promotes them
-      const assetPattern =
-        /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i;
-
-      document.querySelectorAll("*").forEach((el) => {
-        // 0. Skip Meta/Head/Script/Style/SVG tags for attribute promotion
-        if (
-          ["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes(
-            el.tagName,
-          )
-        )
-          return;
-
-        // 1. Force Visibility (Anti-Flicker)
-        const htmlEl = el as HTMLElement;
-        const style = window.getComputedStyle(htmlEl);
-        if (style.opacity === "0" || style.visibility === "hidden") {
-          htmlEl.style.setProperty("opacity", "1", "important");
-          htmlEl.style.setProperty("visibility", "visible", "important");
-        }
-
-        // 2. Promote Data Attributes
-        for (const attr of Array.from(el.attributes)) {
-          const name = attr.name.toLowerCase();
-          const val = attr.value;
-
-          if (
-            assetPattern.test(val) ||
-            name.includes("src") ||
-            name.includes("image")
-          ) {
-            // Standard Image/Video/Source promotion
-            if (el.tagName === "IMG") {
-              const img = el as HTMLImageElement;
-              if (name.includes("srcset")) img.srcset = val;
-              else if (!img.src || img.src.includes("data:")) img.src = val;
-            }
-            if (el.tagName === "SOURCE") {
-              const source = el as HTMLSourceElement;
-              if (name.includes("srcset")) source.srcset = val;
-            }
-            if (el.tagName === "VIDEO" || el.tagName === "AUDIO") {
-              const media = el as HTMLMediaElement;
-              if (!media.src) media.src = val;
-            }
-
-            // Background Image Promotion
-            if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes("href")) {
-              const bg = htmlEl.style.backgroundImage;
-              if (!bg || bg === "none") {
-                htmlEl.style.backgroundImage = `url('${val}')`;
-              }
-            }
-          }
-        }
-      });
-
-      // B. Ensure basic structural elements are visible post-scroll
-      const body = document.body;
-      if (body) {
-        body.style.setProperty("opacity", "1", "important");
-        body.style.setProperty("visibility", "visible", "important");
-      }
-    });
-
-    console.log("⏳ Waiting for network idle...");
-    await page.waitForLoadState("networkidle");
-
-    // 1.5 FINAL SETTLEMENT: Let any scroll-triggered JS finish
-    await page.waitForTimeout(1000);
-
-    // 2. Static Snapshot
-    let content = await page.content();
-
-    // 3. Post-Snapshot Asset Discovery (Regex)
-    // Catches assets that never triggered a network request but exist in the markup
-    const regexPatterns = [
-      /(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi,
-      // Capture CSS url() inside style blocks
-      /url\(["']?([^"'\)]+)["']?\)/gi,
-    ];
-
-    for (const pattern of regexPatterns) {
-      let match;
-      while ((match = pattern.exec(content)) !== null) {
-        try {
-          foundAssets.add(new URL(match[1], targetUrl).href);
-        } catch {
-          // Ignore invalid URLs in content
-        }
-      }
-    }
-
-    // Specific srcset parsing
-    const srcsetRegex = /[a-z0-9-]+srcset=["']([^"']+)["']/gi;
-    let match;
-    while ((match = srcsetRegex.exec(content)) !== null) {
-      match[1].split(",").forEach((rule) => {
-        const parts = rule.trim().split(/\s+/);
-        if (parts[0] && !parts[0].startsWith("data:")) {
-          try {
-            foundAssets.add(new URL(parts[0], targetUrl).href);
-          } catch {
-            // Ignore invalid srcset URLs
-          }
-        }
-      });
-    }
-
-    console.log(`🔍 Processing ${foundAssets.size} discovered assets...`);
-
-    // 4. Download & Map
-    for (const url of foundAssets) {
-      const local = await downloadFile(url, assetsDir);
-      if (local) {
-        urlMap[url] = local;
-        const clean = url.split("?")[0];
-        urlMap[clean] = local;
-
-        // Handle CSS recursively
-        if (clean.endsWith(".css")) {
-          try {
-            const { data } = await axios.get(url, {
-              headers: { "User-Agent": USER_AGENT },
-            });
-            // Process CSS and save it
-            const processedCss = await processCssRecursively(
-              data,
-              url,
-              assetsDir,
-              urlMap,
-            );
-            const relPath = sanitizePath(
-              new URL(url).hostname + new URL(url).pathname,
-            );
-            fs.writeFileSync(path.join(assetsDir, relPath), processedCss);
-          } catch {
-            // Ignore CSS fetch/process errors
-          }
-        }
-      }
-    }
-
-    console.log("🛠️  Finalizing Static Mirror...");
-    let finalContent = content;
-
-    // A. Apply URL Map Replacements
-    // Longer paths first to prevent partial replacement errors
-    const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length);
-    if (sortedUrls.length > 0) {
-      const escaped = sortedUrls.map((u) =>
-        u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"),
-      );
-      // Create a massive regex for single-pass replacement
-      const masterRegex = new RegExp(`(${escaped.join("|")})`, "g");
-      finalContent = finalContent.replace(
-        masterRegex,
-        (match) => urlMap[match] || match,
-      );
-    }
-
-    // B. Global Root-Relative Path Cleanup
-    // Catches things like /wp-content/ that weren't distinct assets or were missed
-    const commonDirs = [
-      "/wp-content/",
-      "/wp-includes/",
-      "/assets/",
-      "/static/",
-      "/images/",
-    ];
-    for (const dir of commonDirs) {
-      const localDir = `./assets/${urlObj.hostname}${dir}`;
-      finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`);
-      finalContent = finalContent.split(`'${dir}`).join(`'${localDir}`);
-      finalContent = finalContent.split(`(${dir}`).join(`(${localDir}`);
-    }
-
-    // C. Domain Nuke
-    // Replace absolute links to the original domain with relative or #
-    const domainPattern = new RegExp(
-      `https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`,
-      "gi",
-    );
-    // We carefully only replace if it looks like a resource link, or neutralize if it's a navigation link
-    // For simplicity and "solidness", we'll rely on the specific replacements above first.
-    // This catch-all nuke ensures we don't leak requests.
-    // Convert remaining absolute domain links to relative .
-    finalContent = finalContent.replace(domainPattern, (match) => {
-      // If we have a map for it, it should have been replaced.
-      // If not, it's likely a navigation link or an uncaptured asset.
-      // Safe fallback:
-      return "./";
-    });
-
-    // D. Static Stability & Cleanup
-    // Remove tracking/analytics/lazy-load scripts that ruins stability
-    finalContent = finalContent.replace(
-      /<script\b[^>]*>([\s\S]*?)<\/script>/gi,
-      (match, content) => {
-        const lower = content.toLowerCase();
-        if (
-          lower.includes("google-analytics") ||
-          lower.includes("gtag") ||
-          lower.includes("fbq") ||
-          lower.includes("lazy") ||
-          lower.includes("tracker")
-        ) {
-          return "";
-        }
-        return match;
-      },
-    );
-
-    // E. CSS Injections for Stability
-    const headEnd = finalContent.indexOf("</head>");
-    if (headEnd > -1) {
-      const stabilityCss = `
-            <style>
-                /* UNIVERSAL CLONE STABILIZATION */
-                * { 
-                    transition: none !important; 
-                    animation: none !important; 
-                    scroll-behavior: auto !important;
-                }
-                [data-aos], .reveal, .lazypath, .lazy-load, [data-src] { 
-                    opacity: 1 !important; 
-                    visibility: visible !important; 
-                    transform: none !important; 
-                    clip-path: none !important;
-                }
-                
-                img, video, iframe { 
-                    max-width: 100%; 
-                    display: block; 
-                }
-                a { 
-                    pointer-events: none; 
-                    cursor: default; 
-                }
-            </style>`;
-      finalContent =
-        finalContent.slice(0, headEnd) +
-        stabilityCss +
-        finalContent.slice(headEnd);
-    }
-
-    // Save
-    const finalPath = path.join(domainDir, htmlFilename);
-    fs.writeFileSync(finalPath, finalContent);
-    console.log(`✅ SUCCESS: Cloned to ${finalPath}`);
-  } catch (err) {
-    console.error("❌ FATAL ERROR:", err);
-  } finally {
-    await browser.close();
-  }
-}
-
-run();
--- a/apps/web/scripts/clone-recursive.ts
+++ b/apps/web/scripts/clone-recursive.ts
@@ -1,239 +0,0 @@
-// @ts-ignore
-import scrape from "website-scraper";
-// @ts-ignore
-import PuppeteerPlugin from "website-scraper-puppeteer";
-import path from "node:path";
-import { fileURLToPath } from "node:url";
-import fs from "node:fs";
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-
-async function run() {
-  const targetUrl = process.argv[2];
-  if (!targetUrl) {
-    console.error("Usage: npm run clone-website <URL> [output-dir]");
-    process.exit(1);
-  }
-
-  const urlObj = new URL(targetUrl);
-  const domain = urlObj.hostname;
-  const safeDomain = domain.replace(/[^a-z0-9-]/gi, "_");
-  const outputDir = process.argv[3]
-    ? path.resolve(process.cwd(), process.argv[3])
-    : path.resolve(__dirname, "../cloned-websites", safeDomain);
-
-  if (fs.existsSync(outputDir)) {
-    console.log(`Cleaning existing directory: ${outputDir}`);
-    fs.rmSync(outputDir, { recursive: true, force: true });
-  }
-
-  console.log(`🚀 Starting recursive clone of ${targetUrl}`);
-  console.log(`📂 Output: ${outputDir}`);
-
-  const options = {
-    urls: [targetUrl],
-    directory: outputDir,
-    recursive: true,
-    maxDepth: 5,
-    // Custom filename generation to avoid "https:/" folders
-    plugins: [
-      new PuppeteerPlugin({
-        launchOptions: {
-          headless: true,
-          args: [
-            "--no-sandbox",
-            "--disable-setuid-sandbox",
-            "--disable-dev-shm-usage",
-          ],
-        },
-        scrollToBottom: { timeout: 10000, viewportN: 10 },
-        blockNavigation: false,
-      }),
-      new (class LoggerPlugin {
-        apply(registerAction: any) {
-          registerAction("onResourceSaved", ({ resource }: any) => {
-            console.log(`  💾 Saved: ${resource.url} -> ${resource.filename}`);
-          });
-          registerAction("onResourceError", ({ resource, error }: any) => {
-            console.error(`  ❌ Error: ${resource.url} - ${error.message}`);
-          });
-        }
-      })(),
-      new (class FilenamePlugin {
-        apply(registerAction: any) {
-          registerAction("generateFilename", ({ resource }: any) => {
-            const u = new URL(resource.url);
-            let filename = u.pathname;
-
-            // normalize
-            if (filename.endsWith("/")) filename += "index.html";
-            else if (!path.extname(filename) && resource.url.includes(domain))
-              filename += "/index.html"; // Assume folder if internal link without ext
-
-            // If it's an external asset, put it in a separate folder
-            if (u.hostname !== domain) {
-              filename = `_external/${u.hostname}${filename}`;
-            }
-
-            // Sanitize filename
-            filename = filename
-              .split("/")
-              .map((part) => part.replace(/[^a-z0-9._-]/gi, "_"))
-              .join("/");
-
-            // Remove leading slash
-            if (filename.startsWith("/")) filename = filename.substring(1);
-
-            // Handle "Unnamed page" by checking if empty
-            if (!filename || filename === "index.html")
-              return { filename: "index.html" };
-
-            return { filename };
-          });
-        }
-      })(),
-    ],
-
-    urlFilter: (url: string) => {
-      const u = new URL(url);
-      const isTargetDomain = u.hostname === domain;
-      const isGoogleFonts =
-        u.hostname.includes("fonts.googleapis.com") ||
-        u.hostname.includes("fonts.gstatic.com");
-      // Allow assets from anywhere
-      const isAsset =
-        /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test(
-          u.pathname,
-        );
-      // Allow fonts/css from common CDNs if standard extension check fails
-      const isCommonAsset =
-        u.pathname.includes("/css/") ||
-        u.pathname.includes("/js/") ||
-        u.pathname.includes("/static/") ||
-        u.pathname.includes("/assets/") ||
-        u.pathname.includes("/uploads/");
-
-      return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
-    },
-
-    sources: [
-      { selector: "img", attr: "src" },
-      { selector: "img", attr: "srcset" },
-      { selector: "source", attr: "src" },
-      { selector: "source", attr: "srcset" },
-      { selector: 'link[rel="stylesheet"]', attr: "href" },
-      { selector: 'link[rel="preload"]', attr: "href" },
-      { selector: 'link[rel="prefetch"]', attr: "href" },
-      { selector: "script", attr: "src" },
-      { selector: "video", attr: "src" },
-      { selector: "video", attr: "poster" },
-      { selector: "iframe", attr: "src" },
-      { selector: 'link[rel*="icon"]', attr: "href" },
-      { selector: 'link[rel="manifest"]', attr: "href" },
-      { selector: 'meta[property="og:image"]', attr: "content" },
-    ],
-
-    request: {
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
-      },
-    },
-  };
-
-  try {
-    // @ts-ignore
-    const result = await scrape(options);
-    console.log(
-      `\n✅ Successfully cloned ${result.length} resources to ${outputDir}`,
-    );
-
-    // Post-processing: Sanitize HTML to remove Next.js hydration scripts
-    // This prevents the static site from trying to "hydrate" and breaking images/links
-    console.log("🧹 Sanitizing HTML files...");
-    sanitizeHtmlFiles(outputDir);
-
-    console.log(`open "${path.join(outputDir, "index.html")}"`);
-  } catch (error) {
-    console.error("❌ Error cloning website:", error);
-    process.exit(1);
-  }
-}
-
-function sanitizeHtmlFiles(dir: string) {
-  const files = fs.readdirSync(dir);
-  for (const file of files) {
-    const fullPath = path.join(dir, file);
-    if (fs.statSync(fullPath).isDirectory()) {
-      sanitizeHtmlFiles(fullPath);
-    } else if (file.endsWith(".html")) {
-      let content = fs.readFileSync(fullPath, "utf8");
-
-      // Remove Next.js data script
-      content = content.replace(
-        /<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi,
-        "",
-      );
-
-      // Remove Next.js chunk scripts (hydration)
-      // match <script src="..._next/static/chunks..." ...
-      content = content.replace(
-        /<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi,
-        "",
-      );
-      content = content.replace(
-        /<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi,
-        "",
-      );
-
-      // Convert Breeze dynamic script/styles into actual tags if possible
-      // match <div class="breeze-scripts-load" ...>URL</div>
-      content = content.replace(
-        /<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi,
-        (match, url) => {
-          if (url.endsWith(".css"))
-            return `<link rel="stylesheet" href="${url}">`;
-          return `<script src="${url}"></script>`;
-        },
-      );
-
-      // Inject Fonts (Fix for missing dynamic fonts)
-      // We inject Inter and Montserrat as safe defaults for industrial/modern sites
-      // Check specifically for a stylesheet link to google fonts
-      const hasGoogleFontStylesheet =
-        /<link[^>]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test(
-          content,
-        );
-      if (!hasGoogleFontStylesheet) {
-        const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700&display=swap">`;
-        const styleBlock = `<style>
-                    :root { --main-font: 'Inter', sans-serif; --heading-font: 'Montserrat', sans-serif; }
-                    body, .body-font, p, span, li, a { font-family: var(--main-font) !important; }
-                    h1, h2, h3, h4, h5, h6, .title-font, .heading-font { font-family: var(--heading-font) !important; }
-                </style>`;
-        content = content.replace("</head>", `${fontLink}${styleBlock}</head>`);
-      }
-
-      // Force column layout on product pages
-      if (content.includes('class="products')) {
-        const layoutScript = `
-                <script>
-                document.addEventListener('DOMContentLoaded', function() {
-                    const products = document.querySelector('.products');
-                    if (products) {
-                        products.classList.remove(...Array.from(products.classList).filter(c => c.startsWith('columns-')));
-                        products.classList.add('columns-1');
-                        products.setAttribute('data-n-desktop-columns', '1');
-                    }
-                });
-                </script>`;
-        content = content.replace("</body>", `${layoutScript}</body>`);
-      }
-
-      fs.writeFileSync(fullPath, content);
-    }
-  }
-}
-
-run();
--- a/apps/web/scripts/clone-website-crawlee.ts
+++ b/apps/web/scripts/clone-website-crawlee.ts
@@ -1,130 +0,0 @@
-import { PlaywrightCrawler, RequestQueue } from 'crawlee';
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import * as fs from 'node:fs';
-import { URL } from 'node:url';
-import { execSync } from 'node:child_process';
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-
-/**
- * The Ultimate Website Cloner
- * Uses Crawlee for discovery and single-file-cli for perfect page capture.
- */
-async function cloneWebsite() {
-    const targetUrl = process.argv[2];
-    if (!targetUrl) {
-        console.error('Please provide a URL as an argument.');
-        process.exit(1);
-    }
-
-    const urlObj = new URL(targetUrl);
-    const domain = urlObj.hostname;
-    const outputDirName = process.argv[3] || domain.replace(/\./g, '-');
-    const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
-    
-    if (fs.existsSync(baseOutputDir)) {
-        fs.rmSync(baseOutputDir, { recursive: true, force: true });
-    }
-    fs.mkdirSync(baseOutputDir, { recursive: true });
-
-    console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
-    console.log(`📂 Output: ${baseOutputDir}`);
-
-    const requestQueue = await RequestQueue.open();
-    await requestQueue.addRequest({ url: targetUrl });
-
-    const crawler = new PlaywrightCrawler({
-        requestQueue,
-        maxRequestsPerCrawl: 100,
-        maxConcurrency: 3, // SingleFile is resource intensive
-
-        async requestHandler({ request, enqueueLinks, log }) {
-            const url = request.url;
-            log.info(`Capturing ${url}...`);
-
-            // 1. Determine local path
-            const u = new URL(url);
-            let relPath = u.pathname;
-            if (relPath === '/' || relPath === '') relPath = '/index.html';
-            if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
-            if (relPath.startsWith('/')) relPath = relPath.substring(1);
-            
-            const fullPath = path.join(baseOutputDir, relPath);
-            fs.mkdirSync(path.dirname(fullPath), { recursive: true });
-
-            // 2. Use single-file-cli for perfect capture
-            // We use --back-links-rewrite=false because we handle link rewriting ourselves for better control
-            try {
-                execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
-                    stdio: 'inherit'
-                });
-            } catch (e) {
-                log.error(`Failed to capture ${url} with SingleFile`);
-            }
-
-            // 3. Enqueue subpages (discovery)
-            // We use a separate lightweight crawl for link discovery
-            await enqueueLinks({
-                strategy: 'same-domain',
-                transformRequestFunction: (req) => {
-                    if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
-                    return req;
-                }
-            });
-        },
-    });
-
-    await crawler.run();
-
-    // 4. Post-processing: Rewrite links between the captured files
-    console.log('🔗 Rewriting internal links for offline navigation...');
-    const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
-    
-    for (const file of allFiles) {
-        let content = fs.readFileSync(file, 'utf8');
-        const fileRelToRoot = path.relative(baseOutputDir, file);
-        
-        // Simple but effective regex for internal links
-        content = content.replace(/href="([^"]+)"/g, (match, href) => {
-            if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
-                try {
-                    const linkUrl = new URL(href, urlObj.href);
-                    if (linkUrl.hostname === domain) {
-                        let linkPath = linkUrl.pathname;
-                        if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
-                        if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
-                        if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
-                        
-                        const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
-                        return `href="${relativeLink}"`;
-                    }
-                } catch (e) {}
-            }
-            return match;
-        });
-        
-        fs.writeFileSync(file, content);
-    }
-
-    console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
-}
-
-function getFiles(dir: string, fileList: string[] = []) {
-    const files = fs.readdirSync(dir);
-    for (const file of files) {
-        const name = path.join(dir, file);
-        if (fs.statSync(name).isDirectory()) {
-            getFiles(name, fileList);
-        } else {
-            fileList.push(name);
-        }
-    }
-    return fileList;
-}
-
-cloneWebsite().catch(err => {
-    console.error('❌ Fatal error:', err);
-    process.exit(1);
-});
--- a/apps/web/scripts/clone-website.ts
+++ b/apps/web/scripts/clone-website.ts
@@ -1,187 +0,0 @@
-import scrape from "website-scraper";
-import PuppeteerPlugin from "website-scraper-puppeteer";
-import path from "path";
-import { fileURLToPath } from "url";
-import fs from "fs";
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-
-// Custom plugin to handle Next.js and Mac-specific path issues
-class PortfolioPlugin {
-  apply(registerAction: any) {
-    // 1. Add more sources before starting
-    registerAction("beforeStart", ({ options }: any) => {
-      if (!options.sources) options.sources = [];
-      options.sources.push({ selector: "img", attr: "data-nimg" });
-      options.sources.push({ selector: "img", attr: "data-src" });
-      options.sources.push({ selector: "img", attr: "data-srcset" });
-      options.sources.push({ selector: "video", attr: "poster" });
-      options.sources.push({ selector: "source", attr: "data-srcset" });
-      options.sources.push({
-        selector: '[style*="background-image"]',
-        attr: "style",
-      });
-      options.sources.push({ selector: 'link[as="font"]', attr: "href" });
-      options.sources.push({ selector: 'link[as="image"]', attr: "href" });
-      options.sources.push({ selector: 'link[as="style"]', attr: "href" });
-      options.sources.push({ selector: 'link[as="script"]', attr: "href" });
-    });
-
-    // 2. Sanitize filenames and handle Next.js optimized images
-    registerAction("generateFilename", ({ resource, filename }: any) => {
-      const url = resource.getUrl();
-      let result = filename;
-
-      // Handle Next.js optimized images: /_next/image?url=...&w=...
-      if (url.includes("/_next/image")) {
-        try {
-          const urlParams = new URL(url).searchParams;
-          const originalUrl = urlParams.get("url");
-          if (originalUrl) {
-            const cleanPath = originalUrl.split("?")[0];
-            const ext = path.extname(cleanPath) || ".webp";
-            const name = path.basename(cleanPath, ext);
-            const width = urlParams.get("w") || "auto";
-            result = `_next/optimized/${name}-${width}${ext}`;
-          }
-        } catch (e) {
-          // Ignore invalid optimized image URLs
-        }
-      }
-
-      // CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles
-      // We split by / to ensure we only replace .app at the end of a directory name or filename
-      result = result
-        .split("/")
-        .map((segment: string) =>
-          segment.endsWith(".app")
-            ? segment.replace(/\.app$/, "-app")
-            : segment,
-        )
-        .join("/");
-
-      return { filename: result };
-    });
-  }
-}
-
-async function cloneWebsite() {
-  const url = process.argv[2];
-  if (!url) {
-    console.error("Please provide a URL as an argument.");
-    process.exit(1);
-  }
-
-  const domain = new URL(url).hostname;
-  let outputDirName = process.argv[3] || domain.replace(/\./g, "-");
-
-  // Sanitize top-level folder name for Mac
-  if (outputDirName.endsWith(".app")) {
-    outputDirName = outputDirName.replace(/\.app$/, "-app");
-  }
-
-  const outputDir = path.resolve(
-    __dirname,
-    "../cloned-websites",
-    outputDirName,
-  );
-
-  if (fs.existsSync(outputDir)) {
-    fs.rmSync(outputDir, { recursive: true, force: true });
-  }
-
-  console.log(`Cloning ${url} to ${outputDir}...`);
-
-  try {
-    await scrape({
-      urls: [url],
-      directory: outputDir,
-      recursive: true,
-      maxRecursiveDepth: 5,
-      requestConcurrency: 10,
-      plugins: [
-        new PuppeteerPlugin({
-          launchOptions: { headless: true, args: ["--no-sandbox"] },
-          gotoOptions: { waitUntil: "networkidle0", timeout: 60000 },
-          scrollToBottom: { timeout: 20000, viewportN: 20 },
-        }),
-        new PortfolioPlugin(),
-      ],
-      sources: [
-        { selector: "img", attr: "src" },
-        { selector: "img", attr: "srcset" },
-        { selector: "img", attr: "data-src" },
-        { selector: "img", attr: "data-srcset" },
-        { selector: 'link[rel="stylesheet"]', attr: "href" },
-        { selector: 'link[rel*="icon"]', attr: "href" },
-        { selector: "script", attr: "src" },
-        { selector: 'link[rel="preload"]', attr: "href" },
-        { selector: 'link[rel="prefetch"]', attr: "href" },
-        { selector: 'link[rel="modulepreload"]', attr: "href" },
-        { selector: 'link[rel="apple-touch-icon"]', attr: "href" },
-        { selector: 'link[rel="mask-icon"]', attr: "href" },
-        { selector: "source", attr: "src" },
-        { selector: "source", attr: "srcset" },
-        { selector: "video", attr: "src" },
-        { selector: "video", attr: "poster" },
-        { selector: "audio", attr: "src" },
-        { selector: "iframe", attr: "src" },
-        { selector: 'meta[property="og:image"]', attr: "content" },
-        { selector: 'meta[name="twitter:image"]', attr: "content" },
-        { selector: "[style]", attr: "style" },
-      ],
-      urlFilter: (link: string) => {
-        const isAsset =
-          /\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test(
-            link,
-          );
-        const isNextAsset = link.includes("/_next/");
-        const isSameDomain =
-          link.startsWith(url) ||
-          link.startsWith("/") ||
-          !link.includes("://") ||
-          link.includes(domain);
-        const isGoogleTagManager = link.includes("googletagmanager.com");
-        const isAnalytics = link.includes("analytics.mintel.me");
-        const isVercelApp = link.includes("vercel.app");
-        const isDataUrl = link.startsWith("data:");
-        const isMailto = link.startsWith("mailto:");
-        const isTel = link.startsWith("tel:");
-        return (
-          (isAsset ||
-            isNextAsset ||
-            isSameDomain ||
-            isGoogleTagManager ||
-            isAnalytics ||
-            isVercelApp) &&
-          !isDataUrl &&
-          !isMailto &&
-          !isTel
-        );
-      },
-      filenameGenerator: "bySiteStructure",
-      subdirectories: [
-        {
-          directory: "img",
-          extensions: [".jpg", ".png", ".svg", ".webp", ".gif", ".ico"],
-        },
-        { directory: "js", extensions: [".js"] },
-        { directory: "css", extensions: [".css"] },
-        {
-          directory: "fonts",
-          extensions: [".woff", ".woff2", ".ttf", ".eot", ".otf"],
-        },
-        { directory: "videos", extensions: [".mp4", ".webm", ".mov", ".ogg"] },
-      ],
-    });
-
-    console.log("✅ Website cloned successfully!");
-    console.log(`Location: ${outputDir}`);
-  } catch (error) {
-    console.error("❌ Error cloning website:", error);
-    process.exit(1);
-  }
-}
-
-cloneWebsite();