diff --git a/apps/web/next.config.mjs b/apps/web/next.config.mjs index 69321dd..ddd3a00 100644 --- a/apps/web/next.config.mjs +++ b/apps/web/next.config.mjs @@ -4,7 +4,6 @@ import withMintelConfig from "@mintel/next-config"; const nextConfig = { reactStrictMode: true, output: 'standalone', - transpilePackages: ["@mintel/pdf"], async rewrites() { const umamiUrl = process.env.UMAMI_API_ENDPOINT || diff --git a/apps/web/package.json b/apps/web/package.json index 7d9f368..ea16cea 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -11,8 +11,6 @@ "test": "npm run test:links", "test:links": "tsx ./scripts/test-links.ts", "test:file-examples": "tsx ./scripts/test-file-examples-comprehensive.ts", - "clone-website": "tsx ./scripts/clone-recursive.ts", - "clone-page": "tsx ./scripts/clone-page.ts", "generate-estimate": "tsx ./scripts/generate-estimate.ts", "ai-estimate": "tsx ./scripts/ai-estimate.ts", "video:preview": "remotion preview video/index.ts", @@ -31,7 +29,7 @@ "typecheck": "tsc --noEmit" }, "dependencies": { - "@mintel/pdf": "link:../../../at-mintel/packages/pdf-library", + "@mintel/pdf": "^1.8.0", "@react-pdf/renderer": "^4.3.2", "@remotion/bundler": "^4.0.414", "@remotion/cli": "^4.0.414", @@ -48,15 +46,12 @@ "axios": "^1.13.4", "canvas-confetti": "^1.9.4", "clsx": "^2.1.1", - "crawlee": "^3.15.3", "framer-motion": "^12.29.2", "ioredis": "^5.9.1", "lucide-react": "^0.468.0", "mermaid": "^11.12.2", "next": "^16.1.6", - "playwright": "^1.58.1", "prismjs": "^1.30.0", - "puppeteer": "^24.36.1", "qrcode": "^1.5.4", "react": "^19.2.3", "react-dom": "^19.2.3", @@ -64,8 +59,6 @@ "shiki": "^1.24.2", "tailwind-merge": "^3.4.0", "tailwindcss": "^3.4.0", - "website-scraper": "^6.0.0", - "website-scraper-puppeteer": "^2.0.0", "zod": "3.22.3", "@directus/sdk": "21.0.0", "@opentelemetry/api": "^1.9.0", @@ -99,4 +92,4 @@ "typescript": "5.9.3", "typescript-eslint": "^8.54.0" } -} +} \ No newline at end of file diff --git a/apps/web/scripts/clone-page.ts b/apps/web/scripts/clone-page.ts deleted file mode 100644 index 745fb6b..0000000 --- a/apps/web/scripts/clone-page.ts +++ /dev/null @@ -1,436 +0,0 @@ -import { chromium } from "playwright"; -import path from "node:path"; -import { fileURLToPath } from "node:url"; -import fs from "node:fs"; -import axios from "axios"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); -const USER_AGENT = - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"; - -function sanitizePath(rawPath: string) { - return rawPath - .split("/") - .map((p) => p.replace(/[^a-z0-9._-]/gi, "_")) - .join("/"); -} - -async function downloadFile(url: string, assetsDir: string) { - if (url.startsWith("//")) url = `https:${url}`; - if (!url.startsWith("http")) return null; - - try { - const u = new URL(url); - // Create a collision-resistant local path - const relPath = sanitizePath(u.hostname + u.pathname); - const dest = path.join(assetsDir, relPath); - - if (fs.existsSync(dest)) return `./assets/${relPath}`; - - const res = await axios.get(url, { - responseType: "arraybuffer", - headers: { "User-Agent": USER_AGENT }, - timeout: 15000, - validateStatus: () => true, - }); - - if (res.status !== 200) return null; - - if (!fs.existsSync(path.dirname(dest))) - fs.mkdirSync(path.dirname(dest), { recursive: true }); - fs.writeFileSync(dest, Buffer.from(res.data)); - return `./assets/${relPath}`; - } catch { - return null; // Fail silently, proceed with original URL - } -} - -async function processCssRecursively( - cssContent: string, - cssUrl: string, - assetsDir: string, - urlMap: Record, - depth = 0, -) { - if (depth > 5) return cssContent; - - // Capture both standard url(...) and @import url(...) - const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi; - let match; - let newContent = cssContent; - - while ((match = urlRegex.exec(cssContent)) !== null) { - const originalUrl = match[1]; - if (originalUrl.startsWith("data:") || originalUrl.startsWith("blob:")) - continue; - - try { - const absUrl = new URL(originalUrl, cssUrl).href; - const local = await downloadFile(absUrl, assetsDir); - - if (local) { - // Calculate relative path from CSS file to Asset - const u = new URL(cssUrl); - const cssPath = u.hostname + u.pathname; - const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname; - - // We need to route from the folder containing the CSS to the asset - const rel = path.relative( - path.dirname(sanitizePath(cssPath)), - sanitizePath(assetPath), - ); - - // Replace strictly the URL part - newContent = newContent.split(originalUrl).join(rel); - urlMap[absUrl] = local; - } - } catch { - // Ignore URL resolution errors - } - } - return newContent; -} - -async function run() { - const rawUrl = process.argv[2]; - if (!rawUrl) { - console.error("Usage: npm run clone-page "); - process.exit(1); - } - const targetUrl = rawUrl.trim(); - const urlObj = new URL(targetUrl); - - // Setup Output Directories - const domainSlug = urlObj.hostname.replace("www.", ""); - const domainDir = path.resolve(__dirname, `../public/showcase/${domainSlug}`); - const assetsDir = path.join(domainDir, "assets"); - if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true }); - - let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-"); - if (!pageSlug) pageSlug = "index"; - const htmlFilename = `${pageSlug}.html`; - - console.log(`๐Ÿš€ INDUSTRIAL CLONE: ${targetUrl}`); - - const browser = await chromium.launch({ headless: true }); - // Start with a standard viewport, we will resize widely later - const context = await browser.newContext({ - userAgent: USER_AGENT, - viewport: { width: 1920, height: 1080 }, - }); - const page = await context.newPage(); - - const urlMap: Record = {}; - const foundAssets = new Set(); - - // 1. Live Network Interception - page.on("response", (response) => { - const url = response.url(); - if (response.status() === 200) { - // Capture anything that looks like a static asset - if ( - url.match( - /\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i, - ) - ) { - foundAssets.add(url); - } - } - }); - - try { - console.log("๐ŸŒ Loading page (Waiting for Network Idle)..."); - await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 }); - - console.log( - '๐ŸŒŠ Executing "Scroll Wave" to trigger all lazy loaders naturally...', - ); - await page.evaluate(async () => { - await new Promise((resolve) => { - let totalHeight = 0; - const distance = 400; - const timer = setInterval(() => { - const scrollHeight = document.body.scrollHeight; - window.scrollBy(0, distance); - totalHeight += distance; - - if (totalHeight >= scrollHeight) { - clearInterval(timer); - window.scrollTo(0, 0); // Reset to top - resolve(true); - } - }, 100); - }); - }); - - console.log( - '๐Ÿ“ Expanding Viewport to "Giant Mode" for final asset capture...', - ); - const fullHeight = await page.evaluate(() => document.body.scrollHeight); - await page.setViewportSize({ width: 1920, height: fullHeight + 1000 }); - - // Final settlement wait - await page.waitForTimeout(3000); - - console.log("๐Ÿ’ง Final DOM Hydration & Sanitization..."); - await page.evaluate(() => { - // A. Deterministic Attribute Hydration (Generic) - // Scours every element for attributes that look like asset URLs and promotes them - const assetPattern = - /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i; - - document.querySelectorAll("*").forEach((el) => { - // 0. Skip Meta/Head/Script/Style/SVG tags for attribute promotion - if ( - ["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes( - el.tagName, - ) - ) - return; - - // 1. Force Visibility (Anti-Flicker) - const htmlEl = el as HTMLElement; - const style = window.getComputedStyle(htmlEl); - if (style.opacity === "0" || style.visibility === "hidden") { - htmlEl.style.setProperty("opacity", "1", "important"); - htmlEl.style.setProperty("visibility", "visible", "important"); - } - - // 2. Promote Data Attributes - for (const attr of Array.from(el.attributes)) { - const name = attr.name.toLowerCase(); - const val = attr.value; - - if ( - assetPattern.test(val) || - name.includes("src") || - name.includes("image") - ) { - // Standard Image/Video/Source promotion - if (el.tagName === "IMG") { - const img = el as HTMLImageElement; - if (name.includes("srcset")) img.srcset = val; - else if (!img.src || img.src.includes("data:")) img.src = val; - } - if (el.tagName === "SOURCE") { - const source = el as HTMLSourceElement; - if (name.includes("srcset")) source.srcset = val; - } - if (el.tagName === "VIDEO" || el.tagName === "AUDIO") { - const media = el as HTMLMediaElement; - if (!media.src) media.src = val; - } - - // Background Image Promotion - if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes("href")) { - const bg = htmlEl.style.backgroundImage; - if (!bg || bg === "none") { - htmlEl.style.backgroundImage = `url('${val}')`; - } - } - } - } - }); - - // B. Ensure basic structural elements are visible post-scroll - const body = document.body; - if (body) { - body.style.setProperty("opacity", "1", "important"); - body.style.setProperty("visibility", "visible", "important"); - } - }); - - console.log("โณ Waiting for network idle..."); - await page.waitForLoadState("networkidle"); - - // 1.5 FINAL SETTLEMENT: Let any scroll-triggered JS finish - await page.waitForTimeout(1000); - - // 2. Static Snapshot - let content = await page.content(); - - // 3. Post-Snapshot Asset Discovery (Regex) - // Catches assets that never triggered a network request but exist in the markup - const regexPatterns = [ - /(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi, - // Capture CSS url() inside style blocks - /url\(["']?([^"'\)]+)["']?\)/gi, - ]; - - for (const pattern of regexPatterns) { - let match; - while ((match = pattern.exec(content)) !== null) { - try { - foundAssets.add(new URL(match[1], targetUrl).href); - } catch { - // Ignore invalid URLs in content - } - } - } - - // Specific srcset parsing - const srcsetRegex = /[a-z0-9-]+srcset=["']([^"']+)["']/gi; - let match; - while ((match = srcsetRegex.exec(content)) !== null) { - match[1].split(",").forEach((rule) => { - const parts = rule.trim().split(/\s+/); - if (parts[0] && !parts[0].startsWith("data:")) { - try { - foundAssets.add(new URL(parts[0], targetUrl).href); - } catch { - // Ignore invalid srcset URLs - } - } - }); - } - - console.log(`๐Ÿ” Processing ${foundAssets.size} discovered assets...`); - - // 4. Download & Map - for (const url of foundAssets) { - const local = await downloadFile(url, assetsDir); - if (local) { - urlMap[url] = local; - const clean = url.split("?")[0]; - urlMap[clean] = local; - - // Handle CSS recursively - if (clean.endsWith(".css")) { - try { - const { data } = await axios.get(url, { - headers: { "User-Agent": USER_AGENT }, - }); - // Process CSS and save it - const processedCss = await processCssRecursively( - data, - url, - assetsDir, - urlMap, - ); - const relPath = sanitizePath( - new URL(url).hostname + new URL(url).pathname, - ); - fs.writeFileSync(path.join(assetsDir, relPath), processedCss); - } catch { - // Ignore CSS fetch/process errors - } - } - } - } - - console.log("๐Ÿ› ๏ธ Finalizing Static Mirror..."); - let finalContent = content; - - // A. Apply URL Map Replacements - // Longer paths first to prevent partial replacement errors - const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length); - if (sortedUrls.length > 0) { - const escaped = sortedUrls.map((u) => - u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), - ); - // Create a massive regex for single-pass replacement - const masterRegex = new RegExp(`(${escaped.join("|")})`, "g"); - finalContent = finalContent.replace( - masterRegex, - (match) => urlMap[match] || match, - ); - } - - // B. Global Root-Relative Path Cleanup - // Catches things like /wp-content/ that weren't distinct assets or were missed - const commonDirs = [ - "/wp-content/", - "/wp-includes/", - "/assets/", - "/static/", - "/images/", - ]; - for (const dir of commonDirs) { - const localDir = `./assets/${urlObj.hostname}${dir}`; - finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`); - finalContent = finalContent.split(`'${dir}`).join(`'${localDir}`); - finalContent = finalContent.split(`(${dir}`).join(`(${localDir}`); - } - - // C. Domain Nuke - // Replace absolute links to the original domain with relative or # - const domainPattern = new RegExp( - `https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`, - "gi", - ); - // We carefully only replace if it looks like a resource link, or neutralize if it's a navigation link - // For simplicity and "solidness", we'll rely on the specific replacements above first. - // This catch-all nuke ensures we don't leak requests. - // Convert remaining absolute domain links to relative . - finalContent = finalContent.replace(domainPattern, (match) => { - // If we have a map for it, it should have been replaced. - // If not, it's likely a navigation link or an uncaptured asset. - // Safe fallback: - return "./"; - }); - - // D. Static Stability & Cleanup - // Remove tracking/analytics/lazy-load scripts that ruins stability - finalContent = finalContent.replace( - /]*>([\s\S]*?)<\/script>/gi, - (match, content) => { - const lower = content.toLowerCase(); - if ( - lower.includes("google-analytics") || - lower.includes("gtag") || - lower.includes("fbq") || - lower.includes("lazy") || - lower.includes("tracker") - ) { - return ""; - } - return match; - }, - ); - - // E. CSS Injections for Stability - const headEnd = finalContent.indexOf(""); - if (headEnd > -1) { - const stabilityCss = ` - `; - finalContent = - finalContent.slice(0, headEnd) + - stabilityCss + - finalContent.slice(headEnd); - } - - // Save - const finalPath = path.join(domainDir, htmlFilename); - fs.writeFileSync(finalPath, finalContent); - console.log(`โœ… SUCCESS: Cloned to ${finalPath}`); - } catch (err) { - console.error("โŒ FATAL ERROR:", err); - } finally { - await browser.close(); - } -} - -run(); diff --git a/apps/web/scripts/clone-recursive.ts b/apps/web/scripts/clone-recursive.ts deleted file mode 100644 index 0409ceb..0000000 --- a/apps/web/scripts/clone-recursive.ts +++ /dev/null @@ -1,239 +0,0 @@ -// @ts-ignore -import scrape from "website-scraper"; -// @ts-ignore -import PuppeteerPlugin from "website-scraper-puppeteer"; -import path from "node:path"; -import { fileURLToPath } from "node:url"; -import fs from "node:fs"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -async function run() { - const targetUrl = process.argv[2]; - if (!targetUrl) { - console.error("Usage: npm run clone-website [output-dir]"); - process.exit(1); - } - - const urlObj = new URL(targetUrl); - const domain = urlObj.hostname; - const safeDomain = domain.replace(/[^a-z0-9-]/gi, "_"); - const outputDir = process.argv[3] - ? path.resolve(process.cwd(), process.argv[3]) - : path.resolve(__dirname, "../cloned-websites", safeDomain); - - if (fs.existsSync(outputDir)) { - console.log(`Cleaning existing directory: ${outputDir}`); - fs.rmSync(outputDir, { recursive: true, force: true }); - } - - console.log(`๐Ÿš€ Starting recursive clone of ${targetUrl}`); - console.log(`๐Ÿ“‚ Output: ${outputDir}`); - - const options = { - urls: [targetUrl], - directory: outputDir, - recursive: true, - maxDepth: 5, - // Custom filename generation to avoid "https:/" folders - plugins: [ - new PuppeteerPlugin({ - launchOptions: { - headless: true, - args: [ - "--no-sandbox", - "--disable-setuid-sandbox", - "--disable-dev-shm-usage", - ], - }, - scrollToBottom: { timeout: 10000, viewportN: 10 }, - blockNavigation: false, - }), - new (class LoggerPlugin { - apply(registerAction: any) { - registerAction("onResourceSaved", ({ resource }: any) => { - console.log(` ๐Ÿ’พ Saved: ${resource.url} -> ${resource.filename}`); - }); - registerAction("onResourceError", ({ resource, error }: any) => { - console.error(` โŒ Error: ${resource.url} - ${error.message}`); - }); - } - })(), - new (class FilenamePlugin { - apply(registerAction: any) { - registerAction("generateFilename", ({ resource }: any) => { - const u = new URL(resource.url); - let filename = u.pathname; - - // normalize - if (filename.endsWith("/")) filename += "index.html"; - else if (!path.extname(filename) && resource.url.includes(domain)) - filename += "/index.html"; // Assume folder if internal link without ext - - // If it's an external asset, put it in a separate folder - if (u.hostname !== domain) { - filename = `_external/${u.hostname}${filename}`; - } - - // Sanitize filename - filename = filename - .split("/") - .map((part) => part.replace(/[^a-z0-9._-]/gi, "_")) - .join("/"); - - // Remove leading slash - if (filename.startsWith("/")) filename = filename.substring(1); - - // Handle "Unnamed page" by checking if empty - if (!filename || filename === "index.html") - return { filename: "index.html" }; - - return { filename }; - }); - } - })(), - ], - - urlFilter: (url: string) => { - const u = new URL(url); - const isTargetDomain = u.hostname === domain; - const isGoogleFonts = - u.hostname.includes("fonts.googleapis.com") || - u.hostname.includes("fonts.gstatic.com"); - // Allow assets from anywhere - const isAsset = - /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test( - u.pathname, - ); - // Allow fonts/css from common CDNs if standard extension check fails - const isCommonAsset = - u.pathname.includes("/css/") || - u.pathname.includes("/js/") || - u.pathname.includes("/static/") || - u.pathname.includes("/assets/") || - u.pathname.includes("/uploads/"); - - return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts; - }, - - sources: [ - { selector: "img", attr: "src" }, - { selector: "img", attr: "srcset" }, - { selector: "source", attr: "src" }, - { selector: "source", attr: "srcset" }, - { selector: 'link[rel="stylesheet"]', attr: "href" }, - { selector: 'link[rel="preload"]', attr: "href" }, - { selector: 'link[rel="prefetch"]', attr: "href" }, - { selector: "script", attr: "src" }, - { selector: "video", attr: "src" }, - { selector: "video", attr: "poster" }, - { selector: "iframe", attr: "src" }, - { selector: 'link[rel*="icon"]', attr: "href" }, - { selector: 'link[rel="manifest"]', attr: "href" }, - { selector: 'meta[property="og:image"]', attr: "content" }, - ], - - request: { - headers: { - "User-Agent": - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - }, - }, - }; - - try { - // @ts-ignore - const result = await scrape(options); - console.log( - `\nโœ… Successfully cloned ${result.length} resources to ${outputDir}`, - ); - - // Post-processing: Sanitize HTML to remove Next.js hydration scripts - // This prevents the static site from trying to "hydrate" and breaking images/links - console.log("๐Ÿงน Sanitizing HTML files..."); - sanitizeHtmlFiles(outputDir); - - console.log(`open "${path.join(outputDir, "index.html")}"`); - } catch (error) { - console.error("โŒ Error cloning website:", error); - process.exit(1); - } -} - -function sanitizeHtmlFiles(dir: string) { - const files = fs.readdirSync(dir); - for (const file of files) { - const fullPath = path.join(dir, file); - if (fs.statSync(fullPath).isDirectory()) { - sanitizeHtmlFiles(fullPath); - } else if (file.endsWith(".html")) { - let content = fs.readFileSync(fullPath, "utf8"); - - // Remove Next.js data script - content = content.replace( - /`; - }, - ); - - // Inject Fonts (Fix for missing dynamic fonts) - // We inject Inter and Montserrat as safe defaults for industrial/modern sites - // Check specifically for a stylesheet link to google fonts - const hasGoogleFontStylesheet = - /]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test( - content, - ); - if (!hasGoogleFontStylesheet) { - const fontLink = ``; - const styleBlock = ``; - content = content.replace("", `${fontLink}${styleBlock}`); - } - - // Force column layout on product pages - if (content.includes('class="products')) { - const layoutScript = ` - `; - content = content.replace("", `${layoutScript}`); - } - - fs.writeFileSync(fullPath, content); - } - } -} - -run(); diff --git a/apps/web/scripts/clone-website-crawlee.ts b/apps/web/scripts/clone-website-crawlee.ts deleted file mode 100644 index 39e373c..0000000 --- a/apps/web/scripts/clone-website-crawlee.ts +++ /dev/null @@ -1,130 +0,0 @@ -import { PlaywrightCrawler, RequestQueue } from 'crawlee'; -import * as path from 'node:path'; -import { fileURLToPath } from 'node:url'; -import * as fs from 'node:fs'; -import { URL } from 'node:url'; -import { execSync } from 'node:child_process'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -/** - * The Ultimate Website Cloner - * Uses Crawlee for discovery and single-file-cli for perfect page capture. - */ -async function cloneWebsite() { - const targetUrl = process.argv[2]; - if (!targetUrl) { - console.error('Please provide a URL as an argument.'); - process.exit(1); - } - - const urlObj = new URL(targetUrl); - const domain = urlObj.hostname; - const outputDirName = process.argv[3] || domain.replace(/\./g, '-'); - const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName); - - if (fs.existsSync(baseOutputDir)) { - fs.rmSync(baseOutputDir, { recursive: true, force: true }); - } - fs.mkdirSync(baseOutputDir, { recursive: true }); - - console.log(`๐Ÿš€ Starting perfect recursive clone of ${targetUrl}...`); - console.log(`๐Ÿ“‚ Output: ${baseOutputDir}`); - - const requestQueue = await RequestQueue.open(); - await requestQueue.addRequest({ url: targetUrl }); - - const crawler = new PlaywrightCrawler({ - requestQueue, - maxRequestsPerCrawl: 100, - maxConcurrency: 3, // SingleFile is resource intensive - - async requestHandler({ request, enqueueLinks, log }) { - const url = request.url; - log.info(`Capturing ${url}...`); - - // 1. Determine local path - const u = new URL(url); - let relPath = u.pathname; - if (relPath === '/' || relPath === '') relPath = '/index.html'; - if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html'; - if (relPath.startsWith('/')) relPath = relPath.substring(1); - - const fullPath = path.join(baseOutputDir, relPath); - fs.mkdirSync(path.dirname(fullPath), { recursive: true }); - - // 2. Use single-file-cli for perfect capture - // We use --back-links-rewrite=false because we handle link rewriting ourselves for better control - try { - execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, { - stdio: 'inherit' - }); - } catch (e) { - log.error(`Failed to capture ${url} with SingleFile`); - } - - // 3. Enqueue subpages (discovery) - // We use a separate lightweight crawl for link discovery - await enqueueLinks({ - strategy: 'same-domain', - transformRequestFunction: (req) => { - if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false; - return req; - } - }); - }, - }); - - await crawler.run(); - - // 4. Post-processing: Rewrite links between the captured files - console.log('๐Ÿ”— Rewriting internal links for offline navigation...'); - const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html')); - - for (const file of allFiles) { - let content = fs.readFileSync(file, 'utf8'); - const fileRelToRoot = path.relative(baseOutputDir, file); - - // Simple but effective regex for internal links - content = content.replace(/href="([^"]+)"/g, (match, href) => { - if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) { - try { - const linkUrl = new URL(href, urlObj.href); - if (linkUrl.hostname === domain) { - let linkPath = linkUrl.pathname; - if (linkPath === '/' || linkPath === '') linkPath = '/index.html'; - if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html'; - if (linkPath.startsWith('/')) linkPath = linkPath.substring(1); - - const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath); - return `href="${relativeLink}"`; - } - } catch (e) {} - } - return match; - }); - - fs.writeFileSync(file, content); - } - - console.log(`\nโœ… Done! Perfect clone complete in: ${baseOutputDir}`); -} - -function getFiles(dir: string, fileList: string[] = []) { - const files = fs.readdirSync(dir); - for (const file of files) { - const name = path.join(dir, file); - if (fs.statSync(name).isDirectory()) { - getFiles(name, fileList); - } else { - fileList.push(name); - } - } - return fileList; -} - -cloneWebsite().catch(err => { - console.error('โŒ Fatal error:', err); - process.exit(1); -}); diff --git a/apps/web/scripts/clone-website.ts b/apps/web/scripts/clone-website.ts deleted file mode 100644 index 21028a0..0000000 --- a/apps/web/scripts/clone-website.ts +++ /dev/null @@ -1,187 +0,0 @@ -import scrape from "website-scraper"; -import PuppeteerPlugin from "website-scraper-puppeteer"; -import path from "path"; -import { fileURLToPath } from "url"; -import fs from "fs"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -// Custom plugin to handle Next.js and Mac-specific path issues -class PortfolioPlugin { - apply(registerAction: any) { - // 1. Add more sources before starting - registerAction("beforeStart", ({ options }: any) => { - if (!options.sources) options.sources = []; - options.sources.push({ selector: "img", attr: "data-nimg" }); - options.sources.push({ selector: "img", attr: "data-src" }); - options.sources.push({ selector: "img", attr: "data-srcset" }); - options.sources.push({ selector: "video", attr: "poster" }); - options.sources.push({ selector: "source", attr: "data-srcset" }); - options.sources.push({ - selector: '[style*="background-image"]', - attr: "style", - }); - options.sources.push({ selector: 'link[as="font"]', attr: "href" }); - options.sources.push({ selector: 'link[as="image"]', attr: "href" }); - options.sources.push({ selector: 'link[as="style"]', attr: "href" }); - options.sources.push({ selector: 'link[as="script"]', attr: "href" }); - }); - - // 2. Sanitize filenames and handle Next.js optimized images - registerAction("generateFilename", ({ resource, filename }: any) => { - const url = resource.getUrl(); - let result = filename; - - // Handle Next.js optimized images: /_next/image?url=...&w=... - if (url.includes("/_next/image")) { - try { - const urlParams = new URL(url).searchParams; - const originalUrl = urlParams.get("url"); - if (originalUrl) { - const cleanPath = originalUrl.split("?")[0]; - const ext = path.extname(cleanPath) || ".webp"; - const name = path.basename(cleanPath, ext); - const width = urlParams.get("w") || "auto"; - result = `_next/optimized/${name}-${width}${ext}`; - } - } catch (e) { - // Ignore invalid optimized image URLs - } - } - - // CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles - // We split by / to ensure we only replace .app at the end of a directory name or filename - result = result - .split("/") - .map((segment: string) => - segment.endsWith(".app") - ? segment.replace(/\.app$/, "-app") - : segment, - ) - .join("/"); - - return { filename: result }; - }); - } -} - -async function cloneWebsite() { - const url = process.argv[2]; - if (!url) { - console.error("Please provide a URL as an argument."); - process.exit(1); - } - - const domain = new URL(url).hostname; - let outputDirName = process.argv[3] || domain.replace(/\./g, "-"); - - // Sanitize top-level folder name for Mac - if (outputDirName.endsWith(".app")) { - outputDirName = outputDirName.replace(/\.app$/, "-app"); - } - - const outputDir = path.resolve( - __dirname, - "../cloned-websites", - outputDirName, - ); - - if (fs.existsSync(outputDir)) { - fs.rmSync(outputDir, { recursive: true, force: true }); - } - - console.log(`Cloning ${url} to ${outputDir}...`); - - try { - await scrape({ - urls: [url], - directory: outputDir, - recursive: true, - maxRecursiveDepth: 5, - requestConcurrency: 10, - plugins: [ - new PuppeteerPlugin({ - launchOptions: { headless: true, args: ["--no-sandbox"] }, - gotoOptions: { waitUntil: "networkidle0", timeout: 60000 }, - scrollToBottom: { timeout: 20000, viewportN: 20 }, - }), - new PortfolioPlugin(), - ], - sources: [ - { selector: "img", attr: "src" }, - { selector: "img", attr: "srcset" }, - { selector: "img", attr: "data-src" }, - { selector: "img", attr: "data-srcset" }, - { selector: 'link[rel="stylesheet"]', attr: "href" }, - { selector: 'link[rel*="icon"]', attr: "href" }, - { selector: "script", attr: "src" }, - { selector: 'link[rel="preload"]', attr: "href" }, - { selector: 'link[rel="prefetch"]', attr: "href" }, - { selector: 'link[rel="modulepreload"]', attr: "href" }, - { selector: 'link[rel="apple-touch-icon"]', attr: "href" }, - { selector: 'link[rel="mask-icon"]', attr: "href" }, - { selector: "source", attr: "src" }, - { selector: "source", attr: "srcset" }, - { selector: "video", attr: "src" }, - { selector: "video", attr: "poster" }, - { selector: "audio", attr: "src" }, - { selector: "iframe", attr: "src" }, - { selector: 'meta[property="og:image"]', attr: "content" }, - { selector: 'meta[name="twitter:image"]', attr: "content" }, - { selector: "[style]", attr: "style" }, - ], - urlFilter: (link: string) => { - const isAsset = - /\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test( - link, - ); - const isNextAsset = link.includes("/_next/"); - const isSameDomain = - link.startsWith(url) || - link.startsWith("/") || - !link.includes("://") || - link.includes(domain); - const isGoogleTagManager = link.includes("googletagmanager.com"); - const isAnalytics = link.includes("analytics.mintel.me"); - const isVercelApp = link.includes("vercel.app"); - const isDataUrl = link.startsWith("data:"); - const isMailto = link.startsWith("mailto:"); - const isTel = link.startsWith("tel:"); - return ( - (isAsset || - isNextAsset || - isSameDomain || - isGoogleTagManager || - isAnalytics || - isVercelApp) && - !isDataUrl && - !isMailto && - !isTel - ); - }, - filenameGenerator: "bySiteStructure", - subdirectories: [ - { - directory: "img", - extensions: [".jpg", ".png", ".svg", ".webp", ".gif", ".ico"], - }, - { directory: "js", extensions: [".js"] }, - { directory: "css", extensions: [".css"] }, - { - directory: "fonts", - extensions: [".woff", ".woff2", ".ttf", ".eot", ".otf"], - }, - { directory: "videos", extensions: [".mp4", ".webm", ".mov", ".ogg"] }, - ], - }); - - console.log("โœ… Website cloned successfully!"); - console.log(`Location: ${outputDir}`); - } catch (error) { - console.error("โŒ Error cloning website:", error); - process.exit(1); - } -} - -cloneWebsite(); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ada7456..07b350e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -94,6 +94,9 @@ importers: "@directus/sdk": specifier: 21.0.0 version: 21.0.0 + "@mintel/cloner": + specifier: link:../../../at-mintel/packages/cloner-library + version: link:../../../at-mintel/packages/cloner-library "@mintel/pdf": specifier: link:../../../at-mintel/packages/pdf-library version: link:../../../at-mintel/packages/pdf-library