feat(cloner): remove cloner from mintel.me and use registry versions for @mintel/pdf and @mintel/cloner
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 10s
Build & Deploy / 🧪 QA (push) Failing after 16s
Build & Deploy / 🏗️ Build (push) Failing after 19s
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🩺 Health Check (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 1s

This commit is contained in:
2026-02-12 22:00:36 +01:00
parent 0fed92ca8c
commit 99e392ce08
7 changed files with 5 additions and 1002 deletions

View File

@@ -1,436 +0,0 @@
import { chromium } from "playwright";
import path from "node:path";
import { fileURLToPath } from "node:url";
import fs from "node:fs";
import axios from "axios";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36";
function sanitizePath(rawPath: string) {
return rawPath
.split("/")
.map((p) => p.replace(/[^a-z0-9._-]/gi, "_"))
.join("/");
}
async function downloadFile(url: string, assetsDir: string) {
if (url.startsWith("//")) url = `https:${url}`;
if (!url.startsWith("http")) return null;
try {
const u = new URL(url);
// Create a collision-resistant local path
const relPath = sanitizePath(u.hostname + u.pathname);
const dest = path.join(assetsDir, relPath);
if (fs.existsSync(dest)) return `./assets/${relPath}`;
const res = await axios.get(url, {
responseType: "arraybuffer",
headers: { "User-Agent": USER_AGENT },
timeout: 15000,
validateStatus: () => true,
});
if (res.status !== 200) return null;
if (!fs.existsSync(path.dirname(dest)))
fs.mkdirSync(path.dirname(dest), { recursive: true });
fs.writeFileSync(dest, Buffer.from(res.data));
return `./assets/${relPath}`;
} catch {
return null; // Fail silently, proceed with original URL
}
}
async function processCssRecursively(
cssContent: string,
cssUrl: string,
assetsDir: string,
urlMap: Record<string, string>,
depth = 0,
) {
if (depth > 5) return cssContent;
// Capture both standard url(...) and @import url(...)
const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi;
let match;
let newContent = cssContent;
while ((match = urlRegex.exec(cssContent)) !== null) {
const originalUrl = match[1];
if (originalUrl.startsWith("data:") || originalUrl.startsWith("blob:"))
continue;
try {
const absUrl = new URL(originalUrl, cssUrl).href;
const local = await downloadFile(absUrl, assetsDir);
if (local) {
// Calculate relative path from CSS file to Asset
const u = new URL(cssUrl);
const cssPath = u.hostname + u.pathname;
const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname;
// We need to route from the folder containing the CSS to the asset
const rel = path.relative(
path.dirname(sanitizePath(cssPath)),
sanitizePath(assetPath),
);
// Replace strictly the URL part
newContent = newContent.split(originalUrl).join(rel);
urlMap[absUrl] = local;
}
} catch {
// Ignore URL resolution errors
}
}
return newContent;
}
async function run() {
const rawUrl = process.argv[2];
if (!rawUrl) {
console.error("Usage: npm run clone-page <url>");
process.exit(1);
}
const targetUrl = rawUrl.trim();
const urlObj = new URL(targetUrl);
// Setup Output Directories
const domainSlug = urlObj.hostname.replace("www.", "");
const domainDir = path.resolve(__dirname, `../public/showcase/${domainSlug}`);
const assetsDir = path.join(domainDir, "assets");
if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-");
if (!pageSlug) pageSlug = "index";
const htmlFilename = `${pageSlug}.html`;
console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`);
const browser = await chromium.launch({ headless: true });
// Start with a standard viewport, we will resize widely later
const context = await browser.newContext({
userAgent: USER_AGENT,
viewport: { width: 1920, height: 1080 },
});
const page = await context.newPage();
const urlMap: Record<string, string> = {};
const foundAssets = new Set<string>();
// 1. Live Network Interception
page.on("response", (response) => {
const url = response.url();
if (response.status() === 200) {
// Capture anything that looks like a static asset
if (
url.match(
/\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i,
)
) {
foundAssets.add(url);
}
}
});
try {
console.log("🌐 Loading page (Waiting for Network Idle)...");
await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 });
console.log(
'🌊 Executing "Scroll Wave" to trigger all lazy loaders naturally...',
);
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 400;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
window.scrollTo(0, 0); // Reset to top
resolve(true);
}
}, 100);
});
});
console.log(
'📐 Expanding Viewport to "Giant Mode" for final asset capture...',
);
const fullHeight = await page.evaluate(() => document.body.scrollHeight);
await page.setViewportSize({ width: 1920, height: fullHeight + 1000 });
// Final settlement wait
await page.waitForTimeout(3000);
console.log("💧 Final DOM Hydration & Sanitization...");
await page.evaluate(() => {
// A. Deterministic Attribute Hydration (Generic)
// Scours every element for attributes that look like asset URLs and promotes them
const assetPattern =
/\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i;
document.querySelectorAll("*").forEach((el) => {
// 0. Skip Meta/Head/Script/Style/SVG tags for attribute promotion
if (
["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes(
el.tagName,
)
)
return;
// 1. Force Visibility (Anti-Flicker)
const htmlEl = el as HTMLElement;
const style = window.getComputedStyle(htmlEl);
if (style.opacity === "0" || style.visibility === "hidden") {
htmlEl.style.setProperty("opacity", "1", "important");
htmlEl.style.setProperty("visibility", "visible", "important");
}
// 2. Promote Data Attributes
for (const attr of Array.from(el.attributes)) {
const name = attr.name.toLowerCase();
const val = attr.value;
if (
assetPattern.test(val) ||
name.includes("src") ||
name.includes("image")
) {
// Standard Image/Video/Source promotion
if (el.tagName === "IMG") {
const img = el as HTMLImageElement;
if (name.includes("srcset")) img.srcset = val;
else if (!img.src || img.src.includes("data:")) img.src = val;
}
if (el.tagName === "SOURCE") {
const source = el as HTMLSourceElement;
if (name.includes("srcset")) source.srcset = val;
}
if (el.tagName === "VIDEO" || el.tagName === "AUDIO") {
const media = el as HTMLMediaElement;
if (!media.src) media.src = val;
}
// Background Image Promotion
if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes("href")) {
const bg = htmlEl.style.backgroundImage;
if (!bg || bg === "none") {
htmlEl.style.backgroundImage = `url('${val}')`;
}
}
}
}
});
// B. Ensure basic structural elements are visible post-scroll
const body = document.body;
if (body) {
body.style.setProperty("opacity", "1", "important");
body.style.setProperty("visibility", "visible", "important");
}
});
console.log("⏳ Waiting for network idle...");
await page.waitForLoadState("networkidle");
// 1.5 FINAL SETTLEMENT: Let any scroll-triggered JS finish
await page.waitForTimeout(1000);
// 2. Static Snapshot
let content = await page.content();
// 3. Post-Snapshot Asset Discovery (Regex)
// Catches assets that never triggered a network request but exist in the markup
const regexPatterns = [
/(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi,
// Capture CSS url() inside style blocks
/url\(["']?([^"'\)]+)["']?\)/gi,
];
for (const pattern of regexPatterns) {
let match;
while ((match = pattern.exec(content)) !== null) {
try {
foundAssets.add(new URL(match[1], targetUrl).href);
} catch {
// Ignore invalid URLs in content
}
}
}
// Specific srcset parsing
const srcsetRegex = /[a-z0-9-]+srcset=["']([^"']+)["']/gi;
let match;
while ((match = srcsetRegex.exec(content)) !== null) {
match[1].split(",").forEach((rule) => {
const parts = rule.trim().split(/\s+/);
if (parts[0] && !parts[0].startsWith("data:")) {
try {
foundAssets.add(new URL(parts[0], targetUrl).href);
} catch {
// Ignore invalid srcset URLs
}
}
});
}
console.log(`🔍 Processing ${foundAssets.size} discovered assets...`);
// 4. Download & Map
for (const url of foundAssets) {
const local = await downloadFile(url, assetsDir);
if (local) {
urlMap[url] = local;
const clean = url.split("?")[0];
urlMap[clean] = local;
// Handle CSS recursively
if (clean.endsWith(".css")) {
try {
const { data } = await axios.get(url, {
headers: { "User-Agent": USER_AGENT },
});
// Process CSS and save it
const processedCss = await processCssRecursively(
data,
url,
assetsDir,
urlMap,
);
const relPath = sanitizePath(
new URL(url).hostname + new URL(url).pathname,
);
fs.writeFileSync(path.join(assetsDir, relPath), processedCss);
} catch {
// Ignore CSS fetch/process errors
}
}
}
}
console.log("🛠️ Finalizing Static Mirror...");
let finalContent = content;
// A. Apply URL Map Replacements
// Longer paths first to prevent partial replacement errors
const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length);
if (sortedUrls.length > 0) {
const escaped = sortedUrls.map((u) =>
u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"),
);
// Create a massive regex for single-pass replacement
const masterRegex = new RegExp(`(${escaped.join("|")})`, "g");
finalContent = finalContent.replace(
masterRegex,
(match) => urlMap[match] || match,
);
}
// B. Global Root-Relative Path Cleanup
// Catches things like /wp-content/ that weren't distinct assets or were missed
const commonDirs = [
"/wp-content/",
"/wp-includes/",
"/assets/",
"/static/",
"/images/",
];
for (const dir of commonDirs) {
const localDir = `./assets/${urlObj.hostname}${dir}`;
finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`);
finalContent = finalContent.split(`'${dir}`).join(`'${localDir}`);
finalContent = finalContent.split(`(${dir}`).join(`(${localDir}`);
}
// C. Domain Nuke
// Replace absolute links to the original domain with relative or #
const domainPattern = new RegExp(
`https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`,
"gi",
);
// We carefully only replace if it looks like a resource link, or neutralize if it's a navigation link
// For simplicity and "solidness", we'll rely on the specific replacements above first.
// This catch-all nuke ensures we don't leak requests.
// Convert remaining absolute domain links to relative .
finalContent = finalContent.replace(domainPattern, (match) => {
// If we have a map for it, it should have been replaced.
// If not, it's likely a navigation link or an uncaptured asset.
// Safe fallback:
return "./";
});
// D. Static Stability & Cleanup
// Remove tracking/analytics/lazy-load scripts that ruins stability
finalContent = finalContent.replace(
/<script\b[^>]*>([\s\S]*?)<\/script>/gi,
(match, content) => {
const lower = content.toLowerCase();
if (
lower.includes("google-analytics") ||
lower.includes("gtag") ||
lower.includes("fbq") ||
lower.includes("lazy") ||
lower.includes("tracker")
) {
return "";
}
return match;
},
);
// E. CSS Injections for Stability
const headEnd = finalContent.indexOf("</head>");
if (headEnd > -1) {
const stabilityCss = `
<style>
/* UNIVERSAL CLONE STABILIZATION */
* {
transition: none !important;
animation: none !important;
scroll-behavior: auto !important;
}
[data-aos], .reveal, .lazypath, .lazy-load, [data-src] {
opacity: 1 !important;
visibility: visible !important;
transform: none !important;
clip-path: none !important;
}
img, video, iframe {
max-width: 100%;
display: block;
}
a {
pointer-events: none;
cursor: default;
}
</style>`;
finalContent =
finalContent.slice(0, headEnd) +
stabilityCss +
finalContent.slice(headEnd);
}
// Save
const finalPath = path.join(domainDir, htmlFilename);
fs.writeFileSync(finalPath, finalContent);
console.log(`✅ SUCCESS: Cloned to ${finalPath}`);
} catch (err) {
console.error("❌ FATAL ERROR:", err);
} finally {
await browser.close();
}
}
run();

View File

@@ -1,239 +0,0 @@
// @ts-ignore
import scrape from "website-scraper";
// @ts-ignore
import PuppeteerPlugin from "website-scraper-puppeteer";
import path from "node:path";
import { fileURLToPath } from "node:url";
import fs from "node:fs";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function run() {
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error("Usage: npm run clone-website <URL> [output-dir]");
process.exit(1);
}
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const safeDomain = domain.replace(/[^a-z0-9-]/gi, "_");
const outputDir = process.argv[3]
? path.resolve(process.cwd(), process.argv[3])
: path.resolve(__dirname, "../cloned-websites", safeDomain);
if (fs.existsSync(outputDir)) {
console.log(`Cleaning existing directory: ${outputDir}`);
fs.rmSync(outputDir, { recursive: true, force: true });
}
console.log(`🚀 Starting recursive clone of ${targetUrl}`);
console.log(`📂 Output: ${outputDir}`);
const options = {
urls: [targetUrl],
directory: outputDir,
recursive: true,
maxDepth: 5,
// Custom filename generation to avoid "https:/" folders
plugins: [
new PuppeteerPlugin({
launchOptions: {
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
},
scrollToBottom: { timeout: 10000, viewportN: 10 },
blockNavigation: false,
}),
new (class LoggerPlugin {
apply(registerAction: any) {
registerAction("onResourceSaved", ({ resource }: any) => {
console.log(` 💾 Saved: ${resource.url} -> ${resource.filename}`);
});
registerAction("onResourceError", ({ resource, error }: any) => {
console.error(` ❌ Error: ${resource.url} - ${error.message}`);
});
}
})(),
new (class FilenamePlugin {
apply(registerAction: any) {
registerAction("generateFilename", ({ resource }: any) => {
const u = new URL(resource.url);
let filename = u.pathname;
// normalize
if (filename.endsWith("/")) filename += "index.html";
else if (!path.extname(filename) && resource.url.includes(domain))
filename += "/index.html"; // Assume folder if internal link without ext
// If it's an external asset, put it in a separate folder
if (u.hostname !== domain) {
filename = `_external/${u.hostname}${filename}`;
}
// Sanitize filename
filename = filename
.split("/")
.map((part) => part.replace(/[^a-z0-9._-]/gi, "_"))
.join("/");
// Remove leading slash
if (filename.startsWith("/")) filename = filename.substring(1);
// Handle "Unnamed page" by checking if empty
if (!filename || filename === "index.html")
return { filename: "index.html" };
return { filename };
});
}
})(),
],
urlFilter: (url: string) => {
const u = new URL(url);
const isTargetDomain = u.hostname === domain;
const isGoogleFonts =
u.hostname.includes("fonts.googleapis.com") ||
u.hostname.includes("fonts.gstatic.com");
// Allow assets from anywhere
const isAsset =
/\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test(
u.pathname,
);
// Allow fonts/css from common CDNs if standard extension check fails
const isCommonAsset =
u.pathname.includes("/css/") ||
u.pathname.includes("/js/") ||
u.pathname.includes("/static/") ||
u.pathname.includes("/assets/") ||
u.pathname.includes("/uploads/");
return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
},
sources: [
{ selector: "img", attr: "src" },
{ selector: "img", attr: "srcset" },
{ selector: "source", attr: "src" },
{ selector: "source", attr: "srcset" },
{ selector: 'link[rel="stylesheet"]', attr: "href" },
{ selector: 'link[rel="preload"]', attr: "href" },
{ selector: 'link[rel="prefetch"]', attr: "href" },
{ selector: "script", attr: "src" },
{ selector: "video", attr: "src" },
{ selector: "video", attr: "poster" },
{ selector: "iframe", attr: "src" },
{ selector: 'link[rel*="icon"]', attr: "href" },
{ selector: 'link[rel="manifest"]', attr: "href" },
{ selector: 'meta[property="og:image"]', attr: "content" },
],
request: {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
},
},
};
try {
// @ts-ignore
const result = await scrape(options);
console.log(
`\n✅ Successfully cloned ${result.length} resources to ${outputDir}`,
);
// Post-processing: Sanitize HTML to remove Next.js hydration scripts
// This prevents the static site from trying to "hydrate" and breaking images/links
console.log("🧹 Sanitizing HTML files...");
sanitizeHtmlFiles(outputDir);
console.log(`open "${path.join(outputDir, "index.html")}"`);
} catch (error) {
console.error("❌ Error cloning website:", error);
process.exit(1);
}
}
function sanitizeHtmlFiles(dir: string) {
const files = fs.readdirSync(dir);
for (const file of files) {
const fullPath = path.join(dir, file);
if (fs.statSync(fullPath).isDirectory()) {
sanitizeHtmlFiles(fullPath);
} else if (file.endsWith(".html")) {
let content = fs.readFileSync(fullPath, "utf8");
// Remove Next.js data script
content = content.replace(
/<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi,
"",
);
// Remove Next.js chunk scripts (hydration)
// match <script src="..._next/static/chunks..." ...
content = content.replace(
/<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi,
"",
);
content = content.replace(
/<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi,
"",
);
// Convert Breeze dynamic script/styles into actual tags if possible
// match <div class="breeze-scripts-load" ...>URL</div>
content = content.replace(
/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi,
(match, url) => {
if (url.endsWith(".css"))
return `<link rel="stylesheet" href="${url}">`;
return `<script src="${url}"></script>`;
},
);
// Inject Fonts (Fix for missing dynamic fonts)
// We inject Inter and Montserrat as safe defaults for industrial/modern sites
// Check specifically for a stylesheet link to google fonts
const hasGoogleFontStylesheet =
/<link[^>]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test(
content,
);
if (!hasGoogleFontStylesheet) {
const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700&display=swap">`;
const styleBlock = `<style>
:root { --main-font: 'Inter', sans-serif; --heading-font: 'Montserrat', sans-serif; }
body, .body-font, p, span, li, a { font-family: var(--main-font) !important; }
h1, h2, h3, h4, h5, h6, .title-font, .heading-font { font-family: var(--heading-font) !important; }
</style>`;
content = content.replace("</head>", `${fontLink}${styleBlock}</head>`);
}
// Force column layout on product pages
if (content.includes('class="products')) {
const layoutScript = `
<script>
document.addEventListener('DOMContentLoaded', function() {
const products = document.querySelector('.products');
if (products) {
products.classList.remove(...Array.from(products.classList).filter(c => c.startsWith('columns-')));
products.classList.add('columns-1');
products.setAttribute('data-n-desktop-columns', '1');
}
});
</script>`;
content = content.replace("</body>", `${layoutScript}</body>`);
}
fs.writeFileSync(fullPath, content);
}
}
}
run();

View File

@@ -1,130 +0,0 @@
import { PlaywrightCrawler, RequestQueue } from 'crawlee';
import * as path from 'node:path';
import { fileURLToPath } from 'node:url';
import * as fs from 'node:fs';
import { URL } from 'node:url';
import { execSync } from 'node:child_process';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
/**
* The Ultimate Website Cloner
* Uses Crawlee for discovery and single-file-cli for perfect page capture.
*/
async function cloneWebsite() {
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error('Please provide a URL as an argument.');
process.exit(1);
}
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const outputDirName = process.argv[3] || domain.replace(/\./g, '-');
const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
if (fs.existsSync(baseOutputDir)) {
fs.rmSync(baseOutputDir, { recursive: true, force: true });
}
fs.mkdirSync(baseOutputDir, { recursive: true });
console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
console.log(`📂 Output: ${baseOutputDir}`);
const requestQueue = await RequestQueue.open();
await requestQueue.addRequest({ url: targetUrl });
const crawler = new PlaywrightCrawler({
requestQueue,
maxRequestsPerCrawl: 100,
maxConcurrency: 3, // SingleFile is resource intensive
async requestHandler({ request, enqueueLinks, log }) {
const url = request.url;
log.info(`Capturing ${url}...`);
// 1. Determine local path
const u = new URL(url);
let relPath = u.pathname;
if (relPath === '/' || relPath === '') relPath = '/index.html';
if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
if (relPath.startsWith('/')) relPath = relPath.substring(1);
const fullPath = path.join(baseOutputDir, relPath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
// 2. Use single-file-cli for perfect capture
// We use --back-links-rewrite=false because we handle link rewriting ourselves for better control
try {
execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
stdio: 'inherit'
});
} catch (e) {
log.error(`Failed to capture ${url} with SingleFile`);
}
// 3. Enqueue subpages (discovery)
// We use a separate lightweight crawl for link discovery
await enqueueLinks({
strategy: 'same-domain',
transformRequestFunction: (req) => {
if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
return req;
}
});
},
});
await crawler.run();
// 4. Post-processing: Rewrite links between the captured files
console.log('🔗 Rewriting internal links for offline navigation...');
const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
for (const file of allFiles) {
let content = fs.readFileSync(file, 'utf8');
const fileRelToRoot = path.relative(baseOutputDir, file);
// Simple but effective regex for internal links
content = content.replace(/href="([^"]+)"/g, (match, href) => {
if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
try {
const linkUrl = new URL(href, urlObj.href);
if (linkUrl.hostname === domain) {
let linkPath = linkUrl.pathname;
if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
return `href="${relativeLink}"`;
}
} catch (e) {}
}
return match;
});
fs.writeFileSync(file, content);
}
console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
}
function getFiles(dir: string, fileList: string[] = []) {
const files = fs.readdirSync(dir);
for (const file of files) {
const name = path.join(dir, file);
if (fs.statSync(name).isDirectory()) {
getFiles(name, fileList);
} else {
fileList.push(name);
}
}
return fileList;
}
cloneWebsite().catch(err => {
console.error('❌ Fatal error:', err);
process.exit(1);
});

View File

@@ -1,187 +0,0 @@
import scrape from "website-scraper";
import PuppeteerPlugin from "website-scraper-puppeteer";
import path from "path";
import { fileURLToPath } from "url";
import fs from "fs";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Custom plugin to handle Next.js and Mac-specific path issues
class PortfolioPlugin {
apply(registerAction: any) {
// 1. Add more sources before starting
registerAction("beforeStart", ({ options }: any) => {
if (!options.sources) options.sources = [];
options.sources.push({ selector: "img", attr: "data-nimg" });
options.sources.push({ selector: "img", attr: "data-src" });
options.sources.push({ selector: "img", attr: "data-srcset" });
options.sources.push({ selector: "video", attr: "poster" });
options.sources.push({ selector: "source", attr: "data-srcset" });
options.sources.push({
selector: '[style*="background-image"]',
attr: "style",
});
options.sources.push({ selector: 'link[as="font"]', attr: "href" });
options.sources.push({ selector: 'link[as="image"]', attr: "href" });
options.sources.push({ selector: 'link[as="style"]', attr: "href" });
options.sources.push({ selector: 'link[as="script"]', attr: "href" });
});
// 2. Sanitize filenames and handle Next.js optimized images
registerAction("generateFilename", ({ resource, filename }: any) => {
const url = resource.getUrl();
let result = filename;
// Handle Next.js optimized images: /_next/image?url=...&w=...
if (url.includes("/_next/image")) {
try {
const urlParams = new URL(url).searchParams;
const originalUrl = urlParams.get("url");
if (originalUrl) {
const cleanPath = originalUrl.split("?")[0];
const ext = path.extname(cleanPath) || ".webp";
const name = path.basename(cleanPath, ext);
const width = urlParams.get("w") || "auto";
result = `_next/optimized/${name}-${width}${ext}`;
}
} catch (e) {
// Ignore invalid optimized image URLs
}
}
// CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles
// We split by / to ensure we only replace .app at the end of a directory name or filename
result = result
.split("/")
.map((segment: string) =>
segment.endsWith(".app")
? segment.replace(/\.app$/, "-app")
: segment,
)
.join("/");
return { filename: result };
});
}
}
async function cloneWebsite() {
const url = process.argv[2];
if (!url) {
console.error("Please provide a URL as an argument.");
process.exit(1);
}
const domain = new URL(url).hostname;
let outputDirName = process.argv[3] || domain.replace(/\./g, "-");
// Sanitize top-level folder name for Mac
if (outputDirName.endsWith(".app")) {
outputDirName = outputDirName.replace(/\.app$/, "-app");
}
const outputDir = path.resolve(
__dirname,
"../cloned-websites",
outputDirName,
);
if (fs.existsSync(outputDir)) {
fs.rmSync(outputDir, { recursive: true, force: true });
}
console.log(`Cloning ${url} to ${outputDir}...`);
try {
await scrape({
urls: [url],
directory: outputDir,
recursive: true,
maxRecursiveDepth: 5,
requestConcurrency: 10,
plugins: [
new PuppeteerPlugin({
launchOptions: { headless: true, args: ["--no-sandbox"] },
gotoOptions: { waitUntil: "networkidle0", timeout: 60000 },
scrollToBottom: { timeout: 20000, viewportN: 20 },
}),
new PortfolioPlugin(),
],
sources: [
{ selector: "img", attr: "src" },
{ selector: "img", attr: "srcset" },
{ selector: "img", attr: "data-src" },
{ selector: "img", attr: "data-srcset" },
{ selector: 'link[rel="stylesheet"]', attr: "href" },
{ selector: 'link[rel*="icon"]', attr: "href" },
{ selector: "script", attr: "src" },
{ selector: 'link[rel="preload"]', attr: "href" },
{ selector: 'link[rel="prefetch"]', attr: "href" },
{ selector: 'link[rel="modulepreload"]', attr: "href" },
{ selector: 'link[rel="apple-touch-icon"]', attr: "href" },
{ selector: 'link[rel="mask-icon"]', attr: "href" },
{ selector: "source", attr: "src" },
{ selector: "source", attr: "srcset" },
{ selector: "video", attr: "src" },
{ selector: "video", attr: "poster" },
{ selector: "audio", attr: "src" },
{ selector: "iframe", attr: "src" },
{ selector: 'meta[property="og:image"]', attr: "content" },
{ selector: 'meta[name="twitter:image"]', attr: "content" },
{ selector: "[style]", attr: "style" },
],
urlFilter: (link: string) => {
const isAsset =
/\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test(
link,
);
const isNextAsset = link.includes("/_next/");
const isSameDomain =
link.startsWith(url) ||
link.startsWith("/") ||
!link.includes("://") ||
link.includes(domain);
const isGoogleTagManager = link.includes("googletagmanager.com");
const isAnalytics = link.includes("analytics.mintel.me");
const isVercelApp = link.includes("vercel.app");
const isDataUrl = link.startsWith("data:");
const isMailto = link.startsWith("mailto:");
const isTel = link.startsWith("tel:");
return (
(isAsset ||
isNextAsset ||
isSameDomain ||
isGoogleTagManager ||
isAnalytics ||
isVercelApp) &&
!isDataUrl &&
!isMailto &&
!isTel
);
},
filenameGenerator: "bySiteStructure",
subdirectories: [
{
directory: "img",
extensions: [".jpg", ".png", ".svg", ".webp", ".gif", ".ico"],
},
{ directory: "js", extensions: [".js"] },
{ directory: "css", extensions: [".css"] },
{
directory: "fonts",
extensions: [".woff", ".woff2", ".ttf", ".eot", ".otf"],
},
{ directory: "videos", extensions: [".mp4", ".webm", ".mov", ".ogg"] },
],
});
console.log("✅ Website cloned successfully!");
console.log(`Location: ${outputDir}`);
} catch (error) {
console.error("❌ Error cloning website:", error);
process.exit(1);
}
}
cloneWebsite();