Files
mintel.me/apps/web/scripts/clone-website.ts
Marc Mintel ecea90dc91
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 1m27s
Build & Deploy / 🏗️ Build (push) Failing after 1m31s
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🩺 Health Check (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
chore: stabilize apps/web (lint, build, typecheck fixes)
2026-02-11 11:56:13 +01:00

188 lines
6.5 KiB
TypeScript

import scrape from "website-scraper";
import PuppeteerPlugin from "website-scraper-puppeteer";
import path from "path";
import { fileURLToPath } from "url";
import fs from "fs";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Custom plugin to handle Next.js and Mac-specific path issues
class PortfolioPlugin {
apply(registerAction: any) {
// 1. Add more sources before starting
registerAction("beforeStart", ({ options }: any) => {
if (!options.sources) options.sources = [];
options.sources.push({ selector: "img", attr: "data-nimg" });
options.sources.push({ selector: "img", attr: "data-src" });
options.sources.push({ selector: "img", attr: "data-srcset" });
options.sources.push({ selector: "video", attr: "poster" });
options.sources.push({ selector: "source", attr: "data-srcset" });
options.sources.push({
selector: '[style*="background-image"]',
attr: "style",
});
options.sources.push({ selector: 'link[as="font"]', attr: "href" });
options.sources.push({ selector: 'link[as="image"]', attr: "href" });
options.sources.push({ selector: 'link[as="style"]', attr: "href" });
options.sources.push({ selector: 'link[as="script"]', attr: "href" });
});
// 2. Sanitize filenames and handle Next.js optimized images
registerAction("generateFilename", ({ resource, filename }: any) => {
const url = resource.getUrl();
let result = filename;
// Handle Next.js optimized images: /_next/image?url=...&w=...
if (url.includes("/_next/image")) {
try {
const urlParams = new URL(url).searchParams;
const originalUrl = urlParams.get("url");
if (originalUrl) {
const cleanPath = originalUrl.split("?")[0];
const ext = path.extname(cleanPath) || ".webp";
const name = path.basename(cleanPath, ext);
const width = urlParams.get("w") || "auto";
result = `_next/optimized/${name}-${width}${ext}`;
}
} catch (e) {
// Ignore invalid optimized image URLs
}
}
// CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles
// We split by / to ensure we only replace .app at the end of a directory name or filename
result = result
.split("/")
.map((segment: string) =>
segment.endsWith(".app")
? segment.replace(/\.app$/, "-app")
: segment,
)
.join("/");
return { filename: result };
});
}
}
async function cloneWebsite() {
const url = process.argv[2];
if (!url) {
console.error("Please provide a URL as an argument.");
process.exit(1);
}
const domain = new URL(url).hostname;
let outputDirName = process.argv[3] || domain.replace(/\./g, "-");
// Sanitize top-level folder name for Mac
if (outputDirName.endsWith(".app")) {
outputDirName = outputDirName.replace(/\.app$/, "-app");
}
const outputDir = path.resolve(
__dirname,
"../cloned-websites",
outputDirName,
);
if (fs.existsSync(outputDir)) {
fs.rmSync(outputDir, { recursive: true, force: true });
}
console.log(`Cloning ${url} to ${outputDir}...`);
try {
await scrape({
urls: [url],
directory: outputDir,
recursive: true,
maxRecursiveDepth: 5,
requestConcurrency: 10,
plugins: [
new PuppeteerPlugin({
launchOptions: { headless: true, args: ["--no-sandbox"] },
gotoOptions: { waitUntil: "networkidle0", timeout: 60000 },
scrollToBottom: { timeout: 20000, viewportN: 20 },
}),
new PortfolioPlugin(),
],
sources: [
{ selector: "img", attr: "src" },
{ selector: "img", attr: "srcset" },
{ selector: "img", attr: "data-src" },
{ selector: "img", attr: "data-srcset" },
{ selector: 'link[rel="stylesheet"]', attr: "href" },
{ selector: 'link[rel*="icon"]', attr: "href" },
{ selector: "script", attr: "src" },
{ selector: 'link[rel="preload"]', attr: "href" },
{ selector: 'link[rel="prefetch"]', attr: "href" },
{ selector: 'link[rel="modulepreload"]', attr: "href" },
{ selector: 'link[rel="apple-touch-icon"]', attr: "href" },
{ selector: 'link[rel="mask-icon"]', attr: "href" },
{ selector: "source", attr: "src" },
{ selector: "source", attr: "srcset" },
{ selector: "video", attr: "src" },
{ selector: "video", attr: "poster" },
{ selector: "audio", attr: "src" },
{ selector: "iframe", attr: "src" },
{ selector: 'meta[property="og:image"]', attr: "content" },
{ selector: 'meta[name="twitter:image"]', attr: "content" },
{ selector: "[style]", attr: "style" },
],
urlFilter: (link: string) => {
const isAsset =
/\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test(
link,
);
const isNextAsset = link.includes("/_next/");
const isSameDomain =
link.startsWith(url) ||
link.startsWith("/") ||
!link.includes("://") ||
link.includes(domain);
const isGoogleTagManager = link.includes("googletagmanager.com");
const isAnalytics = link.includes("analytics.mintel.me");
const isVercelApp = link.includes("vercel.app");
const isDataUrl = link.startsWith("data:");
const isMailto = link.startsWith("mailto:");
const isTel = link.startsWith("tel:");
return (
(isAsset ||
isNextAsset ||
isSameDomain ||
isGoogleTagManager ||
isAnalytics ||
isVercelApp) &&
!isDataUrl &&
!isMailto &&
!isTel
);
},
filenameGenerator: "bySiteStructure",
subdirectories: [
{
directory: "img",
extensions: [".jpg", ".png", ".svg", ".webp", ".gif", ".ico"],
},
{ directory: "js", extensions: [".js"] },
{ directory: "css", extensions: [".css"] },
{
directory: "fonts",
extensions: [".woff", ".woff2", ".ttf", ".eot", ".otf"],
},
{ directory: "videos", extensions: [".mp4", ".webm", ".mov", ".ogg"] },
],
});
console.log("✅ Website cloned successfully!");
console.log(`Location: ${outputDir}`);
} catch (error) {
console.error("❌ Error cloning website:", error);
process.exit(1);
}
}
cloneWebsite();