Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 1m27s
Build & Deploy / 🏗️ Build (push) Failing after 1m31s
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🩺 Health Check (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
188 lines
6.5 KiB
TypeScript
188 lines
6.5 KiB
TypeScript
import scrape from "website-scraper";
|
|
import PuppeteerPlugin from "website-scraper-puppeteer";
|
|
import path from "path";
|
|
import { fileURLToPath } from "url";
|
|
import fs from "fs";
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
// Custom plugin to handle Next.js and Mac-specific path issues
|
|
class PortfolioPlugin {
|
|
apply(registerAction: any) {
|
|
// 1. Add more sources before starting
|
|
registerAction("beforeStart", ({ options }: any) => {
|
|
if (!options.sources) options.sources = [];
|
|
options.sources.push({ selector: "img", attr: "data-nimg" });
|
|
options.sources.push({ selector: "img", attr: "data-src" });
|
|
options.sources.push({ selector: "img", attr: "data-srcset" });
|
|
options.sources.push({ selector: "video", attr: "poster" });
|
|
options.sources.push({ selector: "source", attr: "data-srcset" });
|
|
options.sources.push({
|
|
selector: '[style*="background-image"]',
|
|
attr: "style",
|
|
});
|
|
options.sources.push({ selector: 'link[as="font"]', attr: "href" });
|
|
options.sources.push({ selector: 'link[as="image"]', attr: "href" });
|
|
options.sources.push({ selector: 'link[as="style"]', attr: "href" });
|
|
options.sources.push({ selector: 'link[as="script"]', attr: "href" });
|
|
});
|
|
|
|
// 2. Sanitize filenames and handle Next.js optimized images
|
|
registerAction("generateFilename", ({ resource, filename }: any) => {
|
|
const url = resource.getUrl();
|
|
let result = filename;
|
|
|
|
// Handle Next.js optimized images: /_next/image?url=...&w=...
|
|
if (url.includes("/_next/image")) {
|
|
try {
|
|
const urlParams = new URL(url).searchParams;
|
|
const originalUrl = urlParams.get("url");
|
|
if (originalUrl) {
|
|
const cleanPath = originalUrl.split("?")[0];
|
|
const ext = path.extname(cleanPath) || ".webp";
|
|
const name = path.basename(cleanPath, ext);
|
|
const width = urlParams.get("w") || "auto";
|
|
result = `_next/optimized/${name}-${width}${ext}`;
|
|
}
|
|
} catch (e) {
|
|
// Ignore invalid optimized image URLs
|
|
}
|
|
}
|
|
|
|
// CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles
|
|
// We split by / to ensure we only replace .app at the end of a directory name or filename
|
|
result = result
|
|
.split("/")
|
|
.map((segment: string) =>
|
|
segment.endsWith(".app")
|
|
? segment.replace(/\.app$/, "-app")
|
|
: segment,
|
|
)
|
|
.join("/");
|
|
|
|
return { filename: result };
|
|
});
|
|
}
|
|
}
|
|
|
|
async function cloneWebsite() {
|
|
const url = process.argv[2];
|
|
if (!url) {
|
|
console.error("Please provide a URL as an argument.");
|
|
process.exit(1);
|
|
}
|
|
|
|
const domain = new URL(url).hostname;
|
|
let outputDirName = process.argv[3] || domain.replace(/\./g, "-");
|
|
|
|
// Sanitize top-level folder name for Mac
|
|
if (outputDirName.endsWith(".app")) {
|
|
outputDirName = outputDirName.replace(/\.app$/, "-app");
|
|
}
|
|
|
|
const outputDir = path.resolve(
|
|
__dirname,
|
|
"../cloned-websites",
|
|
outputDirName,
|
|
);
|
|
|
|
if (fs.existsSync(outputDir)) {
|
|
fs.rmSync(outputDir, { recursive: true, force: true });
|
|
}
|
|
|
|
console.log(`Cloning ${url} to ${outputDir}...`);
|
|
|
|
try {
|
|
await scrape({
|
|
urls: [url],
|
|
directory: outputDir,
|
|
recursive: true,
|
|
maxRecursiveDepth: 5,
|
|
requestConcurrency: 10,
|
|
plugins: [
|
|
new PuppeteerPlugin({
|
|
launchOptions: { headless: true, args: ["--no-sandbox"] },
|
|
gotoOptions: { waitUntil: "networkidle0", timeout: 60000 },
|
|
scrollToBottom: { timeout: 20000, viewportN: 20 },
|
|
}),
|
|
new PortfolioPlugin(),
|
|
],
|
|
sources: [
|
|
{ selector: "img", attr: "src" },
|
|
{ selector: "img", attr: "srcset" },
|
|
{ selector: "img", attr: "data-src" },
|
|
{ selector: "img", attr: "data-srcset" },
|
|
{ selector: 'link[rel="stylesheet"]', attr: "href" },
|
|
{ selector: 'link[rel*="icon"]', attr: "href" },
|
|
{ selector: "script", attr: "src" },
|
|
{ selector: 'link[rel="preload"]', attr: "href" },
|
|
{ selector: 'link[rel="prefetch"]', attr: "href" },
|
|
{ selector: 'link[rel="modulepreload"]', attr: "href" },
|
|
{ selector: 'link[rel="apple-touch-icon"]', attr: "href" },
|
|
{ selector: 'link[rel="mask-icon"]', attr: "href" },
|
|
{ selector: "source", attr: "src" },
|
|
{ selector: "source", attr: "srcset" },
|
|
{ selector: "video", attr: "src" },
|
|
{ selector: "video", attr: "poster" },
|
|
{ selector: "audio", attr: "src" },
|
|
{ selector: "iframe", attr: "src" },
|
|
{ selector: 'meta[property="og:image"]', attr: "content" },
|
|
{ selector: 'meta[name="twitter:image"]', attr: "content" },
|
|
{ selector: "[style]", attr: "style" },
|
|
],
|
|
urlFilter: (link: string) => {
|
|
const isAsset =
|
|
/\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test(
|
|
link,
|
|
);
|
|
const isNextAsset = link.includes("/_next/");
|
|
const isSameDomain =
|
|
link.startsWith(url) ||
|
|
link.startsWith("/") ||
|
|
!link.includes("://") ||
|
|
link.includes(domain);
|
|
const isGoogleTagManager = link.includes("googletagmanager.com");
|
|
const isAnalytics = link.includes("analytics.mintel.me");
|
|
const isVercelApp = link.includes("vercel.app");
|
|
const isDataUrl = link.startsWith("data:");
|
|
const isMailto = link.startsWith("mailto:");
|
|
const isTel = link.startsWith("tel:");
|
|
return (
|
|
(isAsset ||
|
|
isNextAsset ||
|
|
isSameDomain ||
|
|
isGoogleTagManager ||
|
|
isAnalytics ||
|
|
isVercelApp) &&
|
|
!isDataUrl &&
|
|
!isMailto &&
|
|
!isTel
|
|
);
|
|
},
|
|
filenameGenerator: "bySiteStructure",
|
|
subdirectories: [
|
|
{
|
|
directory: "img",
|
|
extensions: [".jpg", ".png", ".svg", ".webp", ".gif", ".ico"],
|
|
},
|
|
{ directory: "js", extensions: [".js"] },
|
|
{ directory: "css", extensions: [".css"] },
|
|
{
|
|
directory: "fonts",
|
|
extensions: [".woff", ".woff2", ".ttf", ".eot", ".otf"],
|
|
},
|
|
{ directory: "videos", extensions: [".mp4", ".webm", ".mov", ".ogg"] },
|
|
],
|
|
});
|
|
|
|
console.log("✅ Website cloned successfully!");
|
|
console.log(`Location: ${outputDir}`);
|
|
} catch (error) {
|
|
console.error("❌ Error cloning website:", error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
cloneWebsite();
|