Files
mintel.me/apps/web/scripts/clone-website.ts
Marc Mintel 103d71851c
Some checks failed
🧪 CI (QA) / 🧪 Quality Assurance (push) Failing after 1m3s
chore: overhaul infrastructure and integrate @mintel packages
- Restructure to pnpm monorepo (site moved to apps/web)
- Integrate @mintel/tsconfig, @mintel/eslint-config, @mintel/husky-config
- Implement Docker service architecture (Varnish, Directus, Gatekeeper)
- Setup environment-aware Gitea Actions deployment
2026-02-05 14:18:51 +01:00

151 lines
6.1 KiB
TypeScript

import scrape from 'website-scraper';
import PuppeteerPlugin from 'website-scraper-puppeteer';
import path from 'path';
import { fileURLToPath } from 'url';
import fs from 'fs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Custom plugin to handle Next.js and Mac-specific path issues
class PortfolioPlugin {
apply(registerAction: any) {
// 1. Add more sources before starting
registerAction('beforeStart', ({ options }: any) => {
if (!options.sources) options.sources = [];
options.sources.push({ selector: 'img', attr: 'data-nimg' });
options.sources.push({ selector: 'img', attr: 'data-src' });
options.sources.push({ selector: 'img', attr: 'data-srcset' });
options.sources.push({ selector: 'video', attr: 'poster' });
options.sources.push({ selector: 'source', attr: 'data-srcset' });
options.sources.push({ selector: '[style*="background-image"]', attr: 'style' });
options.sources.push({ selector: 'link[as="font"]', attr: 'href' });
options.sources.push({ selector: 'link[as="image"]', attr: 'href' });
options.sources.push({ selector: 'link[as="style"]', attr: 'href' });
options.sources.push({ selector: 'link[as="script"]', attr: 'href' });
});
// 2. Sanitize filenames and handle Next.js optimized images
registerAction('generateFilename', ({ resource, filename }: any) => {
const url = resource.getUrl();
let result = filename;
// Handle Next.js optimized images: /_next/image?url=...&w=...
if (url.includes('/_next/image')) {
try {
const urlParams = new URL(url).searchParams;
const originalUrl = urlParams.get('url');
if (originalUrl) {
const cleanPath = originalUrl.split('?')[0];
const ext = path.extname(cleanPath) || '.webp';
const name = path.basename(cleanPath, ext);
const width = urlParams.get('w') || 'auto';
result = `_next/optimized/${name}-${width}${ext}`;
}
} catch (e) {}
}
// CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles
// We split by / to ensure we only replace .app at the end of a directory name or filename
result = result.split('/').map((segment: string) =>
segment.endsWith('.app') ? segment.replace(/\.app$/, '-app') : segment
).join('/');
return { filename: result };
});
}
}
async function cloneWebsite() {
const url = process.argv[2];
if (!url) {
console.error('Please provide a URL as an argument.');
process.exit(1);
}
const domain = new URL(url).hostname;
let outputDirName = process.argv[3] || domain.replace(/\./g, '-');
// Sanitize top-level folder name for Mac
if (outputDirName.endsWith('.app')) {
outputDirName = outputDirName.replace(/\.app$/, '-app');
}
const outputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
if (fs.existsSync(outputDir)) {
fs.rmSync(outputDir, { recursive: true, force: true });
}
console.log(`Cloning ${url} to ${outputDir}...`);
try {
await scrape({
urls: [url],
directory: outputDir,
recursive: true,
maxRecursiveDepth: 5,
requestConcurrency: 10,
plugins: [
new PuppeteerPlugin({
launchOptions: { headless: true, args: ['--no-sandbox'] },
gotoOptions: { waitUntil: 'networkidle0', timeout: 60000 },
scrollToBottom: { timeout: 20000, viewportN: 20 },
}),
new PortfolioPlugin()
],
sources: [
{ selector: 'img', attr: 'src' },
{ selector: 'img', attr: 'srcset' },
{ selector: 'img', attr: 'data-src' },
{ selector: 'img', attr: 'data-srcset' },
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
{ selector: 'link[rel*="icon"]', attr: 'href' },
{ selector: 'script', attr: 'src' },
{ selector: 'link[rel="preload"]', attr: 'href' },
{ selector: 'link[rel="prefetch"]', attr: 'href' },
{ selector: 'link[rel="modulepreload"]', attr: 'href' },
{ selector: 'link[rel="apple-touch-icon"]', attr: 'href' },
{ selector: 'link[rel="mask-icon"]', attr: 'href' },
{ selector: 'source', attr: 'src' },
{ selector: 'source', attr: 'srcset' },
{ selector: 'video', attr: 'src' },
{ selector: 'video', attr: 'poster' },
{ selector: 'audio', attr: 'src' },
{ selector: 'iframe', attr: 'src' },
{ selector: 'meta[property="og:image"]', attr: 'content' },
{ selector: 'meta[name="twitter:image"]', attr: 'content' },
{ selector: '[style]', attr: 'style' },
],
urlFilter: (link: string) => {
const isAsset = /\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test(link);
const isNextAsset = link.includes('/_next/');
const isSameDomain = link.startsWith(url) || link.startsWith('/') || !link.includes('://') || link.includes(domain);
const isGoogleTagManager = link.includes('googletagmanager.com');
const isAnalytics = link.includes('analytics.mintel.me');
const isVercelApp = link.includes('vercel.app');
const isDataUrl = link.startsWith('data:');
const isMailto = link.startsWith('mailto:');
const isTel = link.startsWith('tel:');
return (isAsset || isNextAsset || isSameDomain || isGoogleTagManager || isAnalytics || isVercelApp) && !isDataUrl && !isMailto && !isTel;
},
filenameGenerator: 'bySiteStructure',
subdirectories: [
{ directory: 'img', extensions: ['.jpg', '.png', '.svg', '.webp', '.gif', '.ico'] },
{ directory: 'js', extensions: ['.js'] },
{ directory: 'css', extensions: ['.css'] },
{ directory: 'fonts', extensions: ['.woff', '.woff2', '.ttf', '.eot', '.otf'] },
{ directory: 'videos', extensions: ['.mp4', '.webm', '.mov', '.ogg'] },
],
});
console.log('✅ Website cloned successfully!');
console.log(`Location: ${outputDir}`);
} catch (error) {
console.error('❌ Error cloning website:', error);
process.exit(1);
}
}
cloneWebsite();