crawler
This commit is contained in:
150
scripts/clone-website.ts
Normal file
150
scripts/clone-website.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
import scrape from 'website-scraper';
|
||||
import PuppeteerPlugin from 'website-scraper-puppeteer';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import fs from 'fs';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
// Custom plugin to handle Next.js and Mac-specific path issues
|
||||
class PortfolioPlugin {
|
||||
apply(registerAction: any) {
|
||||
// 1. Add more sources before starting
|
||||
registerAction('beforeStart', ({ options }: any) => {
|
||||
if (!options.sources) options.sources = [];
|
||||
options.sources.push({ selector: 'img', attr: 'data-nimg' });
|
||||
options.sources.push({ selector: 'img', attr: 'data-src' });
|
||||
options.sources.push({ selector: 'img', attr: 'data-srcset' });
|
||||
options.sources.push({ selector: 'video', attr: 'poster' });
|
||||
options.sources.push({ selector: 'source', attr: 'data-srcset' });
|
||||
options.sources.push({ selector: '[style*="background-image"]', attr: 'style' });
|
||||
options.sources.push({ selector: 'link[as="font"]', attr: 'href' });
|
||||
options.sources.push({ selector: 'link[as="image"]', attr: 'href' });
|
||||
options.sources.push({ selector: 'link[as="style"]', attr: 'href' });
|
||||
options.sources.push({ selector: 'link[as="script"]', attr: 'href' });
|
||||
});
|
||||
|
||||
// 2. Sanitize filenames and handle Next.js optimized images
|
||||
registerAction('generateFilename', ({ resource, filename }: any) => {
|
||||
const url = resource.getUrl();
|
||||
let result = filename;
|
||||
|
||||
// Handle Next.js optimized images: /_next/image?url=...&w=...
|
||||
if (url.includes('/_next/image')) {
|
||||
try {
|
||||
const urlParams = new URL(url).searchParams;
|
||||
const originalUrl = urlParams.get('url');
|
||||
if (originalUrl) {
|
||||
const cleanPath = originalUrl.split('?')[0];
|
||||
const ext = path.extname(cleanPath) || '.webp';
|
||||
const name = path.basename(cleanPath, ext);
|
||||
const width = urlParams.get('w') || 'auto';
|
||||
result = `_next/optimized/${name}-${width}${ext}`;
|
||||
}
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
// CRITICAL MAC FIX: Replace .app with -app in all paths to prevent hidden Application Bundles
|
||||
// We split by / to ensure we only replace .app at the end of a directory name or filename
|
||||
result = result.split('/').map((segment: string) =>
|
||||
segment.endsWith('.app') ? segment.replace(/\.app$/, '-app') : segment
|
||||
).join('/');
|
||||
|
||||
return { filename: result };
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function cloneWebsite() {
|
||||
const url = process.argv[2];
|
||||
if (!url) {
|
||||
console.error('Please provide a URL as an argument.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const domain = new URL(url).hostname;
|
||||
let outputDirName = process.argv[3] || domain.replace(/\./g, '-');
|
||||
|
||||
// Sanitize top-level folder name for Mac
|
||||
if (outputDirName.endsWith('.app')) {
|
||||
outputDirName = outputDirName.replace(/\.app$/, '-app');
|
||||
}
|
||||
|
||||
const outputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
|
||||
|
||||
if (fs.existsSync(outputDir)) {
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
console.log(`Cloning ${url} to ${outputDir}...`);
|
||||
|
||||
try {
|
||||
await scrape({
|
||||
urls: [url],
|
||||
directory: outputDir,
|
||||
recursive: true,
|
||||
maxRecursiveDepth: 5,
|
||||
requestConcurrency: 10,
|
||||
plugins: [
|
||||
new PuppeteerPlugin({
|
||||
launchOptions: { headless: true, args: ['--no-sandbox'] },
|
||||
gotoOptions: { waitUntil: 'networkidle0', timeout: 60000 },
|
||||
scrollToBottom: { timeout: 20000, viewportN: 20 },
|
||||
}),
|
||||
new PortfolioPlugin()
|
||||
],
|
||||
sources: [
|
||||
{ selector: 'img', attr: 'src' },
|
||||
{ selector: 'img', attr: 'srcset' },
|
||||
{ selector: 'img', attr: 'data-src' },
|
||||
{ selector: 'img', attr: 'data-srcset' },
|
||||
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
|
||||
{ selector: 'link[rel*="icon"]', attr: 'href' },
|
||||
{ selector: 'script', attr: 'src' },
|
||||
{ selector: 'link[rel="preload"]', attr: 'href' },
|
||||
{ selector: 'link[rel="prefetch"]', attr: 'href' },
|
||||
{ selector: 'link[rel="modulepreload"]', attr: 'href' },
|
||||
{ selector: 'link[rel="apple-touch-icon"]', attr: 'href' },
|
||||
{ selector: 'link[rel="mask-icon"]', attr: 'href' },
|
||||
{ selector: 'source', attr: 'src' },
|
||||
{ selector: 'source', attr: 'srcset' },
|
||||
{ selector: 'video', attr: 'src' },
|
||||
{ selector: 'video', attr: 'poster' },
|
||||
{ selector: 'audio', attr: 'src' },
|
||||
{ selector: 'iframe', attr: 'src' },
|
||||
{ selector: 'meta[property="og:image"]', attr: 'content' },
|
||||
{ selector: 'meta[name="twitter:image"]', attr: 'content' },
|
||||
{ selector: '[style]', attr: 'style' },
|
||||
],
|
||||
urlFilter: (link: string) => {
|
||||
const isAsset = /\.(js|css|jpg|jpeg|png|gif|svg|webp|woff|woff2|ttf|eot|otf|mp4|webm|mov|ogg|pdf|ico)(\?.*)?$/i.test(link);
|
||||
const isNextAsset = link.includes('/_next/');
|
||||
const isSameDomain = link.startsWith(url) || link.startsWith('/') || !link.includes('://') || link.includes(domain);
|
||||
const isGoogleTagManager = link.includes('googletagmanager.com');
|
||||
const isAnalytics = link.includes('analytics.mintel.me');
|
||||
const isVercelApp = link.includes('vercel.app');
|
||||
const isDataUrl = link.startsWith('data:');
|
||||
const isMailto = link.startsWith('mailto:');
|
||||
const isTel = link.startsWith('tel:');
|
||||
return (isAsset || isNextAsset || isSameDomain || isGoogleTagManager || isAnalytics || isVercelApp) && !isDataUrl && !isMailto && !isTel;
|
||||
},
|
||||
filenameGenerator: 'bySiteStructure',
|
||||
subdirectories: [
|
||||
{ directory: 'img', extensions: ['.jpg', '.png', '.svg', '.webp', '.gif', '.ico'] },
|
||||
{ directory: 'js', extensions: ['.js'] },
|
||||
{ directory: 'css', extensions: ['.css'] },
|
||||
{ directory: 'fonts', extensions: ['.woff', '.woff2', '.ttf', '.eot', '.otf'] },
|
||||
{ directory: 'videos', extensions: ['.mp4', '.webm', '.mov', '.ogg'] },
|
||||
],
|
||||
});
|
||||
|
||||
console.log('✅ Website cloned successfully!');
|
||||
console.log(`Location: ${outputDir}`);
|
||||
} catch (error) {
|
||||
console.error('❌ Error cloning website:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
cloneWebsite();
|
||||
Reference in New Issue
Block a user