This commit is contained in:
2026-02-01 00:07:10 +01:00
parent 813fb070a7
commit 470854aad4
484 changed files with 45981 additions and 5 deletions

210
scripts/clone-recursive.ts Normal file
View File

@@ -0,0 +1,210 @@
// @ts-ignore
import scrape from 'website-scraper';
// @ts-ignore
import PuppeteerPlugin from 'website-scraper-puppeteer';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
class CustomFilenameGeneratorPlugin {
apply(registerAction: any) {
registerAction('generateFilename', ({ resource }: any) => {
const url = new URL(resource.url);
const ext = path.extname(url.pathname);
// Clean the path
let safePath = url.pathname;
if (safePath.endsWith('/')) {
safePath += 'index.html';
} else if (!ext && !resource.isHtml()) {
// If no extension and not HTML, guess based on content type?
// But usually safe to leave as is or add extension if known.
} else if (!ext && resource.isHtml()) {
safePath += '.html';
}
// Handle query strings if needed (simplifying by ignoring them for static local files usually better,
// unless they determine content. For a clean clone, we usually ignore unique query params)
// But if the site relies on routing via query params (e.g. ?page=2), we might want to encode them.
// For now, let's keep it simple and clean.
// Remove leading slash
if (safePath.startsWith('/')) safePath = safePath.substring(1);
// Sanitization
safePath = safePath.replace(/[:*?"<>|]/g, '_');
// External assets go to a separate folder to avoid collision
// We can detect external by checking if the resource parent is different?
// Actually, simply using the hostname mapping is safer.
// However, the USER wants "local cloned pages".
// If we just use the path, we merge everything into one root.
// If there are collision (e.g. same path on different domains), this is bad.
// But typically we clone ONE site.
return { filename: safePath };
});
}
}
async function run() {
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error('Usage: npm run clone-website <URL> [output-dir]');
process.exit(1);
}
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
const outputDir = process.argv[3]
? path.resolve(process.cwd(), process.argv[3])
: path.resolve(__dirname, '../cloned-websites', safeDomain);
if (fs.existsSync(outputDir)) {
console.log(`Cleaning existing directory: ${outputDir}`);
fs.rmSync(outputDir, { recursive: true, force: true });
}
console.log(`🚀 Starting recursive clone of ${targetUrl}`);
console.log(`📂 Output: ${outputDir}`);
const options = {
urls: [targetUrl],
directory: outputDir,
recursive: true,
maxDepth: 10,
// Custom filename generation to avoid "https:/" folders
// We use 'bySiteStructure' behavior but manually controlled via plugin
// to forcefully strip protocol/domain issues if any.
// Actually, let's just use 'bySiteStructure' but strictly configured?
// No, the user saw garbage. Let's use 'byType' combined with preserving structure for HTML.
// BETTER STRATEGY:
// Use a custom plugin to control filenames EXACTLY how we want.
plugins: [
new PuppeteerPlugin({
launchOptions: {
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
},
scrollToBottom: { timeout: 10000, viewportN: 10 },
blockNavigation: false
}),
new class FilenamePlugin {
apply(registerAction: any) {
registerAction('generateFilename', ({ resource }: any) => {
const u = new URL(resource.url);
let filename = u.pathname;
// normalize
if (filename.endsWith('/')) filename += 'index.html';
else if (!path.extname(filename) && resource.url.includes(domain)) filename += '/index.html'; // Assume folder if internal link without ext
// If it's an external asset, put it in a separate folder
if (u.hostname !== domain) {
filename = `_external/${u.hostname}${filename}`;
}
// Sanitize filename
filename = filename.split('/').map(part => part.replace(/[^a-z0-9._-]/gi, '_')).join('/');
// Remove leading slash
if (filename.startsWith('/')) filename = filename.substring(1);
// Handle "Unnamed page" by checking if empty
if (!filename || filename === 'index.html') return { filename: 'index.html' };
return { filename };
});
}
}
],
urlFilter: (url: string) => {
const u = new URL(url);
const isTargetDomain = u.hostname === domain;
const isGoogleFonts = u.hostname.includes('fonts.googleapis.com') || u.hostname.includes('fonts.gstatic.com');
// Allow assets from anywhere
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json)$/i.test(u.pathname);
// Allow fonts/css from common CDNs if standard extension check fails
const isCommonAsset = u.pathname.includes('/css/') || u.pathname.includes('/js/') || u.pathname.includes('/static/') || u.pathname.includes('/assets/');
return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
},
sources: [
{ selector: 'img', attr: 'src' },
{ selector: 'img', attr: 'srcset' },
{ selector: 'source', attr: 'src' },
{ selector: 'source', attr: 'srcset' },
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
{ selector: 'script', attr: 'src' },
{ selector: 'video', attr: 'src' },
{ selector: 'video', attr: 'poster' },
{ selector: 'iframe', attr: 'src' },
{ selector: 'link[rel*="icon"]', attr: 'href' },
{ selector: 'link[rel="manifest"]', attr: 'href' },
{ selector: 'meta[property="og:image"]', attr: 'content' }
],
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
}
};
try {
// @ts-ignore
const result = await scrape(options);
console.log(`\n✅ Successfully cloned ${result.length} resources to ${outputDir}`);
// Post-processing: Sanitize HTML to remove Next.js hydration scripts
// This prevents the static site from trying to "hydrate" and breaking images/links
console.log('🧹 Sanitizing HTML files...');
sanitizeHtmlFiles(outputDir);
console.log(`open "${path.join(outputDir, 'index.html')}"`);
} catch (error) {
console.error('❌ Error cloning website:', error);
process.exit(1);
}
}
function sanitizeHtmlFiles(dir: string) {
const files = fs.readdirSync(dir);
for (const file of files) {
const fullPath = path.join(dir, file);
if (fs.statSync(fullPath).isDirectory()) {
sanitizeHtmlFiles(fullPath);
} else if (file.endsWith('.html')) {
let content = fs.readFileSync(fullPath, 'utf8');
// Remove Next.js data script
content = content.replace(/<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi, '');
// Remove Next.js chunk scripts (hydration)
// match <script src="..._next/static/chunks..." ...
content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi, '');
content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi, '');
// Inject Fonts (Fix for missing dynamic fonts)
// We inject Inter as a safe default for modern Next.js/Tailwind sites if strictly missing
if (!content.includes('fonts.googleapis.com')) {
const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap">`;
const styleBlock = `<style>.body-font{font-family:'Inter',sans-serif;}.title-font{font-family:'Inter',sans-serif;}</style>`;
content = content.replace('</head>', `${fontLink}${styleBlock}</head>`);
}
fs.writeFileSync(fullPath, content);
}
}
}
run();

View File

@@ -0,0 +1,130 @@
import { PlaywrightCrawler, RequestQueue } from 'crawlee';
import * as path from 'node:path';
import { fileURLToPath } from 'node:url';
import * as fs from 'node:fs';
import { URL } from 'node:url';
import { execSync } from 'node:child_process';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
/**
* The Ultimate Website Cloner
* Uses Crawlee for discovery and single-file-cli for perfect page capture.
*/
async function cloneWebsite() {
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error('Please provide a URL as an argument.');
process.exit(1);
}
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const outputDirName = process.argv[3] || domain.replace(/\./g, '-');
const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
if (fs.existsSync(baseOutputDir)) {
fs.rmSync(baseOutputDir, { recursive: true, force: true });
}
fs.mkdirSync(baseOutputDir, { recursive: true });
console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
console.log(`📂 Output: ${baseOutputDir}`);
const requestQueue = await RequestQueue.open();
await requestQueue.addRequest({ url: targetUrl });
const crawler = new PlaywrightCrawler({
requestQueue,
maxRequestsPerCrawl: 100,
maxConcurrency: 3, // SingleFile is resource intensive
async requestHandler({ request, enqueueLinks, log }) {
const url = request.url;
log.info(`Capturing ${url}...`);
// 1. Determine local path
const u = new URL(url);
let relPath = u.pathname;
if (relPath === '/' || relPath === '') relPath = '/index.html';
if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
if (relPath.startsWith('/')) relPath = relPath.substring(1);
const fullPath = path.join(baseOutputDir, relPath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
// 2. Use single-file-cli for perfect capture
// We use --back-links-rewrite=false because we handle link rewriting ourselves for better control
try {
execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
stdio: 'inherit'
});
} catch (e) {
log.error(`Failed to capture ${url} with SingleFile`);
}
// 3. Enqueue subpages (discovery)
// We use a separate lightweight crawl for link discovery
await enqueueLinks({
strategy: 'same-domain',
transformRequestFunction: (req) => {
if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
return req;
}
});
},
});
await crawler.run();
// 4. Post-processing: Rewrite links between the captured files
console.log('🔗 Rewriting internal links for offline navigation...');
const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
for (const file of allFiles) {
let content = fs.readFileSync(file, 'utf8');
const fileRelToRoot = path.relative(baseOutputDir, file);
// Simple but effective regex for internal links
content = content.replace(/href="([^"]+)"/g, (match, href) => {
if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
try {
const linkUrl = new URL(href, urlObj.href);
if (linkUrl.hostname === domain) {
let linkPath = linkUrl.pathname;
if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
return `href="${relativeLink}"`;
}
} catch (e) {}
}
return match;
});
fs.writeFileSync(file, content);
}
console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
}
function getFiles(dir: string, fileList: string[] = []) {
const files = fs.readdirSync(dir);
for (const file of files) {
const name = path.join(dir, file);
if (fs.statSync(name).isDirectory()) {
getFiles(name, fileList);
} else {
fileList.push(name);
}
}
return fileList;
}
cloneWebsite().catch(err => {
console.error('❌ Fatal error:', err);
process.exit(1);
});