Some checks failed
🧪 CI (QA) / 🧪 Quality Assurance (push) Failing after 1m3s
- Restructure to pnpm monorepo (site moved to apps/web) - Integrate @mintel/tsconfig, @mintel/eslint-config, @mintel/husky-config - Implement Docker service architecture (Varnish, Directus, Gatekeeper) - Setup environment-aware Gitea Actions deployment
131 lines
4.9 KiB
TypeScript
131 lines
4.9 KiB
TypeScript
import { PlaywrightCrawler, RequestQueue } from 'crawlee';
|
|
import * as path from 'node:path';
|
|
import { fileURLToPath } from 'node:url';
|
|
import * as fs from 'node:fs';
|
|
import { URL } from 'node:url';
|
|
import { execSync } from 'node:child_process';
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
/**
|
|
* The Ultimate Website Cloner
|
|
* Uses Crawlee for discovery and single-file-cli for perfect page capture.
|
|
*/
|
|
async function cloneWebsite() {
|
|
const targetUrl = process.argv[2];
|
|
if (!targetUrl) {
|
|
console.error('Please provide a URL as an argument.');
|
|
process.exit(1);
|
|
}
|
|
|
|
const urlObj = new URL(targetUrl);
|
|
const domain = urlObj.hostname;
|
|
const outputDirName = process.argv[3] || domain.replace(/\./g, '-');
|
|
const baseOutputDir = path.resolve(__dirname, '../cloned-websites', outputDirName);
|
|
|
|
if (fs.existsSync(baseOutputDir)) {
|
|
fs.rmSync(baseOutputDir, { recursive: true, force: true });
|
|
}
|
|
fs.mkdirSync(baseOutputDir, { recursive: true });
|
|
|
|
console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
|
|
console.log(`📂 Output: ${baseOutputDir}`);
|
|
|
|
const requestQueue = await RequestQueue.open();
|
|
await requestQueue.addRequest({ url: targetUrl });
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
requestQueue,
|
|
maxRequestsPerCrawl: 100,
|
|
maxConcurrency: 3, // SingleFile is resource intensive
|
|
|
|
async requestHandler({ request, enqueueLinks, log }) {
|
|
const url = request.url;
|
|
log.info(`Capturing ${url}...`);
|
|
|
|
// 1. Determine local path
|
|
const u = new URL(url);
|
|
let relPath = u.pathname;
|
|
if (relPath === '/' || relPath === '') relPath = '/index.html';
|
|
if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
|
|
if (relPath.startsWith('/')) relPath = relPath.substring(1);
|
|
|
|
const fullPath = path.join(baseOutputDir, relPath);
|
|
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
|
|
|
// 2. Use single-file-cli for perfect capture
|
|
// We use --back-links-rewrite=false because we handle link rewriting ourselves for better control
|
|
try {
|
|
execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
|
|
stdio: 'inherit'
|
|
});
|
|
} catch (e) {
|
|
log.error(`Failed to capture ${url} with SingleFile`);
|
|
}
|
|
|
|
// 3. Enqueue subpages (discovery)
|
|
// We use a separate lightweight crawl for link discovery
|
|
await enqueueLinks({
|
|
strategy: 'same-domain',
|
|
transformRequestFunction: (req) => {
|
|
if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
|
|
return req;
|
|
}
|
|
});
|
|
},
|
|
});
|
|
|
|
await crawler.run();
|
|
|
|
// 4. Post-processing: Rewrite links between the captured files
|
|
console.log('🔗 Rewriting internal links for offline navigation...');
|
|
const allFiles = getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
|
|
|
|
for (const file of allFiles) {
|
|
let content = fs.readFileSync(file, 'utf8');
|
|
const fileRelToRoot = path.relative(baseOutputDir, file);
|
|
|
|
// Simple but effective regex for internal links
|
|
content = content.replace(/href="([^"]+)"/g, (match, href) => {
|
|
if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
|
|
try {
|
|
const linkUrl = new URL(href, urlObj.href);
|
|
if (linkUrl.hostname === domain) {
|
|
let linkPath = linkUrl.pathname;
|
|
if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
|
|
if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
|
|
if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
|
|
|
|
const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
|
|
return `href="${relativeLink}"`;
|
|
}
|
|
} catch (e) {}
|
|
}
|
|
return match;
|
|
});
|
|
|
|
fs.writeFileSync(file, content);
|
|
}
|
|
|
|
console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
|
|
}
|
|
|
|
function getFiles(dir: string, fileList: string[] = []) {
|
|
const files = fs.readdirSync(dir);
|
|
for (const file of files) {
|
|
const name = path.join(dir, file);
|
|
if (fs.statSync(name).isDirectory()) {
|
|
getFiles(name, fileList);
|
|
} else {
|
|
fileList.push(name);
|
|
}
|
|
}
|
|
return fileList;
|
|
}
|
|
|
|
cloneWebsite().catch(err => {
|
|
console.error('❌ Fatal error:', err);
|
|
process.exit(1);
|
|
});
|