import { PlaywrightCrawler, RequestQueue } from "crawlee"; import * as path from "node:path"; import * as fs from "node:fs"; import { execSync } from "node:child_process"; export interface WebsiteClonerOptions { baseOutputDir: string; maxRequestsPerCrawl?: number; maxConcurrency?: number; } export class WebsiteCloner { private options: WebsiteClonerOptions; constructor(options: WebsiteClonerOptions) { this.options = { maxRequestsPerCrawl: 100, maxConcurrency: 3, ...options, }; } public async clone( targetUrl: string, outputDirName?: string, ): Promise { const urlObj = new URL(targetUrl); const domain = urlObj.hostname; const finalOutputDirName = outputDirName || domain.replace(/\./g, "-"); const baseOutputDir = path.resolve( this.options.baseOutputDir, finalOutputDirName, ); if (fs.existsSync(baseOutputDir)) { fs.rmSync(baseOutputDir, { recursive: true, force: true }); } fs.mkdirSync(baseOutputDir, { recursive: true }); console.log(`šŸš€ Starting perfect recursive clone of ${targetUrl}...`); console.log(`šŸ“‚ Output: ${baseOutputDir}`); const requestQueue = await RequestQueue.open(); await requestQueue.addRequest({ url: targetUrl }); const crawler = new PlaywrightCrawler({ requestQueue, maxRequestsPerCrawl: this.options.maxRequestsPerCrawl, maxConcurrency: this.options.maxConcurrency, async requestHandler({ request, enqueueLinks, log }) { const url = request.url; log.info(`Capturing ${url}...`); const u = new URL(url); let relPath = u.pathname; if (relPath === "/" || relPath === "") relPath = "/index.html"; if (!relPath.endsWith(".html") && !path.extname(relPath)) relPath += "/index.html"; if (relPath.startsWith("/")) relPath = relPath.substring(1); const fullPath = path.join(baseOutputDir, relPath); fs.mkdirSync(path.dirname(fullPath), { recursive: true }); try { // Note: This assumes single-file-cli is available in the environment execSync( `npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, { stdio: "inherit", }, ); } catch (_e) { log.error(`Failed to capture ${url} with SingleFile`); } await enqueueLinks({ strategy: "same-domain", transformRequestFunction: (req) => { if ( /\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test( req.url, ) ) return false; return req; }, }); }, }); await crawler.run(); console.log("šŸ”— Rewriting internal links for offline navigation..."); const allFiles = this.getFiles(baseOutputDir).filter((f) => f.endsWith(".html"), ); for (const file of allFiles) { let content = fs.readFileSync(file, "utf8"); const fileRelToRoot = path.relative(baseOutputDir, file); content = content.replace(/href="([^"]+)"/g, (match, href) => { if ( href.startsWith(targetUrl) || href.startsWith("/") || (!href.includes("://") && !href.startsWith("data:")) ) { try { const linkUrl = new URL(href, targetUrl); if (linkUrl.hostname === domain) { let linkPath = linkUrl.pathname; if (linkPath === "/" || linkPath === "") linkPath = "/index.html"; if (!linkPath.endsWith(".html") && !path.extname(linkPath)) linkPath += "/index.html"; if (linkPath.startsWith("/")) linkPath = linkPath.substring(1); const relativeLink = path.relative( path.dirname(fileRelToRoot), linkPath, ); return `href="${relativeLink}"`; } } catch (_e) { // Ignore link rewriting failures } } return match; }); fs.writeFileSync(file, content); } console.log(`\nāœ… Done! Perfect clone complete in: ${baseOutputDir}`); return baseOutputDir; } private getFiles(dir: string, fileList: string[] = []) { const files = fs.readdirSync(dir); for (const file of files) { const name = path.join(dir, file); if (fs.statSync(name).isDirectory()) { this.getFiles(name, fileList); } else { fileList.push(name); } } return fileList; } }