151 lines
4.5 KiB
TypeScript
151 lines
4.5 KiB
TypeScript
import { PlaywrightCrawler, RequestQueue } from "crawlee";
|
|
import * as path from "node:path";
|
|
import * as fs from "node:fs";
|
|
import { execSync } from "node:child_process";
|
|
|
|
export interface WebsiteClonerOptions {
|
|
baseOutputDir: string;
|
|
maxRequestsPerCrawl?: number;
|
|
maxConcurrency?: number;
|
|
}
|
|
|
|
export class WebsiteCloner {
|
|
private options: WebsiteClonerOptions;
|
|
|
|
constructor(options: WebsiteClonerOptions) {
|
|
this.options = {
|
|
maxRequestsPerCrawl: 100,
|
|
maxConcurrency: 3,
|
|
...options,
|
|
};
|
|
}
|
|
|
|
public async clone(
|
|
targetUrl: string,
|
|
outputDirName?: string,
|
|
): Promise<string> {
|
|
const urlObj = new URL(targetUrl);
|
|
const domain = urlObj.hostname;
|
|
const finalOutputDirName = outputDirName || domain.replace(/\./g, "-");
|
|
const baseOutputDir = path.resolve(
|
|
this.options.baseOutputDir,
|
|
finalOutputDirName,
|
|
);
|
|
|
|
if (fs.existsSync(baseOutputDir)) {
|
|
fs.rmSync(baseOutputDir, { recursive: true, force: true });
|
|
}
|
|
fs.mkdirSync(baseOutputDir, { recursive: true });
|
|
|
|
console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
|
|
console.log(`📂 Output: ${baseOutputDir}`);
|
|
|
|
const requestQueue = await RequestQueue.open();
|
|
await requestQueue.addRequest({ url: targetUrl });
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
requestQueue,
|
|
maxRequestsPerCrawl: this.options.maxRequestsPerCrawl,
|
|
maxConcurrency: this.options.maxConcurrency,
|
|
|
|
async requestHandler({ request, enqueueLinks, log }) {
|
|
const url = request.url;
|
|
log.info(`Capturing ${url}...`);
|
|
|
|
const u = new URL(url);
|
|
let relPath = u.pathname;
|
|
if (relPath === "/" || relPath === "") relPath = "/index.html";
|
|
if (!relPath.endsWith(".html") && !path.extname(relPath))
|
|
relPath += "/index.html";
|
|
if (relPath.startsWith("/")) relPath = relPath.substring(1);
|
|
|
|
const fullPath = path.join(baseOutputDir, relPath);
|
|
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
|
|
|
try {
|
|
// Note: This assumes single-file-cli is available in the environment
|
|
execSync(
|
|
`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`,
|
|
{
|
|
stdio: "inherit",
|
|
},
|
|
);
|
|
} catch (_e) {
|
|
log.error(`Failed to capture ${url} with SingleFile`);
|
|
}
|
|
|
|
await enqueueLinks({
|
|
strategy: "same-domain",
|
|
transformRequestFunction: (req) => {
|
|
if (
|
|
/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(
|
|
req.url,
|
|
)
|
|
)
|
|
return false;
|
|
return req;
|
|
},
|
|
});
|
|
},
|
|
});
|
|
|
|
await crawler.run();
|
|
|
|
console.log("🔗 Rewriting internal links for offline navigation...");
|
|
const allFiles = this.getFiles(baseOutputDir).filter((f) =>
|
|
f.endsWith(".html"),
|
|
);
|
|
|
|
for (const file of allFiles) {
|
|
let content = fs.readFileSync(file, "utf8");
|
|
const fileRelToRoot = path.relative(baseOutputDir, file);
|
|
|
|
content = content.replace(/href="([^"]+)"/g, (match, href) => {
|
|
if (
|
|
href.startsWith(targetUrl) ||
|
|
href.startsWith("/") ||
|
|
(!href.includes("://") && !href.startsWith("data:"))
|
|
) {
|
|
try {
|
|
const linkUrl = new URL(href, targetUrl);
|
|
if (linkUrl.hostname === domain) {
|
|
let linkPath = linkUrl.pathname;
|
|
if (linkPath === "/" || linkPath === "") linkPath = "/index.html";
|
|
if (!linkPath.endsWith(".html") && !path.extname(linkPath))
|
|
linkPath += "/index.html";
|
|
if (linkPath.startsWith("/")) linkPath = linkPath.substring(1);
|
|
|
|
const relativeLink = path.relative(
|
|
path.dirname(fileRelToRoot),
|
|
linkPath,
|
|
);
|
|
return `href="${relativeLink}"`;
|
|
}
|
|
} catch (_e) {
|
|
// Ignore link rewriting failures
|
|
}
|
|
}
|
|
return match;
|
|
});
|
|
|
|
fs.writeFileSync(file, content);
|
|
}
|
|
|
|
console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
|
|
return baseOutputDir;
|
|
}
|
|
|
|
private getFiles(dir: string, fileList: string[] = []) {
|
|
const files = fs.readdirSync(dir);
|
|
for (const file of files) {
|
|
const name = path.join(dir, file);
|
|
if (fs.statSync(name).isDirectory()) {
|
|
this.getFiles(name, fileList);
|
|
} else {
|
|
fileList.push(name);
|
|
}
|
|
}
|
|
return fileList;
|
|
}
|
|
}
|