chore: optimize cms startup, refactor scripts and implement real-time dev mode

This commit is contained in:
2026-02-16 18:10:52 +01:00
parent 67750c886e
commit 8f32c80801
33 changed files with 1185 additions and 618 deletions

View File

@@ -1,123 +1,150 @@
import { PlaywrightCrawler, RequestQueue } from 'crawlee';
import * as path from 'node:path';
import * as fs from 'node:fs';
import { execSync } from 'node:child_process';
import { PlaywrightCrawler, RequestQueue } from "crawlee";
import * as path from "node:path";
import * as fs from "node:fs";
import { execSync } from "node:child_process";
export interface WebsiteClonerOptions {
baseOutputDir: string;
maxRequestsPerCrawl?: number;
maxConcurrency?: number;
baseOutputDir: string;
maxRequestsPerCrawl?: number;
maxConcurrency?: number;
}
export class WebsiteCloner {
private options: WebsiteClonerOptions;
private options: WebsiteClonerOptions;
constructor(options: WebsiteClonerOptions) {
this.options = {
maxRequestsPerCrawl: 100,
maxConcurrency: 3,
...options
};
constructor(options: WebsiteClonerOptions) {
this.options = {
maxRequestsPerCrawl: 100,
maxConcurrency: 3,
...options,
};
}
public async clone(
targetUrl: string,
outputDirName?: string,
): Promise<string> {
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const finalOutputDirName = outputDirName || domain.replace(/\./g, "-");
const baseOutputDir = path.resolve(
this.options.baseOutputDir,
finalOutputDirName,
);
if (fs.existsSync(baseOutputDir)) {
fs.rmSync(baseOutputDir, { recursive: true, force: true });
}
fs.mkdirSync(baseOutputDir, { recursive: true });
public async clone(targetUrl: string, outputDirName?: string): Promise<string> {
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const finalOutputDirName = outputDirName || domain.replace(/\./g, '-');
const baseOutputDir = path.resolve(this.options.baseOutputDir, finalOutputDirName);
console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
console.log(`📂 Output: ${baseOutputDir}`);
if (fs.existsSync(baseOutputDir)) {
fs.rmSync(baseOutputDir, { recursive: true, force: true });
}
fs.mkdirSync(baseOutputDir, { recursive: true });
const requestQueue = await RequestQueue.open();
await requestQueue.addRequest({ url: targetUrl });
console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
console.log(`📂 Output: ${baseOutputDir}`);
const crawler = new PlaywrightCrawler({
requestQueue,
maxRequestsPerCrawl: this.options.maxRequestsPerCrawl,
maxConcurrency: this.options.maxConcurrency,
const requestQueue = await RequestQueue.open();
await requestQueue.addRequest({ url: targetUrl });
async requestHandler({ request, enqueueLinks, log }) {
const url = request.url;
log.info(`Capturing ${url}...`);
const crawler = new PlaywrightCrawler({
requestQueue,
maxRequestsPerCrawl: this.options.maxRequestsPerCrawl,
maxConcurrency: this.options.maxConcurrency,
const u = new URL(url);
let relPath = u.pathname;
if (relPath === "/" || relPath === "") relPath = "/index.html";
if (!relPath.endsWith(".html") && !path.extname(relPath))
relPath += "/index.html";
if (relPath.startsWith("/")) relPath = relPath.substring(1);
async requestHandler({ request, enqueueLinks, log }) {
const url = request.url;
log.info(`Capturing ${url}...`);
const fullPath = path.join(baseOutputDir, relPath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
const u = new URL(url);
let relPath = u.pathname;
if (relPath === '/' || relPath === '') relPath = '/index.html';
if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
if (relPath.startsWith('/')) relPath = relPath.substring(1);
const fullPath = path.join(baseOutputDir, relPath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
try {
// Note: This assumes single-file-cli is available in the environment
execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
stdio: 'inherit'
});
} catch (e) {
log.error(`Failed to capture ${url} with SingleFile`);
}
await enqueueLinks({
strategy: 'same-domain',
transformRequestFunction: (req) => {
if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
return req;
}
});
try {
// Note: This assumes single-file-cli is available in the environment
execSync(
`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`,
{
stdio: "inherit",
},
);
} catch (_e) {
log.error(`Failed to capture ${url} with SingleFile`);
}
await enqueueLinks({
strategy: "same-domain",
transformRequestFunction: (req) => {
if (
/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(
req.url,
)
)
return false;
return req;
},
});
},
});
await crawler.run();
await crawler.run();
console.log('🔗 Rewriting internal links for offline navigation...');
const allFiles = this.getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
console.log("🔗 Rewriting internal links for offline navigation...");
const allFiles = this.getFiles(baseOutputDir).filter((f) =>
f.endsWith(".html"),
);
for (const file of allFiles) {
let content = fs.readFileSync(file, 'utf8');
const fileRelToRoot = path.relative(baseOutputDir, file);
for (const file of allFiles) {
let content = fs.readFileSync(file, "utf8");
const fileRelToRoot = path.relative(baseOutputDir, file);
content = content.replace(/href="([^"]+)"/g, (match, href) => {
if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
try {
const linkUrl = new URL(href, targetUrl);
if (linkUrl.hostname === domain) {
let linkPath = linkUrl.pathname;
if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
content = content.replace(/href="([^"]+)"/g, (match, href) => {
if (
href.startsWith(targetUrl) ||
href.startsWith("/") ||
(!href.includes("://") && !href.startsWith("data:"))
) {
try {
const linkUrl = new URL(href, targetUrl);
if (linkUrl.hostname === domain) {
let linkPath = linkUrl.pathname;
if (linkPath === "/" || linkPath === "") linkPath = "/index.html";
if (!linkPath.endsWith(".html") && !path.extname(linkPath))
linkPath += "/index.html";
if (linkPath.startsWith("/")) linkPath = linkPath.substring(1);
const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
return `href="${relativeLink}"`;
}
} catch (e) { }
}
return match;
});
fs.writeFileSync(file, content);
}
console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
return baseOutputDir;
}
private getFiles(dir: string, fileList: string[] = []) {
const files = fs.readdirSync(dir);
for (const file of files) {
const name = path.join(dir, file);
if (fs.statSync(name).isDirectory()) {
this.getFiles(name, fileList);
} else {
fileList.push(name);
const relativeLink = path.relative(
path.dirname(fileRelToRoot),
linkPath,
);
return `href="${relativeLink}"`;
}
} catch (_e) {
// Ignore link rewriting failures
}
}
return fileList;
return match;
});
fs.writeFileSync(file, content);
}
console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
return baseOutputDir;
}
private getFiles(dir: string, fileList: string[] = []) {
const files = fs.readdirSync(dir);
for (const file of files) {
const name = path.join(dir, file);
if (fs.statSync(name).isDirectory()) {
this.getFiles(name, fileList);
} else {
fileList.push(name);
}
}
return fileList;
}
}