feat(cloner): add cloner-library and finalize pdf-library rename
This commit is contained in:
93
packages/cloner-library/src/AssetManager.ts
Normal file
93
packages/cloner-library/src/AssetManager.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import axios from "axios";
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
export interface AssetMap {
|
||||
[originalUrl: string]: string;
|
||||
}
|
||||
|
||||
export class AssetManager {
|
||||
private userAgent: string;
|
||||
|
||||
constructor(userAgent: string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36") {
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
public sanitizePath(rawPath: string): string {
|
||||
return rawPath
|
||||
.split("/")
|
||||
.map((p) => p.replace(/[^a-z0-9._-]/gi, "_"))
|
||||
.join("/");
|
||||
}
|
||||
|
||||
public async downloadFile(url: string, assetsDir: string): Promise<string | null> {
|
||||
if (url.startsWith("//")) url = `https:${url}`;
|
||||
if (!url.startsWith("http")) return null;
|
||||
|
||||
try {
|
||||
const u = new URL(url);
|
||||
const relPath = this.sanitizePath(u.hostname + u.pathname);
|
||||
const dest = path.join(assetsDir, relPath);
|
||||
|
||||
if (fs.existsSync(dest)) return `./assets/${relPath}`;
|
||||
|
||||
const res = await axios.get(url, {
|
||||
responseType: "arraybuffer",
|
||||
headers: { "User-Agent": this.userAgent },
|
||||
timeout: 15000,
|
||||
validateStatus: () => true,
|
||||
});
|
||||
|
||||
if (res.status !== 200) return null;
|
||||
|
||||
if (!fs.existsSync(path.dirname(dest)))
|
||||
fs.mkdirSync(path.dirname(dest), { recursive: true });
|
||||
fs.writeFileSync(dest, Buffer.from(res.data));
|
||||
return `./assets/${relPath}`;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public async processCssRecursively(
|
||||
cssContent: string,
|
||||
cssUrl: string,
|
||||
assetsDir: string,
|
||||
urlMap: AssetMap,
|
||||
depth = 0,
|
||||
): Promise<string> {
|
||||
if (depth > 5) return cssContent;
|
||||
|
||||
const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi;
|
||||
let match;
|
||||
let newContent = cssContent;
|
||||
|
||||
while ((match = urlRegex.exec(cssContent)) !== null) {
|
||||
const originalUrl = match[1];
|
||||
if (originalUrl.startsWith("data:") || originalUrl.startsWith("blob:"))
|
||||
continue;
|
||||
|
||||
try {
|
||||
const absUrl = new URL(originalUrl, cssUrl).href;
|
||||
const local = await this.downloadFile(absUrl, assetsDir);
|
||||
|
||||
if (local) {
|
||||
const u = new URL(cssUrl);
|
||||
const cssPath = u.hostname + u.pathname;
|
||||
const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname;
|
||||
|
||||
const rel = path.relative(
|
||||
path.dirname(this.sanitizePath(cssPath)),
|
||||
this.sanitizePath(assetPath),
|
||||
);
|
||||
|
||||
newContent = newContent.split(originalUrl).join(rel);
|
||||
urlMap[absUrl] = local;
|
||||
}
|
||||
} catch {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
return newContent;
|
||||
}
|
||||
}
|
||||
184
packages/cloner-library/src/PageCloner.ts
Normal file
184
packages/cloner-library/src/PageCloner.ts
Normal file
@@ -0,0 +1,184 @@
|
||||
import { chromium, Browser, BrowserContext, Page } from "playwright";
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import axios from "axios";
|
||||
import { AssetManager, AssetMap } from "./AssetManager.js";
|
||||
|
||||
export interface PageClonerOptions {
|
||||
outputDir: string;
|
||||
userAgent?: string;
|
||||
}
|
||||
|
||||
export class PageCloner {
|
||||
private options: PageClonerOptions;
|
||||
private assetManager: AssetManager;
|
||||
private userAgent: string;
|
||||
|
||||
constructor(options: PageClonerOptions) {
|
||||
this.options = options;
|
||||
this.userAgent = options.userAgent || "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36";
|
||||
this.assetManager = new AssetManager(this.userAgent);
|
||||
}
|
||||
|
||||
public async clone(targetUrl: string): Promise<string> {
|
||||
const urlObj = new URL(targetUrl);
|
||||
const domainSlug = urlObj.hostname.replace("www.", "");
|
||||
const domainDir = path.resolve(this.options.outputDir, domainSlug);
|
||||
const assetsDir = path.join(domainDir, "assets");
|
||||
|
||||
if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
|
||||
|
||||
let pageSlug = urlObj.pathname.split("/").filter(Boolean).join("-");
|
||||
if (!pageSlug) pageSlug = "index";
|
||||
const htmlFilename = `${pageSlug}.html`;
|
||||
|
||||
console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`);
|
||||
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const context = await browser.newContext({
|
||||
userAgent: this.userAgent,
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
});
|
||||
const page = await context.newPage();
|
||||
|
||||
const urlMap: AssetMap = {};
|
||||
const foundAssets = new Set<string>();
|
||||
|
||||
page.on("response", (response) => {
|
||||
if (response.status() === 200) {
|
||||
const url = response.url();
|
||||
if (url.match(/\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i)) {
|
||||
foundAssets.add(url);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
await page.goto(targetUrl, { waitUntil: "networkidle", timeout: 90000 });
|
||||
|
||||
// Scroll Wave
|
||||
await page.evaluate(async () => {
|
||||
await new Promise((resolve) => {
|
||||
let totalHeight = 0;
|
||||
const distance = 400;
|
||||
const timer = setInterval(() => {
|
||||
const scrollHeight = document.body.scrollHeight;
|
||||
window.scrollBy(0, distance);
|
||||
totalHeight += distance;
|
||||
if (totalHeight >= scrollHeight) {
|
||||
clearInterval(timer);
|
||||
window.scrollTo(0, 0);
|
||||
resolve(true);
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
});
|
||||
|
||||
const fullHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
await page.setViewportSize({ width: 1920, height: fullHeight + 1000 });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Sanitization
|
||||
await page.evaluate(() => {
|
||||
const assetPattern = /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i;
|
||||
document.querySelectorAll("*").forEach((el) => {
|
||||
if (["META", "LINK", "HEAD", "SCRIPT", "STYLE", "SVG", "PATH"].includes(el.tagName)) return;
|
||||
const htmlEl = el as HTMLElement;
|
||||
const style = window.getComputedStyle(htmlEl);
|
||||
if (style.opacity === "0" || style.visibility === "hidden") {
|
||||
htmlEl.style.setProperty("opacity", "1", "important");
|
||||
htmlEl.style.setProperty("visibility", "visible", "important");
|
||||
}
|
||||
for (const attr of Array.from(el.attributes)) {
|
||||
const name = attr.name.toLowerCase();
|
||||
const val = attr.value;
|
||||
if (assetPattern.test(val) || name.includes("src") || name.includes("image")) {
|
||||
if (el.tagName === "IMG") {
|
||||
const img = el as HTMLImageElement;
|
||||
if (name.includes("srcset")) img.srcset = val;
|
||||
else if (!img.src || img.src.includes("data:")) img.src = val;
|
||||
}
|
||||
if (el.tagName === "SOURCE") (el as HTMLSourceElement).srcset = val;
|
||||
if (el.tagName === "VIDEO" || el.tagName === "AUDIO") (el as HTMLMediaElement).src = val;
|
||||
if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes("href")) {
|
||||
const bg = htmlEl.style.backgroundImage;
|
||||
if (!bg || bg === "none") htmlEl.style.backgroundImage = `url('${val}')`;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
if (document.body) {
|
||||
document.body.style.setProperty("opacity", "1", "important");
|
||||
document.body.style.setProperty("visibility", "visible", "important");
|
||||
}
|
||||
});
|
||||
|
||||
await page.waitForLoadState("networkidle");
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
let content = await page.content();
|
||||
const regexPatterns = [
|
||||
/(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi,
|
||||
/url\(["']?([^"'\)]+)["']?\)/gi,
|
||||
];
|
||||
|
||||
for (const pattern of regexPatterns) {
|
||||
let match;
|
||||
while ((match = pattern.exec(content)) !== null) {
|
||||
try { foundAssets.add(new URL(match[1], targetUrl).href); } catch { }
|
||||
}
|
||||
}
|
||||
|
||||
for (const url of foundAssets) {
|
||||
const local = await this.assetManager.downloadFile(url, assetsDir);
|
||||
if (local) {
|
||||
urlMap[url] = local;
|
||||
const clean = url.split("?")[0];
|
||||
urlMap[clean] = local;
|
||||
if (clean.endsWith(".css")) {
|
||||
try {
|
||||
const { data } = await axios.get(url, { headers: { "User-Agent": this.userAgent } });
|
||||
const processedCss = await this.assetManager.processCssRecursively(data, url, assetsDir, urlMap);
|
||||
const relPath = this.assetManager.sanitizePath(new URL(url).hostname + new URL(url).pathname);
|
||||
fs.writeFileSync(path.join(assetsDir, relPath), processedCss);
|
||||
} catch { }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let finalContent = content;
|
||||
const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length);
|
||||
if (sortedUrls.length > 0) {
|
||||
const escaped = sortedUrls.map((u) => u.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
|
||||
const masterRegex = new RegExp(`(${escaped.join("|")})`, "g");
|
||||
finalContent = finalContent.replace(masterRegex, (match) => urlMap[match] || match);
|
||||
}
|
||||
|
||||
const commonDirs = ["/wp-content/", "/wp-includes/", "/assets/", "/static/", "/images/"];
|
||||
for (const dir of commonDirs) {
|
||||
const localDir = `./assets/${urlObj.hostname}${dir}`;
|
||||
finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`).split(`'${dir}`).join(`'${localDir}`).split(`(${dir}`).join(`(${localDir}`);
|
||||
}
|
||||
|
||||
const domainPattern = new RegExp(`https?://(www\\.)?${urlObj.hostname.replace(/\./g, "\\.")}[^"']*`, "gi");
|
||||
finalContent = finalContent.replace(domainPattern, () => "./");
|
||||
|
||||
finalContent = finalContent.replace(/<script\b[^>]*>([\s\S]*?)<\/script>/gi, (match, scriptContent) => {
|
||||
const lower = scriptContent.toLowerCase();
|
||||
return (lower.includes("google-analytics") || lower.includes("gtag") || lower.includes("fbq") || lower.includes("lazy") || lower.includes("tracker")) ? "" : match;
|
||||
});
|
||||
|
||||
const headEnd = finalContent.indexOf("</head>");
|
||||
if (headEnd > -1) {
|
||||
const stabilityCss = `\n<style>* { transition: none !important; animation: none !important; scroll-behavior: auto !important; } [data-aos], .reveal, .lazypath, .lazy-load, [data-src] { opacity: 1 !important; visibility: visible !important; transform: none !important; clip-path: none !important; } img, video, iframe { max-width: 100%; display: block; } a { pointer-events: none; cursor: default; } </style>`;
|
||||
finalContent = finalContent.slice(0, headEnd) + stabilityCss + finalContent.slice(headEnd);
|
||||
}
|
||||
|
||||
const finalPath = path.join(domainDir, htmlFilename);
|
||||
fs.writeFileSync(finalPath, finalContent);
|
||||
return finalPath;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
123
packages/cloner-library/src/WebsiteCloner.ts
Normal file
123
packages/cloner-library/src/WebsiteCloner.ts
Normal file
@@ -0,0 +1,123 @@
|
||||
import { PlaywrightCrawler, RequestQueue } from 'crawlee';
|
||||
import * as path from 'node:path';
|
||||
import * as fs from 'node:fs';
|
||||
import { execSync } from 'node:child_process';
|
||||
|
||||
export interface WebsiteClonerOptions {
|
||||
baseOutputDir: string;
|
||||
maxRequestsPerCrawl?: number;
|
||||
maxConcurrency?: number;
|
||||
}
|
||||
|
||||
export class WebsiteCloner {
|
||||
private options: WebsiteClonerOptions;
|
||||
|
||||
constructor(options: WebsiteClonerOptions) {
|
||||
this.options = {
|
||||
maxRequestsPerCrawl: 100,
|
||||
maxConcurrency: 3,
|
||||
...options
|
||||
};
|
||||
}
|
||||
|
||||
public async clone(targetUrl: string, outputDirName?: string): Promise<string> {
|
||||
const urlObj = new URL(targetUrl);
|
||||
const domain = urlObj.hostname;
|
||||
const finalOutputDirName = outputDirName || domain.replace(/\./g, '-');
|
||||
const baseOutputDir = path.resolve(this.options.baseOutputDir, finalOutputDirName);
|
||||
|
||||
if (fs.existsSync(baseOutputDir)) {
|
||||
fs.rmSync(baseOutputDir, { recursive: true, force: true });
|
||||
}
|
||||
fs.mkdirSync(baseOutputDir, { recursive: true });
|
||||
|
||||
console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
|
||||
console.log(`📂 Output: ${baseOutputDir}`);
|
||||
|
||||
const requestQueue = await RequestQueue.open();
|
||||
await requestQueue.addRequest({ url: targetUrl });
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
requestQueue,
|
||||
maxRequestsPerCrawl: this.options.maxRequestsPerCrawl,
|
||||
maxConcurrency: this.options.maxConcurrency,
|
||||
|
||||
async requestHandler({ request, enqueueLinks, log }) {
|
||||
const url = request.url;
|
||||
log.info(`Capturing ${url}...`);
|
||||
|
||||
const u = new URL(url);
|
||||
let relPath = u.pathname;
|
||||
if (relPath === '/' || relPath === '') relPath = '/index.html';
|
||||
if (!relPath.endsWith('.html') && !path.extname(relPath)) relPath += '/index.html';
|
||||
if (relPath.startsWith('/')) relPath = relPath.substring(1);
|
||||
|
||||
const fullPath = path.join(baseOutputDir, relPath);
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
|
||||
try {
|
||||
// Note: This assumes single-file-cli is available in the environment
|
||||
execSync(`npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`, {
|
||||
stdio: 'inherit'
|
||||
});
|
||||
} catch (e) {
|
||||
log.error(`Failed to capture ${url} with SingleFile`);
|
||||
}
|
||||
|
||||
await enqueueLinks({
|
||||
strategy: 'same-domain',
|
||||
transformRequestFunction: (req) => {
|
||||
if (/\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(req.url)) return false;
|
||||
return req;
|
||||
}
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
await crawler.run();
|
||||
|
||||
console.log('🔗 Rewriting internal links for offline navigation...');
|
||||
const allFiles = this.getFiles(baseOutputDir).filter(f => f.endsWith('.html'));
|
||||
|
||||
for (const file of allFiles) {
|
||||
let content = fs.readFileSync(file, 'utf8');
|
||||
const fileRelToRoot = path.relative(baseOutputDir, file);
|
||||
|
||||
content = content.replace(/href="([^"]+)"/g, (match, href) => {
|
||||
if (href.startsWith(targetUrl) || href.startsWith('/') || (!href.includes('://') && !href.startsWith('data:'))) {
|
||||
try {
|
||||
const linkUrl = new URL(href, targetUrl);
|
||||
if (linkUrl.hostname === domain) {
|
||||
let linkPath = linkUrl.pathname;
|
||||
if (linkPath === '/' || linkPath === '') linkPath = '/index.html';
|
||||
if (!linkPath.endsWith('.html') && !path.extname(linkPath)) linkPath += '/index.html';
|
||||
if (linkPath.startsWith('/')) linkPath = linkPath.substring(1);
|
||||
|
||||
const relativeLink = path.relative(path.dirname(fileRelToRoot), linkPath);
|
||||
return `href="${relativeLink}"`;
|
||||
}
|
||||
} catch (e) { }
|
||||
}
|
||||
return match;
|
||||
});
|
||||
|
||||
fs.writeFileSync(file, content);
|
||||
}
|
||||
|
||||
console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
|
||||
return baseOutputDir;
|
||||
}
|
||||
|
||||
private getFiles(dir: string, fileList: string[] = []) {
|
||||
const files = fs.readdirSync(dir);
|
||||
for (const file of files) {
|
||||
const name = path.join(dir, file);
|
||||
if (fs.statSync(name).isDirectory()) {
|
||||
this.getFiles(name, fileList);
|
||||
} else {
|
||||
fileList.push(name);
|
||||
}
|
||||
}
|
||||
return fileList;
|
||||
}
|
||||
}
|
||||
3
packages/cloner-library/src/index.ts
Normal file
3
packages/cloner-library/src/index.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
export * from "./AssetManager.js";
|
||||
export * from "./PageCloner.js";
|
||||
export * from "./WebsiteCloner.js";
|
||||
Reference in New Issue
Block a user