This commit is contained in:
2026-02-02 16:31:08 +01:00
parent 821a35f0fc
commit badb7b6141
526 changed files with 14833 additions and 5713 deletions

View File

@@ -1,197 +1,320 @@
import { chromium } from 'playwright';
import { chromium, type Page } from 'playwright';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
import axios from 'axios';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36';
function sanitizePath(rawPath: string) {
return rawPath.split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
}
async function downloadFile(url: string, assetsDir: string) {
if (url.startsWith('//')) url = `https:${url}`;
if (!url.startsWith('http')) return null;
try {
const u = new URL(url);
// Create a collision-resistant local path
const relPath = sanitizePath(u.hostname + u.pathname);
const dest = path.join(assetsDir, relPath);
if (fs.existsSync(dest)) return `./assets/${relPath}`;
const res = await axios.get(url, {
responseType: 'arraybuffer',
headers: { 'User-Agent': USER_AGENT },
timeout: 15000,
validateStatus: () => true
});
if (res.status !== 200) return null;
if (!fs.existsSync(path.dirname(dest))) fs.mkdirSync(path.dirname(dest), { recursive: true });
fs.writeFileSync(dest, Buffer.from(res.data));
return `./assets/${relPath}`;
} catch {
return null; // Fail silently, proceed with original URL
}
}
async function processCssRecursively(cssContent: string, cssUrl: string, assetsDir: string, urlMap: Record<string, string>, depth = 0) {
if (depth > 5) return cssContent;
// Capture both standard url(...) and @import url(...)
const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi;
let match;
let newContent = cssContent;
while ((match = urlRegex.exec(cssContent)) !== null) {
const originalUrl = match[1];
if (originalUrl.startsWith('data:') || originalUrl.startsWith('blob:')) continue;
try {
const absUrl = new URL(originalUrl, cssUrl).href;
const local = await downloadFile(absUrl, assetsDir);
if (local) {
// Calculate relative path from CSS file to Asset
const u = new URL(cssUrl);
const cssPath = u.hostname + u.pathname;
const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname;
// We need to route from the folder containing the CSS to the asset
const rel = path.relative(path.dirname(sanitizePath(cssPath)), sanitizePath(assetPath));
// Replace strictly the URL part
newContent = newContent.split(originalUrl).join(rel);
urlMap[absUrl] = local;
}
} catch { }
}
return newContent;
}
async function run() {
let rawUrl = process.argv[2];
const rawUrl = process.argv[2];
if (!rawUrl) {
console.error('Usage: npm run clone-page <URL>');
console.error('Usage: npm run clone-page <url>');
process.exit(1);
}
const targetUrl = rawUrl.trim().replace(/[;'"]+$/, '');
const targetUrl = rawUrl.trim();
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
const domainDir = path.resolve(__dirname, '../cloned-websites', safeDomain);
// Setup Output Directories
const domainSlug = urlObj.hostname.replace('www.', '');
const domainDir = path.resolve(__dirname, `../public/showcase/${domainSlug}`);
const assetsDir = path.join(domainDir, 'assets');
if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
let slug = urlObj.pathname.split('/').filter(Boolean).join('-');
if (!slug) slug = 'index';
const htmlFilename = `${slug}.html`;
let pageSlug = urlObj.pathname.split('/').filter(Boolean).join('-');
if (!pageSlug) pageSlug = 'index';
const htmlFilename = `${pageSlug}.html`;
console.log(`🚀 CLONING: ${targetUrl}`);
console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`);
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
});
// Start with a standard viewport, we will resize widely later
const context = await browser.newContext({ userAgent: USER_AGENT, viewport: { width: 1920, height: 1080 } });
const page = await context.newPage();
// Asset capture logic
page.on('response', async (response) => {
const urlMap: Record<string, string> = {};
const foundAssets = new Set<string>();
// 1. Live Network Interception
page.on('response', response => {
const url = response.url();
if (response.status() !== 200) return;
const u = new URL(url);
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf|xml)$/i.test(u.pathname) ||
url.includes('wp-content') || url.includes('wp-includes') || url.includes('fonts.googleapis.com');
if (isAsset && url !== targetUrl) {
try {
const buffer = await response.body();
const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
const fileDest = path.join(assetsDir, u.hostname, sanitizedPath);
if (!fs.existsSync(path.dirname(fileDest))) fs.mkdirSync(path.dirname(fileDest), { recursive: true });
// We overwrite for now to ensure freshness
fs.writeFileSync(fileDest, buffer);
// If it's a CSS file, we might want to rewrite it later, but let's do it on the fly or after
} catch (e) { }
if (response.status() === 200) {
// Capture anything that looks like a static asset
if (url.match(/\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i)) {
foundAssets.add(url);
}
}
});
try {
await page.goto(targetUrl, { waitUntil: 'networkidle', timeout: 90000 });
console.log('🌐 Loading page...');
await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
// Comprehensive scroll
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0, distance = 400, timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight || totalHeight > 30000) {
clearInterval(timer);
window.scrollTo(0, 0);
resolve(null);
console.log('<27> Expanding Viewport to "Giant Mode" to force-trigger lazy loaders...');
// Measure content height and resize viewport to display EVERYTHING at once
const fullHeight = await page.evaluate(() => document.body.scrollHeight);
await page.setViewportSize({ width: 1920, height: fullHeight + 2000 });
// Wait a moment for IntersectionObservers to fire
await page.waitForTimeout(2000);
console.log('💧 Hydrating attributes and cleaning DOM...');
await page.evaluate(() => {
// A. Deterministic Attribute Hydration
// Scours every element for attributes that look like asset URLs and promotes them
const assetPattern = /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i;
document.querySelectorAll('*').forEach(el => {
// 1. Force Visibility (Anti-Flicker)
const style = window.getComputedStyle(el);
if (style.opacity === '0' || style.visibility === 'hidden') {
el.style.setProperty('opacity', '1', 'important');
el.style.setProperty('visibility', 'visible', 'important');
}
// 2. Promote Data Attributes
for (const attr of Array.from(el.attributes)) {
const name = attr.name.toLowerCase();
const val = attr.value;
if (assetPattern.test(val) || name.includes('src') || name.includes('image')) {
// Standard Image/Video/Source promotion
if (el.tagName === 'IMG') {
if (name.includes('srcset')) el.srcset = val;
else if (!el.src || el.src.includes('data:')) el.src = val;
}
if (el.tagName === 'SOURCE') {
if (name.includes('srcset')) el.srcset = val;
}
if (el.tagName === 'VIDEO' || el.tagName === 'AUDIO') {
if (!el.src) el.src = val;
}
// Background Image Promotion
// Common patterns: data-bg, data-image-src, style="...url(...)..."
if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes('href')) {
const bg = el.style.backgroundImage;
if (!bg || bg === 'none') {
el.style.backgroundImage = `url('${val}')`;
}
}
}
}, 200);
}
});
});
console.log('⏳ Waiting for network idle...');
await page.waitForLoadState('networkidle');
await page.waitForTimeout(5000); // 5 seconds extra for lazy scripts
// 2. Static Snapshot
let content = await page.content();
const rewriteUrl = (fullUrl: string) => {
try {
if (!fullUrl.startsWith('http') && !fullUrl.startsWith('//')) return fullUrl;
let actualUrl = fullUrl;
if (fullUrl.startsWith('//')) actualUrl = `https:${fullUrl}`;
// 3. Post-Snapshot Asset Discovery (Regex)
// Catches assets that never triggered a network request but exist in the markup
const regexPatterns = [
/(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi,
// Capture CSS url() inside style blocks
/url\(["']?([^"'\)]+)["']?\)/gi
];
const u = new URL(actualUrl);
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf)$/i.test(u.pathname) ||
actualUrl.includes('wp-content') || actualUrl.includes('wp-includes') || actualUrl.includes('fonts.googleapis.com');
if (isAsset) {
const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
return `./assets/${u.hostname}/${sanitizedPath}`;
}
} catch (e) { }
return fullUrl;
};
// 1. Rewrite src, href, content, poster
content = content.replace(/(src|href|content|poster)=["']([^"']+)["']/gi, (match, attr, url) => {
if (attr === 'href' && !url.includes('.') && !url.includes('http')) return match; // Keep anchor links or paths
return `${attr}="${rewriteUrl(url)}"`;
});
// 2. Rewrite srcset
content = content.replace(/srcset=["']([^"']+)["']/gi, (match, srcset) => {
const parts = srcset.split(',').map(part => {
const trimmed = part.trim();
const lastSpaceIndex = trimmed.lastIndexOf(' ');
if (lastSpaceIndex === -1) return rewriteUrl(trimmed);
const url = trimmed.substring(0, lastSpaceIndex);
const size = trimmed.substring(lastSpaceIndex);
return `${rewriteUrl(url)}${size}`;
});
return `srcset="${parts.join(', ')}"`;
});
// 3. Rewrite inline styles
content = content.replace(/url\(["']?([^"'\)]+)["']?\)/gi, (match, url) => {
return `url("${rewriteUrl(url)}")`;
});
// 4. Salient/Industrial Overrides
const fixes = `
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700;800;900&display=swap" rel="stylesheet">
<style>
:root { --main-font: 'Inter'; --heading-font: 'Montserrat'; }
body, p, li, a, span { font-family: 'Inter', sans-serif !important; }
h1, h2, h3, h4, h5, h6, .title-font { font-family: 'Montserrat', sans-serif !important; font-weight: 700 !important; }
</style>`;
content = content.replace('</head>', `${fixes}</head>`);
// Link Nuker: only if it looks like an internal/external link, not assets
content = content.replace(/<a\b([^>]*)\bhref=["'](https?:\/\/[^"']+|(?![^"']*\.(css|js|png|jpg|jpeg|gif|svg|pdf))[./][^"']*)["']/gi, '<a$1href="#"');
// Fix Breeze dynamic scripts
content = content.replace(/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => {
const u = url.trim();
const cleanUrl = u.split('?')[0];
if (cleanUrl.endsWith('.css')) return `<link rel="stylesheet" href="${u}">`;
return `<script src="${u}"></script>`;
});
fs.writeFileSync(path.join(domainDir, htmlFilename), content);
// 5. CSS REWRITING: Fix absolute paths in all captured CSS files
const allFiles = (dir: string): string[] => {
let results: string[] = [];
fs.readdirSync(dir).forEach(f => {
const fullPath = path.join(dir, f);
if (fs.statSync(fullPath).isDirectory()) {
results = results.concat(allFiles(fullPath));
} else if (f.endsWith('.css')) {
results.push(fullPath);
}
});
return results;
};
const cssFiles = allFiles(assetsDir);
for (const cssFile of cssFiles) {
let cssContent = fs.readFileSync(cssFile, 'utf8');
// Replace absolute domain references with local folder structure
// This is tricky because we need relative paths, but we can use absolute-ish paths relative to root?
// Actually, we can just point them back to the same assets folder structure.
// But since they are inside assets/host/path, they need to go up levels.
// A simpler way: replace domain urls with a full site-root relative path if possible,
// but CSS relative paths are hard.
// Let's just try to flatten them or use absolute paths for the clone.
// Actually, the easiest is to replace https://klz-cables.com/ with /assets/klz-cables.com/
// But the clone is viewed locally.
cssContent = cssContent.replace(/url\(["']?https?:\/\/([^\/"']+\/[^"'\)]+)["']?\)/gi, (match, pathAndHost) => {
const parts = pathAndHost.split('/');
const host = parts[0];
const rest = parts.slice(1).join('/').split('?')[0];
const sanitizedRest = rest.split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
// This is still just a guess at where the asset is.
// But it's better than pointing to a dead live site.
return `url("/assets/${host}/${sanitizedRest}")`;
});
fs.writeFileSync(cssFile, cssContent);
for (const pattern of regexPatterns) {
let match;
while ((match = pattern.exec(content)) !== null) {
try { foundAssets.add(new URL(match[1], targetUrl).href); } catch { }
}
}
console.log(`\n✅ CLONED: ${htmlFilename}`);
// Specific srcset parsing
const srcsetRegex = /[a-z0-9-]+srcset=["']([^"']+)["']/gi;
let match;
while ((match = srcsetRegex.exec(content)) !== null) {
match[1].split(',').forEach(rule => {
const parts = rule.trim().split(/\s+/);
if (parts[0] && !parts[0].startsWith('data:')) {
try { foundAssets.add(new URL(parts[0], targetUrl).href); } catch { }
}
});
}
console.log(`🔍 Processing ${foundAssets.size} discovered assets...`);
// 4. Download & Map
for (const url of foundAssets) {
const local = await downloadFile(url, assetsDir);
if (local) {
urlMap[url] = local;
const clean = url.split('?')[0];
urlMap[clean] = local;
// Handle CSS recursively
if (clean.endsWith('.css')) {
try {
const { data } = await axios.get(url, { headers: { 'User-Agent': USER_AGENT } });
// Process CSS and save it
const processedCss = await processCssRecursively(data, url, assetsDir, urlMap);
const relPath = sanitizePath(new URL(url).hostname + new URL(url).pathname);
fs.writeFileSync(path.join(assetsDir, relPath), processedCss);
} catch { }
}
}
}
console.log('🛠️ Finalizing Static Mirror...');
let finalContent = content;
// A. Apply URL Map Replacements
// Longer paths first to prevent partial replacement errors
const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length);
if (sortedUrls.length > 0) {
const escaped = sortedUrls.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
// Create a massive regex for single-pass replacement
const masterRegex = new RegExp(`(${escaped.join('|')})`, 'g');
finalContent = finalContent.replace(masterRegex, (match) => urlMap[match] || match);
}
// B. Global Root-Relative Path Cleanup
// Catches things like /wp-content/ that weren't distinct assets or were missed
const commonDirs = ['/wp-content/', '/wp-includes/', '/assets/', '/static/', '/images/'];
for (const dir of commonDirs) {
const localDir = `./assets/${urlObj.hostname}${dir}`;
finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`);
finalContent = finalContent.split(`'${dir}`).join(`'${localDir}`);
finalContent = finalContent.split(`(${dir}`).join(`(${localDir}`);
}
// C. Domain Nuke
// Replace absolute links to the original domain with relative or #
const domainPattern = new RegExp(`https?://(www\\.)?${urlObj.hostname.replace(/\./g, '\\.')}[^"']*`, 'gi');
// We carefully only replace if it looks like a resource link, or neutralize if it's a navigation link
// For simplicity and "solidness", we'll rely on the specific replacements above first.
// This catch-all nuke ensures we don't leak requests.
// Convert remaining absolute domain links to relative .
finalContent = finalContent.replace(domainPattern, (match) => {
// If we have a map for it, it should have been replaced.
// If not, it's likely a navigation link or an uncaptured asset.
// Safe fallback:
return './';
});
// D. Static Stability & Cleanup
// Remove tracking/analytics/lazy-load scripts that ruins stability
finalContent = finalContent.replace(/<script\b[^>]*>([\s\S]*?)<\/script>/gi, (match, content) => {
const lower = content.toLowerCase();
if (lower.includes('google-analytics') ||
lower.includes('gtag') ||
lower.includes('fbq') ||
lower.includes('lazy') ||
lower.includes('tracker')) {
return '';
}
return match;
});
// E. CSS Injections for Stability
const headEnd = finalContent.indexOf('</head>');
if (headEnd > -1) {
const stabilityCss = `
<style>
/* INDUSTRIAL CLONE STABILIZATION */
* {
transition: none !important;
animation: none !important;
scroll-behavior: auto !important;
}
[data-aos], .reveal, .lazypath, .lazy-load, [data-src] {
opacity: 1 !important;
visibility: visible !important;
transform: none !important;
clip-path: none !important;
}
img, video, iframe {
max-width: 100%;
display: block;
}
a {
pointer-events: none;
cursor: default;
}
</style>`;
finalContent = finalContent.slice(0, headEnd) + stabilityCss + finalContent.slice(headEnd);
}
// Save
const finalPath = path.join(domainDir, htmlFilename);
fs.writeFileSync(finalPath, finalContent);
console.log(`✅ SUCCESS: Cloned to ${finalPath}`);
} catch (err) {
console.error('❌ FAILED:', err);
console.error('❌ FATAL ERROR:', err);
} finally {
await browser.close();
}