Some checks failed
🧪 CI (QA) / 🧪 Quality Assurance (push) Failing after 1m3s
- Restructure to pnpm monorepo (site moved to apps/web) - Integrate @mintel/tsconfig, @mintel/eslint-config, @mintel/husky-config - Implement Docker service architecture (Varnish, Directus, Gatekeeper) - Setup environment-aware Gitea Actions deployment
359 lines
15 KiB
TypeScript
359 lines
15 KiB
TypeScript
import { chromium, type Page } from 'playwright';
|
|
import path from 'node:path';
|
|
import { fileURLToPath } from 'node:url';
|
|
import fs from 'node:fs';
|
|
import axios from 'axios';
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36';
|
|
|
|
function sanitizePath(rawPath: string) {
|
|
return rawPath.split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
|
|
}
|
|
|
|
async function downloadFile(url: string, assetsDir: string) {
|
|
if (url.startsWith('//')) url = `https:${url}`;
|
|
if (!url.startsWith('http')) return null;
|
|
|
|
try {
|
|
const u = new URL(url);
|
|
// Create a collision-resistant local path
|
|
const relPath = sanitizePath(u.hostname + u.pathname);
|
|
const dest = path.join(assetsDir, relPath);
|
|
|
|
if (fs.existsSync(dest)) return `./assets/${relPath}`;
|
|
|
|
const res = await axios.get(url, {
|
|
responseType: 'arraybuffer',
|
|
headers: { 'User-Agent': USER_AGENT },
|
|
timeout: 15000,
|
|
validateStatus: () => true
|
|
});
|
|
|
|
if (res.status !== 200) return null;
|
|
|
|
if (!fs.existsSync(path.dirname(dest))) fs.mkdirSync(path.dirname(dest), { recursive: true });
|
|
fs.writeFileSync(dest, Buffer.from(res.data));
|
|
return `./assets/${relPath}`;
|
|
} catch {
|
|
return null; // Fail silently, proceed with original URL
|
|
}
|
|
}
|
|
|
|
async function processCssRecursively(cssContent: string, cssUrl: string, assetsDir: string, urlMap: Record<string, string>, depth = 0) {
|
|
if (depth > 5) return cssContent;
|
|
|
|
// Capture both standard url(...) and @import url(...)
|
|
const urlRegex = /(?:url\(["']?|@import\s+["'])([^"'\)]+)["']?\)?/gi;
|
|
let match;
|
|
let newContent = cssContent;
|
|
|
|
while ((match = urlRegex.exec(cssContent)) !== null) {
|
|
const originalUrl = match[1];
|
|
if (originalUrl.startsWith('data:') || originalUrl.startsWith('blob:')) continue;
|
|
|
|
try {
|
|
const absUrl = new URL(originalUrl, cssUrl).href;
|
|
const local = await downloadFile(absUrl, assetsDir);
|
|
|
|
if (local) {
|
|
// Calculate relative path from CSS file to Asset
|
|
const u = new URL(cssUrl);
|
|
const cssPath = u.hostname + u.pathname;
|
|
const assetPath = new URL(absUrl).hostname + new URL(absUrl).pathname;
|
|
|
|
// We need to route from the folder containing the CSS to the asset
|
|
const rel = path.relative(path.dirname(sanitizePath(cssPath)), sanitizePath(assetPath));
|
|
|
|
// Replace strictly the URL part
|
|
newContent = newContent.split(originalUrl).join(rel);
|
|
urlMap[absUrl] = local;
|
|
}
|
|
} catch { }
|
|
}
|
|
return newContent;
|
|
}
|
|
|
|
async function run() {
|
|
const rawUrl = process.argv[2];
|
|
if (!rawUrl) {
|
|
console.error('Usage: npm run clone-page <url>');
|
|
process.exit(1);
|
|
}
|
|
const targetUrl = rawUrl.trim();
|
|
const urlObj = new URL(targetUrl);
|
|
|
|
// Setup Output Directories
|
|
const domainSlug = urlObj.hostname.replace('www.', '');
|
|
const domainDir = path.resolve(__dirname, `../public/showcase/${domainSlug}`);
|
|
const assetsDir = path.join(domainDir, 'assets');
|
|
if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
|
|
|
|
let pageSlug = urlObj.pathname.split('/').filter(Boolean).join('-');
|
|
if (!pageSlug) pageSlug = 'index';
|
|
const htmlFilename = `${pageSlug}.html`;
|
|
|
|
console.log(`🚀 INDUSTRIAL CLONE: ${targetUrl}`);
|
|
|
|
const browser = await chromium.launch({ headless: true });
|
|
// Start with a standard viewport, we will resize widely later
|
|
const context = await browser.newContext({ userAgent: USER_AGENT, viewport: { width: 1920, height: 1080 } });
|
|
const page = await context.newPage();
|
|
|
|
const urlMap: Record<string, string> = {};
|
|
const foundAssets = new Set<string>();
|
|
|
|
// 1. Live Network Interception
|
|
page.on('response', response => {
|
|
const url = response.url();
|
|
if (response.status() === 200) {
|
|
// Capture anything that looks like a static asset
|
|
if (url.match(/\.(css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)/i)) {
|
|
foundAssets.add(url);
|
|
}
|
|
}
|
|
});
|
|
|
|
try {
|
|
console.log('🌐 Loading page (Waiting for Network Idle)...');
|
|
await page.goto(targetUrl, { waitUntil: 'networkidle', timeout: 90000 });
|
|
|
|
console.log('🌊 Executing "Scroll Wave" to trigger all lazy loaders naturally...');
|
|
await page.evaluate(async () => {
|
|
await new Promise((resolve) => {
|
|
let totalHeight = 0;
|
|
const distance = 400;
|
|
const timer = setInterval(() => {
|
|
const scrollHeight = document.body.scrollHeight;
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
|
|
if (totalHeight >= scrollHeight) {
|
|
clearInterval(timer);
|
|
window.scrollTo(0, 0); // Reset to top
|
|
resolve(true);
|
|
}
|
|
}, 100);
|
|
});
|
|
});
|
|
|
|
console.log('📐 Expanding Viewport to "Giant Mode" for final asset capture...');
|
|
const fullHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
await page.setViewportSize({ width: 1920, height: fullHeight + 1000 });
|
|
|
|
// Final settlement wait
|
|
await page.waitForTimeout(3000);
|
|
|
|
console.log('💧 Final DOM Hydration & Sanitization...');
|
|
await page.evaluate(() => {
|
|
// A. Deterministic Attribute Hydration (Generic)
|
|
// Scours every element for attributes that look like asset URLs and promotes them
|
|
const assetPattern = /\.(jpg|jpeg|png|gif|svg|webp|mp4|webm|woff2?|ttf|otf)/i;
|
|
|
|
document.querySelectorAll('*').forEach(el => {
|
|
// 0. Skip Meta/Head/Script/Style/SVG tags for attribute promotion
|
|
if (['META', 'LINK', 'HEAD', 'SCRIPT', 'STYLE', 'SVG', 'PATH'].includes(el.tagName)) return;
|
|
|
|
// 1. Force Visibility (Anti-Flicker)
|
|
const htmlEl = el as HTMLElement;
|
|
const style = window.getComputedStyle(htmlEl);
|
|
if (style.opacity === '0' || style.visibility === 'hidden') {
|
|
htmlEl.style.setProperty('opacity', '1', 'important');
|
|
htmlEl.style.setProperty('visibility', 'visible', 'important');
|
|
}
|
|
|
|
// 2. Promote Data Attributes
|
|
for (const attr of Array.from(el.attributes)) {
|
|
const name = attr.name.toLowerCase();
|
|
const val = attr.value;
|
|
|
|
if (assetPattern.test(val) || name.includes('src') || name.includes('image')) {
|
|
// Standard Image/Video/Source promotion
|
|
if (el.tagName === 'IMG') {
|
|
const img = el as HTMLImageElement;
|
|
if (name.includes('srcset')) img.srcset = val;
|
|
else if (!img.src || img.src.includes('data:')) img.src = val;
|
|
}
|
|
if (el.tagName === 'SOURCE') {
|
|
const source = el as HTMLSourceElement;
|
|
if (name.includes('srcset')) source.srcset = val;
|
|
}
|
|
if (el.tagName === 'VIDEO' || el.tagName === 'AUDIO') {
|
|
const media = el as HTMLMediaElement;
|
|
if (!media.src) media.src = val;
|
|
}
|
|
|
|
// Background Image Promotion
|
|
if (val.match(/^(https?:\/\/|\/\/|\/)/) && !name.includes('href')) {
|
|
const bg = htmlEl.style.backgroundImage;
|
|
if (!bg || bg === 'none') {
|
|
htmlEl.style.backgroundImage = `url('${val}')`;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
// B. Ensure basic structural elements are visible post-scroll
|
|
const body = document.body;
|
|
if (body) {
|
|
body.style.setProperty('opacity', '1', 'important');
|
|
body.style.setProperty('visibility', 'visible', 'important');
|
|
}
|
|
});
|
|
|
|
console.log('⏳ Waiting for network idle...');
|
|
await page.waitForLoadState('networkidle');
|
|
|
|
// 1.5 FINAL SETTLEMENT: Let any scroll-triggered JS finish
|
|
await page.waitForTimeout(1000);
|
|
|
|
// 2. Static Snapshot
|
|
let content = await page.content();
|
|
|
|
// 3. Post-Snapshot Asset Discovery (Regex)
|
|
// Catches assets that never triggered a network request but exist in the markup
|
|
const regexPatterns = [
|
|
/(?:src|href|url|data-[a-z-]+|srcset)=["']([^"'<>\s]+?\.(?:css|js|png|jpg|jpeg|gif|svg|woff2?|ttf|otf|mp4|webm|webp|ico)(?:\?[^"']*)?)["']/gi,
|
|
// Capture CSS url() inside style blocks
|
|
/url\(["']?([^"'\)]+)["']?\)/gi
|
|
];
|
|
|
|
for (const pattern of regexPatterns) {
|
|
let match;
|
|
while ((match = pattern.exec(content)) !== null) {
|
|
try { foundAssets.add(new URL(match[1], targetUrl).href); } catch { }
|
|
}
|
|
}
|
|
|
|
// Specific srcset parsing
|
|
const srcsetRegex = /[a-z0-9-]+srcset=["']([^"']+)["']/gi;
|
|
let match;
|
|
while ((match = srcsetRegex.exec(content)) !== null) {
|
|
match[1].split(',').forEach(rule => {
|
|
const parts = rule.trim().split(/\s+/);
|
|
if (parts[0] && !parts[0].startsWith('data:')) {
|
|
try { foundAssets.add(new URL(parts[0], targetUrl).href); } catch { }
|
|
}
|
|
});
|
|
}
|
|
|
|
console.log(`🔍 Processing ${foundAssets.size} discovered assets...`);
|
|
|
|
// 4. Download & Map
|
|
for (const url of foundAssets) {
|
|
const local = await downloadFile(url, assetsDir);
|
|
if (local) {
|
|
urlMap[url] = local;
|
|
const clean = url.split('?')[0];
|
|
urlMap[clean] = local;
|
|
|
|
// Handle CSS recursively
|
|
if (clean.endsWith('.css')) {
|
|
try {
|
|
const { data } = await axios.get(url, { headers: { 'User-Agent': USER_AGENT } });
|
|
// Process CSS and save it
|
|
const processedCss = await processCssRecursively(data, url, assetsDir, urlMap);
|
|
const relPath = sanitizePath(new URL(url).hostname + new URL(url).pathname);
|
|
fs.writeFileSync(path.join(assetsDir, relPath), processedCss);
|
|
} catch { }
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log('🛠️ Finalizing Static Mirror...');
|
|
let finalContent = content;
|
|
|
|
// A. Apply URL Map Replacements
|
|
// Longer paths first to prevent partial replacement errors
|
|
const sortedUrls = Object.keys(urlMap).sort((a, b) => b.length - a.length);
|
|
if (sortedUrls.length > 0) {
|
|
const escaped = sortedUrls.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
|
|
// Create a massive regex for single-pass replacement
|
|
const masterRegex = new RegExp(`(${escaped.join('|')})`, 'g');
|
|
finalContent = finalContent.replace(masterRegex, (match) => urlMap[match] || match);
|
|
}
|
|
|
|
// B. Global Root-Relative Path Cleanup
|
|
// Catches things like /wp-content/ that weren't distinct assets or were missed
|
|
const commonDirs = ['/wp-content/', '/wp-includes/', '/assets/', '/static/', '/images/'];
|
|
for (const dir of commonDirs) {
|
|
const localDir = `./assets/${urlObj.hostname}${dir}`;
|
|
finalContent = finalContent.split(`"${dir}`).join(`"${localDir}`);
|
|
finalContent = finalContent.split(`'${dir}`).join(`'${localDir}`);
|
|
finalContent = finalContent.split(`(${dir}`).join(`(${localDir}`);
|
|
}
|
|
|
|
// C. Domain Nuke
|
|
// Replace absolute links to the original domain with relative or #
|
|
const domainPattern = new RegExp(`https?://(www\\.)?${urlObj.hostname.replace(/\./g, '\\.')}[^"']*`, 'gi');
|
|
// We carefully only replace if it looks like a resource link, or neutralize if it's a navigation link
|
|
// For simplicity and "solidness", we'll rely on the specific replacements above first.
|
|
// This catch-all nuke ensures we don't leak requests.
|
|
// Convert remaining absolute domain links to relative .
|
|
finalContent = finalContent.replace(domainPattern, (match) => {
|
|
// If we have a map for it, it should have been replaced.
|
|
// If not, it's likely a navigation link or an uncaptured asset.
|
|
// Safe fallback:
|
|
return './';
|
|
});
|
|
|
|
// D. Static Stability & Cleanup
|
|
// Remove tracking/analytics/lazy-load scripts that ruins stability
|
|
finalContent = finalContent.replace(/<script\b[^>]*>([\s\S]*?)<\/script>/gi, (match, content) => {
|
|
const lower = content.toLowerCase();
|
|
if (lower.includes('google-analytics') ||
|
|
lower.includes('gtag') ||
|
|
lower.includes('fbq') ||
|
|
lower.includes('lazy') ||
|
|
lower.includes('tracker')) {
|
|
return '';
|
|
}
|
|
return match;
|
|
});
|
|
|
|
// E. CSS Injections for Stability
|
|
const headEnd = finalContent.indexOf('</head>');
|
|
if (headEnd > -1) {
|
|
const stabilityCss = `
|
|
<style>
|
|
/* UNIVERSAL CLONE STABILIZATION */
|
|
* {
|
|
transition: none !important;
|
|
animation: none !important;
|
|
scroll-behavior: auto !important;
|
|
}
|
|
[data-aos], .reveal, .lazypath, .lazy-load, [data-src] {
|
|
opacity: 1 !important;
|
|
visibility: visible !important;
|
|
transform: none !important;
|
|
clip-path: none !important;
|
|
}
|
|
|
|
img, video, iframe {
|
|
max-width: 100%;
|
|
display: block;
|
|
}
|
|
a {
|
|
pointer-events: none;
|
|
cursor: default;
|
|
}
|
|
</style>`;
|
|
finalContent = finalContent.slice(0, headEnd) + stabilityCss + finalContent.slice(headEnd);
|
|
}
|
|
|
|
// Save
|
|
const finalPath = path.join(domainDir, htmlFilename);
|
|
fs.writeFileSync(finalPath, finalContent);
|
|
console.log(`✅ SUCCESS: Cloned to ${finalPath}`);
|
|
|
|
} catch (err) {
|
|
console.error('❌ FATAL ERROR:', err);
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
run();
|