klz case study
Some checks failed
Build & Deploy Mintel Blog / build-and-deploy (push) Failing after 2m14s
Some checks failed
Build & Deploy Mintel Blog / build-and-deploy (push) Failing after 2m14s
This commit is contained in:
@@ -1,6 +1,4 @@
|
||||
import scrape from 'website-scraper';
|
||||
// @ts-ignore
|
||||
import PuppeteerPlugin from 'website-scraper-puppeteer';
|
||||
import { chromium } from 'playwright';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import fs from 'node:fs';
|
||||
@@ -15,160 +13,187 @@ async function run() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// CLEANUP: Aggressively strip shell noise like ; or trailing quotes
|
||||
const targetUrl = rawUrl.trim().replace(/[;'"]+$/, '');
|
||||
const urlObj = new URL(targetUrl);
|
||||
const domain = urlObj.hostname;
|
||||
const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
|
||||
|
||||
const domainDir = path.resolve(__dirname, '../cloned-websites', safeDomain);
|
||||
if (!fs.existsSync(domainDir)) fs.mkdirSync(domainDir, { recursive: true });
|
||||
const assetsDir = path.join(domainDir, 'assets');
|
||||
if (!fs.existsSync(assetsDir)) fs.mkdirSync(assetsDir, { recursive: true });
|
||||
|
||||
// Determine slug for filename
|
||||
let slug = urlObj.pathname.replace(/^\/|\/$/g, '').replace(/\//g, '-');
|
||||
let slug = urlObj.pathname.split('/').filter(Boolean).join('-');
|
||||
if (!slug) slug = 'index';
|
||||
const htmlFilename = `${slug}.html`;
|
||||
|
||||
console.log(`🚀 CLONING PAGE: ${targetUrl}`);
|
||||
console.log(`📂 SAVING AS: ${htmlFilename} in ${domainDir}`);
|
||||
console.log(`🚀 CLONING: ${targetUrl}`);
|
||||
|
||||
// website-scraper needs an empty directory for each 'scrape' call if we use its defaults,
|
||||
// but we want to MERGE assets. So we scrape to a temp dir and then move.
|
||||
const tempDir = path.join(domainDir, `_temp_${Date.now()}`);
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const context = await browser.newContext({
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
|
||||
});
|
||||
|
||||
const options = {
|
||||
urls: [targetUrl],
|
||||
directory: tempDir,
|
||||
recursive: false,
|
||||
plugins: [
|
||||
new PuppeteerPlugin({
|
||||
launchOptions: { headless: true, args: ['--no-sandbox'] },
|
||||
scrollToBottom: { timeout: 15000, viewportN: 10 },
|
||||
blockNavigation: false
|
||||
})
|
||||
],
|
||||
// Sources list covering Salient/WooCommerce lazy assets
|
||||
sources: [
|
||||
{ selector: 'img', attr: 'src' },
|
||||
{ selector: 'img', attr: 'srcset' },
|
||||
{ selector: 'img', attr: 'data-src' },
|
||||
{ selector: 'img', attr: 'data-lazy-src' },
|
||||
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
|
||||
{ selector: 'link[rel="preload"]', attr: 'href' },
|
||||
{ selector: 'script', attr: 'src' },
|
||||
{ selector: 'video', attr: 'src' },
|
||||
{ selector: 'video', attr: 'poster' },
|
||||
{ selector: 'source', attr: 'src' },
|
||||
{ selector: 'source', attr: 'srcset' },
|
||||
{ selector: 'iframe', attr: 'src' },
|
||||
{ selector: 'meta[property="og:image"]', attr: 'content' },
|
||||
{ selector: '[style*="background-image"]', attr: 'style' }
|
||||
],
|
||||
// Shared directory for assets
|
||||
subdirectories: [
|
||||
{ directory: 'assets/img', extensions: ['.jpg', '.png', '.svg', '.webp', '.gif', '.ico'] },
|
||||
{ directory: 'assets/js', extensions: ['.js'] },
|
||||
{ directory: 'assets/css', extensions: ['.css'] },
|
||||
{ directory: 'assets/fonts', extensions: ['.woff', '.woff2', '.ttf', '.eot'] },
|
||||
{ directory: 'assets/media', extensions: ['.mp4', '.webm'] }
|
||||
],
|
||||
request: {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
|
||||
}
|
||||
},
|
||||
urlFilter: (url: string) => {
|
||||
const u = new URL(url);
|
||||
// Allow domain assets, google fonts, and common cdn/upload patterns
|
||||
return u.hostname === domain ||
|
||||
u.hostname.includes('fonts.googleapis.com') ||
|
||||
u.hostname.includes('fonts.gstatic.com') ||
|
||||
url.includes('wp-content') ||
|
||||
url.includes('wp-includes');
|
||||
const page = await context.newPage();
|
||||
|
||||
// Asset capture logic
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
if (response.status() !== 200) return;
|
||||
const u = new URL(url);
|
||||
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf|xml)$/i.test(u.pathname) ||
|
||||
url.includes('wp-content') || url.includes('wp-includes') || url.includes('fonts.googleapis.com');
|
||||
|
||||
if (isAsset && url !== targetUrl) {
|
||||
try {
|
||||
const buffer = await response.body();
|
||||
const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
|
||||
const fileDest = path.join(assetsDir, u.hostname, sanitizedPath);
|
||||
if (!fs.existsSync(path.dirname(fileDest))) fs.mkdirSync(path.dirname(fileDest), { recursive: true });
|
||||
// We overwrite for now to ensure freshness
|
||||
fs.writeFileSync(fileDest, buffer);
|
||||
|
||||
// If it's a CSS file, we might want to rewrite it later, but let's do it on the fly or after
|
||||
} catch (e) { }
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
try {
|
||||
await scrape(options);
|
||||
await page.goto(targetUrl, { waitUntil: 'networkidle', timeout: 90000 });
|
||||
|
||||
// Rename the downloaded index.html to our slug.html
|
||||
const downloadedHtml = path.join(tempDir, 'index.html');
|
||||
const targetHtmlPath = path.join(tempDir, htmlFilename);
|
||||
if (fs.existsSync(downloadedHtml)) {
|
||||
fs.renameSync(downloadedHtml, targetHtmlPath);
|
||||
}
|
||||
|
||||
// POST-PROCESS: Inject Fonts and fix paths in the HTML
|
||||
if (fs.existsSync(targetHtmlPath)) {
|
||||
let content = fs.readFileSync(targetHtmlPath, 'utf8');
|
||||
|
||||
// NUKE TYPOGRAPHY: Strong overrides for Salient theme
|
||||
const fontInjection = `
|
||||
<!-- INDUSTRIAL TYPOGRAPHY OVERRIDE -->
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700;800;900&display=swap" rel="stylesheet">
|
||||
<style>
|
||||
:root {
|
||||
--main-font: 'Inter', sans-serif !important;
|
||||
--heading-font: 'Montserrat', sans-serif !important;
|
||||
--font-family-body: 'Inter', sans-serif !important;
|
||||
--font-family-heading: 'Montserrat', sans-serif !important;
|
||||
}
|
||||
body, p, li, a, span, label, input, textarea, .body-font {
|
||||
font-family: 'Inter', sans-serif !important;
|
||||
}
|
||||
h1, h2, h3, h4, h5, h6, .title-font, .heading-font, [class*="heading"] {
|
||||
font-family: 'Montserrat', sans-serif !important;
|
||||
font-weight: 700 !important;
|
||||
}
|
||||
/* Salient Specific Heading Classes */
|
||||
.nectar-milestone .number, .nectar-milestone .subject, .nectar-heading {
|
||||
font-family: 'Montserrat' !important;
|
||||
}
|
||||
</style>
|
||||
`;
|
||||
content = content.replace('</head>', `${fontInjection}</head>`);
|
||||
|
||||
// Link fix: Replace all anchor hrefs with # to prevent unintentional navigation/leaks
|
||||
content = content.replace(/<a\b([^>]*)\bhref=["'][^"']*["']/gi, '<a$1href="#"');
|
||||
|
||||
// Fix Breeze dynamic scripts (Salient optimization)
|
||||
content = content.replace(/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => {
|
||||
const u = url.trim();
|
||||
const cleanUrl = u.split('?')[0];
|
||||
if (cleanUrl.endsWith('.css')) return `<link rel="stylesheet" href="${u}">`;
|
||||
return `<script src="${u}"></script>`;
|
||||
});
|
||||
|
||||
fs.writeFileSync(targetHtmlPath, content);
|
||||
}
|
||||
|
||||
// MOVE AND MERGE into domainDir
|
||||
const merge = (src: string, dest: string) => {
|
||||
if (!fs.existsSync(dest)) fs.mkdirSync(dest, { recursive: true });
|
||||
fs.readdirSync(src).forEach(item => {
|
||||
const s = path.join(src, item);
|
||||
const d = path.join(dest, item);
|
||||
if (fs.statSync(s).isDirectory()) {
|
||||
merge(s, d);
|
||||
} else {
|
||||
// Copy file (overwrite if HTML, skip assets to avoid duplicate downloads)
|
||||
if (item.endsWith('.html') || !fs.existsSync(d)) {
|
||||
fs.copyFileSync(s, d);
|
||||
// Comprehensive scroll
|
||||
await page.evaluate(async () => {
|
||||
await new Promise((resolve) => {
|
||||
let totalHeight = 0, distance = 400, timer = setInterval(() => {
|
||||
let scrollHeight = document.body.scrollHeight;
|
||||
window.scrollBy(0, distance);
|
||||
totalHeight += distance;
|
||||
if (totalHeight >= scrollHeight || totalHeight > 30000) {
|
||||
clearInterval(timer);
|
||||
window.scrollTo(0, 0);
|
||||
resolve(null);
|
||||
}
|
||||
}
|
||||
}, 200);
|
||||
});
|
||||
});
|
||||
|
||||
await page.waitForLoadState('networkidle');
|
||||
await page.waitForTimeout(5000); // 5 seconds extra for lazy scripts
|
||||
|
||||
let content = await page.content();
|
||||
|
||||
const rewriteUrl = (fullUrl: string) => {
|
||||
try {
|
||||
if (!fullUrl.startsWith('http') && !fullUrl.startsWith('//')) return fullUrl;
|
||||
let actualUrl = fullUrl;
|
||||
if (fullUrl.startsWith('//')) actualUrl = `https:${fullUrl}`;
|
||||
|
||||
const u = new URL(actualUrl);
|
||||
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp|pdf)$/i.test(u.pathname) ||
|
||||
actualUrl.includes('wp-content') || actualUrl.includes('wp-includes') || actualUrl.includes('fonts.googleapis.com');
|
||||
if (isAsset) {
|
||||
const sanitizedPath = u.pathname.replace(/^\//, '').split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
|
||||
return `./assets/${u.hostname}/${sanitizedPath}`;
|
||||
}
|
||||
} catch (e) { }
|
||||
return fullUrl;
|
||||
};
|
||||
|
||||
merge(tempDir, domainDir);
|
||||
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||
// 1. Rewrite src, href, content, poster
|
||||
content = content.replace(/(src|href|content|poster)=["']([^"']+)["']/gi, (match, attr, url) => {
|
||||
if (attr === 'href' && !url.includes('.') && !url.includes('http')) return match; // Keep anchor links or paths
|
||||
return `${attr}="${rewriteUrl(url)}"`;
|
||||
});
|
||||
|
||||
console.log(`\n✅ SUCCESS: ${path.join(domainDir, htmlFilename)}`);
|
||||
// 2. Rewrite srcset
|
||||
content = content.replace(/srcset=["']([^"']+)["']/gi, (match, srcset) => {
|
||||
const parts = srcset.split(',').map(part => {
|
||||
const trimmed = part.trim();
|
||||
const lastSpaceIndex = trimmed.lastIndexOf(' ');
|
||||
if (lastSpaceIndex === -1) return rewriteUrl(trimmed);
|
||||
const url = trimmed.substring(0, lastSpaceIndex);
|
||||
const size = trimmed.substring(lastSpaceIndex);
|
||||
return `${rewriteUrl(url)}${size}`;
|
||||
});
|
||||
return `srcset="${parts.join(', ')}"`;
|
||||
});
|
||||
|
||||
} catch (e) {
|
||||
console.error('❌ CLONE FAILED:', e);
|
||||
if (fs.existsSync(tempDir)) fs.rmSync(tempDir, { recursive: true, force: true });
|
||||
// 3. Rewrite inline styles
|
||||
content = content.replace(/url\(["']?([^"'\)]+)["']?\)/gi, (match, url) => {
|
||||
return `url("${rewriteUrl(url)}")`;
|
||||
});
|
||||
|
||||
// 4. Salient/Industrial Overrides
|
||||
const fixes = `
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700;800;900&display=swap" rel="stylesheet">
|
||||
<style>
|
||||
:root { --main-font: 'Inter'; --heading-font: 'Montserrat'; }
|
||||
body, p, li, a, span { font-family: 'Inter', sans-serif !important; }
|
||||
h1, h2, h3, h4, h5, h6, .title-font { font-family: 'Montserrat', sans-serif !important; font-weight: 700 !important; }
|
||||
</style>`;
|
||||
|
||||
content = content.replace('</head>', `${fixes}</head>`);
|
||||
|
||||
// Link Nuker: only if it looks like an internal/external link, not assets
|
||||
content = content.replace(/<a\b([^>]*)\bhref=["'](https?:\/\/[^"']+|(?![^"']*\.(css|js|png|jpg|jpeg|gif|svg|pdf))[./][^"']*)["']/gi, '<a$1href="#"');
|
||||
|
||||
// Fix Breeze dynamic scripts
|
||||
content = content.replace(/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => {
|
||||
const u = url.trim();
|
||||
const cleanUrl = u.split('?')[0];
|
||||
if (cleanUrl.endsWith('.css')) return `<link rel="stylesheet" href="${u}">`;
|
||||
return `<script src="${u}"></script>`;
|
||||
});
|
||||
|
||||
fs.writeFileSync(path.join(domainDir, htmlFilename), content);
|
||||
|
||||
// 5. CSS REWRITING: Fix absolute paths in all captured CSS files
|
||||
const allFiles = (dir: string): string[] => {
|
||||
let results: string[] = [];
|
||||
fs.readdirSync(dir).forEach(f => {
|
||||
const fullPath = path.join(dir, f);
|
||||
if (fs.statSync(fullPath).isDirectory()) {
|
||||
results = results.concat(allFiles(fullPath));
|
||||
} else if (f.endsWith('.css')) {
|
||||
results.push(fullPath);
|
||||
}
|
||||
});
|
||||
return results;
|
||||
};
|
||||
|
||||
const cssFiles = allFiles(assetsDir);
|
||||
for (const cssFile of cssFiles) {
|
||||
let cssContent = fs.readFileSync(cssFile, 'utf8');
|
||||
// Replace absolute domain references with local folder structure
|
||||
// This is tricky because we need relative paths, but we can use absolute-ish paths relative to root?
|
||||
// Actually, we can just point them back to the same assets folder structure.
|
||||
// But since they are inside assets/host/path, they need to go up levels.
|
||||
// A simpler way: replace domain urls with a full site-root relative path if possible,
|
||||
// but CSS relative paths are hard.
|
||||
// Let's just try to flatten them or use absolute paths for the clone.
|
||||
// Actually, the easiest is to replace https://klz-cables.com/ with /assets/klz-cables.com/
|
||||
// But the clone is viewed locally.
|
||||
|
||||
cssContent = cssContent.replace(/url\(["']?https?:\/\/([^\/"']+\/[^"'\)]+)["']?\)/gi, (match, pathAndHost) => {
|
||||
const parts = pathAndHost.split('/');
|
||||
const host = parts[0];
|
||||
const rest = parts.slice(1).join('/').split('?')[0];
|
||||
const sanitizedRest = rest.split('/').map(p => p.replace(/[^a-z0-9._-]/gi, '_')).join('/');
|
||||
// This is still just a guess at where the asset is.
|
||||
// But it's better than pointing to a dead live site.
|
||||
return `url("/assets/${host}/${sanitizedRest}")`;
|
||||
});
|
||||
fs.writeFileSync(cssFile, cssContent);
|
||||
}
|
||||
|
||||
console.log(`\n✅ CLONED: ${htmlFilename}`);
|
||||
|
||||
} catch (err) {
|
||||
console.error('❌ FAILED:', err);
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user