Files
mintel.me/scripts/clone-page.ts
2026-02-01 19:56:53 +01:00

176 lines
7.5 KiB
TypeScript

import scrape from 'website-scraper';
// @ts-ignore
import PuppeteerPlugin from 'website-scraper-puppeteer';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function run() {
let rawUrl = process.argv[2];
if (!rawUrl) {
console.error('Usage: npm run clone-page <URL>');
process.exit(1);
}
// CLEANUP: Aggressively strip shell noise like ; or trailing quotes
const targetUrl = rawUrl.trim().replace(/[;'"]+$/, '');
const urlObj = new URL(targetUrl);
const domain = urlObj.hostname;
const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
const domainDir = path.resolve(__dirname, '../cloned-websites', safeDomain);
if (!fs.existsSync(domainDir)) fs.mkdirSync(domainDir, { recursive: true });
// Determine slug for filename
let slug = urlObj.pathname.replace(/^\/|\/$/g, '').replace(/\//g, '-');
if (!slug) slug = 'index';
const htmlFilename = `${slug}.html`;
console.log(`🚀 CLONING PAGE: ${targetUrl}`);
console.log(`📂 SAVING AS: ${htmlFilename} in ${domainDir}`);
// website-scraper needs an empty directory for each 'scrape' call if we use its defaults,
// but we want to MERGE assets. So we scrape to a temp dir and then move.
const tempDir = path.join(domainDir, `_temp_${Date.now()}`);
const options = {
urls: [targetUrl],
directory: tempDir,
recursive: false,
plugins: [
new PuppeteerPlugin({
launchOptions: { headless: true, args: ['--no-sandbox'] },
scrollToBottom: { timeout: 15000, viewportN: 10 },
blockNavigation: false
})
],
// Sources list covering Salient/WooCommerce lazy assets
sources: [
{ selector: 'img', attr: 'src' },
{ selector: 'img', attr: 'srcset' },
{ selector: 'img', attr: 'data-src' },
{ selector: 'img', attr: 'data-lazy-src' },
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
{ selector: 'link[rel="preload"]', attr: 'href' },
{ selector: 'script', attr: 'src' },
{ selector: 'video', attr: 'src' },
{ selector: 'video', attr: 'poster' },
{ selector: 'source', attr: 'src' },
{ selector: 'source', attr: 'srcset' },
{ selector: 'iframe', attr: 'src' },
{ selector: 'meta[property="og:image"]', attr: 'content' },
{ selector: '[style*="background-image"]', attr: 'style' }
],
// Shared directory for assets
subdirectories: [
{ directory: 'assets/img', extensions: ['.jpg', '.png', '.svg', '.webp', '.gif', '.ico'] },
{ directory: 'assets/js', extensions: ['.js'] },
{ directory: 'assets/css', extensions: ['.css'] },
{ directory: 'assets/fonts', extensions: ['.woff', '.woff2', '.ttf', '.eot'] },
{ directory: 'assets/media', extensions: ['.mp4', '.webm'] }
],
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
},
urlFilter: (url: string) => {
const u = new URL(url);
// Allow domain assets, google fonts, and common cdn/upload patterns
return u.hostname === domain ||
u.hostname.includes('fonts.googleapis.com') ||
u.hostname.includes('fonts.gstatic.com') ||
url.includes('wp-content') ||
url.includes('wp-includes');
}
};
try {
await scrape(options);
// Rename the downloaded index.html to our slug.html
const downloadedHtml = path.join(tempDir, 'index.html');
const targetHtmlPath = path.join(tempDir, htmlFilename);
if (fs.existsSync(downloadedHtml)) {
fs.renameSync(downloadedHtml, targetHtmlPath);
}
// POST-PROCESS: Inject Fonts and fix paths in the HTML
if (fs.existsSync(targetHtmlPath)) {
let content = fs.readFileSync(targetHtmlPath, 'utf8');
// NUKE TYPOGRAPHY: Strong overrides for Salient theme
const fontInjection = `
<!-- INDUSTRIAL TYPOGRAPHY OVERRIDE -->
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700;800;900&display=swap" rel="stylesheet">
<style>
:root {
--main-font: 'Inter', sans-serif !important;
--heading-font: 'Montserrat', sans-serif !important;
--font-family-body: 'Inter', sans-serif !important;
--font-family-heading: 'Montserrat', sans-serif !important;
}
body, p, li, a, span, label, input, textarea, .body-font {
font-family: 'Inter', sans-serif !important;
}
h1, h2, h3, h4, h5, h6, .title-font, .heading-font, [class*="heading"] {
font-family: 'Montserrat', sans-serif !important;
font-weight: 700 !important;
}
/* Salient Specific Heading Classes */
.nectar-milestone .number, .nectar-milestone .subject, .nectar-heading {
font-family: 'Montserrat' !important;
}
</style>
`;
content = content.replace('</head>', `${fontInjection}</head>`);
// Link fix: Replace all anchor hrefs with # to prevent unintentional navigation/leaks
content = content.replace(/<a\b([^>]*)\bhref=["'][^"']*["']/gi, '<a$1href="#"');
// Fix Breeze dynamic scripts (Salient optimization)
content = content.replace(/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => {
const u = url.trim();
const cleanUrl = u.split('?')[0];
if (cleanUrl.endsWith('.css')) return `<link rel="stylesheet" href="${u}">`;
return `<script src="${u}"></script>`;
});
fs.writeFileSync(targetHtmlPath, content);
}
// MOVE AND MERGE into domainDir
const merge = (src: string, dest: string) => {
if (!fs.existsSync(dest)) fs.mkdirSync(dest, { recursive: true });
fs.readdirSync(src).forEach(item => {
const s = path.join(src, item);
const d = path.join(dest, item);
if (fs.statSync(s).isDirectory()) {
merge(s, d);
} else {
// Copy file (overwrite if HTML, skip assets to avoid duplicate downloads)
if (item.endsWith('.html') || !fs.existsSync(d)) {
fs.copyFileSync(s, d);
}
}
});
};
merge(tempDir, domainDir);
fs.rmSync(tempDir, { recursive: true, force: true });
console.log(`\n✅ SUCCESS: ${path.join(domainDir, htmlFilename)}`);
} catch (e) {
console.error('❌ CLONE FAILED:', e);
if (fs.existsSync(tempDir)) fs.rmSync(tempDir, { recursive: true, force: true });
}
}
run();