chore: stabilize apps/web (lint, build, typecheck fixes)
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 1m27s
Build & Deploy / 🏗️ Build (push) Failing after 1m31s
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🩺 Health Check (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 1m27s
Build & Deploy / 🏗️ Build (push) Failing after 1m31s
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🩺 Health Check (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
This commit is contained in:
@@ -1,228 +1,223 @@
|
||||
// @ts-ignore
|
||||
import scrape from 'website-scraper';
|
||||
import scrape from "website-scraper";
|
||||
// @ts-ignore
|
||||
import PuppeteerPlugin from 'website-scraper-puppeteer';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import fs from 'node:fs';
|
||||
import PuppeteerPlugin from "website-scraper-puppeteer";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import fs from "node:fs";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
class CustomFilenameGeneratorPlugin {
|
||||
apply(registerAction: any) {
|
||||
registerAction('generateFilename', ({ resource }: any) => {
|
||||
const url = new URL(resource.url);
|
||||
const ext = path.extname(url.pathname);
|
||||
async function run() {
|
||||
const targetUrl = process.argv[2];
|
||||
if (!targetUrl) {
|
||||
console.error("Usage: npm run clone-website <URL> [output-dir]");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Clean the path
|
||||
let safePath = url.pathname;
|
||||
if (safePath.endsWith('/')) {
|
||||
safePath += 'index.html';
|
||||
} else if (!ext && !resource.isHtml()) {
|
||||
// If no extension and not HTML, guess based on content type?
|
||||
// But usually safe to leave as is or add extension if known.
|
||||
} else if (!ext && resource.isHtml()) {
|
||||
safePath += '.html';
|
||||
const urlObj = new URL(targetUrl);
|
||||
const domain = urlObj.hostname;
|
||||
const safeDomain = domain.replace(/[^a-z0-9-]/gi, "_");
|
||||
const outputDir = process.argv[3]
|
||||
? path.resolve(process.cwd(), process.argv[3])
|
||||
: path.resolve(__dirname, "../cloned-websites", safeDomain);
|
||||
|
||||
if (fs.existsSync(outputDir)) {
|
||||
console.log(`Cleaning existing directory: ${outputDir}`);
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
console.log(`🚀 Starting recursive clone of ${targetUrl}`);
|
||||
console.log(`📂 Output: ${outputDir}`);
|
||||
|
||||
const options = {
|
||||
urls: [targetUrl],
|
||||
directory: outputDir,
|
||||
recursive: true,
|
||||
maxDepth: 5,
|
||||
// Custom filename generation to avoid "https:/" folders
|
||||
plugins: [
|
||||
new PuppeteerPlugin({
|
||||
launchOptions: {
|
||||
headless: true,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
],
|
||||
},
|
||||
scrollToBottom: { timeout: 10000, viewportN: 10 },
|
||||
blockNavigation: false,
|
||||
}),
|
||||
new (class LoggerPlugin {
|
||||
apply(registerAction: any) {
|
||||
registerAction("onResourceSaved", ({ resource }: any) => {
|
||||
console.log(` 💾 Saved: ${resource.url} -> ${resource.filename}`);
|
||||
});
|
||||
registerAction("onResourceError", ({ resource, error }: any) => {
|
||||
console.error(` ❌ Error: ${resource.url} - ${error.message}`);
|
||||
});
|
||||
}
|
||||
})(),
|
||||
new (class FilenamePlugin {
|
||||
apply(registerAction: any) {
|
||||
registerAction("generateFilename", ({ resource }: any) => {
|
||||
const u = new URL(resource.url);
|
||||
let filename = u.pathname;
|
||||
|
||||
// normalize
|
||||
if (filename.endsWith("/")) filename += "index.html";
|
||||
else if (!path.extname(filename) && resource.url.includes(domain))
|
||||
filename += "/index.html"; // Assume folder if internal link without ext
|
||||
|
||||
// If it's an external asset, put it in a separate folder
|
||||
if (u.hostname !== domain) {
|
||||
filename = `_external/${u.hostname}${filename}`;
|
||||
}
|
||||
|
||||
// Handle query strings if needed (simplifying by ignoring them for static local files usually better,
|
||||
// unless they determine content. For a clean clone, we usually ignore unique query params)
|
||||
// But if the site relies on routing via query params (e.g. ?page=2), we might want to encode them.
|
||||
// For now, let's keep it simple and clean.
|
||||
// Sanitize filename
|
||||
filename = filename
|
||||
.split("/")
|
||||
.map((part) => part.replace(/[^a-z0-9._-]/gi, "_"))
|
||||
.join("/");
|
||||
|
||||
// Remove leading slash
|
||||
if (safePath.startsWith('/')) safePath = safePath.substring(1);
|
||||
if (filename.startsWith("/")) filename = filename.substring(1);
|
||||
|
||||
// Sanitization
|
||||
safePath = safePath.replace(/[:*?"<>|]/g, '_');
|
||||
// Handle "Unnamed page" by checking if empty
|
||||
if (!filename || filename === "index.html")
|
||||
return { filename: "index.html" };
|
||||
|
||||
// External assets go to a separate folder to avoid collision
|
||||
// We can detect external by checking if the resource parent is different?
|
||||
// Actually, simply using the hostname mapping is safer.
|
||||
|
||||
// However, the USER wants "local cloned pages".
|
||||
// If we just use the path, we merge everything into one root.
|
||||
// If there are collision (e.g. same path on different domains), this is bad.
|
||||
// But typically we clone ONE site.
|
||||
|
||||
return { filename: safePath };
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const targetUrl = process.argv[2];
|
||||
if (!targetUrl) {
|
||||
console.error('Usage: npm run clone-website <URL> [output-dir]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const urlObj = new URL(targetUrl);
|
||||
const domain = urlObj.hostname;
|
||||
const safeDomain = domain.replace(/[^a-z0-9-]/gi, '_');
|
||||
const outputDir = process.argv[3]
|
||||
? path.resolve(process.cwd(), process.argv[3])
|
||||
: path.resolve(__dirname, '../cloned-websites', safeDomain);
|
||||
|
||||
if (fs.existsSync(outputDir)) {
|
||||
console.log(`Cleaning existing directory: ${outputDir}`);
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
console.log(`🚀 Starting recursive clone of ${targetUrl}`);
|
||||
console.log(`📂 Output: ${outputDir}`);
|
||||
|
||||
const options = {
|
||||
urls: [targetUrl],
|
||||
directory: outputDir,
|
||||
recursive: true,
|
||||
maxDepth: 5,
|
||||
// Custom filename generation to avoid "https:/" folders
|
||||
plugins: [
|
||||
new PuppeteerPlugin({
|
||||
launchOptions: {
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
||||
},
|
||||
scrollToBottom: { timeout: 10000, viewportN: 10 },
|
||||
blockNavigation: false
|
||||
}),
|
||||
new class LoggerPlugin {
|
||||
apply(registerAction: any) {
|
||||
registerAction('onResourceSaved', ({ resource }: any) => {
|
||||
console.log(` 💾 Saved: ${resource.url} -> ${resource.filename}`);
|
||||
});
|
||||
registerAction('onResourceError', ({ resource, error }: any) => {
|
||||
console.error(` ❌ Error: ${resource.url} - ${error.message}`);
|
||||
});
|
||||
}
|
||||
},
|
||||
new class FilenamePlugin {
|
||||
apply(registerAction: any) {
|
||||
registerAction('generateFilename', ({ resource }: any) => {
|
||||
const u = new URL(resource.url);
|
||||
let filename = u.pathname;
|
||||
|
||||
// normalize
|
||||
if (filename.endsWith('/')) filename += 'index.html';
|
||||
else if (!path.extname(filename) && resource.url.includes(domain)) filename += '/index.html'; // Assume folder if internal link without ext
|
||||
|
||||
// If it's an external asset, put it in a separate folder
|
||||
if (u.hostname !== domain) {
|
||||
filename = `_external/${u.hostname}${filename}`;
|
||||
}
|
||||
|
||||
// Sanitize filename
|
||||
filename = filename.split('/').map(part => part.replace(/[^a-z0-9._-]/gi, '_')).join('/');
|
||||
|
||||
// Remove leading slash
|
||||
if (filename.startsWith('/')) filename = filename.substring(1);
|
||||
|
||||
// Handle "Unnamed page" by checking if empty
|
||||
if (!filename || filename === 'index.html') return { filename: 'index.html' };
|
||||
|
||||
return { filename };
|
||||
});
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
urlFilter: (url: string) => {
|
||||
const u = new URL(url);
|
||||
const isTargetDomain = u.hostname === domain;
|
||||
const isGoogleFonts = u.hostname.includes('fonts.googleapis.com') || u.hostname.includes('fonts.gstatic.com');
|
||||
// Allow assets from anywhere
|
||||
const isAsset = /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test(u.pathname);
|
||||
// Allow fonts/css from common CDNs if standard extension check fails
|
||||
const isCommonAsset = u.pathname.includes('/css/') || u.pathname.includes('/js/') || u.pathname.includes('/static/') || u.pathname.includes('/assets/') || u.pathname.includes('/uploads/');
|
||||
|
||||
return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
|
||||
},
|
||||
|
||||
|
||||
sources: [
|
||||
{ selector: 'img', attr: 'src' },
|
||||
{ selector: 'img', attr: 'srcset' },
|
||||
{ selector: 'source', attr: 'src' },
|
||||
{ selector: 'source', attr: 'srcset' },
|
||||
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
|
||||
{ selector: 'link[rel="preload"]', attr: 'href' },
|
||||
{ selector: 'link[rel="prefetch"]', attr: 'href' },
|
||||
{ selector: 'script', attr: 'src' },
|
||||
{ selector: 'video', attr: 'src' },
|
||||
{ selector: 'video', attr: 'poster' },
|
||||
{ selector: 'iframe', attr: 'src' },
|
||||
{ selector: 'link[rel*="icon"]', attr: 'href' },
|
||||
{ selector: 'link[rel="manifest"]', attr: 'href' },
|
||||
{ selector: 'meta[property="og:image"]', attr: 'content' }
|
||||
],
|
||||
|
||||
request: {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
|
||||
}
|
||||
return { filename };
|
||||
});
|
||||
}
|
||||
};
|
||||
})(),
|
||||
],
|
||||
|
||||
try {
|
||||
// @ts-ignore
|
||||
const result = await scrape(options);
|
||||
console.log(`\n✅ Successfully cloned ${result.length} resources to ${outputDir}`);
|
||||
urlFilter: (url: string) => {
|
||||
const u = new URL(url);
|
||||
const isTargetDomain = u.hostname === domain;
|
||||
const isGoogleFonts =
|
||||
u.hostname.includes("fonts.googleapis.com") ||
|
||||
u.hostname.includes("fonts.gstatic.com");
|
||||
// Allow assets from anywhere
|
||||
const isAsset =
|
||||
/\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|eot|mp4|webm|ico|json|webp)$/i.test(
|
||||
u.pathname,
|
||||
);
|
||||
// Allow fonts/css from common CDNs if standard extension check fails
|
||||
const isCommonAsset =
|
||||
u.pathname.includes("/css/") ||
|
||||
u.pathname.includes("/js/") ||
|
||||
u.pathname.includes("/static/") ||
|
||||
u.pathname.includes("/assets/") ||
|
||||
u.pathname.includes("/uploads/");
|
||||
|
||||
// Post-processing: Sanitize HTML to remove Next.js hydration scripts
|
||||
// This prevents the static site from trying to "hydrate" and breaking images/links
|
||||
console.log('🧹 Sanitizing HTML files...');
|
||||
sanitizeHtmlFiles(outputDir);
|
||||
return isTargetDomain || isAsset || isCommonAsset || isGoogleFonts;
|
||||
},
|
||||
|
||||
console.log(`open "${path.join(outputDir, 'index.html')}"`);
|
||||
} catch (error) {
|
||||
console.error('❌ Error cloning website:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
sources: [
|
||||
{ selector: "img", attr: "src" },
|
||||
{ selector: "img", attr: "srcset" },
|
||||
{ selector: "source", attr: "src" },
|
||||
{ selector: "source", attr: "srcset" },
|
||||
{ selector: 'link[rel="stylesheet"]', attr: "href" },
|
||||
{ selector: 'link[rel="preload"]', attr: "href" },
|
||||
{ selector: 'link[rel="prefetch"]', attr: "href" },
|
||||
{ selector: "script", attr: "src" },
|
||||
{ selector: "video", attr: "src" },
|
||||
{ selector: "video", attr: "poster" },
|
||||
{ selector: "iframe", attr: "src" },
|
||||
{ selector: 'link[rel*="icon"]', attr: "href" },
|
||||
{ selector: 'link[rel="manifest"]', attr: "href" },
|
||||
{ selector: 'meta[property="og:image"]', attr: "content" },
|
||||
],
|
||||
|
||||
request: {
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
// @ts-ignore
|
||||
const result = await scrape(options);
|
||||
console.log(
|
||||
`\n✅ Successfully cloned ${result.length} resources to ${outputDir}`,
|
||||
);
|
||||
|
||||
// Post-processing: Sanitize HTML to remove Next.js hydration scripts
|
||||
// This prevents the static site from trying to "hydrate" and breaking images/links
|
||||
console.log("🧹 Sanitizing HTML files...");
|
||||
sanitizeHtmlFiles(outputDir);
|
||||
|
||||
console.log(`open "${path.join(outputDir, "index.html")}"`);
|
||||
} catch (error) {
|
||||
console.error("❌ Error cloning website:", error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
function sanitizeHtmlFiles(dir: string) {
|
||||
const files = fs.readdirSync(dir);
|
||||
for (const file of files) {
|
||||
const fullPath = path.join(dir, file);
|
||||
if (fs.statSync(fullPath).isDirectory()) {
|
||||
sanitizeHtmlFiles(fullPath);
|
||||
} else if (file.endsWith('.html')) {
|
||||
let content = fs.readFileSync(fullPath, 'utf8');
|
||||
const files = fs.readdirSync(dir);
|
||||
for (const file of files) {
|
||||
const fullPath = path.join(dir, file);
|
||||
if (fs.statSync(fullPath).isDirectory()) {
|
||||
sanitizeHtmlFiles(fullPath);
|
||||
} else if (file.endsWith(".html")) {
|
||||
let content = fs.readFileSync(fullPath, "utf8");
|
||||
|
||||
// Remove Next.js data script
|
||||
content = content.replace(/<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi, '');
|
||||
// Remove Next.js data script
|
||||
content = content.replace(
|
||||
/<script id="__NEXT_DATA__"[\s\S]*?<\/script>/gi,
|
||||
"",
|
||||
);
|
||||
|
||||
// Remove Next.js chunk scripts (hydration)
|
||||
// match <script src="..._next/static/chunks..." ...
|
||||
content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi, '');
|
||||
content = content.replace(/<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi, '');
|
||||
// Remove Next.js chunk scripts (hydration)
|
||||
// match <script src="..._next/static/chunks..." ...
|
||||
content = content.replace(
|
||||
/<script[^>]+src="[^"]*\/_next\/static\/chunks\/[^"]*"[^>]*><\/script>/gi,
|
||||
"",
|
||||
);
|
||||
content = content.replace(
|
||||
/<script[^>]+src="[^"]*\/_next\/static\/[^"]*Manifest\.js"[^>]*><\/script>/gi,
|
||||
"",
|
||||
);
|
||||
|
||||
// Convert Breeze dynamic script/styles into actual tags if possible
|
||||
// match <div class="breeze-scripts-load" ...>URL</div>
|
||||
content = content.replace(/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi, (match, url) => {
|
||||
if (url.endsWith('.css')) return `<link rel="stylesheet" href="${url}">`;
|
||||
return `<script src="${url}"></script>`;
|
||||
});
|
||||
// Convert Breeze dynamic script/styles into actual tags if possible
|
||||
// match <div class="breeze-scripts-load" ...>URL</div>
|
||||
content = content.replace(
|
||||
/<div[^>]+class="breeze-scripts-load"[^>]*>([^<]+)<\/div>/gi,
|
||||
(match, url) => {
|
||||
if (url.endsWith(".css"))
|
||||
return `<link rel="stylesheet" href="${url}">`;
|
||||
return `<script src="${url}"></script>`;
|
||||
},
|
||||
);
|
||||
|
||||
// Inject Fonts (Fix for missing dynamic fonts)
|
||||
// We inject Inter and Montserrat as safe defaults for industrial/modern sites
|
||||
// Check specifically for a stylesheet link to google fonts
|
||||
const hasGoogleFontStylesheet = /<link[^>]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test(content);
|
||||
if (!hasGoogleFontStylesheet) {
|
||||
const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700&display=swap">`;
|
||||
const styleBlock = `<style>
|
||||
// Inject Fonts (Fix for missing dynamic fonts)
|
||||
// We inject Inter and Montserrat as safe defaults for industrial/modern sites
|
||||
// Check specifically for a stylesheet link to google fonts
|
||||
const hasGoogleFontStylesheet =
|
||||
/<link[^>]+rel="stylesheet"[^>]+href="[^"]*fonts\.googleapis\.com/i.test(
|
||||
content,
|
||||
);
|
||||
if (!hasGoogleFontStylesheet) {
|
||||
const fontLink = `<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Montserrat:wght@300;400;500;600;700&display=swap">`;
|
||||
const styleBlock = `<style>
|
||||
:root { --main-font: 'Inter', sans-serif; --heading-font: 'Montserrat', sans-serif; }
|
||||
body, .body-font, p, span, li, a { font-family: var(--main-font) !important; }
|
||||
h1, h2, h3, h4, h5, h6, .title-font, .heading-font { font-family: var(--heading-font) !important; }
|
||||
</style>`;
|
||||
content = content.replace('</head>', `${fontLink}${styleBlock}</head>`);
|
||||
}
|
||||
content = content.replace("</head>", `${fontLink}${styleBlock}</head>`);
|
||||
}
|
||||
|
||||
// Force column layout on product pages
|
||||
if (content.includes('class="products')) {
|
||||
const layoutScript = `
|
||||
// Force column layout on product pages
|
||||
if (content.includes('class="products')) {
|
||||
const layoutScript = `
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const products = document.querySelector('.products');
|
||||
@@ -233,12 +228,12 @@ function sanitizeHtmlFiles(dir: string) {
|
||||
}
|
||||
});
|
||||
</script>`;
|
||||
content = content.replace('</body>', `${layoutScript}</body>`);
|
||||
}
|
||||
content = content.replace("</body>", `${layoutScript}</body>`);
|
||||
}
|
||||
|
||||
fs.writeFileSync(fullPath, content);
|
||||
}
|
||||
fs.writeFileSync(fullPath, content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
run();
|
||||
|
||||
Reference in New Issue
Block a user