// ============================================================================ // Scraper — Zyte API + Local Persistence // Crawls all pages of a website, stores them locally for reuse. // Crawls all pages of a website, stores them locally for reuse. // ============================================================================ import * as cheerio from "cheerio"; import * as fs from "node:fs/promises"; import * as path from "node:path"; import { existsSync } from "node:fs"; import type { CrawledPage, PageType } from "./types.js"; interface ScraperConfig { zyteApiKey?: string; crawlDir: string; maxPages?: number; } /** * Classify a URL pathname into a page type. */ function classifyPage(pathname: string): PageType { const p = pathname.toLowerCase(); if (p === "/" || p === "" || p === "/index.html") return "home"; if ( p.includes("service") || p.includes("leistung") || p.includes("kompetenz") ) return "service"; if ( p.includes("about") || p.includes("ueber") || p.includes("über") || p.includes("unternehmen") ) return "about"; if (p.includes("contact") || p.includes("kontakt")) return "contact"; if ( p.includes("job") || p.includes("karriere") || p.includes("career") || p.includes("human-resources") ) return "career"; if ( p.includes("portfolio") || p.includes("referenz") || p.includes("projekt") || p.includes("case-study") ) return "portfolio"; if ( p.includes("blog") || p.includes("news") || p.includes("aktuelles") || p.includes("magazin") ) return "blog"; if ( p.includes("legal") || p.includes("impressum") || p.includes("datenschutz") || p.includes("privacy") || p.includes("agb") ) return "legal"; return "other"; } /** * Detect interactive features present on a page. */ function detectFeatures($: cheerio.CheerioAPI): string[] { const features: string[] = []; // Search if ( $('input[type="search"]').length > 0 || $('form[role="search"]').length > 0 || $(".search-form, .search-box, #search, .searchbar").length > 0 || $('input[name="q"], input[name="s"], input[name="search"]').length > 0 ) { features.push("search"); } // Forms (beyond search) const formCount = $("form").length; const searchForms = $('form[role="search"], .search-form').length; if (formCount > searchForms) { features.push("forms"); } // Maps if ( $( 'iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]', ).length > 0 ) { features.push("maps"); } // Video if ( $("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container") .length > 0 ) { features.push("video"); } // Calendar / Events if ($(".calendar, .event, [data-calendar]").length > 0) { features.push("calendar"); } // Cookie consent if ( $(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length > 0 ) { features.push("cookie-consent"); } return features; } /** * Extract all internal links from a page. */ function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] { const links: string[] = []; $("a[href]").each((_, el) => { const href = $(el).attr("href"); if (!href) return; try { const url = new URL(href, origin); if (url.origin === origin) { // Skip assets if ( /\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test( url.pathname, ) ) return; // Skip anchors-only if (url.pathname === "/" && url.hash) return; links.push(url.pathname); } } catch { // Invalid URL, skip } }); return [...new Set(links)]; } /** * Extract all images from a page. */ function extractImages($: cheerio.CheerioAPI, origin: string): string[] { const images: string[] = []; // Regular img tags $("img[src]").each((_, el) => { const src = $(el).attr("src"); if (src) images.push(src); }); // CSS background images (inline styles) $("[style*='background-image']").each((_, el) => { const style = $(el).attr("style"); const match = style?.match(/url\(['"]?(.*?)['"]?\)/); if (match && match[1]) { images.push(match[1]); } }); // Resolve URLs to absolute const absoluteImages: string[] = []; for (const img of images) { if (img.startsWith("data:image")) continue; // Skip inline base64 try { const url = new URL(img, origin); // Ignore small tracking pixels or generic vectors if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo")) continue; absoluteImages.push(url.href); } catch { // Invalid URL } } return [...new Set(absoluteImages)]; } /** * Fetch a page via Zyte API with browser rendering. */ async function fetchWithZyte(url: string, apiKey: string): Promise { const auth = Buffer.from(`${apiKey}:`).toString("base64"); const resp = await fetch("https://api.zyte.com/v1/extract", { method: "POST", headers: { Authorization: `Basic ${auth}`, "Content-Type": "application/json", }, body: JSON.stringify({ url, browserHtml: true, }), signal: AbortSignal.timeout(60000), }); if (!resp.ok) { const errorText = await resp.text(); console.error( ` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`, ); // Rate limited — wait and retry once if (resp.status === 429) { console.log(" ⏳ Rate limited, waiting 5s and retrying..."); await new Promise((r) => setTimeout(r, 5000)); return fetchWithZyte(url, apiKey); } throw new Error(`HTTP ${resp.status}: ${errorText}`); } const data = await resp.json(); const html = data.browserHtml || ""; if (!html) { console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`); } return html; } /** * Fetch a page via simple HTTP GET (fallback). */ async function fetchDirect(url: string): Promise { const resp = await fetch(url, { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", }, signal: AbortSignal.timeout(30000), }).catch(() => null); if (!resp || !resp.ok) return ""; return await resp.text(); } /** * Parse an HTML string into a CrawledPage. */ function parsePage(html: string, url: string): CrawledPage { const $ = cheerio.load(html); const urlObj = new URL(url); const title = $("title").text().trim(); const headings = $("h1, h2, h3") .map((_, el) => $(el).text().trim()) .get() .filter((h) => h.length > 0); const navItems = $("nav a") .map((_, el) => $(el).text().trim()) .get() .filter((t) => t.length > 0 && t.length < 100); const bodyText = $("body") .text() .replace(/\s+/g, " ") .substring(0, 50000) .trim(); const features = detectFeatures($); const links = extractInternalLinks($, urlObj.origin); const images = extractImages($, urlObj.origin); const description = $('meta[name="description"]').attr("content") || undefined; const ogTitle = $('meta[property="og:title"]').attr("content") || undefined; const ogImage = $('meta[property="og:image"]').attr("content") || undefined; return { url, pathname: urlObj.pathname, title, html, text: bodyText, headings, navItems, features, type: classifyPage(urlObj.pathname), links, images, meta: { description, ogTitle, ogImage }, }; } /** * Crawl a website and persist all pages locally. * * Returns an array of CrawledPage objects. */ export async function crawlSite( targetUrl: string, config: ScraperConfig, ): Promise { const urlObj = new URL(targetUrl); const origin = urlObj.origin; const domain = urlObj.hostname; const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-")); // Check for existing crawl const metaFile = path.join(domainDir, "_crawl_meta.json"); if (existsSync(metaFile)) { console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`); return loadCrawlFromDisk(domainDir); } console.log( `🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`, ); // Ensure output dir await fs.mkdir(domainDir, { recursive: true }); const maxPages = config.maxPages || 30; const visited = new Set(); const queue: string[] = [targetUrl]; const pages: CrawledPage[] = []; while (queue.length > 0 && visited.size < maxPages) { const url = queue.shift()!; const urlPath = new URL(url).pathname; if (visited.has(urlPath)) continue; visited.add(urlPath); try { console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`); let html: string; if (config.zyteApiKey) { html = await fetchWithZyte(url, config.zyteApiKey); } else { html = await fetchDirect(url); } if (!html || html.length < 100) { console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`); continue; } const page = parsePage(html, url); pages.push(page); // Save HTML + metadata to disk const safeName = urlPath === "/" ? "index" : urlPath.replace(/\//g, "_").replace(/^_/, ""); await fs.writeFile(path.join(domainDir, `${safeName}.html`), html); await fs.writeFile( path.join(domainDir, `${safeName}.meta.json`), JSON.stringify( { url: page.url, pathname: page.pathname, title: page.title, type: page.type, headings: page.headings, navItems: page.navItems, features: page.features, links: page.links, images: page.images, meta: page.meta, }, null, 2, ), ); // Discover new links for (const link of page.links) { if (!visited.has(link)) { const fullUrl = `${origin}${link}`; queue.push(fullUrl); } } } catch (err) { console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`); } } // Save crawl metadata await fs.writeFile( metaFile, JSON.stringify( { domain, crawledAt: new Date().toISOString(), totalPages: pages.length, urls: pages.map((p) => p.url), }, null, 2, ), ); console.log( `✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`, ); return pages; } /** * Load a previously crawled site from disk. */ async function loadCrawlFromDisk(domainDir: string): Promise { const files = await fs.readdir(domainDir); const metaFiles = files.filter( (f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json", ); const pages: CrawledPage[] = []; for (const metaFile of metaFiles) { const baseName = metaFile.replace(".meta.json", ""); const htmlFile = `${baseName}.html`; const meta = JSON.parse( await fs.readFile(path.join(domainDir, metaFile), "utf8"), ); let html = ""; if (files.includes(htmlFile)) { html = await fs.readFile(path.join(domainDir, htmlFile), "utf8"); } const text = html ? cheerio .load(html)("body") .text() .replace(/\s+/g, " ") .substring(0, 50000) .trim() : ""; pages.push({ url: meta.url, pathname: meta.pathname, title: meta.title, html, text, headings: meta.headings || [], navItems: meta.navItems || [], features: meta.features || [], type: meta.type || "other", links: meta.links || [], images: meta.images || [], meta: meta.meta || {}, }); } console.log(` 📂 Loaded ${pages.length} cached pages from disk.`); return pages; } /** * Delete a cached crawl to force re-crawl. */ export async function clearCrawlCache( crawlDir: string, domain: string, ): Promise { const domainDir = path.join(crawlDir, domain.replace(/\./g, "-")); if (existsSync(domainDir)) { await fs.rm(domainDir, { recursive: true, force: true }); console.log(`🧹 Cleared crawl cache for ${domain}`); } }