Files
at-mintel/packages/concept-engine/src/scraper.ts
Marc Mintel 7702310a9c
All checks were successful
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 3s
Monorepo Pipeline / 🧹 Lint (push) Successful in 1m19s
Monorepo Pipeline / 🧪 Test (push) Successful in 1m5s
Monorepo Pipeline / 🏗️ Build (push) Successful in 1m26s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
chore: remove Directus CMS and related dependencies
2026-02-27 19:06:06 +01:00

479 lines
12 KiB
TypeScript

// ============================================================================
// Scraper — Zyte API + Local Persistence
// Crawls all pages of a website, stores them locally for reuse.
// Crawls all pages of a website, stores them locally for reuse.
// ============================================================================
import * as cheerio from "cheerio";
import * as fs from "node:fs/promises";
import * as path from "node:path";
import { existsSync } from "node:fs";
import type { CrawledPage, PageType } from "./types.js";
interface ScraperConfig {
zyteApiKey?: string;
crawlDir: string;
maxPages?: number;
}
/**
* Classify a URL pathname into a page type.
*/
function classifyPage(pathname: string): PageType {
const p = pathname.toLowerCase();
if (p === "/" || p === "" || p === "/index.html") return "home";
if (
p.includes("service") ||
p.includes("leistung") ||
p.includes("kompetenz")
)
return "service";
if (
p.includes("about") ||
p.includes("ueber") ||
p.includes("über") ||
p.includes("unternehmen")
)
return "about";
if (p.includes("contact") || p.includes("kontakt")) return "contact";
if (
p.includes("job") ||
p.includes("karriere") ||
p.includes("career") ||
p.includes("human-resources")
)
return "career";
if (
p.includes("portfolio") ||
p.includes("referenz") ||
p.includes("projekt") ||
p.includes("case-study")
)
return "portfolio";
if (
p.includes("blog") ||
p.includes("news") ||
p.includes("aktuelles") ||
p.includes("magazin")
)
return "blog";
if (
p.includes("legal") ||
p.includes("impressum") ||
p.includes("datenschutz") ||
p.includes("privacy") ||
p.includes("agb")
)
return "legal";
return "other";
}
/**
* Detect interactive features present on a page.
*/
function detectFeatures($: cheerio.CheerioAPI): string[] {
const features: string[] = [];
// Search
if (
$('input[type="search"]').length > 0 ||
$('form[role="search"]').length > 0 ||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
) {
features.push("search");
}
// Forms (beyond search)
const formCount = $("form").length;
const searchForms = $('form[role="search"], .search-form').length;
if (formCount > searchForms) {
features.push("forms");
}
// Maps
if (
$(
'iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]',
).length > 0
) {
features.push("maps");
}
// Video
if (
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container")
.length > 0
) {
features.push("video");
}
// Calendar / Events
if ($(".calendar, .event, [data-calendar]").length > 0) {
features.push("calendar");
}
// Cookie consent
if (
$(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length >
0
) {
features.push("cookie-consent");
}
return features;
}
/**
* Extract all internal links from a page.
*/
function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
const links: string[] = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const url = new URL(href, origin);
if (url.origin === origin) {
// Skip assets
if (
/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(
url.pathname,
)
)
return;
// Skip anchors-only
if (url.pathname === "/" && url.hash) return;
links.push(url.pathname);
}
} catch {
// Invalid URL, skip
}
});
return [...new Set(links)];
}
/**
* Extract all images from a page.
*/
function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
const images: string[] = [];
// Regular img tags
$("img[src]").each((_, el) => {
const src = $(el).attr("src");
if (src) images.push(src);
});
// CSS background images (inline styles)
$("[style*='background-image']").each((_, el) => {
const style = $(el).attr("style");
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
if (match && match[1]) {
images.push(match[1]);
}
});
// Resolve URLs to absolute
const absoluteImages: string[] = [];
for (const img of images) {
if (img.startsWith("data:image")) continue; // Skip inline base64
try {
const url = new URL(img, origin);
// Ignore small tracking pixels or generic vectors
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo"))
continue;
absoluteImages.push(url.href);
} catch {
// Invalid URL
}
}
return [...new Set(absoluteImages)];
}
/**
* Fetch a page via Zyte API with browser rendering.
*/
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
const auth = Buffer.from(`${apiKey}:`).toString("base64");
const resp = await fetch("https://api.zyte.com/v1/extract", {
method: "POST",
headers: {
Authorization: `Basic ${auth}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url,
browserHtml: true,
}),
signal: AbortSignal.timeout(60000),
});
if (!resp.ok) {
const errorText = await resp.text();
console.error(
` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`,
);
// Rate limited — wait and retry once
if (resp.status === 429) {
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
await new Promise((r) => setTimeout(r, 5000));
return fetchWithZyte(url, apiKey);
}
throw new Error(`HTTP ${resp.status}: ${errorText}`);
}
const data = await resp.json();
const html = data.browserHtml || "";
if (!html) {
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
}
return html;
}
/**
* Fetch a page via simple HTTP GET (fallback).
*/
async function fetchDirect(url: string): Promise<string> {
const resp = await fetch(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
},
signal: AbortSignal.timeout(30000),
}).catch(() => null);
if (!resp || !resp.ok) return "";
return await resp.text();
}
/**
* Parse an HTML string into a CrawledPage.
*/
function parsePage(html: string, url: string): CrawledPage {
const $ = cheerio.load(html);
const urlObj = new URL(url);
const title = $("title").text().trim();
const headings = $("h1, h2, h3")
.map((_, el) => $(el).text().trim())
.get()
.filter((h) => h.length > 0);
const navItems = $("nav a")
.map((_, el) => $(el).text().trim())
.get()
.filter((t) => t.length > 0 && t.length < 100);
const bodyText = $("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim();
const features = detectFeatures($);
const links = extractInternalLinks($, urlObj.origin);
const images = extractImages($, urlObj.origin);
const description =
$('meta[name="description"]').attr("content") || undefined;
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
return {
url,
pathname: urlObj.pathname,
title,
html,
text: bodyText,
headings,
navItems,
features,
type: classifyPage(urlObj.pathname),
links,
images,
meta: { description, ogTitle, ogImage },
};
}
/**
* Crawl a website and persist all pages locally.
*
* Returns an array of CrawledPage objects.
*/
export async function crawlSite(
targetUrl: string,
config: ScraperConfig,
): Promise<CrawledPage[]> {
const urlObj = new URL(targetUrl);
const origin = urlObj.origin;
const domain = urlObj.hostname;
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
// Check for existing crawl
const metaFile = path.join(domainDir, "_crawl_meta.json");
if (existsSync(metaFile)) {
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
return loadCrawlFromDisk(domainDir);
}
console.log(
`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`,
);
// Ensure output dir
await fs.mkdir(domainDir, { recursive: true });
const maxPages = config.maxPages || 30;
const visited = new Set<string>();
const queue: string[] = [targetUrl];
const pages: CrawledPage[] = [];
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift()!;
const urlPath = new URL(url).pathname;
if (visited.has(urlPath)) continue;
visited.add(urlPath);
try {
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
let html: string;
if (config.zyteApiKey) {
html = await fetchWithZyte(url, config.zyteApiKey);
} else {
html = await fetchDirect(url);
}
if (!html || html.length < 100) {
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
continue;
}
const page = parsePage(html, url);
pages.push(page);
// Save HTML + metadata to disk
const safeName =
urlPath === "/"
? "index"
: urlPath.replace(/\//g, "_").replace(/^_/, "");
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
await fs.writeFile(
path.join(domainDir, `${safeName}.meta.json`),
JSON.stringify(
{
url: page.url,
pathname: page.pathname,
title: page.title,
type: page.type,
headings: page.headings,
navItems: page.navItems,
features: page.features,
links: page.links,
images: page.images,
meta: page.meta,
},
null,
2,
),
);
// Discover new links
for (const link of page.links) {
if (!visited.has(link)) {
const fullUrl = `${origin}${link}`;
queue.push(fullUrl);
}
}
} catch (err) {
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
}
}
// Save crawl metadata
await fs.writeFile(
metaFile,
JSON.stringify(
{
domain,
crawledAt: new Date().toISOString(),
totalPages: pages.length,
urls: pages.map((p) => p.url),
},
null,
2,
),
);
console.log(
`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`,
);
return pages;
}
/**
* Load a previously crawled site from disk.
*/
async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
const files = await fs.readdir(domainDir);
const metaFiles = files.filter(
(f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json",
);
const pages: CrawledPage[] = [];
for (const metaFile of metaFiles) {
const baseName = metaFile.replace(".meta.json", "");
const htmlFile = `${baseName}.html`;
const meta = JSON.parse(
await fs.readFile(path.join(domainDir, metaFile), "utf8"),
);
let html = "";
if (files.includes(htmlFile)) {
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
}
const text = html
? cheerio
.load(html)("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim()
: "";
pages.push({
url: meta.url,
pathname: meta.pathname,
title: meta.title,
html,
text,
headings: meta.headings || [],
navItems: meta.navItems || [],
features: meta.features || [],
type: meta.type || "other",
links: meta.links || [],
images: meta.images || [],
meta: meta.meta || {},
});
}
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
return pages;
}
/**
* Delete a cached crawl to force re-crawl.
*/
export async function clearCrawlCache(
crawlDir: string,
domain: string,
): Promise<void> {
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
if (existsSync(domainDir)) {
await fs.rm(domainDir, { recursive: true, force: true });
console.log(`🧹 Cleared crawl cache for ${domain}`);
}
}