feat: migrate npm registry from Verdaccio to Gitea Packages

2026-02-27 00:12:00 +01:00
parent efd1341762
commit 5da88356a8
69 changed files with 5397 additions and 114 deletions
--- a/packages/concept-engine/src/scraper.ts
+++ b/packages/concept-engine/src/scraper.ts
@@ -0,0 +1,432 @@
+// ============================================================================
+// Scraper — Zyte API + Local Persistence
+// Crawls all pages of a website, stores them locally for reuse.
+// ============================================================================
+
+import axios from "axios";
+import * as cheerio from "cheerio";
+import * as fs from "node:fs/promises";
+import * as path from "node:path";
+import { existsSync } from "node:fs";
+import type { CrawledPage, PageType } from "./types.js";
+
+interface ScraperConfig {
+    zyteApiKey?: string;
+    crawlDir: string;
+    maxPages?: number;
+}
+
+/**
+ * Classify a URL pathname into a page type.
+ */
+function classifyPage(pathname: string): PageType {
+    const p = pathname.toLowerCase();
+    if (p === "/" || p === "" || p === "/index.html") return "home";
+    if (p.includes("service") || p.includes("leistung") || p.includes("kompetenz"))
+        return "service";
+    if (p.includes("about") || p.includes("ueber") || p.includes("über") || p.includes("unternehmen"))
+        return "about";
+    if (p.includes("contact") || p.includes("kontakt")) return "contact";
+    if (p.includes("job") || p.includes("karriere") || p.includes("career") || p.includes("human-resources"))
+        return "career";
+    if (p.includes("portfolio") || p.includes("referenz") || p.includes("projekt") || p.includes("case-study"))
+        return "portfolio";
+    if (p.includes("blog") || p.includes("news") || p.includes("aktuelles") || p.includes("magazin"))
+        return "blog";
+    if (p.includes("legal") || p.includes("impressum") || p.includes("datenschutz") || p.includes("privacy") || p.includes("agb"))
+        return "legal";
+    return "other";
+}
+
+/**
+ * Detect interactive features present on a page.
+ */
+function detectFeatures($: cheerio.CheerioAPI): string[] {
+    const features: string[] = [];
+
+    // Search
+    if (
+        $('input[type="search"]').length > 0 ||
+        $('form[role="search"]').length > 0 ||
+        $(".search-form, .search-box, #search, .searchbar").length > 0 ||
+        $('input[name="q"], input[name="s"], input[name="search"]').length > 0
+    ) {
+        features.push("search");
+    }
+
+    // Forms (beyond search)
+    const formCount = $("form").length;
+    const searchForms = $('form[role="search"], .search-form').length;
+    if (formCount > searchForms) {
+        features.push("forms");
+    }
+
+    // Maps
+    if (
+        $('iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]').length > 0
+    ) {
+        features.push("maps");
+    }
+
+    // Video
+    if (
+        $("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container").length > 0
+    ) {
+        features.push("video");
+    }
+
+    // Calendar / Events
+    if ($(".calendar, .event, [data-calendar]").length > 0) {
+        features.push("calendar");
+    }
+
+    // Cookie consent
+    if ($(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length > 0) {
+        features.push("cookie-consent");
+    }
+
+    return features;
+}
+
+/**
+ * Extract all internal links from a page.
+ */
+function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
+    const links: string[] = [];
+    $("a[href]").each((_, el) => {
+        const href = $(el).attr("href");
+        if (!href) return;
+        try {
+            const url = new URL(href, origin);
+            if (url.origin === origin) {
+                // Skip assets
+                if (/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(url.pathname)) return;
+                // Skip anchors-only
+                if (url.pathname === "/" && url.hash) return;
+                links.push(url.pathname);
+            }
+        } catch {
+            // Invalid URL, skip
+        }
+    });
+    return [...new Set(links)];
+}
+
+/**
+ * Extract all images from a page.
+ */
+function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
+    const images: string[] = [];
+
+    // Regular img tags
+    $("img[src]").each((_, el) => {
+        const src = $(el).attr("src");
+        if (src) images.push(src);
+    });
+
+    // CSS background images (inline styles)
+    $("[style*='background-image']").each((_, el) => {
+        const style = $(el).attr("style");
+        const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
+        if (match && match[1]) {
+            images.push(match[1]);
+        }
+    });
+
+    // Resolve URLs to absolute
+    const absoluteImages: string[] = [];
+    for (const img of images) {
+        if (img.startsWith("data:image")) continue; // Skip inline base64
+        try {
+            const url = new URL(img, origin);
+            // Ignore small tracking pixels or generic vectors
+            if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo")) continue;
+            absoluteImages.push(url.href);
+        } catch {
+            // Invalid URL
+        }
+    }
+
+    return [...new Set(absoluteImages)];
+}
+
+/**
+ * Extract services/competencies from text content.
+ */
+function extractServices(text: string): string[] {
+    const services: string[] = [];
+    // Common pattern: bulleted or newline-separated service lists
+    const lines = text.split(/\n/).map((l) => l.trim()).filter((l) => l.length > 3 && l.length < 100);
+    for (const line of lines) {
+        // Skip generic boilerplate
+        if (/cookie|datenschutz|impressum|copyright|©/i.test(line)) continue;
+        if (/^(tel|fax|e-mail|mobil|web|http)/i.test(line)) continue;
+        services.push(line);
+    }
+    return services;
+}
+
+/**
+ * Fetch a page via Zyte API with browser rendering.
+ */
+async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
+    try {
+        const resp = await axios.post(
+            "https://api.zyte.com/v1/extract",
+            {
+                url,
+                browserHtml: true,
+            },
+            {
+                auth: { username: apiKey, password: "" },
+                timeout: 60000,
+            },
+        );
+        const html = resp.data.browserHtml || "";
+        if (!html) {
+            console.warn(`  ⚠️ Zyte returned empty browserHtml for ${url}`);
+        }
+        return html;
+    } catch (err: any) {
+        if (err.response) {
+            console.error(`  ❌ Zyte API error ${err.response.status} for ${url}: ${err.response.data?.detail || err.response.statusText}`);
+            // Rate limited — wait and retry once
+            if (err.response.status === 429) {
+                console.log("  ⏳ Rate limited, waiting 5s and retrying...");
+                await new Promise((r) => setTimeout(r, 5000));
+                return fetchWithZyte(url, apiKey);
+            }
+        }
+        throw err;
+    }
+}
+
+/**
+ * Fetch a page via simple HTTP GET (fallback).
+ */
+async function fetchDirect(url: string): Promise<string> {
+    const resp = await axios.get(url, {
+        timeout: 30000,
+        headers: {
+            "User-Agent":
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
+        },
+    });
+    return typeof resp.data === "string" ? resp.data : "";
+}
+
+/**
+ * Parse an HTML string into a CrawledPage.
+ */
+function parsePage(html: string, url: string): CrawledPage {
+    const $ = cheerio.load(html);
+    const urlObj = new URL(url);
+
+    const title = $("title").text().trim();
+    const headings = $("h1, h2, h3")
+        .map((_, el) => $(el).text().trim())
+        .get()
+        .filter((h) => h.length > 0);
+
+    const navItems = $("nav a")
+        .map((_, el) => $(el).text().trim())
+        .get()
+        .filter((t) => t.length > 0 && t.length < 100);
+
+    const bodyText = $("body")
+        .text()
+        .replace(/\s+/g, " ")
+        .substring(0, 50000)
+        .trim();
+
+    const features = detectFeatures($);
+    const links = extractInternalLinks($, urlObj.origin);
+    const images = extractImages($, urlObj.origin);
+
+    const description = $('meta[name="description"]').attr("content") || undefined;
+    const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
+    const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
+
+    return {
+        url,
+        pathname: urlObj.pathname,
+        title,
+        html,
+        text: bodyText,
+        headings,
+        navItems,
+        features,
+        type: classifyPage(urlObj.pathname),
+        links,
+        images,
+        meta: { description, ogTitle, ogImage },
+    };
+}
+
+/**
+ * Crawl a website and persist all pages locally.
+ *
+ * Returns an array of CrawledPage objects.
+ */
+export async function crawlSite(
+    targetUrl: string,
+    config: ScraperConfig,
+): Promise<CrawledPage[]> {
+    const urlObj = new URL(targetUrl);
+    const origin = urlObj.origin;
+    const domain = urlObj.hostname;
+    const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
+
+    // Check for existing crawl
+    const metaFile = path.join(domainDir, "_crawl_meta.json");
+    if (existsSync(metaFile)) {
+        console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
+        return loadCrawlFromDisk(domainDir);
+    }
+
+    console.log(`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`);
+
+    // Ensure output dir
+    await fs.mkdir(domainDir, { recursive: true });
+
+    const maxPages = config.maxPages || 30;
+    const visited = new Set<string>();
+    const queue: string[] = [targetUrl];
+    const pages: CrawledPage[] = [];
+
+    while (queue.length > 0 && visited.size < maxPages) {
+        const url = queue.shift()!;
+        const urlPath = new URL(url).pathname;
+
+        if (visited.has(urlPath)) continue;
+        visited.add(urlPath);
+
+        try {
+            console.log(`  ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
+
+            let html: string;
+            if (config.zyteApiKey) {
+                html = await fetchWithZyte(url, config.zyteApiKey);
+            } else {
+                html = await fetchDirect(url);
+            }
+
+            if (!html || html.length < 100) {
+                console.warn(`  ⚠️ Empty/tiny response for ${url}, skipping.`);
+                continue;
+            }
+
+            const page = parsePage(html, url);
+            pages.push(page);
+
+            // Save HTML + metadata to disk
+            const safeName = urlPath === "/" ? "index" : urlPath.replace(/\//g, "_").replace(/^_/, "");
+            await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
+            await fs.writeFile(
+                path.join(domainDir, `${safeName}.meta.json`),
+                JSON.stringify(
+                    {
+                        url: page.url,
+                        pathname: page.pathname,
+                        title: page.title,
+                        type: page.type,
+                        headings: page.headings,
+                        navItems: page.navItems,
+                        features: page.features,
+                        links: page.links,
+                        images: page.images,
+                        meta: page.meta,
+                    },
+                    null,
+                    2,
+                ),
+            );
+
+            // Discover new links
+            for (const link of page.links) {
+                if (!visited.has(link)) {
+                    const fullUrl = `${origin}${link}`;
+                    queue.push(fullUrl);
+                }
+            }
+        } catch (err) {
+            console.warn(`  ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
+        }
+    }
+
+    // Save crawl metadata
+    await fs.writeFile(
+        metaFile,
+        JSON.stringify(
+            {
+                domain,
+                crawledAt: new Date().toISOString(),
+                totalPages: pages.length,
+                urls: pages.map((p) => p.url),
+            },
+            null,
+            2,
+        ),
+    );
+
+    console.log(`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`);
+    return pages;
+}
+
+/**
+ * Load a previously crawled site from disk.
+ */
+async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
+    const files = await fs.readdir(domainDir);
+    const metaFiles = files.filter((f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json");
+
+    const pages: CrawledPage[] = [];
+    for (const metaFile of metaFiles) {
+        const baseName = metaFile.replace(".meta.json", "");
+        const htmlFile = `${baseName}.html`;
+
+        const meta = JSON.parse(await fs.readFile(path.join(domainDir, metaFile), "utf8"));
+        let html = "";
+        if (files.includes(htmlFile)) {
+            html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
+        }
+
+        const text = html
+            ? cheerio
+                .load(html)("body")
+                .text()
+                .replace(/\s+/g, " ")
+                .substring(0, 50000)
+                .trim()
+            : "";
+
+        pages.push({
+            url: meta.url,
+            pathname: meta.pathname,
+            title: meta.title,
+            html,
+            text,
+            headings: meta.headings || [],
+            navItems: meta.navItems || [],
+            features: meta.features || [],
+            type: meta.type || "other",
+            links: meta.links || [],
+            images: meta.images || [],
+            meta: meta.meta || {},
+        });
+    }
+
+    console.log(`  📂 Loaded ${pages.length} cached pages from disk.`);
+    return pages;
+}
+
+/**
+ * Delete a cached crawl to force re-crawl.
+ */
+export async function clearCrawlCache(crawlDir: string, domain: string): Promise<void> {
+    const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
+    if (existsSync(domainDir)) {
+        await fs.rm(domainDir, { recursive: true, force: true });
+        console.log(`🧹 Cleared crawl cache for ${domain}`);
+    }
+}