feat: migrate npm registry from Verdaccio to Gitea Packages

2026-02-27 00:12:00 +01:00
parent efd1341762
commit 5da88356a8
69 changed files with 5397 additions and 114 deletions
--- a/packages/concept-engine/src/analyzer.ts
+++ b/packages/concept-engine/src/analyzer.ts
@@ -0,0 +1,334 @@
+// ============================================================================
+// Analyzer — Deterministic Site Analysis (NO LLM!)
+// Builds a SiteProfile from crawled pages using pure code logic.
+// This is the core fix against hallucinated page structures.
+// ============================================================================
+
+import type {
+    CrawledPage,
+    SiteProfile,
+    NavItem,
+    CompanyInfo,
+    PageInventoryItem,
+} from "./types.js";
+
+/**
+ * Build a complete SiteProfile from an array of crawled pages.
+ * This is 100% deterministic — no LLM calls involved.
+ */
+export function analyzeSite(pages: CrawledPage[], domain: string): SiteProfile {
+    const navigation = extractNavigation(pages);
+    const existingFeatures = extractExistingFeatures(pages);
+    const services = extractAllServices(pages);
+    const companyInfo = extractCompanyInfo(pages);
+    const colors = extractColors(pages);
+    const socialLinks = extractSocialLinks(pages);
+    const externalDomains = extractExternalDomains(pages, domain);
+    const images = extractAllImages(pages);
+    const employeeCount = extractEmployeeCount(pages);
+    const pageInventory = buildPageInventory(pages);
+
+    return {
+        domain,
+        crawledAt: new Date().toISOString(),
+        totalPages: pages.filter((p) => p.type !== "legal").length,
+        navigation,
+        existingFeatures,
+        services,
+        companyInfo,
+        pageInventory,
+        colors,
+        socialLinks,
+        externalDomains,
+        images,
+        employeeCount,
+    };
+}
+
+/**
+ * Extract the site's main navigation structure from <nav> elements.
+ * Uses the HOME page's nav as the canonical source.
+ */
+function extractNavigation(pages: CrawledPage[]): NavItem[] {
+    // Prefer the home page's nav
+    const homePage = pages.find((p) => p.type === "home");
+    const sourcePage = homePage || pages[0];
+    if (!sourcePage) return [];
+
+    // Deduplicate nav items
+    const seen = new Set<string>();
+    const navItems: NavItem[] = [];
+
+    for (const label of sourcePage.navItems) {
+        const normalized = label.toLowerCase().trim();
+        if (seen.has(normalized)) continue;
+        if (normalized.length < 2) continue;
+        seen.add(normalized);
+        navItems.push({ label, href: "" });
+    }
+
+    return navItems;
+}
+
+/**
+ * Aggregate all detected interactive features across all pages.
+ */
+function extractExistingFeatures(pages: CrawledPage[]): string[] {
+    const allFeatures = new Set<string>();
+    for (const page of pages) {
+        for (const feature of page.features) {
+            allFeatures.add(feature);
+        }
+    }
+    return [...allFeatures];
+}
+
+/**
+ * Aggregate all images found across all pages.
+ */
+function extractAllImages(pages: CrawledPage[]): string[] {
+    const allImages = new Set<string>();
+    for (const page of pages) {
+        if (!page.images) continue;
+        for (const img of page.images) {
+            allImages.add(img);
+        }
+    }
+    return [...allImages];
+}
+
+/**
+ * Extract employee count from page text.
+ * Looks for patterns like "über 50 Mitarbeitern", "200 Mitarbeiter", "50+ employees".
+ */
+function extractEmployeeCount(pages: CrawledPage[]): string | null {
+    const allText = pages.map((p) => p.text).join(" ");
+
+    // German patterns: 'über 50 Mitarbeitern', '120 Beschäftigte', '+200 MA'
+    const patterns = [
+        /(über|ca\.?|rund|mehr als|\+)?\s*(\d{1,4})\s*(Mitarbeiter(?:innen)?|Beschäftigte|MA|Fachkräfte)\b/gi,
+        /(\d{1,4})\+?\s*(employees|team members)/gi,
+    ];
+
+    for (const pattern of patterns) {
+        const match = allText.match(pattern);
+        if (match && match[0]) {
+            const num = match[0].match(/(\d{1,4})/)?.[1];
+            const prefix = match[0].match(/über|ca\.?|rund|mehr als/i)?.[0];
+            if (num) return prefix ? `${prefix} ${num}` : num;
+        }
+    }
+    return null;
+}
+
+/**
+ * Extract services/competencies from service-type pages.
+ * Focuses on H2-H3 headings and list items on service pages.
+ */
+function extractAllServices(pages: CrawledPage[]): string[] {
+    const servicePages = pages.filter(
+        (p) => p.type === "service" || p.pathname.includes("kompetenz"),
+    );
+
+    const services = new Set<string>();
+    for (const page of servicePages) {
+        // Use headings as primary service indicators
+        for (const heading of page.headings) {
+            const clean = heading.trim();
+            if (clean.length > 3 && clean.length < 100) {
+                // Skip generic headings
+                if (/^(home|kontakt|impressum|datenschutz|menü|navigation|suche)/i.test(clean)) continue;
+                services.add(clean);
+            }
+        }
+    }
+
+    // If no service pages found, look at the home page headings too
+    if (services.size === 0) {
+        const homePage = pages.find((p) => p.type === "home");
+        if (homePage) {
+            for (const heading of homePage.headings) {
+                const clean = heading.trim();
+                if (clean.length > 3 && clean.length < 80) {
+                    services.add(clean);
+                }
+            }
+        }
+    }
+
+    return [...services];
+}
+
+/**
+ * Extract company information from Impressum / footer content.
+ */
+function extractCompanyInfo(pages: CrawledPage[]): CompanyInfo {
+    const info: CompanyInfo = {};
+
+    // Find Impressum or legal page
+    const legalPage = pages.find(
+        (p) =>
+            p.type === "legal" &&
+            (p.pathname.includes("impressum") || p.title.toLowerCase().includes("impressum")),
+    );
+
+    const sourceText = legalPage?.text || pages.find((p) => p.type === "home")?.text || "";
+
+    // USt-ID
+    const taxMatch = sourceText.match(/USt[.\s-]*(?:ID[.\s-]*Nr\.?|IdNr\.?)[:\s]*([A-Z]{2}\d{9,11})/i);
+    if (taxMatch) info.taxId = taxMatch[1];
+
+    // HRB number
+    const hrbMatch = sourceText.match(/HRB[:\s]*(\d+\s*[A-Z]*)/i);
+    if (hrbMatch) info.registerNumber = `HRB ${hrbMatch[1].trim()}`;
+
+    // Phone
+    const phoneMatch = sourceText.match(/(?:Tel|Telefon|Fon)[.:\s]*([+\d\s()/-]{10,20})/i);
+    if (phoneMatch) info.phone = phoneMatch[1].trim();
+
+    // Email
+    const emailMatch = sourceText.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
+    if (emailMatch) info.email = emailMatch[0];
+
+    // Address (look for German postal code pattern)
+    const addressMatch = sourceText.match(
+        /(?:[\w\s.-]+(?:straße|str\.|weg|platz|ring|allee|gasse)\s*\d+[a-z]?\s*,?\s*)?(?:D-)?(\d{5})\s+\w+/i,
+    );
+    if (addressMatch) info.address = addressMatch[0].trim();
+
+    // GF / Geschäftsführer
+    const gfMatch = sourceText.match(
+        /Geschäftsführ(?:er|ung)[:\s]*([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+){1,3})/,
+    );
+    if (gfMatch) info.managingDirector = gfMatch[1].trim();
+
+    return info;
+}
+
+/**
+ * Extract brand colors from HTML (inline styles, CSS variables).
+ */
+function extractColors(pages: CrawledPage[]): string[] {
+    const colors = new Set<string>();
+    const homePage = pages.find((p) => p.type === "home");
+    if (!homePage) return [];
+
+    const hexMatches = homePage.html.match(/#(?:[0-9a-fA-F]{3}){1,2}\b/g) || [];
+    for (const hex of hexMatches) {
+        colors.add(hex.toLowerCase());
+        if (colors.size >= 8) break;
+    }
+
+    return [...colors];
+}
+
+/**
+ * Extract social media links from footers / headers.
+ */
+function extractSocialLinks(pages: CrawledPage[]): Record<string, string> {
+    const socials: Record<string, string> = {};
+    const platforms = [
+        { key: "linkedin", patterns: ["linkedin.com"] },
+        { key: "instagram", patterns: ["instagram.com"] },
+        { key: "facebook", patterns: ["facebook.com", "fb.com"] },
+        { key: "youtube", patterns: ["youtube.com", "youtu.be"] },
+        { key: "twitter", patterns: ["twitter.com", "x.com"] },
+        { key: "xing", patterns: ["xing.com"] },
+    ];
+
+    const homePage = pages.find((p) => p.type === "home");
+    if (!homePage) return socials;
+
+    const urlMatches = homePage.html.match(/https?:\/\/[^\s"'<>]+/g) || [];
+    for (const url of urlMatches) {
+        for (const platform of platforms) {
+            if (platform.patterns.some((p) => url.includes(p)) && !socials[platform.key]) {
+                socials[platform.key] = url;
+            }
+        }
+    }
+
+    return socials;
+}
+
+/**
+ * Find domains that are linked but separate from the main domain.
+ * Critical for detecting sister companies with own websites (e.g. etib-ing.com).
+ */
+function extractExternalDomains(pages: CrawledPage[], mainDomain: string): string[] {
+    const externalDomains = new Set<string>();
+    const cleanMain = mainDomain.replace(/^www\./, "");
+    // Extract meaningful base parts: "e-tib.com" → ["e", "tib", "etib"]
+    const mainParts = cleanMain.split(".")[0].toLowerCase().split(/[-_]/).filter(p => p.length > 1);
+    const mainJoined = mainParts.join(""); // "etib"
+
+    for (const page of pages) {
+        const linkMatches = page.html.match(/https?:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
+        for (const url of linkMatches) {
+            try {
+                const urlObj = new URL(url);
+                const domain = urlObj.hostname.replace(/^www\./, "");
+                // Skip same domain
+                if (domain === cleanMain) continue;
+                // Skip common third-party services
+                if (
+                    domain.includes("google") ||
+                    domain.includes("facebook") ||
+                    domain.includes("twitter") ||
+                    domain.includes("linkedin") ||
+                    domain.includes("instagram") ||
+                    domain.includes("youtube") ||
+                    domain.includes("cookie") ||
+                    domain.includes("analytics") ||
+                    domain.includes("cdn") ||
+                    domain.includes("cloudflare") ||
+                    domain.includes("fonts") ||
+                    domain.includes("jquery") ||
+                    domain.includes("bootstrap") ||
+                    domain.includes("wordpress") ||
+                    domain.includes("jimdo") ||
+                    domain.includes("wix")
+                )
+                    continue;
+
+                // Fuzzy match: check if the domain contains any base part of the main domain
+                // e.g. main="e-tib.com" → mainParts=["e","tib"], mainJoined="etib"
+                // target="etib-ing.com" → domainBase="etib-ing", domainJoined="etibing"
+                const domainBase = domain.split(".")[0].toLowerCase();
+                const domainJoined = domainBase.replace(/[-_]/g, "");
+
+                const isRelated =
+                    domainJoined.includes(mainJoined) ||
+                    mainJoined.includes(domainJoined) ||
+                    mainParts.some(part => part.length > 2 && domainBase.includes(part));
+
+                if (isRelated) {
+                    externalDomains.add(domain);
+                }
+            } catch {
+                // Invalid URL
+            }
+        }
+    }
+
+    return [...externalDomains];
+}
+
+/**
+ * Build a structured inventory of all pages.
+ */
+function buildPageInventory(pages: CrawledPage[]): PageInventoryItem[] {
+    return pages.map((page) => ({
+        url: page.url,
+        pathname: page.pathname,
+        title: page.title,
+        type: page.type,
+        headings: page.headings.slice(0, 10),
+        services: page.type === "service" ? page.headings.filter((h) => h.length > 3 && h.length < 80) : [],
+        hasSearch: page.features.includes("search"),
+        hasForms: page.features.includes("forms"),
+        hasMap: page.features.includes("maps"),
+        hasVideo: page.features.includes("video"),
+        contentSummary: page.text.substring(0, 500),
+    }));
+}