at-mintel/packages/concept-engine/src/analyzer.ts

// ============================================================================
// Analyzer — Deterministic Site Analysis (NO LLM!)
// Builds a SiteProfile from crawled pages using pure code logic.
// This is the core fix against hallucinated page structures.
// ============================================================================

import type {
    CrawledPage,
    SiteProfile,
    NavItem,
    CompanyInfo,
    PageInventoryItem,
} from "./types.js";

/**
 * Build a complete SiteProfile from an array of crawled pages.
 * This is 100% deterministic — no LLM calls involved.
 */
export function analyzeSite(pages: CrawledPage[], domain: string): SiteProfile {
    const navigation = extractNavigation(pages);
    const existingFeatures = extractExistingFeatures(pages);
    const services = extractAllServices(pages);
    const companyInfo = extractCompanyInfo(pages);
    const colors = extractColors(pages);
    const socialLinks = extractSocialLinks(pages);
    const externalDomains = extractExternalDomains(pages, domain);
    const images = extractAllImages(pages);
    const employeeCount = extractEmployeeCount(pages);
    const pageInventory = buildPageInventory(pages);

    return {
        domain,
        crawledAt: new Date().toISOString(),
        totalPages: pages.filter((p) => p.type !== "legal").length,
        navigation,
        existingFeatures,
        services,
        companyInfo,
        pageInventory,
        colors,
        socialLinks,
        externalDomains,
        images,
        employeeCount,
    };
}

/**
 * Extract the site's main navigation structure from <nav> elements.
 * Uses the HOME page's nav as the canonical source.
 */
function extractNavigation(pages: CrawledPage[]): NavItem[] {
    // Prefer the home page's nav
    const homePage = pages.find((p) => p.type === "home");
    const sourcePage = homePage || pages[0];
    if (!sourcePage) return [];

    // Deduplicate nav items
    const seen = new Set<string>();
    const navItems: NavItem[] = [];

    for (const label of sourcePage.navItems) {
        const normalized = label.toLowerCase().trim();
        if (seen.has(normalized)) continue;
        if (normalized.length < 2) continue;
        seen.add(normalized);
        navItems.push({ label, href: "" });
    }

    return navItems;
}

/**
 * Aggregate all detected interactive features across all pages.
 */
function extractExistingFeatures(pages: CrawledPage[]): string[] {
    const allFeatures = new Set<string>();
    for (const page of pages) {
        for (const feature of page.features) {
            allFeatures.add(feature);
        }
    }
    return [...allFeatures];
}

/**
 * Aggregate all images found across all pages.
 */
function extractAllImages(pages: CrawledPage[]): string[] {
    const allImages = new Set<string>();
    for (const page of pages) {
        if (!page.images) continue;
        for (const img of page.images) {
            allImages.add(img);
        }
    }
    return [...allImages];
}

/**
 * Extract employee count from page text.
 * Looks for patterns like "über 50 Mitarbeitern", "200 Mitarbeiter", "50+ employees".
 */
function extractEmployeeCount(pages: CrawledPage[]): string | null {
    const allText = pages.map((p) => p.text).join(" ");

    // German patterns: 'über 50 Mitarbeitern', '120 Beschäftigte', '+200 MA'
    const patterns = [
        /(über|ca\.?|rund|mehr als|\+)?\s*(\d{1,4})\s*(Mitarbeiter(?:innen)?|Beschäftigte|MA|Fachkräfte)\b/gi,
        /(\d{1,4})\+?\s*(employees|team members)/gi,
    ];

    for (const pattern of patterns) {
        const match = allText.match(pattern);
        if (match && match[0]) {
            const num = match[0].match(/(\d{1,4})/)?.[1];
            const prefix = match[0].match(/über|ca\.?|rund|mehr als/i)?.[0];
            if (num) return prefix ? `${prefix} ${num}` : num;
        }
    }
    return null;
}

/**
 * Extract services/competencies from service-type pages.
 * Focuses on H2-H3 headings and list items on service pages.
 */
function extractAllServices(pages: CrawledPage[]): string[] {
    const servicePages = pages.filter(
        (p) => p.type === "service" || p.pathname.includes("kompetenz"),
    );

    const services = new Set<string>();
    for (const page of servicePages) {
        // Use headings as primary service indicators
        for (const heading of page.headings) {
            const clean = heading.trim();
            if (clean.length > 3 && clean.length < 100) {
                // Skip generic headings
                if (/^(home|kontakt|impressum|datenschutz|menü|navigation|suche)/i.test(clean)) continue;
                services.add(clean);
            }
        }
    }

    // If no service pages found, look at the home page headings too
    if (services.size === 0) {
        const homePage = pages.find((p) => p.type === "home");
        if (homePage) {
            for (const heading of homePage.headings) {
                const clean = heading.trim();
                if (clean.length > 3 && clean.length < 80) {
                    services.add(clean);
                }
            }
        }
    }

    return [...services];
}

/**
 * Extract company information from Impressum / footer content.
 */
function extractCompanyInfo(pages: CrawledPage[]): CompanyInfo {
    const info: CompanyInfo = {};

    // Find Impressum or legal page
    const legalPage = pages.find(
        (p) =>
            p.type === "legal" &&
            (p.pathname.includes("impressum") || p.title.toLowerCase().includes("impressum")),
    );

    const sourceText = legalPage?.text || pages.find((p) => p.type === "home")?.text || "";

    // USt-ID
    const taxMatch = sourceText.match(/USt[.\s-]*(?:ID[.\s-]*Nr\.?|IdNr\.?)[:\s]*([A-Z]{2}\d{9,11})/i);
    if (taxMatch) info.taxId = taxMatch[1];

    // HRB number
    const hrbMatch = sourceText.match(/HRB[:\s]*(\d+\s*[A-Z]*)/i);
    if (hrbMatch) info.registerNumber = `HRB ${hrbMatch[1].trim()}`;

    // Phone
    const phoneMatch = sourceText.match(/(?:Tel|Telefon|Fon)[.:\s]*([+\d\s()/-]{10,20})/i);
    if (phoneMatch) info.phone = phoneMatch[1].trim();

    // Email
    const emailMatch = sourceText.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
    if (emailMatch) info.email = emailMatch[0];

    // Address (look for German postal code pattern)
    const addressMatch = sourceText.match(
        /(?:[\w\s.-]+(?:straße|str\.|weg|platz|ring|allee|gasse)\s*\d+[a-z]?\s*,?\s*)?(?:D-)?(\d{5})\s+\w+/i,
    );
    if (addressMatch) info.address = addressMatch[0].trim();

    // GF / Geschäftsführer
    const gfMatch = sourceText.match(
        /Geschäftsführ(?:er|ung)[:\s]*([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+){1,3})/,
    );
    if (gfMatch) info.managingDirector = gfMatch[1].trim();

    return info;
}

/**
 * Extract brand colors from HTML (inline styles, CSS variables).
 */
function extractColors(pages: CrawledPage[]): string[] {
    const colors = new Set<string>();
    const homePage = pages.find((p) => p.type === "home");
    if (!homePage) return [];

    const hexMatches = homePage.html.match(/#(?:[0-9a-fA-F]{3}){1,2}\b/g) || [];
    for (const hex of hexMatches) {
        colors.add(hex.toLowerCase());
        if (colors.size >= 8) break;
    }

    return [...colors];
}

/**
 * Extract social media links from footers / headers.
 */
function extractSocialLinks(pages: CrawledPage[]): Record<string, string> {
    const socials: Record<string, string> = {};
    const platforms = [
        { key: "linkedin", patterns: ["linkedin.com"] },
        { key: "instagram", patterns: ["instagram.com"] },
        { key: "facebook", patterns: ["facebook.com", "fb.com"] },
        { key: "youtube", patterns: ["youtube.com", "youtu.be"] },
        { key: "twitter", patterns: ["twitter.com", "x.com"] },
        { key: "xing", patterns: ["xing.com"] },
    ];

    const homePage = pages.find((p) => p.type === "home");
    if (!homePage) return socials;

    const urlMatches = homePage.html.match(/https?:\/\/[^\s"'<>]+/g) || [];
    for (const url of urlMatches) {
        for (const platform of platforms) {
            if (platform.patterns.some((p) => url.includes(p)) && !socials[platform.key]) {
                socials[platform.key] = url;
            }
        }
    }

    return socials;
}

/**
 * Find domains that are linked but separate from the main domain.
 * Critical for detecting sister companies with own websites (e.g. etib-ing.com).
 */
function extractExternalDomains(pages: CrawledPage[], mainDomain: string): string[] {
    const externalDomains = new Set<string>();
    const cleanMain = mainDomain.replace(/^www\./, "");
    // Extract meaningful base parts: "e-tib.com" → ["e", "tib", "etib"]
    const mainParts = cleanMain.split(".")[0].toLowerCase().split(/[-_]/).filter(p => p.length > 1);
    const mainJoined = mainParts.join(""); // "etib"

    for (const page of pages) {
        const linkMatches = page.html.match(/https?:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
        for (const url of linkMatches) {
            try {
                const urlObj = new URL(url);
                const domain = urlObj.hostname.replace(/^www\./, "");
                // Skip same domain
                if (domain === cleanMain) continue;
                // Skip common third-party services
                if (
                    domain.includes("google") ||
                    domain.includes("facebook") ||
                    domain.includes("twitter") ||
                    domain.includes("linkedin") ||
                    domain.includes("instagram") ||
                    domain.includes("youtube") ||
                    domain.includes("cookie") ||
                    domain.includes("analytics") ||
                    domain.includes("cdn") ||
                    domain.includes("cloudflare") ||
                    domain.includes("fonts") ||
                    domain.includes("jquery") ||
                    domain.includes("bootstrap") ||
                    domain.includes("wordpress") ||
                    domain.includes("jimdo") ||
                    domain.includes("wix")
                )
                    continue;

                // Fuzzy match: check if the domain contains any base part of the main domain
                // e.g. main="e-tib.com" → mainParts=["e","tib"], mainJoined="etib"
                // target="etib-ing.com" → domainBase="etib-ing", domainJoined="etibing"
                const domainBase = domain.split(".")[0].toLowerCase();
                const domainJoined = domainBase.replace(/[-_]/g, "");

                const isRelated =
                    domainJoined.includes(mainJoined) ||
                    mainJoined.includes(domainJoined) ||
                    mainParts.some(part => part.length > 2 && domainBase.includes(part));

                if (isRelated) {
                    externalDomains.add(domain);
                }
            } catch {
                // Invalid URL
            }
        }
    }

    return [...externalDomains];
}

/**
 * Build a structured inventory of all pages.
 */
function buildPageInventory(pages: CrawledPage[]): PageInventoryItem[] {
    return pages.map((page) => ({
        url: page.url,
        pathname: page.pathname,
        title: page.title,
        type: page.type,
        headings: page.headings.slice(0, 10),
        services: page.type === "service" ? page.headings.filter((h) => h.length > 3 && h.length < 80) : [],
        hasSearch: page.features.includes("search"),
        hasForms: page.features.includes("forms"),
        hasMap: page.features.includes("maps"),
        hasVideo: page.features.includes("video"),
        contentSummary: page.text.substring(0, 500),
    }));
}