Files
at-mintel/packages/concept-engine/src/analyzer.ts
Marc Mintel 5da88356a8
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
feat: migrate npm registry from Verdaccio to Gitea Packages
2026-02-27 00:12:00 +01:00

335 lines
12 KiB
TypeScript

// ============================================================================
// Analyzer — Deterministic Site Analysis (NO LLM!)
// Builds a SiteProfile from crawled pages using pure code logic.
// This is the core fix against hallucinated page structures.
// ============================================================================
import type {
CrawledPage,
SiteProfile,
NavItem,
CompanyInfo,
PageInventoryItem,
} from "./types.js";
/**
* Build a complete SiteProfile from an array of crawled pages.
* This is 100% deterministic — no LLM calls involved.
*/
export function analyzeSite(pages: CrawledPage[], domain: string): SiteProfile {
const navigation = extractNavigation(pages);
const existingFeatures = extractExistingFeatures(pages);
const services = extractAllServices(pages);
const companyInfo = extractCompanyInfo(pages);
const colors = extractColors(pages);
const socialLinks = extractSocialLinks(pages);
const externalDomains = extractExternalDomains(pages, domain);
const images = extractAllImages(pages);
const employeeCount = extractEmployeeCount(pages);
const pageInventory = buildPageInventory(pages);
return {
domain,
crawledAt: new Date().toISOString(),
totalPages: pages.filter((p) => p.type !== "legal").length,
navigation,
existingFeatures,
services,
companyInfo,
pageInventory,
colors,
socialLinks,
externalDomains,
images,
employeeCount,
};
}
/**
* Extract the site's main navigation structure from <nav> elements.
* Uses the HOME page's nav as the canonical source.
*/
function extractNavigation(pages: CrawledPage[]): NavItem[] {
// Prefer the home page's nav
const homePage = pages.find((p) => p.type === "home");
const sourcePage = homePage || pages[0];
if (!sourcePage) return [];
// Deduplicate nav items
const seen = new Set<string>();
const navItems: NavItem[] = [];
for (const label of sourcePage.navItems) {
const normalized = label.toLowerCase().trim();
if (seen.has(normalized)) continue;
if (normalized.length < 2) continue;
seen.add(normalized);
navItems.push({ label, href: "" });
}
return navItems;
}
/**
* Aggregate all detected interactive features across all pages.
*/
function extractExistingFeatures(pages: CrawledPage[]): string[] {
const allFeatures = new Set<string>();
for (const page of pages) {
for (const feature of page.features) {
allFeatures.add(feature);
}
}
return [...allFeatures];
}
/**
* Aggregate all images found across all pages.
*/
function extractAllImages(pages: CrawledPage[]): string[] {
const allImages = new Set<string>();
for (const page of pages) {
if (!page.images) continue;
for (const img of page.images) {
allImages.add(img);
}
}
return [...allImages];
}
/**
* Extract employee count from page text.
* Looks for patterns like "über 50 Mitarbeitern", "200 Mitarbeiter", "50+ employees".
*/
function extractEmployeeCount(pages: CrawledPage[]): string | null {
const allText = pages.map((p) => p.text).join(" ");
// German patterns: 'über 50 Mitarbeitern', '120 Beschäftigte', '+200 MA'
const patterns = [
/(über|ca\.?|rund|mehr als|\+)?\s*(\d{1,4})\s*(Mitarbeiter(?:innen)?|Beschäftigte|MA|Fachkräfte)\b/gi,
/(\d{1,4})\+?\s*(employees|team members)/gi,
];
for (const pattern of patterns) {
const match = allText.match(pattern);
if (match && match[0]) {
const num = match[0].match(/(\d{1,4})/)?.[1];
const prefix = match[0].match(/über|ca\.?|rund|mehr als/i)?.[0];
if (num) return prefix ? `${prefix} ${num}` : num;
}
}
return null;
}
/**
* Extract services/competencies from service-type pages.
* Focuses on H2-H3 headings and list items on service pages.
*/
function extractAllServices(pages: CrawledPage[]): string[] {
const servicePages = pages.filter(
(p) => p.type === "service" || p.pathname.includes("kompetenz"),
);
const services = new Set<string>();
for (const page of servicePages) {
// Use headings as primary service indicators
for (const heading of page.headings) {
const clean = heading.trim();
if (clean.length > 3 && clean.length < 100) {
// Skip generic headings
if (/^(home|kontakt|impressum|datenschutz|menü|navigation|suche)/i.test(clean)) continue;
services.add(clean);
}
}
}
// If no service pages found, look at the home page headings too
if (services.size === 0) {
const homePage = pages.find((p) => p.type === "home");
if (homePage) {
for (const heading of homePage.headings) {
const clean = heading.trim();
if (clean.length > 3 && clean.length < 80) {
services.add(clean);
}
}
}
}
return [...services];
}
/**
* Extract company information from Impressum / footer content.
*/
function extractCompanyInfo(pages: CrawledPage[]): CompanyInfo {
const info: CompanyInfo = {};
// Find Impressum or legal page
const legalPage = pages.find(
(p) =>
p.type === "legal" &&
(p.pathname.includes("impressum") || p.title.toLowerCase().includes("impressum")),
);
const sourceText = legalPage?.text || pages.find((p) => p.type === "home")?.text || "";
// USt-ID
const taxMatch = sourceText.match(/USt[.\s-]*(?:ID[.\s-]*Nr\.?|IdNr\.?)[:\s]*([A-Z]{2}\d{9,11})/i);
if (taxMatch) info.taxId = taxMatch[1];
// HRB number
const hrbMatch = sourceText.match(/HRB[:\s]*(\d+\s*[A-Z]*)/i);
if (hrbMatch) info.registerNumber = `HRB ${hrbMatch[1].trim()}`;
// Phone
const phoneMatch = sourceText.match(/(?:Tel|Telefon|Fon)[.:\s]*([+\d\s()/-]{10,20})/i);
if (phoneMatch) info.phone = phoneMatch[1].trim();
// Email
const emailMatch = sourceText.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
if (emailMatch) info.email = emailMatch[0];
// Address (look for German postal code pattern)
const addressMatch = sourceText.match(
/(?:[\w\s.-]+(?:straße|str\.|weg|platz|ring|allee|gasse)\s*\d+[a-z]?\s*,?\s*)?(?:D-)?(\d{5})\s+\w+/i,
);
if (addressMatch) info.address = addressMatch[0].trim();
// GF / Geschäftsführer
const gfMatch = sourceText.match(
/Geschäftsführ(?:er|ung)[:\s]*([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+){1,3})/,
);
if (gfMatch) info.managingDirector = gfMatch[1].trim();
return info;
}
/**
* Extract brand colors from HTML (inline styles, CSS variables).
*/
function extractColors(pages: CrawledPage[]): string[] {
const colors = new Set<string>();
const homePage = pages.find((p) => p.type === "home");
if (!homePage) return [];
const hexMatches = homePage.html.match(/#(?:[0-9a-fA-F]{3}){1,2}\b/g) || [];
for (const hex of hexMatches) {
colors.add(hex.toLowerCase());
if (colors.size >= 8) break;
}
return [...colors];
}
/**
* Extract social media links from footers / headers.
*/
function extractSocialLinks(pages: CrawledPage[]): Record<string, string> {
const socials: Record<string, string> = {};
const platforms = [
{ key: "linkedin", patterns: ["linkedin.com"] },
{ key: "instagram", patterns: ["instagram.com"] },
{ key: "facebook", patterns: ["facebook.com", "fb.com"] },
{ key: "youtube", patterns: ["youtube.com", "youtu.be"] },
{ key: "twitter", patterns: ["twitter.com", "x.com"] },
{ key: "xing", patterns: ["xing.com"] },
];
const homePage = pages.find((p) => p.type === "home");
if (!homePage) return socials;
const urlMatches = homePage.html.match(/https?:\/\/[^\s"'<>]+/g) || [];
for (const url of urlMatches) {
for (const platform of platforms) {
if (platform.patterns.some((p) => url.includes(p)) && !socials[platform.key]) {
socials[platform.key] = url;
}
}
}
return socials;
}
/**
* Find domains that are linked but separate from the main domain.
* Critical for detecting sister companies with own websites (e.g. etib-ing.com).
*/
function extractExternalDomains(pages: CrawledPage[], mainDomain: string): string[] {
const externalDomains = new Set<string>();
const cleanMain = mainDomain.replace(/^www\./, "");
// Extract meaningful base parts: "e-tib.com" → ["e", "tib", "etib"]
const mainParts = cleanMain.split(".")[0].toLowerCase().split(/[-_]/).filter(p => p.length > 1);
const mainJoined = mainParts.join(""); // "etib"
for (const page of pages) {
const linkMatches = page.html.match(/https?:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
for (const url of linkMatches) {
try {
const urlObj = new URL(url);
const domain = urlObj.hostname.replace(/^www\./, "");
// Skip same domain
if (domain === cleanMain) continue;
// Skip common third-party services
if (
domain.includes("google") ||
domain.includes("facebook") ||
domain.includes("twitter") ||
domain.includes("linkedin") ||
domain.includes("instagram") ||
domain.includes("youtube") ||
domain.includes("cookie") ||
domain.includes("analytics") ||
domain.includes("cdn") ||
domain.includes("cloudflare") ||
domain.includes("fonts") ||
domain.includes("jquery") ||
domain.includes("bootstrap") ||
domain.includes("wordpress") ||
domain.includes("jimdo") ||
domain.includes("wix")
)
continue;
// Fuzzy match: check if the domain contains any base part of the main domain
// e.g. main="e-tib.com" → mainParts=["e","tib"], mainJoined="etib"
// target="etib-ing.com" → domainBase="etib-ing", domainJoined="etibing"
const domainBase = domain.split(".")[0].toLowerCase();
const domainJoined = domainBase.replace(/[-_]/g, "");
const isRelated =
domainJoined.includes(mainJoined) ||
mainJoined.includes(domainJoined) ||
mainParts.some(part => part.length > 2 && domainBase.includes(part));
if (isRelated) {
externalDomains.add(domain);
}
} catch {
// Invalid URL
}
}
}
return [...externalDomains];
}
/**
* Build a structured inventory of all pages.
*/
function buildPageInventory(pages: CrawledPage[]): PageInventoryItem[] {
return pages.map((page) => ({
url: page.url,
pathname: page.pathname,
title: page.title,
type: page.type,
headings: page.headings.slice(0, 10),
services: page.type === "service" ? page.headings.filter((h) => h.length > 3 && h.length < 80) : [],
hasSearch: page.features.includes("search"),
hasForms: page.features.includes("forms"),
hasMap: page.features.includes("maps"),
hasVideo: page.features.includes("video"),
contentSummary: page.text.substring(0, 500),
}));
}