Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
335 lines
12 KiB
TypeScript
335 lines
12 KiB
TypeScript
// ============================================================================
|
|
// Analyzer — Deterministic Site Analysis (NO LLM!)
|
|
// Builds a SiteProfile from crawled pages using pure code logic.
|
|
// This is the core fix against hallucinated page structures.
|
|
// ============================================================================
|
|
|
|
import type {
|
|
CrawledPage,
|
|
SiteProfile,
|
|
NavItem,
|
|
CompanyInfo,
|
|
PageInventoryItem,
|
|
} from "./types.js";
|
|
|
|
/**
|
|
* Build a complete SiteProfile from an array of crawled pages.
|
|
* This is 100% deterministic — no LLM calls involved.
|
|
*/
|
|
export function analyzeSite(pages: CrawledPage[], domain: string): SiteProfile {
|
|
const navigation = extractNavigation(pages);
|
|
const existingFeatures = extractExistingFeatures(pages);
|
|
const services = extractAllServices(pages);
|
|
const companyInfo = extractCompanyInfo(pages);
|
|
const colors = extractColors(pages);
|
|
const socialLinks = extractSocialLinks(pages);
|
|
const externalDomains = extractExternalDomains(pages, domain);
|
|
const images = extractAllImages(pages);
|
|
const employeeCount = extractEmployeeCount(pages);
|
|
const pageInventory = buildPageInventory(pages);
|
|
|
|
return {
|
|
domain,
|
|
crawledAt: new Date().toISOString(),
|
|
totalPages: pages.filter((p) => p.type !== "legal").length,
|
|
navigation,
|
|
existingFeatures,
|
|
services,
|
|
companyInfo,
|
|
pageInventory,
|
|
colors,
|
|
socialLinks,
|
|
externalDomains,
|
|
images,
|
|
employeeCount,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract the site's main navigation structure from <nav> elements.
|
|
* Uses the HOME page's nav as the canonical source.
|
|
*/
|
|
function extractNavigation(pages: CrawledPage[]): NavItem[] {
|
|
// Prefer the home page's nav
|
|
const homePage = pages.find((p) => p.type === "home");
|
|
const sourcePage = homePage || pages[0];
|
|
if (!sourcePage) return [];
|
|
|
|
// Deduplicate nav items
|
|
const seen = new Set<string>();
|
|
const navItems: NavItem[] = [];
|
|
|
|
for (const label of sourcePage.navItems) {
|
|
const normalized = label.toLowerCase().trim();
|
|
if (seen.has(normalized)) continue;
|
|
if (normalized.length < 2) continue;
|
|
seen.add(normalized);
|
|
navItems.push({ label, href: "" });
|
|
}
|
|
|
|
return navItems;
|
|
}
|
|
|
|
/**
|
|
* Aggregate all detected interactive features across all pages.
|
|
*/
|
|
function extractExistingFeatures(pages: CrawledPage[]): string[] {
|
|
const allFeatures = new Set<string>();
|
|
for (const page of pages) {
|
|
for (const feature of page.features) {
|
|
allFeatures.add(feature);
|
|
}
|
|
}
|
|
return [...allFeatures];
|
|
}
|
|
|
|
/**
|
|
* Aggregate all images found across all pages.
|
|
*/
|
|
function extractAllImages(pages: CrawledPage[]): string[] {
|
|
const allImages = new Set<string>();
|
|
for (const page of pages) {
|
|
if (!page.images) continue;
|
|
for (const img of page.images) {
|
|
allImages.add(img);
|
|
}
|
|
}
|
|
return [...allImages];
|
|
}
|
|
|
|
/**
|
|
* Extract employee count from page text.
|
|
* Looks for patterns like "über 50 Mitarbeitern", "200 Mitarbeiter", "50+ employees".
|
|
*/
|
|
function extractEmployeeCount(pages: CrawledPage[]): string | null {
|
|
const allText = pages.map((p) => p.text).join(" ");
|
|
|
|
// German patterns: 'über 50 Mitarbeitern', '120 Beschäftigte', '+200 MA'
|
|
const patterns = [
|
|
/(über|ca\.?|rund|mehr als|\+)?\s*(\d{1,4})\s*(Mitarbeiter(?:innen)?|Beschäftigte|MA|Fachkräfte)\b/gi,
|
|
/(\d{1,4})\+?\s*(employees|team members)/gi,
|
|
];
|
|
|
|
for (const pattern of patterns) {
|
|
const match = allText.match(pattern);
|
|
if (match && match[0]) {
|
|
const num = match[0].match(/(\d{1,4})/)?.[1];
|
|
const prefix = match[0].match(/über|ca\.?|rund|mehr als/i)?.[0];
|
|
if (num) return prefix ? `${prefix} ${num}` : num;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Extract services/competencies from service-type pages.
|
|
* Focuses on H2-H3 headings and list items on service pages.
|
|
*/
|
|
function extractAllServices(pages: CrawledPage[]): string[] {
|
|
const servicePages = pages.filter(
|
|
(p) => p.type === "service" || p.pathname.includes("kompetenz"),
|
|
);
|
|
|
|
const services = new Set<string>();
|
|
for (const page of servicePages) {
|
|
// Use headings as primary service indicators
|
|
for (const heading of page.headings) {
|
|
const clean = heading.trim();
|
|
if (clean.length > 3 && clean.length < 100) {
|
|
// Skip generic headings
|
|
if (/^(home|kontakt|impressum|datenschutz|menü|navigation|suche)/i.test(clean)) continue;
|
|
services.add(clean);
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no service pages found, look at the home page headings too
|
|
if (services.size === 0) {
|
|
const homePage = pages.find((p) => p.type === "home");
|
|
if (homePage) {
|
|
for (const heading of homePage.headings) {
|
|
const clean = heading.trim();
|
|
if (clean.length > 3 && clean.length < 80) {
|
|
services.add(clean);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return [...services];
|
|
}
|
|
|
|
/**
|
|
* Extract company information from Impressum / footer content.
|
|
*/
|
|
function extractCompanyInfo(pages: CrawledPage[]): CompanyInfo {
|
|
const info: CompanyInfo = {};
|
|
|
|
// Find Impressum or legal page
|
|
const legalPage = pages.find(
|
|
(p) =>
|
|
p.type === "legal" &&
|
|
(p.pathname.includes("impressum") || p.title.toLowerCase().includes("impressum")),
|
|
);
|
|
|
|
const sourceText = legalPage?.text || pages.find((p) => p.type === "home")?.text || "";
|
|
|
|
// USt-ID
|
|
const taxMatch = sourceText.match(/USt[.\s-]*(?:ID[.\s-]*Nr\.?|IdNr\.?)[:\s]*([A-Z]{2}\d{9,11})/i);
|
|
if (taxMatch) info.taxId = taxMatch[1];
|
|
|
|
// HRB number
|
|
const hrbMatch = sourceText.match(/HRB[:\s]*(\d+\s*[A-Z]*)/i);
|
|
if (hrbMatch) info.registerNumber = `HRB ${hrbMatch[1].trim()}`;
|
|
|
|
// Phone
|
|
const phoneMatch = sourceText.match(/(?:Tel|Telefon|Fon)[.:\s]*([+\d\s()/-]{10,20})/i);
|
|
if (phoneMatch) info.phone = phoneMatch[1].trim();
|
|
|
|
// Email
|
|
const emailMatch = sourceText.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
|
|
if (emailMatch) info.email = emailMatch[0];
|
|
|
|
// Address (look for German postal code pattern)
|
|
const addressMatch = sourceText.match(
|
|
/(?:[\w\s.-]+(?:straße|str\.|weg|platz|ring|allee|gasse)\s*\d+[a-z]?\s*,?\s*)?(?:D-)?(\d{5})\s+\w+/i,
|
|
);
|
|
if (addressMatch) info.address = addressMatch[0].trim();
|
|
|
|
// GF / Geschäftsführer
|
|
const gfMatch = sourceText.match(
|
|
/Geschäftsführ(?:er|ung)[:\s]*([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+){1,3})/,
|
|
);
|
|
if (gfMatch) info.managingDirector = gfMatch[1].trim();
|
|
|
|
return info;
|
|
}
|
|
|
|
/**
|
|
* Extract brand colors from HTML (inline styles, CSS variables).
|
|
*/
|
|
function extractColors(pages: CrawledPage[]): string[] {
|
|
const colors = new Set<string>();
|
|
const homePage = pages.find((p) => p.type === "home");
|
|
if (!homePage) return [];
|
|
|
|
const hexMatches = homePage.html.match(/#(?:[0-9a-fA-F]{3}){1,2}\b/g) || [];
|
|
for (const hex of hexMatches) {
|
|
colors.add(hex.toLowerCase());
|
|
if (colors.size >= 8) break;
|
|
}
|
|
|
|
return [...colors];
|
|
}
|
|
|
|
/**
|
|
* Extract social media links from footers / headers.
|
|
*/
|
|
function extractSocialLinks(pages: CrawledPage[]): Record<string, string> {
|
|
const socials: Record<string, string> = {};
|
|
const platforms = [
|
|
{ key: "linkedin", patterns: ["linkedin.com"] },
|
|
{ key: "instagram", patterns: ["instagram.com"] },
|
|
{ key: "facebook", patterns: ["facebook.com", "fb.com"] },
|
|
{ key: "youtube", patterns: ["youtube.com", "youtu.be"] },
|
|
{ key: "twitter", patterns: ["twitter.com", "x.com"] },
|
|
{ key: "xing", patterns: ["xing.com"] },
|
|
];
|
|
|
|
const homePage = pages.find((p) => p.type === "home");
|
|
if (!homePage) return socials;
|
|
|
|
const urlMatches = homePage.html.match(/https?:\/\/[^\s"'<>]+/g) || [];
|
|
for (const url of urlMatches) {
|
|
for (const platform of platforms) {
|
|
if (platform.patterns.some((p) => url.includes(p)) && !socials[platform.key]) {
|
|
socials[platform.key] = url;
|
|
}
|
|
}
|
|
}
|
|
|
|
return socials;
|
|
}
|
|
|
|
/**
|
|
* Find domains that are linked but separate from the main domain.
|
|
* Critical for detecting sister companies with own websites (e.g. etib-ing.com).
|
|
*/
|
|
function extractExternalDomains(pages: CrawledPage[], mainDomain: string): string[] {
|
|
const externalDomains = new Set<string>();
|
|
const cleanMain = mainDomain.replace(/^www\./, "");
|
|
// Extract meaningful base parts: "e-tib.com" → ["e", "tib", "etib"]
|
|
const mainParts = cleanMain.split(".")[0].toLowerCase().split(/[-_]/).filter(p => p.length > 1);
|
|
const mainJoined = mainParts.join(""); // "etib"
|
|
|
|
for (const page of pages) {
|
|
const linkMatches = page.html.match(/https?:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
|
|
for (const url of linkMatches) {
|
|
try {
|
|
const urlObj = new URL(url);
|
|
const domain = urlObj.hostname.replace(/^www\./, "");
|
|
// Skip same domain
|
|
if (domain === cleanMain) continue;
|
|
// Skip common third-party services
|
|
if (
|
|
domain.includes("google") ||
|
|
domain.includes("facebook") ||
|
|
domain.includes("twitter") ||
|
|
domain.includes("linkedin") ||
|
|
domain.includes("instagram") ||
|
|
domain.includes("youtube") ||
|
|
domain.includes("cookie") ||
|
|
domain.includes("analytics") ||
|
|
domain.includes("cdn") ||
|
|
domain.includes("cloudflare") ||
|
|
domain.includes("fonts") ||
|
|
domain.includes("jquery") ||
|
|
domain.includes("bootstrap") ||
|
|
domain.includes("wordpress") ||
|
|
domain.includes("jimdo") ||
|
|
domain.includes("wix")
|
|
)
|
|
continue;
|
|
|
|
// Fuzzy match: check if the domain contains any base part of the main domain
|
|
// e.g. main="e-tib.com" → mainParts=["e","tib"], mainJoined="etib"
|
|
// target="etib-ing.com" → domainBase="etib-ing", domainJoined="etibing"
|
|
const domainBase = domain.split(".")[0].toLowerCase();
|
|
const domainJoined = domainBase.replace(/[-_]/g, "");
|
|
|
|
const isRelated =
|
|
domainJoined.includes(mainJoined) ||
|
|
mainJoined.includes(domainJoined) ||
|
|
mainParts.some(part => part.length > 2 && domainBase.includes(part));
|
|
|
|
if (isRelated) {
|
|
externalDomains.add(domain);
|
|
}
|
|
} catch {
|
|
// Invalid URL
|
|
}
|
|
}
|
|
}
|
|
|
|
return [...externalDomains];
|
|
}
|
|
|
|
/**
|
|
* Build a structured inventory of all pages.
|
|
*/
|
|
function buildPageInventory(pages: CrawledPage[]): PageInventoryItem[] {
|
|
return pages.map((page) => ({
|
|
url: page.url,
|
|
pathname: page.pathname,
|
|
title: page.title,
|
|
type: page.type,
|
|
headings: page.headings.slice(0, 10),
|
|
services: page.type === "service" ? page.headings.filter((h) => h.length > 3 && h.length < 80) : [],
|
|
hasSearch: page.features.includes("search"),
|
|
hasForms: page.features.includes("forms"),
|
|
hasMap: page.features.includes("maps"),
|
|
hasVideo: page.features.includes("video"),
|
|
contentSummary: page.text.substring(0, 500),
|
|
}));
|
|
}
|