feat: migrate npm registry from Verdaccio to Gitea Packages
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
This commit is contained in:
334
packages/concept-engine/src/analyzer.ts
Normal file
334
packages/concept-engine/src/analyzer.ts
Normal file
@@ -0,0 +1,334 @@
|
||||
// ============================================================================
|
||||
// Analyzer — Deterministic Site Analysis (NO LLM!)
|
||||
// Builds a SiteProfile from crawled pages using pure code logic.
|
||||
// This is the core fix against hallucinated page structures.
|
||||
// ============================================================================
|
||||
|
||||
import type {
|
||||
CrawledPage,
|
||||
SiteProfile,
|
||||
NavItem,
|
||||
CompanyInfo,
|
||||
PageInventoryItem,
|
||||
} from "./types.js";
|
||||
|
||||
/**
|
||||
* Build a complete SiteProfile from an array of crawled pages.
|
||||
* This is 100% deterministic — no LLM calls involved.
|
||||
*/
|
||||
export function analyzeSite(pages: CrawledPage[], domain: string): SiteProfile {
|
||||
const navigation = extractNavigation(pages);
|
||||
const existingFeatures = extractExistingFeatures(pages);
|
||||
const services = extractAllServices(pages);
|
||||
const companyInfo = extractCompanyInfo(pages);
|
||||
const colors = extractColors(pages);
|
||||
const socialLinks = extractSocialLinks(pages);
|
||||
const externalDomains = extractExternalDomains(pages, domain);
|
||||
const images = extractAllImages(pages);
|
||||
const employeeCount = extractEmployeeCount(pages);
|
||||
const pageInventory = buildPageInventory(pages);
|
||||
|
||||
return {
|
||||
domain,
|
||||
crawledAt: new Date().toISOString(),
|
||||
totalPages: pages.filter((p) => p.type !== "legal").length,
|
||||
navigation,
|
||||
existingFeatures,
|
||||
services,
|
||||
companyInfo,
|
||||
pageInventory,
|
||||
colors,
|
||||
socialLinks,
|
||||
externalDomains,
|
||||
images,
|
||||
employeeCount,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the site's main navigation structure from <nav> elements.
|
||||
* Uses the HOME page's nav as the canonical source.
|
||||
*/
|
||||
function extractNavigation(pages: CrawledPage[]): NavItem[] {
|
||||
// Prefer the home page's nav
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
const sourcePage = homePage || pages[0];
|
||||
if (!sourcePage) return [];
|
||||
|
||||
// Deduplicate nav items
|
||||
const seen = new Set<string>();
|
||||
const navItems: NavItem[] = [];
|
||||
|
||||
for (const label of sourcePage.navItems) {
|
||||
const normalized = label.toLowerCase().trim();
|
||||
if (seen.has(normalized)) continue;
|
||||
if (normalized.length < 2) continue;
|
||||
seen.add(normalized);
|
||||
navItems.push({ label, href: "" });
|
||||
}
|
||||
|
||||
return navItems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all detected interactive features across all pages.
|
||||
*/
|
||||
function extractExistingFeatures(pages: CrawledPage[]): string[] {
|
||||
const allFeatures = new Set<string>();
|
||||
for (const page of pages) {
|
||||
for (const feature of page.features) {
|
||||
allFeatures.add(feature);
|
||||
}
|
||||
}
|
||||
return [...allFeatures];
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all images found across all pages.
|
||||
*/
|
||||
function extractAllImages(pages: CrawledPage[]): string[] {
|
||||
const allImages = new Set<string>();
|
||||
for (const page of pages) {
|
||||
if (!page.images) continue;
|
||||
for (const img of page.images) {
|
||||
allImages.add(img);
|
||||
}
|
||||
}
|
||||
return [...allImages];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract employee count from page text.
|
||||
* Looks for patterns like "über 50 Mitarbeitern", "200 Mitarbeiter", "50+ employees".
|
||||
*/
|
||||
function extractEmployeeCount(pages: CrawledPage[]): string | null {
|
||||
const allText = pages.map((p) => p.text).join(" ");
|
||||
|
||||
// German patterns: 'über 50 Mitarbeitern', '120 Beschäftigte', '+200 MA'
|
||||
const patterns = [
|
||||
/(über|ca\.?|rund|mehr als|\+)?\s*(\d{1,4})\s*(Mitarbeiter(?:innen)?|Beschäftigte|MA|Fachkräfte)\b/gi,
|
||||
/(\d{1,4})\+?\s*(employees|team members)/gi,
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = allText.match(pattern);
|
||||
if (match && match[0]) {
|
||||
const num = match[0].match(/(\d{1,4})/)?.[1];
|
||||
const prefix = match[0].match(/über|ca\.?|rund|mehr als/i)?.[0];
|
||||
if (num) return prefix ? `${prefix} ${num}` : num;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract services/competencies from service-type pages.
|
||||
* Focuses on H2-H3 headings and list items on service pages.
|
||||
*/
|
||||
function extractAllServices(pages: CrawledPage[]): string[] {
|
||||
const servicePages = pages.filter(
|
||||
(p) => p.type === "service" || p.pathname.includes("kompetenz"),
|
||||
);
|
||||
|
||||
const services = new Set<string>();
|
||||
for (const page of servicePages) {
|
||||
// Use headings as primary service indicators
|
||||
for (const heading of page.headings) {
|
||||
const clean = heading.trim();
|
||||
if (clean.length > 3 && clean.length < 100) {
|
||||
// Skip generic headings
|
||||
if (/^(home|kontakt|impressum|datenschutz|menü|navigation|suche)/i.test(clean)) continue;
|
||||
services.add(clean);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no service pages found, look at the home page headings too
|
||||
if (services.size === 0) {
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
if (homePage) {
|
||||
for (const heading of homePage.headings) {
|
||||
const clean = heading.trim();
|
||||
if (clean.length > 3 && clean.length < 80) {
|
||||
services.add(clean);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...services];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract company information from Impressum / footer content.
|
||||
*/
|
||||
function extractCompanyInfo(pages: CrawledPage[]): CompanyInfo {
|
||||
const info: CompanyInfo = {};
|
||||
|
||||
// Find Impressum or legal page
|
||||
const legalPage = pages.find(
|
||||
(p) =>
|
||||
p.type === "legal" &&
|
||||
(p.pathname.includes("impressum") || p.title.toLowerCase().includes("impressum")),
|
||||
);
|
||||
|
||||
const sourceText = legalPage?.text || pages.find((p) => p.type === "home")?.text || "";
|
||||
|
||||
// USt-ID
|
||||
const taxMatch = sourceText.match(/USt[.\s-]*(?:ID[.\s-]*Nr\.?|IdNr\.?)[:\s]*([A-Z]{2}\d{9,11})/i);
|
||||
if (taxMatch) info.taxId = taxMatch[1];
|
||||
|
||||
// HRB number
|
||||
const hrbMatch = sourceText.match(/HRB[:\s]*(\d+\s*[A-Z]*)/i);
|
||||
if (hrbMatch) info.registerNumber = `HRB ${hrbMatch[1].trim()}`;
|
||||
|
||||
// Phone
|
||||
const phoneMatch = sourceText.match(/(?:Tel|Telefon|Fon)[.:\s]*([+\d\s()/-]{10,20})/i);
|
||||
if (phoneMatch) info.phone = phoneMatch[1].trim();
|
||||
|
||||
// Email
|
||||
const emailMatch = sourceText.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
|
||||
if (emailMatch) info.email = emailMatch[0];
|
||||
|
||||
// Address (look for German postal code pattern)
|
||||
const addressMatch = sourceText.match(
|
||||
/(?:[\w\s.-]+(?:straße|str\.|weg|platz|ring|allee|gasse)\s*\d+[a-z]?\s*,?\s*)?(?:D-)?(\d{5})\s+\w+/i,
|
||||
);
|
||||
if (addressMatch) info.address = addressMatch[0].trim();
|
||||
|
||||
// GF / Geschäftsführer
|
||||
const gfMatch = sourceText.match(
|
||||
/Geschäftsführ(?:er|ung)[:\s]*([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+){1,3})/,
|
||||
);
|
||||
if (gfMatch) info.managingDirector = gfMatch[1].trim();
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract brand colors from HTML (inline styles, CSS variables).
|
||||
*/
|
||||
function extractColors(pages: CrawledPage[]): string[] {
|
||||
const colors = new Set<string>();
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
if (!homePage) return [];
|
||||
|
||||
const hexMatches = homePage.html.match(/#(?:[0-9a-fA-F]{3}){1,2}\b/g) || [];
|
||||
for (const hex of hexMatches) {
|
||||
colors.add(hex.toLowerCase());
|
||||
if (colors.size >= 8) break;
|
||||
}
|
||||
|
||||
return [...colors];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract social media links from footers / headers.
|
||||
*/
|
||||
function extractSocialLinks(pages: CrawledPage[]): Record<string, string> {
|
||||
const socials: Record<string, string> = {};
|
||||
const platforms = [
|
||||
{ key: "linkedin", patterns: ["linkedin.com"] },
|
||||
{ key: "instagram", patterns: ["instagram.com"] },
|
||||
{ key: "facebook", patterns: ["facebook.com", "fb.com"] },
|
||||
{ key: "youtube", patterns: ["youtube.com", "youtu.be"] },
|
||||
{ key: "twitter", patterns: ["twitter.com", "x.com"] },
|
||||
{ key: "xing", patterns: ["xing.com"] },
|
||||
];
|
||||
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
if (!homePage) return socials;
|
||||
|
||||
const urlMatches = homePage.html.match(/https?:\/\/[^\s"'<>]+/g) || [];
|
||||
for (const url of urlMatches) {
|
||||
for (const platform of platforms) {
|
||||
if (platform.patterns.some((p) => url.includes(p)) && !socials[platform.key]) {
|
||||
socials[platform.key] = url;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return socials;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find domains that are linked but separate from the main domain.
|
||||
* Critical for detecting sister companies with own websites (e.g. etib-ing.com).
|
||||
*/
|
||||
function extractExternalDomains(pages: CrawledPage[], mainDomain: string): string[] {
|
||||
const externalDomains = new Set<string>();
|
||||
const cleanMain = mainDomain.replace(/^www\./, "");
|
||||
// Extract meaningful base parts: "e-tib.com" → ["e", "tib", "etib"]
|
||||
const mainParts = cleanMain.split(".")[0].toLowerCase().split(/[-_]/).filter(p => p.length > 1);
|
||||
const mainJoined = mainParts.join(""); // "etib"
|
||||
|
||||
for (const page of pages) {
|
||||
const linkMatches = page.html.match(/https?:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
|
||||
for (const url of linkMatches) {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const domain = urlObj.hostname.replace(/^www\./, "");
|
||||
// Skip same domain
|
||||
if (domain === cleanMain) continue;
|
||||
// Skip common third-party services
|
||||
if (
|
||||
domain.includes("google") ||
|
||||
domain.includes("facebook") ||
|
||||
domain.includes("twitter") ||
|
||||
domain.includes("linkedin") ||
|
||||
domain.includes("instagram") ||
|
||||
domain.includes("youtube") ||
|
||||
domain.includes("cookie") ||
|
||||
domain.includes("analytics") ||
|
||||
domain.includes("cdn") ||
|
||||
domain.includes("cloudflare") ||
|
||||
domain.includes("fonts") ||
|
||||
domain.includes("jquery") ||
|
||||
domain.includes("bootstrap") ||
|
||||
domain.includes("wordpress") ||
|
||||
domain.includes("jimdo") ||
|
||||
domain.includes("wix")
|
||||
)
|
||||
continue;
|
||||
|
||||
// Fuzzy match: check if the domain contains any base part of the main domain
|
||||
// e.g. main="e-tib.com" → mainParts=["e","tib"], mainJoined="etib"
|
||||
// target="etib-ing.com" → domainBase="etib-ing", domainJoined="etibing"
|
||||
const domainBase = domain.split(".")[0].toLowerCase();
|
||||
const domainJoined = domainBase.replace(/[-_]/g, "");
|
||||
|
||||
const isRelated =
|
||||
domainJoined.includes(mainJoined) ||
|
||||
mainJoined.includes(domainJoined) ||
|
||||
mainParts.some(part => part.length > 2 && domainBase.includes(part));
|
||||
|
||||
if (isRelated) {
|
||||
externalDomains.add(domain);
|
||||
}
|
||||
} catch {
|
||||
// Invalid URL
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...externalDomains];
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a structured inventory of all pages.
|
||||
*/
|
||||
function buildPageInventory(pages: CrawledPage[]): PageInventoryItem[] {
|
||||
return pages.map((page) => ({
|
||||
url: page.url,
|
||||
pathname: page.pathname,
|
||||
title: page.title,
|
||||
type: page.type,
|
||||
headings: page.headings.slice(0, 10),
|
||||
services: page.type === "service" ? page.headings.filter((h) => h.length > 3 && h.length < 80) : [],
|
||||
hasSearch: page.features.includes("search"),
|
||||
hasForms: page.features.includes("forms"),
|
||||
hasMap: page.features.includes("maps"),
|
||||
hasVideo: page.features.includes("video"),
|
||||
contentSummary: page.text.substring(0, 500),
|
||||
}));
|
||||
}
|
||||
Reference in New Issue
Block a user