feat: migrate npm registry from Verdaccio to Gitea Packages
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped

This commit is contained in:
2026-02-27 00:12:00 +01:00
parent efd1341762
commit 5da88356a8
69 changed files with 5397 additions and 114 deletions

View File

@@ -0,0 +1,40 @@
import { config as dotenvConfig } from 'dotenv';
import * as path from 'node:path';
import * as fs from 'node:fs/promises';
import { EstimationPipeline } from './pipeline.js';
dotenvConfig({ path: path.resolve(process.cwd(), '../../.env') });
const briefing = await fs.readFile(
path.resolve(process.cwd(), '../../data/briefings/etib.txt'),
'utf8',
);
console.log(`Briefing loaded: ${briefing.length} chars`);
const pipeline = new EstimationPipeline(
{
openrouterKey: process.env.OPENROUTER_API_KEY || '',
zyteApiKey: process.env.ZYTE_API_KEY,
outputDir: path.resolve(process.cwd(), '../../out/estimations'),
crawlDir: path.resolve(process.cwd(), '../../data/crawls'),
},
{
onStepStart: (id, name) => console.log(`[CB] Starting: ${id}`),
onStepComplete: (id) => console.log(`[CB] Done: ${id}`),
onStepError: (id, err) => console.error(`[CB] Error in ${id}: ${err}`),
},
);
try {
const result = await pipeline.run({
briefing,
url: 'https://www.e-tib.com',
});
console.log('\n✨ Pipeline complete!');
console.log('Validation:', result.validationResult?.passed ? 'PASSED' : 'FAILED');
} catch (err: any) {
console.error('\n❌ Pipeline failed:', err.message);
console.error(err.stack);
}

View File

@@ -0,0 +1,334 @@
// ============================================================================
// Analyzer — Deterministic Site Analysis (NO LLM!)
// Builds a SiteProfile from crawled pages using pure code logic.
// This is the core fix against hallucinated page structures.
// ============================================================================
import type {
CrawledPage,
SiteProfile,
NavItem,
CompanyInfo,
PageInventoryItem,
} from "./types.js";
/**
* Build a complete SiteProfile from an array of crawled pages.
* This is 100% deterministic — no LLM calls involved.
*/
export function analyzeSite(pages: CrawledPage[], domain: string): SiteProfile {
const navigation = extractNavigation(pages);
const existingFeatures = extractExistingFeatures(pages);
const services = extractAllServices(pages);
const companyInfo = extractCompanyInfo(pages);
const colors = extractColors(pages);
const socialLinks = extractSocialLinks(pages);
const externalDomains = extractExternalDomains(pages, domain);
const images = extractAllImages(pages);
const employeeCount = extractEmployeeCount(pages);
const pageInventory = buildPageInventory(pages);
return {
domain,
crawledAt: new Date().toISOString(),
totalPages: pages.filter((p) => p.type !== "legal").length,
navigation,
existingFeatures,
services,
companyInfo,
pageInventory,
colors,
socialLinks,
externalDomains,
images,
employeeCount,
};
}
/**
* Extract the site's main navigation structure from <nav> elements.
* Uses the HOME page's nav as the canonical source.
*/
function extractNavigation(pages: CrawledPage[]): NavItem[] {
// Prefer the home page's nav
const homePage = pages.find((p) => p.type === "home");
const sourcePage = homePage || pages[0];
if (!sourcePage) return [];
// Deduplicate nav items
const seen = new Set<string>();
const navItems: NavItem[] = [];
for (const label of sourcePage.navItems) {
const normalized = label.toLowerCase().trim();
if (seen.has(normalized)) continue;
if (normalized.length < 2) continue;
seen.add(normalized);
navItems.push({ label, href: "" });
}
return navItems;
}
/**
* Aggregate all detected interactive features across all pages.
*/
function extractExistingFeatures(pages: CrawledPage[]): string[] {
const allFeatures = new Set<string>();
for (const page of pages) {
for (const feature of page.features) {
allFeatures.add(feature);
}
}
return [...allFeatures];
}
/**
* Aggregate all images found across all pages.
*/
function extractAllImages(pages: CrawledPage[]): string[] {
const allImages = new Set<string>();
for (const page of pages) {
if (!page.images) continue;
for (const img of page.images) {
allImages.add(img);
}
}
return [...allImages];
}
/**
* Extract employee count from page text.
* Looks for patterns like "über 50 Mitarbeitern", "200 Mitarbeiter", "50+ employees".
*/
function extractEmployeeCount(pages: CrawledPage[]): string | null {
const allText = pages.map((p) => p.text).join(" ");
// German patterns: 'über 50 Mitarbeitern', '120 Beschäftigte', '+200 MA'
const patterns = [
/(über|ca\.?|rund|mehr als|\+)?\s*(\d{1,4})\s*(Mitarbeiter(?:innen)?|Beschäftigte|MA|Fachkräfte)\b/gi,
/(\d{1,4})\+?\s*(employees|team members)/gi,
];
for (const pattern of patterns) {
const match = allText.match(pattern);
if (match && match[0]) {
const num = match[0].match(/(\d{1,4})/)?.[1];
const prefix = match[0].match(/über|ca\.?|rund|mehr als/i)?.[0];
if (num) return prefix ? `${prefix} ${num}` : num;
}
}
return null;
}
/**
* Extract services/competencies from service-type pages.
* Focuses on H2-H3 headings and list items on service pages.
*/
function extractAllServices(pages: CrawledPage[]): string[] {
const servicePages = pages.filter(
(p) => p.type === "service" || p.pathname.includes("kompetenz"),
);
const services = new Set<string>();
for (const page of servicePages) {
// Use headings as primary service indicators
for (const heading of page.headings) {
const clean = heading.trim();
if (clean.length > 3 && clean.length < 100) {
// Skip generic headings
if (/^(home|kontakt|impressum|datenschutz|menü|navigation|suche)/i.test(clean)) continue;
services.add(clean);
}
}
}
// If no service pages found, look at the home page headings too
if (services.size === 0) {
const homePage = pages.find((p) => p.type === "home");
if (homePage) {
for (const heading of homePage.headings) {
const clean = heading.trim();
if (clean.length > 3 && clean.length < 80) {
services.add(clean);
}
}
}
}
return [...services];
}
/**
* Extract company information from Impressum / footer content.
*/
function extractCompanyInfo(pages: CrawledPage[]): CompanyInfo {
const info: CompanyInfo = {};
// Find Impressum or legal page
const legalPage = pages.find(
(p) =>
p.type === "legal" &&
(p.pathname.includes("impressum") || p.title.toLowerCase().includes("impressum")),
);
const sourceText = legalPage?.text || pages.find((p) => p.type === "home")?.text || "";
// USt-ID
const taxMatch = sourceText.match(/USt[.\s-]*(?:ID[.\s-]*Nr\.?|IdNr\.?)[:\s]*([A-Z]{2}\d{9,11})/i);
if (taxMatch) info.taxId = taxMatch[1];
// HRB number
const hrbMatch = sourceText.match(/HRB[:\s]*(\d+\s*[A-Z]*)/i);
if (hrbMatch) info.registerNumber = `HRB ${hrbMatch[1].trim()}`;
// Phone
const phoneMatch = sourceText.match(/(?:Tel|Telefon|Fon)[.:\s]*([+\d\s()/-]{10,20})/i);
if (phoneMatch) info.phone = phoneMatch[1].trim();
// Email
const emailMatch = sourceText.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
if (emailMatch) info.email = emailMatch[0];
// Address (look for German postal code pattern)
const addressMatch = sourceText.match(
/(?:[\w\s.-]+(?:straße|str\.|weg|platz|ring|allee|gasse)\s*\d+[a-z]?\s*,?\s*)?(?:D-)?(\d{5})\s+\w+/i,
);
if (addressMatch) info.address = addressMatch[0].trim();
// GF / Geschäftsführer
const gfMatch = sourceText.match(
/Geschäftsführ(?:er|ung)[:\s]*([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+){1,3})/,
);
if (gfMatch) info.managingDirector = gfMatch[1].trim();
return info;
}
/**
* Extract brand colors from HTML (inline styles, CSS variables).
*/
function extractColors(pages: CrawledPage[]): string[] {
const colors = new Set<string>();
const homePage = pages.find((p) => p.type === "home");
if (!homePage) return [];
const hexMatches = homePage.html.match(/#(?:[0-9a-fA-F]{3}){1,2}\b/g) || [];
for (const hex of hexMatches) {
colors.add(hex.toLowerCase());
if (colors.size >= 8) break;
}
return [...colors];
}
/**
* Extract social media links from footers / headers.
*/
function extractSocialLinks(pages: CrawledPage[]): Record<string, string> {
const socials: Record<string, string> = {};
const platforms = [
{ key: "linkedin", patterns: ["linkedin.com"] },
{ key: "instagram", patterns: ["instagram.com"] },
{ key: "facebook", patterns: ["facebook.com", "fb.com"] },
{ key: "youtube", patterns: ["youtube.com", "youtu.be"] },
{ key: "twitter", patterns: ["twitter.com", "x.com"] },
{ key: "xing", patterns: ["xing.com"] },
];
const homePage = pages.find((p) => p.type === "home");
if (!homePage) return socials;
const urlMatches = homePage.html.match(/https?:\/\/[^\s"'<>]+/g) || [];
for (const url of urlMatches) {
for (const platform of platforms) {
if (platform.patterns.some((p) => url.includes(p)) && !socials[platform.key]) {
socials[platform.key] = url;
}
}
}
return socials;
}
/**
* Find domains that are linked but separate from the main domain.
* Critical for detecting sister companies with own websites (e.g. etib-ing.com).
*/
function extractExternalDomains(pages: CrawledPage[], mainDomain: string): string[] {
const externalDomains = new Set<string>();
const cleanMain = mainDomain.replace(/^www\./, "");
// Extract meaningful base parts: "e-tib.com" → ["e", "tib", "etib"]
const mainParts = cleanMain.split(".")[0].toLowerCase().split(/[-_]/).filter(p => p.length > 1);
const mainJoined = mainParts.join(""); // "etib"
for (const page of pages) {
const linkMatches = page.html.match(/https?:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
for (const url of linkMatches) {
try {
const urlObj = new URL(url);
const domain = urlObj.hostname.replace(/^www\./, "");
// Skip same domain
if (domain === cleanMain) continue;
// Skip common third-party services
if (
domain.includes("google") ||
domain.includes("facebook") ||
domain.includes("twitter") ||
domain.includes("linkedin") ||
domain.includes("instagram") ||
domain.includes("youtube") ||
domain.includes("cookie") ||
domain.includes("analytics") ||
domain.includes("cdn") ||
domain.includes("cloudflare") ||
domain.includes("fonts") ||
domain.includes("jquery") ||
domain.includes("bootstrap") ||
domain.includes("wordpress") ||
domain.includes("jimdo") ||
domain.includes("wix")
)
continue;
// Fuzzy match: check if the domain contains any base part of the main domain
// e.g. main="e-tib.com" → mainParts=["e","tib"], mainJoined="etib"
// target="etib-ing.com" → domainBase="etib-ing", domainJoined="etibing"
const domainBase = domain.split(".")[0].toLowerCase();
const domainJoined = domainBase.replace(/[-_]/g, "");
const isRelated =
domainJoined.includes(mainJoined) ||
mainJoined.includes(domainJoined) ||
mainParts.some(part => part.length > 2 && domainBase.includes(part));
if (isRelated) {
externalDomains.add(domain);
}
} catch {
// Invalid URL
}
}
}
return [...externalDomains];
}
/**
* Build a structured inventory of all pages.
*/
function buildPageInventory(pages: CrawledPage[]): PageInventoryItem[] {
return pages.map((page) => ({
url: page.url,
pathname: page.pathname,
title: page.title,
type: page.type,
headings: page.headings.slice(0, 10),
services: page.type === "service" ? page.headings.filter((h) => h.length > 3 && h.length < 80) : [],
hasSearch: page.features.includes("search"),
hasForms: page.features.includes("forms"),
hasMap: page.features.includes("maps"),
hasVideo: page.features.includes("video"),
contentSummary: page.text.substring(0, 500),
}));
}

View File

@@ -0,0 +1,149 @@
#!/usr/bin/env node
// ============================================================================
// @mintel/concept-engine — CLI Entry Point
// Simple commander-based CLI for concept generation.
// ============================================================================
import { Command } from "commander";
import * as path from "node:path";
import * as fs from "node:fs/promises";
import { existsSync } from "node:fs";
import { config as dotenvConfig } from "dotenv";
import { ConceptPipeline } from "./pipeline.js";
// Load .env from monorepo root
dotenvConfig({ path: path.resolve(process.cwd(), "../../.env") });
dotenvConfig({ path: path.resolve(process.cwd(), ".env") });
const program = new Command();
program
.name("concept")
.description("AI-powered project concept generator")
.version("1.0.0");
program
.command("run")
.description("Run the full concept pipeline")
.argument("[briefing]", "Briefing text or @path/to/file.txt")
.option("--url <url>", "Target website URL")
.option("--comments <comments>", "Additional notes")
.option("--clear-cache", "Clear crawl cache and re-crawl")
.option("--output <dir>", "Output directory", "../../out/concepts")
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
.action(async (briefingArg: string | undefined, options: any) => {
const openrouterKey = process.env.OPENROUTER_API_KEY || process.env.OPENROUTER_KEY;
if (!openrouterKey) {
console.error("❌ OPENROUTER_API_KEY not found in environment.");
process.exit(1);
}
let briefing = briefingArg || "";
// Handle @file references
if (briefing.startsWith("@")) {
const rawPath = briefing.substring(1);
const filePath = rawPath.startsWith("/")
? rawPath
: path.resolve(process.cwd(), rawPath);
if (!existsSync(filePath)) {
console.error(`❌ Briefing file not found: ${filePath}`);
process.exit(1);
}
briefing = await fs.readFile(filePath, "utf8");
console.log(`📄 Loaded briefing from: ${filePath}`);
}
// Auto-discover URL from briefing
let url = options.url;
if (!url && briefing) {
const urlMatch = briefing.match(/https?:\/\/[^\s]+/);
if (urlMatch) {
url = urlMatch[0];
console.log(`🔗 Discovered URL in briefing: ${url}`);
}
}
if (!briefing && !url) {
console.error("❌ Provide a briefing text or --url");
process.exit(1);
}
const pipeline = new ConceptPipeline(
{
openrouterKey,
zyteApiKey: process.env.ZYTE_API_KEY,
outputDir: path.resolve(process.cwd(), options.output),
crawlDir: path.resolve(process.cwd(), options.crawlDir),
},
{
onStepStart: (id, name) => {
// Will be enhanced with Ink spinner later
},
onStepComplete: (id, result) => {
// Will be enhanced with Ink UI later
},
},
);
try {
await pipeline.run({
briefing,
url,
comments: options.comments,
clearCache: options.clearCache,
});
console.log("\n✨ Concept generation complete!");
} catch (err) {
console.error(`\n❌ Pipeline failed: ${(err as Error).message}`);
process.exit(1);
}
});
program
.command("analyze")
.description("Only crawl and analyze a website (no LLM)")
.argument("<url>", "Website URL to analyze")
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
.option("--clear-cache", "Clear existing crawl cache")
.action(async (url: string, options: any) => {
const { crawlSite } = await import("./scraper.js");
const { analyzeSite } = await import("./analyzer.js");
if (options.clearCache) {
const { clearCrawlCache } = await import("./scraper.js");
const domain = new URL(url).hostname;
await clearCrawlCache(path.resolve(process.cwd(), options.crawlDir), domain);
}
const pages = await crawlSite(url, {
zyteApiKey: process.env.ZYTE_API_KEY,
crawlDir: path.resolve(process.cwd(), options.crawlDir),
});
const domain = new URL(url).hostname;
const profile = analyzeSite(pages, domain);
console.log("\n📊 Site Profile:");
console.log(` Domain: ${profile.domain}`);
console.log(` Total Pages: ${profile.totalPages}`);
console.log(` Navigation: ${profile.navigation.map((n) => n.label).join(", ")}`);
console.log(` Features: ${profile.existingFeatures.join(", ") || "none"}`);
console.log(` Services: ${profile.services.join(", ") || "none"}`);
console.log(` External Domains: ${profile.externalDomains.join(", ") || "none"}`);
console.log(` Company: ${profile.companyInfo.name || "unbekannt"}`);
console.log(` Tax ID: ${profile.companyInfo.taxId || "unbekannt"}`);
console.log(` Colors: ${profile.colors.join(", ")}`);
console.log(` Images Found: ${profile.images.length}`);
console.log(` Social: ${Object.entries(profile.socialLinks).map(([k, v]) => `${k}`).join(", ") || "none"}`);
const outputPath = path.join(
path.resolve(process.cwd(), options.crawlDir),
domain.replace(/\./g, "-"),
"_site_profile.json",
);
console.log(`\n📦 Full profile saved to: ${outputPath}`);
});
program.parse();

View File

@@ -0,0 +1,10 @@
// ============================================================================
// @mintel/concept-engine — Public API
// ============================================================================
export { ConceptPipeline } from "./pipeline.js";
export type { PipelineCallbacks } from "./pipeline.js";
export { crawlSite, clearCrawlCache } from "./scraper.js";
export { analyzeSite } from "./analyzer.js";
export { llmRequest, llmJsonRequest, cleanJson } from "./llm-client.js";
export * from "./types.js";

View File

@@ -0,0 +1,133 @@
// ============================================================================
// LLM Client — Unified interface with model routing via OpenRouter
// ============================================================================
import axios from "axios";
interface LLMRequestOptions {
model: string;
systemPrompt: string;
userPrompt: string;
jsonMode?: boolean;
apiKey: string;
}
interface LLMResponse {
content: string;
usage: {
promptTokens: number;
completionTokens: number;
cost: number;
};
}
/**
* Clean raw LLM output to parseable JSON.
* Handles markdown fences, control chars, trailing commas.
*/
export function cleanJson(str: string): string {
let cleaned = str.replace(/```json\n?|```/g, "").trim();
cleaned = cleaned.replace(
/[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
" ",
);
cleaned = cleaned.replace(/,\s*([\]}])/g, "$1");
return cleaned;
}
/**
* Send a request to an LLM via OpenRouter.
*/
export async function llmRequest(options: LLMRequestOptions): Promise<LLMResponse> {
const { model, systemPrompt, userPrompt, jsonMode = true, apiKey } = options;
const startTime = Date.now();
const resp = await axios.post(
"https://openrouter.ai/api/v1/chat/completions",
{
model,
messages: [
{ role: "system", content: systemPrompt },
{ role: "user", content: userPrompt },
],
...(jsonMode ? { response_format: { type: "json_object" } } : {}),
},
{
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
timeout: 120000,
},
).catch(err => {
if (err.response) {
console.error("OpenRouter API Error:", JSON.stringify(err.response.data, null, 2));
}
throw err;
});
const content = resp.data.choices?.[0]?.message?.content;
if (!content) {
throw new Error(`LLM returned no content. Model: ${model}`);
}
let cost = 0;
const usage = resp.data.usage || {};
if (usage.cost !== undefined) {
cost = usage.cost;
} else {
// Fallback estimation
cost =
(usage.prompt_tokens || 0) * (0.1 / 1_000_000) +
(usage.completion_tokens || 0) * (0.4 / 1_000_000);
}
return {
content,
usage: {
promptTokens: usage.prompt_tokens || 0,
completionTokens: usage.completion_tokens || 0,
cost,
},
};
}
/**
* Send a request and parse the response as JSON.
*/
export async function llmJsonRequest<T = any>(
options: LLMRequestOptions,
): Promise<{ data: T; usage: LLMResponse["usage"] }> {
const response = await llmRequest({ ...options, jsonMode: true });
const cleaned = cleanJson(response.content);
let parsed: T;
try {
parsed = JSON.parse(cleaned);
} catch (e) {
throw new Error(
`Failed to parse LLM JSON response: ${(e as Error).message}\nRaw: ${cleaned.substring(0, 500)}`,
);
}
// Unwrap common LLM artifacts: {"0": {...}}, {"state": {...}}, etc.
const unwrapped = unwrapResponse(parsed);
return { data: unwrapped as T, usage: response.usage };
}
/**
* Recursively unwrap common LLM wrapping patterns.
*/
function unwrapResponse(obj: any): any {
if (!obj || typeof obj !== "object" || Array.isArray(obj)) return obj;
const keys = Object.keys(obj);
if (keys.length === 1) {
const key = keys[0];
if (key === "0" || key === "state" || key === "facts" || key === "result" || key === "data") {
return unwrapResponse(obj[key]);
}
}
return obj;
}

View File

@@ -0,0 +1,257 @@
// ============================================================================
// Pipeline Orchestrator
// Runs all steps sequentially, tracks state, supports re-running individual steps.
// ============================================================================
import * as fs from "node:fs/promises";
import * as path from "node:path";
import { existsSync } from "node:fs";
import { crawlSite, clearCrawlCache } from "./scraper.js";
import { analyzeSite } from "./analyzer.js";
import { executeResearch } from "./steps/00b-research.js";
import { executeExtract } from "./steps/01-extract.js";
import { executeSiteAudit } from "./steps/00a-site-audit.js";
import { executeAudit } from "./steps/02-audit.js";
import { executeStrategize } from "./steps/03-strategize.js";
import { executeArchitect } from "./steps/04-architect.js";
import type {
PipelineConfig,
PipelineInput,
ConceptState,
ProjectConcept,
StepResult,
StepUsage,
} from "./types.js";
export interface PipelineCallbacks {
onStepStart?: (stepId: string, stepName: string) => void;
onStepComplete?: (stepId: string, result: StepResult) => void;
onStepError?: (stepId: string, error: string) => void;
}
/**
* The main concept pipeline orchestrator.
* Runs conceptual steps sequentially and builds the ProjectConcept.
*/
export class ConceptPipeline {
private config: PipelineConfig;
private state: ConceptState;
private callbacks: PipelineCallbacks;
constructor(config: PipelineConfig, callbacks: PipelineCallbacks = {}) {
this.config = config;
this.callbacks = callbacks;
this.state = this.createInitialState();
}
private createInitialState(): ConceptState {
return {
briefing: "",
usage: {
totalPromptTokens: 0,
totalCompletionTokens: 0,
totalCost: 0,
perStep: [],
},
};
}
/**
* Run the full concept pipeline from scratch.
*/
async run(input: PipelineInput): Promise<ProjectConcept> {
this.state.briefing = input.briefing;
this.state.url = input.url;
this.state.comments = input.comments;
// Ensure output directories
await fs.mkdir(this.config.outputDir, { recursive: true });
await fs.mkdir(this.config.crawlDir, { recursive: true });
// Step 0: Scrape & Analyze (deterministic)
if (input.url) {
if (input.clearCache) {
const domain = new URL(input.url).hostname;
await clearCrawlCache(this.config.crawlDir, domain);
}
await this.runStep("00-scrape", "Scraping & Analyzing Website", async () => {
const pages = await crawlSite(input.url!, {
zyteApiKey: this.config.zyteApiKey,
crawlDir: this.config.crawlDir,
});
const domain = new URL(input.url!).hostname;
const siteProfile = analyzeSite(pages, domain);
this.state.siteProfile = siteProfile;
this.state.crawlDir = path.join(this.config.crawlDir, domain.replace(/\./g, "-"));
// Save site profile
await fs.writeFile(
path.join(this.state.crawlDir!, "_site_profile.json"),
JSON.stringify(siteProfile, null, 2),
);
return {
success: true,
data: siteProfile,
usage: { step: "00-scrape", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: 0 },
};
});
}
// Step 00a: Site Audit (DataForSEO)
await this.runStep("00a-site-audit", "IST-Analysis (DataForSEO)", async () => {
const result = await executeSiteAudit(this.state, this.config);
if (result.success && result.data) {
this.state.siteAudit = result.data;
}
return result;
});
// Step 00b: Research (real web data via journaling)
await this.runStep("00b-research", "Industry & Company Research", async () => {
const result = await executeResearch(this.state);
if (result.success && result.data) {
this.state.researchData = result.data;
}
return result;
});
// Step 1: Extract facts
await this.runStep("01-extract", "Extracting Facts from Briefing", async () => {
const result = await executeExtract(this.state, this.config);
if (result.success) this.state.facts = result.data;
return result;
});
// Step 2: Audit features
await this.runStep("02-audit", "Auditing Features (Skeptical Review)", async () => {
const result = await executeAudit(this.state, this.config);
if (result.success) this.state.auditedFacts = result.data;
return result;
});
// Step 3: Strategic analysis
await this.runStep("03-strategize", "Strategic Analysis", async () => {
const result = await executeStrategize(this.state, this.config);
if (result.success) {
this.state.briefingSummary = result.data.briefingSummary;
this.state.designVision = result.data.designVision;
}
return result;
});
// Step 4: Sitemap architecture
await this.runStep("04-architect", "Information Architecture", async () => {
const result = await executeArchitect(this.state, this.config);
if (result.success) {
this.state.sitemap = result.data.sitemap;
this.state.websiteTopic = result.data.websiteTopic;
}
return result;
});
const projectConcept = this.buildProjectConcept();
await this.saveState(projectConcept);
return projectConcept;
}
/**
* Run a single step with callbacks and error handling.
*/
private async runStep(
stepId: string,
stepName: string,
executor: () => Promise<StepResult>,
): Promise<void> {
this.callbacks.onStepStart?.(stepId, stepName);
console.log(`\n📍 ${stepName}...`);
try {
const result = await executor();
if (result.usage) {
this.state.usage.perStep.push(result.usage);
this.state.usage.totalPromptTokens += result.usage.promptTokens;
this.state.usage.totalCompletionTokens += result.usage.completionTokens;
this.state.usage.totalCost += result.usage.cost;
}
if (result.success) {
const cost = result.usage?.cost ? ` ($${result.usage.cost.toFixed(4)})` : "";
const duration = result.usage?.durationMs ? ` [${(result.usage.durationMs / 1000).toFixed(1)}s]` : "";
console.log(`${stepName} complete${cost}${duration}`);
this.callbacks.onStepComplete?.(stepId, result);
} else {
console.error(`${stepName} failed: ${result.error}`);
this.callbacks.onStepError?.(stepId, result.error || "Unknown error");
throw new Error(result.error);
}
} catch (err) {
const errorMsg = (err as Error).message;
this.callbacks.onStepError?.(stepId, errorMsg);
throw err;
}
}
/**
* Build the final Concept object.
*/
private buildProjectConcept(): ProjectConcept {
return {
domain: this.state.siteProfile?.domain || "unknown",
timestamp: new Date().toISOString(),
briefing: this.state.briefing,
auditedFacts: this.state.auditedFacts || {},
siteProfile: this.state.siteProfile,
siteAudit: this.state.siteAudit,
researchData: this.state.researchData,
strategy: {
briefingSummary: this.state.briefingSummary || "",
designVision: this.state.designVision || "",
},
architecture: {
websiteTopic: this.state.websiteTopic || "",
sitemap: this.state.sitemap || [],
},
usage: this.state.usage,
};
}
/**
* Save the full concept generated state to disk.
*/
private async saveState(concept: ProjectConcept): Promise<void> {
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const companyName = this.state.auditedFacts?.companyName || "unknown";
const stateDir = path.join(this.config.outputDir, "concepts");
await fs.mkdir(stateDir, { recursive: true });
const statePath = path.join(stateDir, `${companyName}_${timestamp}.json`);
await fs.writeFile(statePath, JSON.stringify(concept, null, 2));
console.log(`\n📦 Saved Project Concept to: ${statePath}`);
// Save debug trace
const debugPath = path.join(stateDir, `${companyName}_${timestamp}_debug.json`);
await fs.writeFile(debugPath, JSON.stringify(this.state, null, 2));
// Print usage summary
console.log("\n──────────────────────────────────────────────");
console.log("📊 PIPELINE USAGE SUMMARY");
console.log("──────────────────────────────────────────────");
for (const step of this.state.usage.perStep) {
if (step.cost > 0) {
console.log(` ${step.step}: ${step.model}$${step.cost.toFixed(6)} (${(step.durationMs / 1000).toFixed(1)}s)`);
}
}
console.log("──────────────────────────────────────────────");
console.log(` TOTAL: $${this.state.usage.totalCost.toFixed(6)}`);
console.log(` Tokens: ${(this.state.usage.totalPromptTokens + this.state.usage.totalCompletionTokens).toLocaleString()}`);
console.log("──────────────────────────────────────────────\n");
}
/** Get the current internal state (for CLI inspection). */
getState(): ConceptState {
return this.state;
}
}

View File

@@ -0,0 +1,432 @@
// ============================================================================
// Scraper — Zyte API + Local Persistence
// Crawls all pages of a website, stores them locally for reuse.
// ============================================================================
import axios from "axios";
import * as cheerio from "cheerio";
import * as fs from "node:fs/promises";
import * as path from "node:path";
import { existsSync } from "node:fs";
import type { CrawledPage, PageType } from "./types.js";
interface ScraperConfig {
zyteApiKey?: string;
crawlDir: string;
maxPages?: number;
}
/**
* Classify a URL pathname into a page type.
*/
function classifyPage(pathname: string): PageType {
const p = pathname.toLowerCase();
if (p === "/" || p === "" || p === "/index.html") return "home";
if (p.includes("service") || p.includes("leistung") || p.includes("kompetenz"))
return "service";
if (p.includes("about") || p.includes("ueber") || p.includes("über") || p.includes("unternehmen"))
return "about";
if (p.includes("contact") || p.includes("kontakt")) return "contact";
if (p.includes("job") || p.includes("karriere") || p.includes("career") || p.includes("human-resources"))
return "career";
if (p.includes("portfolio") || p.includes("referenz") || p.includes("projekt") || p.includes("case-study"))
return "portfolio";
if (p.includes("blog") || p.includes("news") || p.includes("aktuelles") || p.includes("magazin"))
return "blog";
if (p.includes("legal") || p.includes("impressum") || p.includes("datenschutz") || p.includes("privacy") || p.includes("agb"))
return "legal";
return "other";
}
/**
* Detect interactive features present on a page.
*/
function detectFeatures($: cheerio.CheerioAPI): string[] {
const features: string[] = [];
// Search
if (
$('input[type="search"]').length > 0 ||
$('form[role="search"]').length > 0 ||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
) {
features.push("search");
}
// Forms (beyond search)
const formCount = $("form").length;
const searchForms = $('form[role="search"], .search-form').length;
if (formCount > searchForms) {
features.push("forms");
}
// Maps
if (
$('iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]').length > 0
) {
features.push("maps");
}
// Video
if (
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container").length > 0
) {
features.push("video");
}
// Calendar / Events
if ($(".calendar, .event, [data-calendar]").length > 0) {
features.push("calendar");
}
// Cookie consent
if ($(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length > 0) {
features.push("cookie-consent");
}
return features;
}
/**
* Extract all internal links from a page.
*/
function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
const links: string[] = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const url = new URL(href, origin);
if (url.origin === origin) {
// Skip assets
if (/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(url.pathname)) return;
// Skip anchors-only
if (url.pathname === "/" && url.hash) return;
links.push(url.pathname);
}
} catch {
// Invalid URL, skip
}
});
return [...new Set(links)];
}
/**
* Extract all images from a page.
*/
function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
const images: string[] = [];
// Regular img tags
$("img[src]").each((_, el) => {
const src = $(el).attr("src");
if (src) images.push(src);
});
// CSS background images (inline styles)
$("[style*='background-image']").each((_, el) => {
const style = $(el).attr("style");
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
if (match && match[1]) {
images.push(match[1]);
}
});
// Resolve URLs to absolute
const absoluteImages: string[] = [];
for (const img of images) {
if (img.startsWith("data:image")) continue; // Skip inline base64
try {
const url = new URL(img, origin);
// Ignore small tracking pixels or generic vectors
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo")) continue;
absoluteImages.push(url.href);
} catch {
// Invalid URL
}
}
return [...new Set(absoluteImages)];
}
/**
* Extract services/competencies from text content.
*/
function extractServices(text: string): string[] {
const services: string[] = [];
// Common pattern: bulleted or newline-separated service lists
const lines = text.split(/\n/).map((l) => l.trim()).filter((l) => l.length > 3 && l.length < 100);
for (const line of lines) {
// Skip generic boilerplate
if (/cookie|datenschutz|impressum|copyright|©/i.test(line)) continue;
if (/^(tel|fax|e-mail|mobil|web|http)/i.test(line)) continue;
services.push(line);
}
return services;
}
/**
* Fetch a page via Zyte API with browser rendering.
*/
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
try {
const resp = await axios.post(
"https://api.zyte.com/v1/extract",
{
url,
browserHtml: true,
},
{
auth: { username: apiKey, password: "" },
timeout: 60000,
},
);
const html = resp.data.browserHtml || "";
if (!html) {
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
}
return html;
} catch (err: any) {
if (err.response) {
console.error(` ❌ Zyte API error ${err.response.status} for ${url}: ${err.response.data?.detail || err.response.statusText}`);
// Rate limited — wait and retry once
if (err.response.status === 429) {
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
await new Promise((r) => setTimeout(r, 5000));
return fetchWithZyte(url, apiKey);
}
}
throw err;
}
}
/**
* Fetch a page via simple HTTP GET (fallback).
*/
async function fetchDirect(url: string): Promise<string> {
const resp = await axios.get(url, {
timeout: 30000,
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
},
});
return typeof resp.data === "string" ? resp.data : "";
}
/**
* Parse an HTML string into a CrawledPage.
*/
function parsePage(html: string, url: string): CrawledPage {
const $ = cheerio.load(html);
const urlObj = new URL(url);
const title = $("title").text().trim();
const headings = $("h1, h2, h3")
.map((_, el) => $(el).text().trim())
.get()
.filter((h) => h.length > 0);
const navItems = $("nav a")
.map((_, el) => $(el).text().trim())
.get()
.filter((t) => t.length > 0 && t.length < 100);
const bodyText = $("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim();
const features = detectFeatures($);
const links = extractInternalLinks($, urlObj.origin);
const images = extractImages($, urlObj.origin);
const description = $('meta[name="description"]').attr("content") || undefined;
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
return {
url,
pathname: urlObj.pathname,
title,
html,
text: bodyText,
headings,
navItems,
features,
type: classifyPage(urlObj.pathname),
links,
images,
meta: { description, ogTitle, ogImage },
};
}
/**
* Crawl a website and persist all pages locally.
*
* Returns an array of CrawledPage objects.
*/
export async function crawlSite(
targetUrl: string,
config: ScraperConfig,
): Promise<CrawledPage[]> {
const urlObj = new URL(targetUrl);
const origin = urlObj.origin;
const domain = urlObj.hostname;
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
// Check for existing crawl
const metaFile = path.join(domainDir, "_crawl_meta.json");
if (existsSync(metaFile)) {
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
return loadCrawlFromDisk(domainDir);
}
console.log(`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`);
// Ensure output dir
await fs.mkdir(domainDir, { recursive: true });
const maxPages = config.maxPages || 30;
const visited = new Set<string>();
const queue: string[] = [targetUrl];
const pages: CrawledPage[] = [];
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift()!;
const urlPath = new URL(url).pathname;
if (visited.has(urlPath)) continue;
visited.add(urlPath);
try {
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
let html: string;
if (config.zyteApiKey) {
html = await fetchWithZyte(url, config.zyteApiKey);
} else {
html = await fetchDirect(url);
}
if (!html || html.length < 100) {
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
continue;
}
const page = parsePage(html, url);
pages.push(page);
// Save HTML + metadata to disk
const safeName = urlPath === "/" ? "index" : urlPath.replace(/\//g, "_").replace(/^_/, "");
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
await fs.writeFile(
path.join(domainDir, `${safeName}.meta.json`),
JSON.stringify(
{
url: page.url,
pathname: page.pathname,
title: page.title,
type: page.type,
headings: page.headings,
navItems: page.navItems,
features: page.features,
links: page.links,
images: page.images,
meta: page.meta,
},
null,
2,
),
);
// Discover new links
for (const link of page.links) {
if (!visited.has(link)) {
const fullUrl = `${origin}${link}`;
queue.push(fullUrl);
}
}
} catch (err) {
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
}
}
// Save crawl metadata
await fs.writeFile(
metaFile,
JSON.stringify(
{
domain,
crawledAt: new Date().toISOString(),
totalPages: pages.length,
urls: pages.map((p) => p.url),
},
null,
2,
),
);
console.log(`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`);
return pages;
}
/**
* Load a previously crawled site from disk.
*/
async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
const files = await fs.readdir(domainDir);
const metaFiles = files.filter((f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json");
const pages: CrawledPage[] = [];
for (const metaFile of metaFiles) {
const baseName = metaFile.replace(".meta.json", "");
const htmlFile = `${baseName}.html`;
const meta = JSON.parse(await fs.readFile(path.join(domainDir, metaFile), "utf8"));
let html = "";
if (files.includes(htmlFile)) {
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
}
const text = html
? cheerio
.load(html)("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim()
: "";
pages.push({
url: meta.url,
pathname: meta.pathname,
title: meta.title,
html,
text,
headings: meta.headings || [],
navItems: meta.navItems || [],
features: meta.features || [],
type: meta.type || "other",
links: meta.links || [],
images: meta.images || [],
meta: meta.meta || {},
});
}
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
return pages;
}
/**
* Delete a cached crawl to force re-crawl.
*/
export async function clearCrawlCache(crawlDir: string, domain: string): Promise<void> {
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
if (existsSync(domainDir)) {
await fs.rm(domainDir, { recursive: true, force: true });
console.log(`🧹 Cleared crawl cache for ${domain}`);
}
}

View File

@@ -0,0 +1,65 @@
// ============================================================================
// Step 00a: Site Audit (DataForSEO + AI)
// ============================================================================
import { PageAuditor } from "@mintel/page-audit";
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
export async function executeSiteAudit(
state: ConceptState,
config: PipelineConfig,
): Promise<StepResult> {
const startTime = Date.now();
if (!state.url) {
return {
success: true,
data: null,
usage: { step: "00a-site-audit", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
};
}
try {
const login = process.env.DATA_FOR_SEO_LOGIN || process.env.DATA_FOR_SEO_API_KEY?.split(":")?.[0];
const password = process.env.DATA_FOR_SEO_PASSWORD || process.env.DATA_FOR_SEO_API_KEY?.split(":")?.slice(1)?.join(":");
if (!login || !password) {
console.warn(" ⚠️ Site Audit skipped: DataForSEO credentials missing from environment.");
return {
success: true,
data: null,
usage: { step: "00a-site-audit", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
};
}
const auditor = new PageAuditor({
dataForSeoLogin: login,
dataForSeoPassword: password,
openrouterKey: config.openrouterKey,
outputDir: config.outputDir ? `${config.outputDir}/audits` : undefined,
});
// Run audit (max 20 pages for the estimation phase to keep it fast)
const result = await auditor.audit(state.url, { maxPages: 20 });
return {
success: true,
data: result,
usage: {
step: "00a-site-audit",
model: "dataforseo",
cost: 0, // DataForSEO cost tracking could be added later
promptTokens: 0,
completionTokens: 0,
durationMs: Date.now() - startTime,
},
};
} catch (err: any) {
console.warn(` ⚠️ Site Audit failed, skipping: ${err.message}`);
return {
success: true,
data: null,
usage: { step: "00a-site-audit", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
};
}
}

View File

@@ -0,0 +1,121 @@
// ============================================================================
// Step 00b: Research — Industry Research via @mintel/journaling (No LLM hallus)
// Uses Serper API for real web search results about the industry/company.
// ============================================================================
import type { ConceptState, StepResult } from "../types.js";
interface ResearchResult {
companyContext: string[];
industryInsights: string[];
competitorInfo: string[];
}
/**
* Research the company and industry using real web search data.
* Uses @mintel/journaling's ResearchAgent — results are grounded in real sources.
*
* NOTE: The journaling package can cause unhandled rejections that crash the process.
* We wrap each call in an additional safety layer.
*/
export async function executeResearch(
state: ConceptState,
): Promise<StepResult<ResearchResult>> {
const startTime = Date.now();
const companyName = state.siteProfile?.companyInfo?.name || "";
const websiteTopic = state.siteProfile?.services?.slice(0, 3).join(", ") || "";
const domain = state.siteProfile?.domain || "";
if (!companyName && !websiteTopic && !domain) {
return {
success: true,
data: { companyContext: [], industryInsights: [], competitorInfo: [] },
usage: { step: "00b-research", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: 0 },
};
}
// Safety wrapper: catch ANY unhandled rejections during this step
const safeCall = <T>(fn: () => Promise<T>, fallback: T): Promise<T> => {
return new Promise<T>((resolve) => {
const handler = (err: any) => {
console.warn(` ⚠️ Unhandled rejection caught in research: ${err?.message || err}`);
process.removeListener("unhandledRejection", handler);
resolve(fallback);
};
process.on("unhandledRejection", handler);
fn()
.then((result) => {
process.removeListener("unhandledRejection", handler);
resolve(result);
})
.catch((err) => {
process.removeListener("unhandledRejection", handler);
console.warn(` ⚠️ Research call failed: ${err?.message || err}`);
resolve(fallback);
});
});
};
try {
const { ResearchAgent } = await import("@mintel/journaling");
const agent = new ResearchAgent(process.env.OPENROUTER_API_KEY || "");
const results: ResearchResult = {
companyContext: [],
industryInsights: [],
competitorInfo: [],
};
// 1. Research the company itself
if (companyName || domain) {
const searchQuery = companyName
? `${companyName} ${websiteTopic} Unternehmen`
: `site:${domain}`;
console.log(` 🔍 Researching: "${searchQuery}"...`);
const facts = await safeCall(
() => agent.researchTopic(searchQuery),
[] as any[],
);
results.companyContext = (facts || [])
.filter((f: any) => f?.fact || f?.value || f?.text || f?.statement)
.map((f: any) => f.fact || f.value || f.text || f.statement)
.slice(0, 5);
}
// 2. Industry research
if (websiteTopic) {
console.log(` 🔍 Researching industry: "${websiteTopic}"...`);
const insights = await safeCall(
() => agent.researchCompetitors(websiteTopic),
[] as any[],
);
results.industryInsights = (insights || []).slice(0, 5);
}
const totalFacts = results.companyContext.length + results.industryInsights.length + results.competitorInfo.length;
console.log(` 📊 Research found ${totalFacts} data points.`);
return {
success: true,
data: results,
usage: {
step: "00b-research",
model: "serper/datacommons",
promptTokens: 0,
completionTokens: 0,
cost: 0,
durationMs: Date.now() - startTime,
},
};
} catch (err) {
console.warn(` ⚠️ Research step skipped: ${(err as Error).message}`);
return {
success: true,
data: { companyContext: [], industryInsights: [], competitorInfo: [] },
usage: { step: "00b-research", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
};
}
}

View File

@@ -0,0 +1,108 @@
// ============================================================================
// Step 01: Extract — Briefing Fact Extraction (Gemini Flash)
// ============================================================================
import { llmJsonRequest } from "../llm-client.js";
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
import { DEFAULT_MODELS } from "../types.js";
export async function executeExtract(
state: ConceptState,
config: PipelineConfig,
): Promise<StepResult> {
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
const startTime = Date.now();
// Build site context from the deterministic analyzer
const siteContext = state.siteProfile
? `
EXISTING WEBSITE ANALYSIS (FACTS — verifiably crawled, NOT guessed):
- Domain: ${state.siteProfile.domain}
- Total pages crawled: ${state.siteProfile.totalPages}
- Navigation items: ${state.siteProfile.navigation.map((n) => n.label).join(", ") || "nicht erkannt"}
- Existing features: ${state.siteProfile.existingFeatures.join(", ") || "keine"}
- Services / Kompetenzen: ${state.siteProfile.services.join(" | ") || "keine"}
- Employee count (from website text): ${(state.siteProfile as any).employeeCount || "nicht genannt"}
- Company name: ${state.siteProfile.companyInfo.name || "unbekannt"}
- Address: ${state.siteProfile.companyInfo.address || "unbekannt"}
- Tax ID (USt-ID): ${state.siteProfile.companyInfo.taxId || "unbekannt"}
- HRB: ${state.siteProfile.companyInfo.registerNumber || "unbekannt"}
- Managing Director: ${state.siteProfile.companyInfo.managingDirector || "unbekannt"}
- External related domains (HAVE OWN WEBSITES — DO NOT include as sub-pages!): ${state.siteProfile.externalDomains.join(", ") || "keine"}
- Social links: ${Object.entries(state.siteProfile.socialLinks).map(([k, v]) => `${k}: ${v}`).join(", ") || "keine"}
`
: "No existing website data available.";
const systemPrompt = `
You are a precision fact extractor. Your only job: extract verifiable facts from the BRIEFING.
Output language: GERMAN (strict).
Output format: flat JSON at root level. No nesting except arrays.
### CRITICAL RULES:
1. "employeeCount": take from SITE ANALYSIS if available. Only override if briefing states something more specific.
2. External domains (e.g. "etib-ing.com") have their OWN website. NEVER include them as sub-pages.
3. Videos (Messefilm, Imagefilm) are CONTENT ASSETS, not pages.
4. If existing site already has search, include "search" in functions.
5. DO NOT invent pages not mentioned in briefing or existing navigation.
### CONSERVATIVE RULE:
- simple lists (Jobs, Referenzen, Messen) = pages, NOT features
- Assume "page" as default. Only add "feature" for complex interactive systems.
### OUTPUT FORMAT:
{
"companyName": string,
"companyAddress": string,
"personName": string,
"email": string,
"existingWebsite": string,
"websiteTopic": string, // MAX 3 words
"isRelaunch": boolean,
"employeeCount": string, // from site analysis, e.g. "über 50"
"pages": string[], // ALL pages: ["Startseite", "Über Uns", "Leistungen", ...]
"functions": string[], // search, forms, maps, video, cookie_consent, etc.
"assets": string[], // existing_website, logo, media, photos, videos
"deadline": string,
"targetAudience": string,
"cmsSetup": boolean,
"multilang": boolean
}
BANNED OUTPUT KEYS: "selectedPages", "otherPages", "features", "apiSystems" — use pages[] and functions[] ONLY.
`;
const userPrompt = `BRIEFING (TRUTH SOURCE):
${state.briefing}
COMMENTS:
${state.comments || "keine"}
${siteContext}`;
try {
const { data, usage } = await llmJsonRequest({
model: models.flash,
systemPrompt,
userPrompt,
apiKey: config.openrouterKey,
});
return {
success: true,
data,
usage: {
step: "01-extract",
model: models.flash,
promptTokens: usage.promptTokens,
completionTokens: usage.completionTokens,
cost: usage.cost,
durationMs: Date.now() - startTime,
},
};
} catch (err) {
return {
success: false,
error: `Extract step failed: ${(err as Error).message}`,
};
}
}

View File

@@ -0,0 +1,110 @@
// ============================================================================
// Step 02: Audit — Feature Auditor + Skeptical Review (Gemini Flash)
// ============================================================================
import { llmJsonRequest } from "../llm-client.js";
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
import { DEFAULT_MODELS } from "../types.js";
export async function executeAudit(
state: ConceptState,
config: PipelineConfig,
): Promise<StepResult> {
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
const startTime = Date.now();
if (!state.facts) {
return { success: false, error: "No facts from Step 01 available." };
}
const systemPrompt = `
You are a "Strict Cost Controller". Your mission is to prevent over-billing.
Review the extracted FEATURES against the BRIEFING and the EXISTING SITE ANALYSIS.
### RULE OF THUMB:
- A "Feature" (1.500 €) is ONLY justified for complex, dynamic systems (logic, database, CMS-driven management, advanced filtering).
- Simple lists, information sections, or static descriptions (e.g., "Messen", "Team", "Historie", "Jobs" as mere text) are ALWAYS "Pages" (600 €).
- If the briefing doesn't explicitly mention "Management System", "Filterable Database", or "Client Login", it is a PAGE.
### ADDITIONAL CHECKS:
1. If any feature maps to an entity that has its own external website (listed in EXTERNAL_DOMAINS), remove it entirely — it's out of scope.
2. Videos are ASSETS not pages. Remove any video-related entries from pages.
3. If the existing site has features (search, forms, etc.), ensure they are in the functions list.
### MISSION:
Return the corrected 'features', 'otherPages', and 'functions' arrays.
### OUTPUT FORMAT:
{
"features": string[],
"otherPages": string[],
"functions": string[],
"removedItems": [{ "item": string, "reason": string }],
"addedItems": [{ "item": string, "reason": string }]
}
`;
const userPrompt = `
EXTRACTED FACTS:
${JSON.stringify(state.facts, null, 2)}
BRIEFING:
${state.briefing}
EXTERNAL DOMAINS (have own websites, OUT OF SCOPE):
${state.siteProfile?.externalDomains?.join(", ") || "none"}
EXISTING FEATURES ON CURRENT SITE:
${state.siteProfile?.existingFeatures?.join(", ") || "none"}
`;
try {
const { data, usage } = await llmJsonRequest({
model: models.flash,
systemPrompt,
userPrompt,
apiKey: config.openrouterKey,
});
// Apply audit results to facts
const auditedFacts = { ...state.facts };
auditedFacts.features = data.features || [];
auditedFacts.otherPages = [
...new Set([...(auditedFacts.otherPages || []), ...(data.otherPages || [])]),
];
if (data.functions) {
auditedFacts.functions = [
...new Set([...(auditedFacts.functions || []), ...data.functions]),
];
}
// Log changes
if (data.removedItems?.length) {
console.log(" 📉 Audit removed:");
for (const item of data.removedItems) {
console.log(` - ${item.item}: ${item.reason}`);
}
}
if (data.addedItems?.length) {
console.log(" 📈 Audit added:");
for (const item of data.addedItems) {
console.log(` + ${item.item}: ${item.reason}`);
}
}
return {
success: true,
data: auditedFacts,
usage: {
step: "02-audit",
model: models.flash,
promptTokens: usage.promptTokens,
completionTokens: usage.completionTokens,
cost: usage.cost,
durationMs: Date.now() - startTime,
},
};
} catch (err) {
return { success: false, error: `Audit step failed: ${(err as Error).message}` };
}
}

View File

@@ -0,0 +1,99 @@
// ============================================================================
// Step 03: Strategize — Briefing Summary + Design Vision (Gemini Pro)
// ============================================================================
import { llmJsonRequest } from "../llm-client.js";
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
import { DEFAULT_MODELS } from "../types.js";
export async function executeStrategize(
state: ConceptState,
config: PipelineConfig,
): Promise<StepResult> {
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
const startTime = Date.now();
if (!state.auditedFacts) {
return { success: false, error: "No audited facts from Step 02 available." };
}
const systemPrompt = `
You are a high-end Digital Architect. Your goal is to make the CUSTOMER feel 100% understood.
Analyze the BRIEFING and the EXISTING WEBSITE context.
### OBJECTIVE:
1. **briefingSummary**: Ein sachlicher, tiefgehender Überblick der Unternehmenslage.
- STIL: Keine Ich-Form. Keine Marketing-Floskeln. Nutze präzise Fachbegriffe. Sei prägnant.
- FORM: EXAKT ZWEI ABSÄTZE. Insgesamt ca. 6 Sätze.
- INHALT: Status Quo, was der Kunde will, welcher Sprung notwendig ist.
- ABSOLUTE REGEL: Keine Halluzinationen. Keine namentlichen Nennungen von Personen.
- RELAUNCH-REGEL: Wenn isRelaunch=true, NICHT sagen "keine digitale Präsenz". Es GIBT eine Seite.
- SORGLOS BETRIEB: MUSS erwähnt werden als Teil des Gesamtpakets.
2. **designVision**: Ein abstraktes, strategisches Konzept.
- STIL: Rein konzeptionell. Keine Umsetzungsschritte. Keine Ich-Form. Sei prägnant.
- FORM: EXAKT ZWEI ABSÄTZE. Insgesamt ca. 4 Sätze.
- DATENSCHUTZ: KEINERLEI namentliche Nennungen.
- FOKUS: Welche strategische Wirkung soll erzielt werden?
### RULES:
- NO "wir/unser". NO "Ich/Mein". Objective, fact-oriented narrative.
- NO marketing lingo. NO "innovativ", "revolutionär", "state-of-the-art".
- NO hallucinations about features not in the briefing.
- NO "SEO-Standards zur Fachkräftesicherung" or "B2B-Nutzerströme" — das ist Schwachsinn.
Use specific industry terms from the briefing (e.g. "Kabeltiefbau", "HDD-Bohrverfahren").
- LANGUAGE: Professional German. Simple but expert-level.
### OUTPUT FORMAT:
{
"briefingSummary": string,
"designVision": string
}
`;
const userPrompt = `
BRIEFING (TRUTH SOURCE):
${state.briefing}
EXISTING WEBSITE DATA:
- Services: ${state.siteProfile?.services?.join(", ") || "unbekannt"}
- Navigation: ${state.siteProfile?.navigation?.map((n) => n.label).join(", ") || "unbekannt"}
- Company: ${state.auditedFacts.companyName || "unbekannt"}
EXTRACTED & AUDITED FACTS:
${JSON.stringify(state.auditedFacts, null, 2)}
${state.siteAudit?.report ? `
TECHNICAL SITE AUDIT (IST-Analyse):
Health: ${state.siteAudit.report.overallHealth} (SEO: ${state.siteAudit.report.seoScore}, UX: ${state.siteAudit.report.uxScore}, Perf: ${state.siteAudit.report.performanceScore})
- Executive Summary: ${state.siteAudit.report.executiveSummary}
- Strengths: ${state.siteAudit.report.strengths.join(", ")}
- Critical Issues: ${state.siteAudit.report.criticalIssues.join(", ")}
- Quick Wins: ${state.siteAudit.report.quickWins.join(", ")}
` : ""}
`;
try {
const { data, usage } = await llmJsonRequest({
model: models.pro,
systemPrompt,
userPrompt,
apiKey: config.openrouterKey,
});
return {
success: true,
data,
usage: {
step: "03-strategize",
model: models.pro,
promptTokens: usage.promptTokens,
completionTokens: usage.completionTokens,
cost: usage.cost,
durationMs: Date.now() - startTime,
},
};
} catch (err) {
return { success: false, error: `Strategize step failed: ${(err as Error).message}` };
}
}

View File

@@ -0,0 +1,133 @@
// ============================================================================
// Step 04: Architect — Sitemap & Information Architecture (Gemini Pro)
// ============================================================================
import { llmJsonRequest } from "../llm-client.js";
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
import { DEFAULT_MODELS } from "../types.js";
export async function executeArchitect(
state: ConceptState,
config: PipelineConfig,
): Promise<StepResult> {
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
const startTime = Date.now();
if (!state.auditedFacts) {
return { success: false, error: "No audited facts available." };
}
// Build navigation constraint from the real site
const existingNav = state.siteProfile?.navigation?.map((n) => n.label).join(", ") || "unbekannt";
const existingServices = state.siteProfile?.services?.join(", ") || "unbekannt";
const externalDomains = state.siteProfile?.externalDomains?.join(", ") || "keine";
const systemPrompt = `
Du bist ein Senior UX Architekt. Erstelle einen ECHTEN SEITENBAUM für die neue Website.
Regelwerk für den Output:
### SEITENBAUM-REGELN:
1. KEIN MARKETINGSPRECH als Kategoriename. Gültige Kategorien sind nur die echten Navigationspunkte der Website.
ERLAUBT: "Startseite", "Leistungen", "Über uns", "Karriere", "Referenzen", "Kontakt", "Rechtliches"
VERBOTEN: "Kern-Präsenz", "Vertrauen", "Business Areas", "Digitaler Auftritt"
2. LEISTUNGEN muss in ECHTE UNTERSEITEN aufgeteilt werden — nicht eine einzige "Leistungen"-Seite.
Jede Kompetenz aus dem existierenden Leistungsspektrum = eine eigene Seite.
Beispiel statt:
{ category: "Leistungen", pages: [{ title: "Leistungen", desc: "..." }] }
So:
{ category: "Leistungen", pages: [
{ title: "Kabeltiefbau", desc: "Mittelspannung, Niederspannung, Kabelpflugarbeiten..." },
{ title: "Horizontalspülbohrungen", desc: "HDD in allen Bodenklassen..." },
{ title: "Elektromontagen", desc: "Bis 110 kV, Glasfaserkabelmontagen..." },
{ title: "Planung & Dokumentation", desc: "Genehmigungs- und Ausführungsplanung, Vermessung..." }
]}
3. SEITENTITEL: Kurz, klar, faktisch. Kein Werbejargon.
ERLAUBT: "Kabeltiefbau", "Über uns", "Karriere"
VERBOTEN: "Unsere Expertise", "Kompetenzspektrum", "Community"
4. Gruppe die Leistungen nach dem ECHTEN Kompetenzkatalog der bestehenden Site — nicht erfinden.
5. Keine doppelten Seiten. Keine Phantomseiten.
6. Videos = Content-Assets, keine eigene Seite.
7. Entitäten mit eigener Domain (${externalDomains}) = NICHT als Seite. Nur als Teaser/Link wenn nötig.
### KONTEXT:
Bestehende Navigation: ${existingNav}
Bestehende Services: ${existingServices}
Externe Domains (haben eigene Website): ${externalDomains}
Angeforderte zusätzliche Seiten aus Briefing: ${(state.auditedFacts as any)?.pages?.join(", ") || "keine spezifischen"}
### OUTPUT FORMAT (JSON):
{
"websiteTopic": string, // MAX 3 Wörter, beschreibend
"sitemap": [
{
"category": string, // Echter Nav-Eintrag. KEIN Marketingsprech.
"pages": [
{ "title": string, "desc": string } // Echte Unterseite, 1-2 Sätze Zweck
]
}
]
}
`;
const userPrompt = `
BRIEFING:
${state.briefing}
FAKTEN (aus Extraktion):
${JSON.stringify({ facts: state.auditedFacts, strategy: { briefingSummary: state.briefingSummary } }, null, 2)}
Erstelle den Seitenbaum. Baue die Leistungen DETAILLIERT aus — echte Unterseiten pro Kompetenzbereich.
`;
try {
const { data, usage } = await llmJsonRequest({
model: models.pro,
systemPrompt,
userPrompt,
apiKey: config.openrouterKey,
});
// Normalize sitemap structure
let sitemap = data.sitemap;
if (sitemap && !Array.isArray(sitemap)) {
if (sitemap.categories) sitemap = sitemap.categories;
else {
const entries = Object.entries(sitemap);
if (entries.every(([, v]) => Array.isArray(v))) {
sitemap = entries.map(([category, pages]) => ({ category, pages }));
}
}
}
if (Array.isArray(sitemap)) {
sitemap = sitemap.map((cat: any) => ({
category: cat.category || cat.kategorie || cat.Kategorie || "Allgemein",
pages: (cat.pages || cat.seiten || []).map((page: any) => ({
title: page.title || page.titel || "Seite",
desc: page.desc || page.beschreibung || page.description || "",
})),
}));
}
return {
success: true,
data: { websiteTopic: data.websiteTopic, sitemap },
usage: {
step: "04-architect",
model: models.pro,
promptTokens: usage.promptTokens,
completionTokens: usage.completionTokens,
cost: usage.cost,
durationMs: Date.now() - startTime,
},
};
} catch (err) {
return { success: false, error: `Architect step failed: ${(err as Error).message}` };
}
}

View File

@@ -0,0 +1,233 @@
// ============================================================================
// @mintel/concept-engine — Core Type Definitions
// ============================================================================
/** Page types recognized during crawling */
export type PageType =
| "home"
| "service"
| "about"
| "contact"
| "career"
| "portfolio"
| "blog"
| "legal"
| "other";
/** A single crawled page with extracted metadata */
export interface CrawledPage {
url: string;
pathname: string;
title: string;
html: string;
text: string;
headings: string[];
navItems: string[];
features: string[];
type: PageType;
links: string[];
images: string[];
meta: {
description?: string;
ogTitle?: string;
ogImage?: string;
};
}
/** Navigation item extracted from <nav> elements */
export interface NavItem {
label: string;
href: string;
children?: NavItem[];
}
/** Company info extracted from Impressum / footer */
export interface CompanyInfo {
name?: string;
address?: string;
phone?: string;
email?: string;
taxId?: string;
registerNumber?: string;
managingDirector?: string;
}
/** A page in the site inventory */
export interface PageInventoryItem {
url: string;
pathname: string;
title: string;
type: PageType;
headings: string[];
services: string[];
hasSearch: boolean;
hasForms: boolean;
hasMap: boolean;
hasVideo: boolean;
contentSummary: string;
}
/** Full site profile — deterministic, no LLM involved */
export interface SiteProfile {
domain: string;
crawledAt: string;
totalPages: number;
navigation: NavItem[];
existingFeatures: string[];
services: string[];
companyInfo: CompanyInfo;
pageInventory: PageInventoryItem[];
colors: string[];
socialLinks: Record<string, string>;
externalDomains: string[];
images: string[];
employeeCount: string | null;
}
/** Configuration for the estimation pipeline */
export interface PipelineConfig {
openrouterKey: string;
zyteApiKey?: string;
outputDir: string;
crawlDir: string;
modelsOverride?: Partial<ModelConfig>;
}
/** Model routing configuration */
export interface ModelConfig {
flash: string;
pro: string;
opus: string;
}
export const DEFAULT_MODELS: ModelConfig = {
flash: "google/gemini-3-flash-preview",
pro: "google/gemini-3.1-pro-preview",
opus: "anthropic/claude-opus-4-6",
};
/** Input for a pipeline run */
export interface PipelineInput {
briefing: string;
url?: string;
budget?: string;
comments?: string;
clearCache?: boolean;
}
/** State that flows through all concept pipeline steps */
export interface ConceptState {
// Input
briefing: string;
url?: string;
comments?: string;
// Output: Scrape & Analyze
siteProfile?: SiteProfile;
crawlDir?: string;
// Output: Site Audit
siteAudit?: any;
// Output: Research
researchData?: any;
// Output: Extract
facts?: Record<string, any>;
// Output: Audit
auditedFacts?: Record<string, any>;
// Output: Strategy
briefingSummary?: string;
designVision?: string;
// Output: Architecture
sitemap?: SitemapCategory[];
websiteTopic?: string;
// Cost tracking
usage: UsageStats;
}
/** Final output of the Concept Engine */
export interface ProjectConcept {
domain: string;
timestamp: string;
briefing: string;
auditedFacts: Record<string, any>;
siteProfile?: SiteProfile;
siteAudit?: any;
researchData?: any;
strategy: {
briefingSummary: string;
designVision: string;
};
architecture: {
websiteTopic: string;
sitemap: SitemapCategory[];
};
usage: UsageStats;
}
export interface SitemapCategory {
category: string;
pages: { title: string; desc: string }[];
}
export interface UsageStats {
totalPromptTokens: number;
totalCompletionTokens: number;
totalCost: number;
perStep: StepUsage[];
}
export interface StepUsage {
step: string;
model: string;
promptTokens: number;
completionTokens: number;
cost: number;
durationMs: number;
}
/** Result of a single pipeline step */
export interface StepResult<T = any> {
success: boolean;
data?: T;
error?: string;
usage?: StepUsage;
}
/** Validation result from the deterministic validator */
export interface ValidationResult {
passed: boolean;
errors: ValidationError[];
warnings: ValidationWarning[];
}
export interface ValidationError {
code: string;
message: string;
field?: string;
expected?: any;
actual?: any;
}
export interface ValidationWarning {
code: string;
message: string;
suggestion?: string;
}
/** Step definition for the concept pipeline */
export interface PipelineStep {
id: string;
name: string;
description: string;
model: "flash" | "pro" | "opus" | "none";
execute: (
state: ConceptState,
config: PipelineConfig,
) => Promise<StepResult>;
}