Files
at-mintel/packages/concept-engine/src/steps/01-extract.ts
Marc Mintel 5da88356a8
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
feat: migrate npm registry from Verdaccio to Gitea Packages
2026-02-27 00:12:00 +01:00

109 lines
4.3 KiB
TypeScript

// ============================================================================
// Step 01: Extract — Briefing Fact Extraction (Gemini Flash)
// ============================================================================
import { llmJsonRequest } from "../llm-client.js";
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
import { DEFAULT_MODELS } from "../types.js";
export async function executeExtract(
state: ConceptState,
config: PipelineConfig,
): Promise<StepResult> {
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
const startTime = Date.now();
// Build site context from the deterministic analyzer
const siteContext = state.siteProfile
? `
EXISTING WEBSITE ANALYSIS (FACTS — verifiably crawled, NOT guessed):
- Domain: ${state.siteProfile.domain}
- Total pages crawled: ${state.siteProfile.totalPages}
- Navigation items: ${state.siteProfile.navigation.map((n) => n.label).join(", ") || "nicht erkannt"}
- Existing features: ${state.siteProfile.existingFeatures.join(", ") || "keine"}
- Services / Kompetenzen: ${state.siteProfile.services.join(" | ") || "keine"}
- Employee count (from website text): ${(state.siteProfile as any).employeeCount || "nicht genannt"}
- Company name: ${state.siteProfile.companyInfo.name || "unbekannt"}
- Address: ${state.siteProfile.companyInfo.address || "unbekannt"}
- Tax ID (USt-ID): ${state.siteProfile.companyInfo.taxId || "unbekannt"}
- HRB: ${state.siteProfile.companyInfo.registerNumber || "unbekannt"}
- Managing Director: ${state.siteProfile.companyInfo.managingDirector || "unbekannt"}
- External related domains (HAVE OWN WEBSITES — DO NOT include as sub-pages!): ${state.siteProfile.externalDomains.join(", ") || "keine"}
- Social links: ${Object.entries(state.siteProfile.socialLinks).map(([k, v]) => `${k}: ${v}`).join(", ") || "keine"}
`
: "No existing website data available.";
const systemPrompt = `
You are a precision fact extractor. Your only job: extract verifiable facts from the BRIEFING.
Output language: GERMAN (strict).
Output format: flat JSON at root level. No nesting except arrays.
### CRITICAL RULES:
1. "employeeCount": take from SITE ANALYSIS if available. Only override if briefing states something more specific.
2. External domains (e.g. "etib-ing.com") have their OWN website. NEVER include them as sub-pages.
3. Videos (Messefilm, Imagefilm) are CONTENT ASSETS, not pages.
4. If existing site already has search, include "search" in functions.
5. DO NOT invent pages not mentioned in briefing or existing navigation.
### CONSERVATIVE RULE:
- simple lists (Jobs, Referenzen, Messen) = pages, NOT features
- Assume "page" as default. Only add "feature" for complex interactive systems.
### OUTPUT FORMAT:
{
"companyName": string,
"companyAddress": string,
"personName": string,
"email": string,
"existingWebsite": string,
"websiteTopic": string, // MAX 3 words
"isRelaunch": boolean,
"employeeCount": string, // from site analysis, e.g. "über 50"
"pages": string[], // ALL pages: ["Startseite", "Über Uns", "Leistungen", ...]
"functions": string[], // search, forms, maps, video, cookie_consent, etc.
"assets": string[], // existing_website, logo, media, photos, videos
"deadline": string,
"targetAudience": string,
"cmsSetup": boolean,
"multilang": boolean
}
BANNED OUTPUT KEYS: "selectedPages", "otherPages", "features", "apiSystems" — use pages[] and functions[] ONLY.
`;
const userPrompt = `BRIEFING (TRUTH SOURCE):
${state.briefing}
COMMENTS:
${state.comments || "keine"}
${siteContext}`;
try {
const { data, usage } = await llmJsonRequest({
model: models.flash,
systemPrompt,
userPrompt,
apiKey: config.openrouterKey,
});
return {
success: true,
data,
usage: {
step: "01-extract",
model: models.flash,
promptTokens: usage.promptTokens,
completionTokens: usage.completionTokens,
cost: usage.cost,
durationMs: Date.now() - startTime,
},
};
} catch (err) {
return {
success: false,
error: `Extract step failed: ${(err as Error).message}`,
};
}
}