at-mintel/packages/concept-engine/src/steps/01-extract.ts

// ============================================================================
// Step 01: Extract — Briefing Fact Extraction (Gemini Flash)
// ============================================================================

import { llmJsonRequest } from "../llm-client.js";
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
import { DEFAULT_MODELS } from "../types.js";

export async function executeExtract(
    state: ConceptState,
    config: PipelineConfig,
): Promise<StepResult> {
    const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
    const startTime = Date.now();

    // Build site context from the deterministic analyzer
    const siteContext = state.siteProfile
        ? `
EXISTING WEBSITE ANALYSIS (FACTS — verifiably crawled, NOT guessed):
- Domain: ${state.siteProfile.domain}
- Total pages crawled: ${state.siteProfile.totalPages}
- Navigation items: ${state.siteProfile.navigation.map((n) => n.label).join(", ") || "nicht erkannt"}
- Existing features: ${state.siteProfile.existingFeatures.join(", ") || "keine"}
- Services / Kompetenzen: ${state.siteProfile.services.join(" | ") || "keine"}
- Employee count (from website text): ${(state.siteProfile as any).employeeCount || "nicht genannt"}
- Company name: ${state.siteProfile.companyInfo.name || "unbekannt"}
- Address: ${state.siteProfile.companyInfo.address || "unbekannt"}
- Tax ID (USt-ID): ${state.siteProfile.companyInfo.taxId || "unbekannt"}
- HRB: ${state.siteProfile.companyInfo.registerNumber || "unbekannt"}
- Managing Director: ${state.siteProfile.companyInfo.managingDirector || "unbekannt"}
- External related domains (HAVE OWN WEBSITES — DO NOT include as sub-pages!): ${state.siteProfile.externalDomains.join(", ") || "keine"}
- Social links: ${Object.entries(state.siteProfile.socialLinks).map(([k, v]) => `${k}: ${v}`).join(", ") || "keine"}
`
        : "No existing website data available.";

    const systemPrompt = `
You are a precision fact extractor. Your only job: extract verifiable facts from the BRIEFING.
Output language: GERMAN (strict).
Output format: flat JSON at root level. No nesting except arrays.

### CRITICAL RULES:
1. "employeeCount": take from SITE ANALYSIS if available. Only override if briefing states something more specific.
2. External domains (e.g. "etib-ing.com") have their OWN website. NEVER include them as sub-pages.
3. Videos (Messefilm, Imagefilm) are CONTENT ASSETS, not pages.
4. If existing site already has search, include "search" in functions.
5. DO NOT invent pages not mentioned in briefing or existing navigation.

### CONSERVATIVE RULE:
- simple lists (Jobs, Referenzen, Messen) = pages, NOT features
- Assume "page" as default. Only add "feature" for complex interactive systems.

### OUTPUT FORMAT:
{
  "companyName": string,
  "companyAddress": string,
  "personName": string,
  "email": string,
  "existingWebsite": string,
  "websiteTopic": string,         // MAX 3 words
  "isRelaunch": boolean,
  "employeeCount": string,        // from site analysis, e.g. "über 50"
  "pages": string[],              // ALL pages: ["Startseite", "Über Uns", "Leistungen", ...]
  "functions": string[],          // search, forms, maps, video, cookie_consent, etc.
  "assets": string[],             // existing_website, logo, media, photos, videos
  "deadline": string,
  "targetAudience": string,
  "cmsSetup": boolean,
  "multilang": boolean
}

BANNED OUTPUT KEYS: "selectedPages", "otherPages", "features", "apiSystems" — use pages[] and functions[] ONLY.
`;

    const userPrompt = `BRIEFING (TRUTH SOURCE):
${state.briefing}

COMMENTS:
${state.comments || "keine"}

${siteContext}`;

    try {
        const { data, usage } = await llmJsonRequest({
            model: models.flash,
            systemPrompt,
            userPrompt,
            apiKey: config.openrouterKey,
        });

        return {
            success: true,
            data,
            usage: {
                step: "01-extract",
                model: models.flash,
                promptTokens: usage.promptTokens,
                completionTokens: usage.completionTokens,
                cost: usage.cost,
                durationMs: Date.now() - startTime,
            },
        };
    } catch (err) {
        return {
            success: false,
            error: `Extract step failed: ${(err as Error).message}`,
        };
    }
}