feat: migrate npm registry from Verdaccio to Gitea Packages
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
This commit is contained in:
40
packages/concept-engine/src/_test_pipeline.ts
Normal file
40
packages/concept-engine/src/_test_pipeline.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
import { config as dotenvConfig } from 'dotenv';
|
||||
import * as path from 'node:path';
|
||||
import * as fs from 'node:fs/promises';
|
||||
import { EstimationPipeline } from './pipeline.js';
|
||||
|
||||
dotenvConfig({ path: path.resolve(process.cwd(), '../../.env') });
|
||||
|
||||
const briefing = await fs.readFile(
|
||||
path.resolve(process.cwd(), '../../data/briefings/etib.txt'),
|
||||
'utf8',
|
||||
);
|
||||
|
||||
console.log(`Briefing loaded: ${briefing.length} chars`);
|
||||
|
||||
const pipeline = new EstimationPipeline(
|
||||
{
|
||||
openrouterKey: process.env.OPENROUTER_API_KEY || '',
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
outputDir: path.resolve(process.cwd(), '../../out/estimations'),
|
||||
crawlDir: path.resolve(process.cwd(), '../../data/crawls'),
|
||||
},
|
||||
{
|
||||
onStepStart: (id, name) => console.log(`[CB] Starting: ${id}`),
|
||||
onStepComplete: (id) => console.log(`[CB] Done: ${id}`),
|
||||
onStepError: (id, err) => console.error(`[CB] Error in ${id}: ${err}`),
|
||||
},
|
||||
);
|
||||
|
||||
try {
|
||||
const result = await pipeline.run({
|
||||
briefing,
|
||||
url: 'https://www.e-tib.com',
|
||||
});
|
||||
|
||||
console.log('\n✨ Pipeline complete!');
|
||||
console.log('Validation:', result.validationResult?.passed ? 'PASSED' : 'FAILED');
|
||||
} catch (err: any) {
|
||||
console.error('\n❌ Pipeline failed:', err.message);
|
||||
console.error(err.stack);
|
||||
}
|
||||
334
packages/concept-engine/src/analyzer.ts
Normal file
334
packages/concept-engine/src/analyzer.ts
Normal file
@@ -0,0 +1,334 @@
|
||||
// ============================================================================
|
||||
// Analyzer — Deterministic Site Analysis (NO LLM!)
|
||||
// Builds a SiteProfile from crawled pages using pure code logic.
|
||||
// This is the core fix against hallucinated page structures.
|
||||
// ============================================================================
|
||||
|
||||
import type {
|
||||
CrawledPage,
|
||||
SiteProfile,
|
||||
NavItem,
|
||||
CompanyInfo,
|
||||
PageInventoryItem,
|
||||
} from "./types.js";
|
||||
|
||||
/**
|
||||
* Build a complete SiteProfile from an array of crawled pages.
|
||||
* This is 100% deterministic — no LLM calls involved.
|
||||
*/
|
||||
export function analyzeSite(pages: CrawledPage[], domain: string): SiteProfile {
|
||||
const navigation = extractNavigation(pages);
|
||||
const existingFeatures = extractExistingFeatures(pages);
|
||||
const services = extractAllServices(pages);
|
||||
const companyInfo = extractCompanyInfo(pages);
|
||||
const colors = extractColors(pages);
|
||||
const socialLinks = extractSocialLinks(pages);
|
||||
const externalDomains = extractExternalDomains(pages, domain);
|
||||
const images = extractAllImages(pages);
|
||||
const employeeCount = extractEmployeeCount(pages);
|
||||
const pageInventory = buildPageInventory(pages);
|
||||
|
||||
return {
|
||||
domain,
|
||||
crawledAt: new Date().toISOString(),
|
||||
totalPages: pages.filter((p) => p.type !== "legal").length,
|
||||
navigation,
|
||||
existingFeatures,
|
||||
services,
|
||||
companyInfo,
|
||||
pageInventory,
|
||||
colors,
|
||||
socialLinks,
|
||||
externalDomains,
|
||||
images,
|
||||
employeeCount,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the site's main navigation structure from <nav> elements.
|
||||
* Uses the HOME page's nav as the canonical source.
|
||||
*/
|
||||
function extractNavigation(pages: CrawledPage[]): NavItem[] {
|
||||
// Prefer the home page's nav
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
const sourcePage = homePage || pages[0];
|
||||
if (!sourcePage) return [];
|
||||
|
||||
// Deduplicate nav items
|
||||
const seen = new Set<string>();
|
||||
const navItems: NavItem[] = [];
|
||||
|
||||
for (const label of sourcePage.navItems) {
|
||||
const normalized = label.toLowerCase().trim();
|
||||
if (seen.has(normalized)) continue;
|
||||
if (normalized.length < 2) continue;
|
||||
seen.add(normalized);
|
||||
navItems.push({ label, href: "" });
|
||||
}
|
||||
|
||||
return navItems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all detected interactive features across all pages.
|
||||
*/
|
||||
function extractExistingFeatures(pages: CrawledPage[]): string[] {
|
||||
const allFeatures = new Set<string>();
|
||||
for (const page of pages) {
|
||||
for (const feature of page.features) {
|
||||
allFeatures.add(feature);
|
||||
}
|
||||
}
|
||||
return [...allFeatures];
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all images found across all pages.
|
||||
*/
|
||||
function extractAllImages(pages: CrawledPage[]): string[] {
|
||||
const allImages = new Set<string>();
|
||||
for (const page of pages) {
|
||||
if (!page.images) continue;
|
||||
for (const img of page.images) {
|
||||
allImages.add(img);
|
||||
}
|
||||
}
|
||||
return [...allImages];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract employee count from page text.
|
||||
* Looks for patterns like "über 50 Mitarbeitern", "200 Mitarbeiter", "50+ employees".
|
||||
*/
|
||||
function extractEmployeeCount(pages: CrawledPage[]): string | null {
|
||||
const allText = pages.map((p) => p.text).join(" ");
|
||||
|
||||
// German patterns: 'über 50 Mitarbeitern', '120 Beschäftigte', '+200 MA'
|
||||
const patterns = [
|
||||
/(über|ca\.?|rund|mehr als|\+)?\s*(\d{1,4})\s*(Mitarbeiter(?:innen)?|Beschäftigte|MA|Fachkräfte)\b/gi,
|
||||
/(\d{1,4})\+?\s*(employees|team members)/gi,
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = allText.match(pattern);
|
||||
if (match && match[0]) {
|
||||
const num = match[0].match(/(\d{1,4})/)?.[1];
|
||||
const prefix = match[0].match(/über|ca\.?|rund|mehr als/i)?.[0];
|
||||
if (num) return prefix ? `${prefix} ${num}` : num;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract services/competencies from service-type pages.
|
||||
* Focuses on H2-H3 headings and list items on service pages.
|
||||
*/
|
||||
function extractAllServices(pages: CrawledPage[]): string[] {
|
||||
const servicePages = pages.filter(
|
||||
(p) => p.type === "service" || p.pathname.includes("kompetenz"),
|
||||
);
|
||||
|
||||
const services = new Set<string>();
|
||||
for (const page of servicePages) {
|
||||
// Use headings as primary service indicators
|
||||
for (const heading of page.headings) {
|
||||
const clean = heading.trim();
|
||||
if (clean.length > 3 && clean.length < 100) {
|
||||
// Skip generic headings
|
||||
if (/^(home|kontakt|impressum|datenschutz|menü|navigation|suche)/i.test(clean)) continue;
|
||||
services.add(clean);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no service pages found, look at the home page headings too
|
||||
if (services.size === 0) {
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
if (homePage) {
|
||||
for (const heading of homePage.headings) {
|
||||
const clean = heading.trim();
|
||||
if (clean.length > 3 && clean.length < 80) {
|
||||
services.add(clean);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...services];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract company information from Impressum / footer content.
|
||||
*/
|
||||
function extractCompanyInfo(pages: CrawledPage[]): CompanyInfo {
|
||||
const info: CompanyInfo = {};
|
||||
|
||||
// Find Impressum or legal page
|
||||
const legalPage = pages.find(
|
||||
(p) =>
|
||||
p.type === "legal" &&
|
||||
(p.pathname.includes("impressum") || p.title.toLowerCase().includes("impressum")),
|
||||
);
|
||||
|
||||
const sourceText = legalPage?.text || pages.find((p) => p.type === "home")?.text || "";
|
||||
|
||||
// USt-ID
|
||||
const taxMatch = sourceText.match(/USt[.\s-]*(?:ID[.\s-]*Nr\.?|IdNr\.?)[:\s]*([A-Z]{2}\d{9,11})/i);
|
||||
if (taxMatch) info.taxId = taxMatch[1];
|
||||
|
||||
// HRB number
|
||||
const hrbMatch = sourceText.match(/HRB[:\s]*(\d+\s*[A-Z]*)/i);
|
||||
if (hrbMatch) info.registerNumber = `HRB ${hrbMatch[1].trim()}`;
|
||||
|
||||
// Phone
|
||||
const phoneMatch = sourceText.match(/(?:Tel|Telefon|Fon)[.:\s]*([+\d\s()/-]{10,20})/i);
|
||||
if (phoneMatch) info.phone = phoneMatch[1].trim();
|
||||
|
||||
// Email
|
||||
const emailMatch = sourceText.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
|
||||
if (emailMatch) info.email = emailMatch[0];
|
||||
|
||||
// Address (look for German postal code pattern)
|
||||
const addressMatch = sourceText.match(
|
||||
/(?:[\w\s.-]+(?:straße|str\.|weg|platz|ring|allee|gasse)\s*\d+[a-z]?\s*,?\s*)?(?:D-)?(\d{5})\s+\w+/i,
|
||||
);
|
||||
if (addressMatch) info.address = addressMatch[0].trim();
|
||||
|
||||
// GF / Geschäftsführer
|
||||
const gfMatch = sourceText.match(
|
||||
/Geschäftsführ(?:er|ung)[:\s]*([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+){1,3})/,
|
||||
);
|
||||
if (gfMatch) info.managingDirector = gfMatch[1].trim();
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract brand colors from HTML (inline styles, CSS variables).
|
||||
*/
|
||||
function extractColors(pages: CrawledPage[]): string[] {
|
||||
const colors = new Set<string>();
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
if (!homePage) return [];
|
||||
|
||||
const hexMatches = homePage.html.match(/#(?:[0-9a-fA-F]{3}){1,2}\b/g) || [];
|
||||
for (const hex of hexMatches) {
|
||||
colors.add(hex.toLowerCase());
|
||||
if (colors.size >= 8) break;
|
||||
}
|
||||
|
||||
return [...colors];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract social media links from footers / headers.
|
||||
*/
|
||||
function extractSocialLinks(pages: CrawledPage[]): Record<string, string> {
|
||||
const socials: Record<string, string> = {};
|
||||
const platforms = [
|
||||
{ key: "linkedin", patterns: ["linkedin.com"] },
|
||||
{ key: "instagram", patterns: ["instagram.com"] },
|
||||
{ key: "facebook", patterns: ["facebook.com", "fb.com"] },
|
||||
{ key: "youtube", patterns: ["youtube.com", "youtu.be"] },
|
||||
{ key: "twitter", patterns: ["twitter.com", "x.com"] },
|
||||
{ key: "xing", patterns: ["xing.com"] },
|
||||
];
|
||||
|
||||
const homePage = pages.find((p) => p.type === "home");
|
||||
if (!homePage) return socials;
|
||||
|
||||
const urlMatches = homePage.html.match(/https?:\/\/[^\s"'<>]+/g) || [];
|
||||
for (const url of urlMatches) {
|
||||
for (const platform of platforms) {
|
||||
if (platform.patterns.some((p) => url.includes(p)) && !socials[platform.key]) {
|
||||
socials[platform.key] = url;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return socials;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find domains that are linked but separate from the main domain.
|
||||
* Critical for detecting sister companies with own websites (e.g. etib-ing.com).
|
||||
*/
|
||||
function extractExternalDomains(pages: CrawledPage[], mainDomain: string): string[] {
|
||||
const externalDomains = new Set<string>();
|
||||
const cleanMain = mainDomain.replace(/^www\./, "");
|
||||
// Extract meaningful base parts: "e-tib.com" → ["e", "tib", "etib"]
|
||||
const mainParts = cleanMain.split(".")[0].toLowerCase().split(/[-_]/).filter(p => p.length > 1);
|
||||
const mainJoined = mainParts.join(""); // "etib"
|
||||
|
||||
for (const page of pages) {
|
||||
const linkMatches = page.html.match(/https?:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
|
||||
for (const url of linkMatches) {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const domain = urlObj.hostname.replace(/^www\./, "");
|
||||
// Skip same domain
|
||||
if (domain === cleanMain) continue;
|
||||
// Skip common third-party services
|
||||
if (
|
||||
domain.includes("google") ||
|
||||
domain.includes("facebook") ||
|
||||
domain.includes("twitter") ||
|
||||
domain.includes("linkedin") ||
|
||||
domain.includes("instagram") ||
|
||||
domain.includes("youtube") ||
|
||||
domain.includes("cookie") ||
|
||||
domain.includes("analytics") ||
|
||||
domain.includes("cdn") ||
|
||||
domain.includes("cloudflare") ||
|
||||
domain.includes("fonts") ||
|
||||
domain.includes("jquery") ||
|
||||
domain.includes("bootstrap") ||
|
||||
domain.includes("wordpress") ||
|
||||
domain.includes("jimdo") ||
|
||||
domain.includes("wix")
|
||||
)
|
||||
continue;
|
||||
|
||||
// Fuzzy match: check if the domain contains any base part of the main domain
|
||||
// e.g. main="e-tib.com" → mainParts=["e","tib"], mainJoined="etib"
|
||||
// target="etib-ing.com" → domainBase="etib-ing", domainJoined="etibing"
|
||||
const domainBase = domain.split(".")[0].toLowerCase();
|
||||
const domainJoined = domainBase.replace(/[-_]/g, "");
|
||||
|
||||
const isRelated =
|
||||
domainJoined.includes(mainJoined) ||
|
||||
mainJoined.includes(domainJoined) ||
|
||||
mainParts.some(part => part.length > 2 && domainBase.includes(part));
|
||||
|
||||
if (isRelated) {
|
||||
externalDomains.add(domain);
|
||||
}
|
||||
} catch {
|
||||
// Invalid URL
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...externalDomains];
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a structured inventory of all pages.
|
||||
*/
|
||||
function buildPageInventory(pages: CrawledPage[]): PageInventoryItem[] {
|
||||
return pages.map((page) => ({
|
||||
url: page.url,
|
||||
pathname: page.pathname,
|
||||
title: page.title,
|
||||
type: page.type,
|
||||
headings: page.headings.slice(0, 10),
|
||||
services: page.type === "service" ? page.headings.filter((h) => h.length > 3 && h.length < 80) : [],
|
||||
hasSearch: page.features.includes("search"),
|
||||
hasForms: page.features.includes("forms"),
|
||||
hasMap: page.features.includes("maps"),
|
||||
hasVideo: page.features.includes("video"),
|
||||
contentSummary: page.text.substring(0, 500),
|
||||
}));
|
||||
}
|
||||
149
packages/concept-engine/src/cli.ts
Normal file
149
packages/concept-engine/src/cli.ts
Normal file
@@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env node
|
||||
// ============================================================================
|
||||
// @mintel/concept-engine — CLI Entry Point
|
||||
// Simple commander-based CLI for concept generation.
|
||||
// ============================================================================
|
||||
|
||||
import { Command } from "commander";
|
||||
import * as path from "node:path";
|
||||
import * as fs from "node:fs/promises";
|
||||
import { existsSync } from "node:fs";
|
||||
import { config as dotenvConfig } from "dotenv";
|
||||
import { ConceptPipeline } from "./pipeline.js";
|
||||
|
||||
// Load .env from monorepo root
|
||||
dotenvConfig({ path: path.resolve(process.cwd(), "../../.env") });
|
||||
dotenvConfig({ path: path.resolve(process.cwd(), ".env") });
|
||||
|
||||
const program = new Command();
|
||||
|
||||
program
|
||||
.name("concept")
|
||||
.description("AI-powered project concept generator")
|
||||
.version("1.0.0");
|
||||
|
||||
program
|
||||
.command("run")
|
||||
.description("Run the full concept pipeline")
|
||||
.argument("[briefing]", "Briefing text or @path/to/file.txt")
|
||||
.option("--url <url>", "Target website URL")
|
||||
.option("--comments <comments>", "Additional notes")
|
||||
.option("--clear-cache", "Clear crawl cache and re-crawl")
|
||||
.option("--output <dir>", "Output directory", "../../out/concepts")
|
||||
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
|
||||
.action(async (briefingArg: string | undefined, options: any) => {
|
||||
const openrouterKey = process.env.OPENROUTER_API_KEY || process.env.OPENROUTER_KEY;
|
||||
if (!openrouterKey) {
|
||||
console.error("❌ OPENROUTER_API_KEY not found in environment.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let briefing = briefingArg || "";
|
||||
|
||||
// Handle @file references
|
||||
if (briefing.startsWith("@")) {
|
||||
const rawPath = briefing.substring(1);
|
||||
const filePath = rawPath.startsWith("/")
|
||||
? rawPath
|
||||
: path.resolve(process.cwd(), rawPath);
|
||||
if (!existsSync(filePath)) {
|
||||
console.error(`❌ Briefing file not found: ${filePath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
briefing = await fs.readFile(filePath, "utf8");
|
||||
console.log(`📄 Loaded briefing from: ${filePath}`);
|
||||
}
|
||||
|
||||
// Auto-discover URL from briefing
|
||||
let url = options.url;
|
||||
if (!url && briefing) {
|
||||
const urlMatch = briefing.match(/https?:\/\/[^\s]+/);
|
||||
if (urlMatch) {
|
||||
url = urlMatch[0];
|
||||
console.log(`🔗 Discovered URL in briefing: ${url}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (!briefing && !url) {
|
||||
console.error("❌ Provide a briefing text or --url");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const pipeline = new ConceptPipeline(
|
||||
{
|
||||
openrouterKey,
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
outputDir: path.resolve(process.cwd(), options.output),
|
||||
crawlDir: path.resolve(process.cwd(), options.crawlDir),
|
||||
},
|
||||
{
|
||||
onStepStart: (id, name) => {
|
||||
// Will be enhanced with Ink spinner later
|
||||
},
|
||||
onStepComplete: (id, result) => {
|
||||
// Will be enhanced with Ink UI later
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
try {
|
||||
await pipeline.run({
|
||||
briefing,
|
||||
url,
|
||||
comments: options.comments,
|
||||
clearCache: options.clearCache,
|
||||
});
|
||||
|
||||
console.log("\n✨ Concept generation complete!");
|
||||
} catch (err) {
|
||||
console.error(`\n❌ Pipeline failed: ${(err as Error).message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
program
|
||||
.command("analyze")
|
||||
.description("Only crawl and analyze a website (no LLM)")
|
||||
.argument("<url>", "Website URL to analyze")
|
||||
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
|
||||
.option("--clear-cache", "Clear existing crawl cache")
|
||||
.action(async (url: string, options: any) => {
|
||||
const { crawlSite } = await import("./scraper.js");
|
||||
const { analyzeSite } = await import("./analyzer.js");
|
||||
|
||||
if (options.clearCache) {
|
||||
const { clearCrawlCache } = await import("./scraper.js");
|
||||
const domain = new URL(url).hostname;
|
||||
await clearCrawlCache(path.resolve(process.cwd(), options.crawlDir), domain);
|
||||
}
|
||||
|
||||
const pages = await crawlSite(url, {
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
crawlDir: path.resolve(process.cwd(), options.crawlDir),
|
||||
});
|
||||
|
||||
const domain = new URL(url).hostname;
|
||||
const profile = analyzeSite(pages, domain);
|
||||
|
||||
console.log("\n📊 Site Profile:");
|
||||
console.log(` Domain: ${profile.domain}`);
|
||||
console.log(` Total Pages: ${profile.totalPages}`);
|
||||
console.log(` Navigation: ${profile.navigation.map((n) => n.label).join(", ")}`);
|
||||
console.log(` Features: ${profile.existingFeatures.join(", ") || "none"}`);
|
||||
console.log(` Services: ${profile.services.join(", ") || "none"}`);
|
||||
console.log(` External Domains: ${profile.externalDomains.join(", ") || "none"}`);
|
||||
console.log(` Company: ${profile.companyInfo.name || "unbekannt"}`);
|
||||
console.log(` Tax ID: ${profile.companyInfo.taxId || "unbekannt"}`);
|
||||
console.log(` Colors: ${profile.colors.join(", ")}`);
|
||||
console.log(` Images Found: ${profile.images.length}`);
|
||||
console.log(` Social: ${Object.entries(profile.socialLinks).map(([k, v]) => `${k}`).join(", ") || "none"}`);
|
||||
|
||||
const outputPath = path.join(
|
||||
path.resolve(process.cwd(), options.crawlDir),
|
||||
domain.replace(/\./g, "-"),
|
||||
"_site_profile.json",
|
||||
);
|
||||
console.log(`\n📦 Full profile saved to: ${outputPath}`);
|
||||
});
|
||||
|
||||
program.parse();
|
||||
10
packages/concept-engine/src/index.ts
Normal file
10
packages/concept-engine/src/index.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
// ============================================================================
|
||||
// @mintel/concept-engine — Public API
|
||||
// ============================================================================
|
||||
|
||||
export { ConceptPipeline } from "./pipeline.js";
|
||||
export type { PipelineCallbacks } from "./pipeline.js";
|
||||
export { crawlSite, clearCrawlCache } from "./scraper.js";
|
||||
export { analyzeSite } from "./analyzer.js";
|
||||
export { llmRequest, llmJsonRequest, cleanJson } from "./llm-client.js";
|
||||
export * from "./types.js";
|
||||
133
packages/concept-engine/src/llm-client.ts
Normal file
133
packages/concept-engine/src/llm-client.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
// ============================================================================
|
||||
// LLM Client — Unified interface with model routing via OpenRouter
|
||||
// ============================================================================
|
||||
|
||||
import axios from "axios";
|
||||
|
||||
interface LLMRequestOptions {
|
||||
model: string;
|
||||
systemPrompt: string;
|
||||
userPrompt: string;
|
||||
jsonMode?: boolean;
|
||||
apiKey: string;
|
||||
}
|
||||
|
||||
interface LLMResponse {
|
||||
content: string;
|
||||
usage: {
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
cost: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean raw LLM output to parseable JSON.
|
||||
* Handles markdown fences, control chars, trailing commas.
|
||||
*/
|
||||
export function cleanJson(str: string): string {
|
||||
let cleaned = str.replace(/```json\n?|```/g, "").trim();
|
||||
cleaned = cleaned.replace(
|
||||
/[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
|
||||
" ",
|
||||
);
|
||||
cleaned = cleaned.replace(/,\s*([\]}])/g, "$1");
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a request to an LLM via OpenRouter.
|
||||
*/
|
||||
export async function llmRequest(options: LLMRequestOptions): Promise<LLMResponse> {
|
||||
const { model, systemPrompt, userPrompt, jsonMode = true, apiKey } = options;
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
const resp = await axios.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
{
|
||||
model,
|
||||
messages: [
|
||||
{ role: "system", content: systemPrompt },
|
||||
{ role: "user", content: userPrompt },
|
||||
],
|
||||
...(jsonMode ? { response_format: { type: "json_object" } } : {}),
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout: 120000,
|
||||
},
|
||||
).catch(err => {
|
||||
if (err.response) {
|
||||
console.error("OpenRouter API Error:", JSON.stringify(err.response.data, null, 2));
|
||||
}
|
||||
throw err;
|
||||
});
|
||||
|
||||
const content = resp.data.choices?.[0]?.message?.content;
|
||||
if (!content) {
|
||||
throw new Error(`LLM returned no content. Model: ${model}`);
|
||||
}
|
||||
|
||||
let cost = 0;
|
||||
const usage = resp.data.usage || {};
|
||||
if (usage.cost !== undefined) {
|
||||
cost = usage.cost;
|
||||
} else {
|
||||
// Fallback estimation
|
||||
cost =
|
||||
(usage.prompt_tokens || 0) * (0.1 / 1_000_000) +
|
||||
(usage.completion_tokens || 0) * (0.4 / 1_000_000);
|
||||
}
|
||||
|
||||
return {
|
||||
content,
|
||||
usage: {
|
||||
promptTokens: usage.prompt_tokens || 0,
|
||||
completionTokens: usage.completion_tokens || 0,
|
||||
cost,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a request and parse the response as JSON.
|
||||
*/
|
||||
export async function llmJsonRequest<T = any>(
|
||||
options: LLMRequestOptions,
|
||||
): Promise<{ data: T; usage: LLMResponse["usage"] }> {
|
||||
const response = await llmRequest({ ...options, jsonMode: true });
|
||||
const cleaned = cleanJson(response.content);
|
||||
|
||||
let parsed: T;
|
||||
try {
|
||||
parsed = JSON.parse(cleaned);
|
||||
} catch (e) {
|
||||
throw new Error(
|
||||
`Failed to parse LLM JSON response: ${(e as Error).message}\nRaw: ${cleaned.substring(0, 500)}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Unwrap common LLM artifacts: {"0": {...}}, {"state": {...}}, etc.
|
||||
const unwrapped = unwrapResponse(parsed);
|
||||
|
||||
return { data: unwrapped as T, usage: response.usage };
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively unwrap common LLM wrapping patterns.
|
||||
*/
|
||||
function unwrapResponse(obj: any): any {
|
||||
if (!obj || typeof obj !== "object" || Array.isArray(obj)) return obj;
|
||||
const keys = Object.keys(obj);
|
||||
if (keys.length === 1) {
|
||||
const key = keys[0];
|
||||
if (key === "0" || key === "state" || key === "facts" || key === "result" || key === "data") {
|
||||
return unwrapResponse(obj[key]);
|
||||
}
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
257
packages/concept-engine/src/pipeline.ts
Normal file
257
packages/concept-engine/src/pipeline.ts
Normal file
@@ -0,0 +1,257 @@
|
||||
// ============================================================================
|
||||
// Pipeline Orchestrator
|
||||
// Runs all steps sequentially, tracks state, supports re-running individual steps.
|
||||
// ============================================================================
|
||||
|
||||
import * as fs from "node:fs/promises";
|
||||
import * as path from "node:path";
|
||||
import { existsSync } from "node:fs";
|
||||
import { crawlSite, clearCrawlCache } from "./scraper.js";
|
||||
import { analyzeSite } from "./analyzer.js";
|
||||
import { executeResearch } from "./steps/00b-research.js";
|
||||
import { executeExtract } from "./steps/01-extract.js";
|
||||
import { executeSiteAudit } from "./steps/00a-site-audit.js";
|
||||
import { executeAudit } from "./steps/02-audit.js";
|
||||
import { executeStrategize } from "./steps/03-strategize.js";
|
||||
import { executeArchitect } from "./steps/04-architect.js";
|
||||
import type {
|
||||
PipelineConfig,
|
||||
PipelineInput,
|
||||
ConceptState,
|
||||
ProjectConcept,
|
||||
StepResult,
|
||||
StepUsage,
|
||||
} from "./types.js";
|
||||
|
||||
export interface PipelineCallbacks {
|
||||
onStepStart?: (stepId: string, stepName: string) => void;
|
||||
onStepComplete?: (stepId: string, result: StepResult) => void;
|
||||
onStepError?: (stepId: string, error: string) => void;
|
||||
}
|
||||
|
||||
/**
|
||||
* The main concept pipeline orchestrator.
|
||||
* Runs conceptual steps sequentially and builds the ProjectConcept.
|
||||
*/
|
||||
export class ConceptPipeline {
|
||||
private config: PipelineConfig;
|
||||
private state: ConceptState;
|
||||
private callbacks: PipelineCallbacks;
|
||||
|
||||
constructor(config: PipelineConfig, callbacks: PipelineCallbacks = {}) {
|
||||
this.config = config;
|
||||
this.callbacks = callbacks;
|
||||
this.state = this.createInitialState();
|
||||
}
|
||||
|
||||
private createInitialState(): ConceptState {
|
||||
return {
|
||||
briefing: "",
|
||||
usage: {
|
||||
totalPromptTokens: 0,
|
||||
totalCompletionTokens: 0,
|
||||
totalCost: 0,
|
||||
perStep: [],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the full concept pipeline from scratch.
|
||||
*/
|
||||
async run(input: PipelineInput): Promise<ProjectConcept> {
|
||||
this.state.briefing = input.briefing;
|
||||
this.state.url = input.url;
|
||||
this.state.comments = input.comments;
|
||||
|
||||
// Ensure output directories
|
||||
await fs.mkdir(this.config.outputDir, { recursive: true });
|
||||
await fs.mkdir(this.config.crawlDir, { recursive: true });
|
||||
|
||||
// Step 0: Scrape & Analyze (deterministic)
|
||||
if (input.url) {
|
||||
if (input.clearCache) {
|
||||
const domain = new URL(input.url).hostname;
|
||||
await clearCrawlCache(this.config.crawlDir, domain);
|
||||
}
|
||||
await this.runStep("00-scrape", "Scraping & Analyzing Website", async () => {
|
||||
const pages = await crawlSite(input.url!, {
|
||||
zyteApiKey: this.config.zyteApiKey,
|
||||
crawlDir: this.config.crawlDir,
|
||||
});
|
||||
const domain = new URL(input.url!).hostname;
|
||||
const siteProfile = analyzeSite(pages, domain);
|
||||
this.state.siteProfile = siteProfile;
|
||||
this.state.crawlDir = path.join(this.config.crawlDir, domain.replace(/\./g, "-"));
|
||||
|
||||
// Save site profile
|
||||
await fs.writeFile(
|
||||
path.join(this.state.crawlDir!, "_site_profile.json"),
|
||||
JSON.stringify(siteProfile, null, 2),
|
||||
);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: siteProfile,
|
||||
usage: { step: "00-scrape", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: 0 },
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// Step 00a: Site Audit (DataForSEO)
|
||||
await this.runStep("00a-site-audit", "IST-Analysis (DataForSEO)", async () => {
|
||||
const result = await executeSiteAudit(this.state, this.config);
|
||||
if (result.success && result.data) {
|
||||
this.state.siteAudit = result.data;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 00b: Research (real web data via journaling)
|
||||
await this.runStep("00b-research", "Industry & Company Research", async () => {
|
||||
const result = await executeResearch(this.state);
|
||||
if (result.success && result.data) {
|
||||
this.state.researchData = result.data;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 1: Extract facts
|
||||
await this.runStep("01-extract", "Extracting Facts from Briefing", async () => {
|
||||
const result = await executeExtract(this.state, this.config);
|
||||
if (result.success) this.state.facts = result.data;
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 2: Audit features
|
||||
await this.runStep("02-audit", "Auditing Features (Skeptical Review)", async () => {
|
||||
const result = await executeAudit(this.state, this.config);
|
||||
if (result.success) this.state.auditedFacts = result.data;
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 3: Strategic analysis
|
||||
await this.runStep("03-strategize", "Strategic Analysis", async () => {
|
||||
const result = await executeStrategize(this.state, this.config);
|
||||
if (result.success) {
|
||||
this.state.briefingSummary = result.data.briefingSummary;
|
||||
this.state.designVision = result.data.designVision;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 4: Sitemap architecture
|
||||
await this.runStep("04-architect", "Information Architecture", async () => {
|
||||
const result = await executeArchitect(this.state, this.config);
|
||||
if (result.success) {
|
||||
this.state.sitemap = result.data.sitemap;
|
||||
this.state.websiteTopic = result.data.websiteTopic;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
const projectConcept = this.buildProjectConcept();
|
||||
await this.saveState(projectConcept);
|
||||
|
||||
return projectConcept;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single step with callbacks and error handling.
|
||||
*/
|
||||
private async runStep(
|
||||
stepId: string,
|
||||
stepName: string,
|
||||
executor: () => Promise<StepResult>,
|
||||
): Promise<void> {
|
||||
this.callbacks.onStepStart?.(stepId, stepName);
|
||||
console.log(`\n📍 ${stepName}...`);
|
||||
|
||||
try {
|
||||
const result = await executor();
|
||||
if (result.usage) {
|
||||
this.state.usage.perStep.push(result.usage);
|
||||
this.state.usage.totalPromptTokens += result.usage.promptTokens;
|
||||
this.state.usage.totalCompletionTokens += result.usage.completionTokens;
|
||||
this.state.usage.totalCost += result.usage.cost;
|
||||
}
|
||||
|
||||
if (result.success) {
|
||||
const cost = result.usage?.cost ? ` ($${result.usage.cost.toFixed(4)})` : "";
|
||||
const duration = result.usage?.durationMs ? ` [${(result.usage.durationMs / 1000).toFixed(1)}s]` : "";
|
||||
console.log(` ✅ ${stepName} complete${cost}${duration}`);
|
||||
this.callbacks.onStepComplete?.(stepId, result);
|
||||
} else {
|
||||
console.error(` ❌ ${stepName} failed: ${result.error}`);
|
||||
this.callbacks.onStepError?.(stepId, result.error || "Unknown error");
|
||||
throw new Error(result.error);
|
||||
}
|
||||
} catch (err) {
|
||||
const errorMsg = (err as Error).message;
|
||||
this.callbacks.onStepError?.(stepId, errorMsg);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the final Concept object.
|
||||
*/
|
||||
private buildProjectConcept(): ProjectConcept {
|
||||
return {
|
||||
domain: this.state.siteProfile?.domain || "unknown",
|
||||
timestamp: new Date().toISOString(),
|
||||
briefing: this.state.briefing,
|
||||
auditedFacts: this.state.auditedFacts || {},
|
||||
siteProfile: this.state.siteProfile,
|
||||
siteAudit: this.state.siteAudit,
|
||||
researchData: this.state.researchData,
|
||||
strategy: {
|
||||
briefingSummary: this.state.briefingSummary || "",
|
||||
designVision: this.state.designVision || "",
|
||||
},
|
||||
architecture: {
|
||||
websiteTopic: this.state.websiteTopic || "",
|
||||
sitemap: this.state.sitemap || [],
|
||||
},
|
||||
usage: this.state.usage,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the full concept generated state to disk.
|
||||
*/
|
||||
private async saveState(concept: ProjectConcept): Promise<void> {
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||
const companyName = this.state.auditedFacts?.companyName || "unknown";
|
||||
|
||||
const stateDir = path.join(this.config.outputDir, "concepts");
|
||||
await fs.mkdir(stateDir, { recursive: true });
|
||||
|
||||
const statePath = path.join(stateDir, `${companyName}_${timestamp}.json`);
|
||||
await fs.writeFile(statePath, JSON.stringify(concept, null, 2));
|
||||
console.log(`\n📦 Saved Project Concept to: ${statePath}`);
|
||||
|
||||
// Save debug trace
|
||||
const debugPath = path.join(stateDir, `${companyName}_${timestamp}_debug.json`);
|
||||
await fs.writeFile(debugPath, JSON.stringify(this.state, null, 2));
|
||||
|
||||
// Print usage summary
|
||||
console.log("\n──────────────────────────────────────────────");
|
||||
console.log("📊 PIPELINE USAGE SUMMARY");
|
||||
console.log("──────────────────────────────────────────────");
|
||||
for (const step of this.state.usage.perStep) {
|
||||
if (step.cost > 0) {
|
||||
console.log(` ${step.step}: ${step.model} — $${step.cost.toFixed(6)} (${(step.durationMs / 1000).toFixed(1)}s)`);
|
||||
}
|
||||
}
|
||||
console.log("──────────────────────────────────────────────");
|
||||
console.log(` TOTAL: $${this.state.usage.totalCost.toFixed(6)}`);
|
||||
console.log(` Tokens: ${(this.state.usage.totalPromptTokens + this.state.usage.totalCompletionTokens).toLocaleString()}`);
|
||||
console.log("──────────────────────────────────────────────\n");
|
||||
}
|
||||
|
||||
/** Get the current internal state (for CLI inspection). */
|
||||
getState(): ConceptState {
|
||||
return this.state;
|
||||
}
|
||||
}
|
||||
432
packages/concept-engine/src/scraper.ts
Normal file
432
packages/concept-engine/src/scraper.ts
Normal file
@@ -0,0 +1,432 @@
|
||||
// ============================================================================
|
||||
// Scraper — Zyte API + Local Persistence
|
||||
// Crawls all pages of a website, stores them locally for reuse.
|
||||
// ============================================================================
|
||||
|
||||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import * as fs from "node:fs/promises";
|
||||
import * as path from "node:path";
|
||||
import { existsSync } from "node:fs";
|
||||
import type { CrawledPage, PageType } from "./types.js";
|
||||
|
||||
interface ScraperConfig {
|
||||
zyteApiKey?: string;
|
||||
crawlDir: string;
|
||||
maxPages?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify a URL pathname into a page type.
|
||||
*/
|
||||
function classifyPage(pathname: string): PageType {
|
||||
const p = pathname.toLowerCase();
|
||||
if (p === "/" || p === "" || p === "/index.html") return "home";
|
||||
if (p.includes("service") || p.includes("leistung") || p.includes("kompetenz"))
|
||||
return "service";
|
||||
if (p.includes("about") || p.includes("ueber") || p.includes("über") || p.includes("unternehmen"))
|
||||
return "about";
|
||||
if (p.includes("contact") || p.includes("kontakt")) return "contact";
|
||||
if (p.includes("job") || p.includes("karriere") || p.includes("career") || p.includes("human-resources"))
|
||||
return "career";
|
||||
if (p.includes("portfolio") || p.includes("referenz") || p.includes("projekt") || p.includes("case-study"))
|
||||
return "portfolio";
|
||||
if (p.includes("blog") || p.includes("news") || p.includes("aktuelles") || p.includes("magazin"))
|
||||
return "blog";
|
||||
if (p.includes("legal") || p.includes("impressum") || p.includes("datenschutz") || p.includes("privacy") || p.includes("agb"))
|
||||
return "legal";
|
||||
return "other";
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect interactive features present on a page.
|
||||
*/
|
||||
function detectFeatures($: cheerio.CheerioAPI): string[] {
|
||||
const features: string[] = [];
|
||||
|
||||
// Search
|
||||
if (
|
||||
$('input[type="search"]').length > 0 ||
|
||||
$('form[role="search"]').length > 0 ||
|
||||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
|
||||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
|
||||
) {
|
||||
features.push("search");
|
||||
}
|
||||
|
||||
// Forms (beyond search)
|
||||
const formCount = $("form").length;
|
||||
const searchForms = $('form[role="search"], .search-form').length;
|
||||
if (formCount > searchForms) {
|
||||
features.push("forms");
|
||||
}
|
||||
|
||||
// Maps
|
||||
if (
|
||||
$('iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]').length > 0
|
||||
) {
|
||||
features.push("maps");
|
||||
}
|
||||
|
||||
// Video
|
||||
if (
|
||||
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container").length > 0
|
||||
) {
|
||||
features.push("video");
|
||||
}
|
||||
|
||||
// Calendar / Events
|
||||
if ($(".calendar, .event, [data-calendar]").length > 0) {
|
||||
features.push("calendar");
|
||||
}
|
||||
|
||||
// Cookie consent
|
||||
if ($(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length > 0) {
|
||||
features.push("cookie-consent");
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all internal links from a page.
|
||||
*/
|
||||
function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
|
||||
const links: string[] = [];
|
||||
$("a[href]").each((_, el) => {
|
||||
const href = $(el).attr("href");
|
||||
if (!href) return;
|
||||
try {
|
||||
const url = new URL(href, origin);
|
||||
if (url.origin === origin) {
|
||||
// Skip assets
|
||||
if (/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(url.pathname)) return;
|
||||
// Skip anchors-only
|
||||
if (url.pathname === "/" && url.hash) return;
|
||||
links.push(url.pathname);
|
||||
}
|
||||
} catch {
|
||||
// Invalid URL, skip
|
||||
}
|
||||
});
|
||||
return [...new Set(links)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all images from a page.
|
||||
*/
|
||||
function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
|
||||
const images: string[] = [];
|
||||
|
||||
// Regular img tags
|
||||
$("img[src]").each((_, el) => {
|
||||
const src = $(el).attr("src");
|
||||
if (src) images.push(src);
|
||||
});
|
||||
|
||||
// CSS background images (inline styles)
|
||||
$("[style*='background-image']").each((_, el) => {
|
||||
const style = $(el).attr("style");
|
||||
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
|
||||
if (match && match[1]) {
|
||||
images.push(match[1]);
|
||||
}
|
||||
});
|
||||
|
||||
// Resolve URLs to absolute
|
||||
const absoluteImages: string[] = [];
|
||||
for (const img of images) {
|
||||
if (img.startsWith("data:image")) continue; // Skip inline base64
|
||||
try {
|
||||
const url = new URL(img, origin);
|
||||
// Ignore small tracking pixels or generic vectors
|
||||
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo")) continue;
|
||||
absoluteImages.push(url.href);
|
||||
} catch {
|
||||
// Invalid URL
|
||||
}
|
||||
}
|
||||
|
||||
return [...new Set(absoluteImages)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract services/competencies from text content.
|
||||
*/
|
||||
function extractServices(text: string): string[] {
|
||||
const services: string[] = [];
|
||||
// Common pattern: bulleted or newline-separated service lists
|
||||
const lines = text.split(/\n/).map((l) => l.trim()).filter((l) => l.length > 3 && l.length < 100);
|
||||
for (const line of lines) {
|
||||
// Skip generic boilerplate
|
||||
if (/cookie|datenschutz|impressum|copyright|©/i.test(line)) continue;
|
||||
if (/^(tel|fax|e-mail|mobil|web|http)/i.test(line)) continue;
|
||||
services.push(line);
|
||||
}
|
||||
return services;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a page via Zyte API with browser rendering.
|
||||
*/
|
||||
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
|
||||
try {
|
||||
const resp = await axios.post(
|
||||
"https://api.zyte.com/v1/extract",
|
||||
{
|
||||
url,
|
||||
browserHtml: true,
|
||||
},
|
||||
{
|
||||
auth: { username: apiKey, password: "" },
|
||||
timeout: 60000,
|
||||
},
|
||||
);
|
||||
const html = resp.data.browserHtml || "";
|
||||
if (!html) {
|
||||
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
|
||||
}
|
||||
return html;
|
||||
} catch (err: any) {
|
||||
if (err.response) {
|
||||
console.error(` ❌ Zyte API error ${err.response.status} for ${url}: ${err.response.data?.detail || err.response.statusText}`);
|
||||
// Rate limited — wait and retry once
|
||||
if (err.response.status === 429) {
|
||||
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
return fetchWithZyte(url, apiKey);
|
||||
}
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a page via simple HTTP GET (fallback).
|
||||
*/
|
||||
async function fetchDirect(url: string): Promise<string> {
|
||||
const resp = await axios.get(url, {
|
||||
timeout: 30000,
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
},
|
||||
});
|
||||
return typeof resp.data === "string" ? resp.data : "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse an HTML string into a CrawledPage.
|
||||
*/
|
||||
function parsePage(html: string, url: string): CrawledPage {
|
||||
const $ = cheerio.load(html);
|
||||
const urlObj = new URL(url);
|
||||
|
||||
const title = $("title").text().trim();
|
||||
const headings = $("h1, h2, h3")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((h) => h.length > 0);
|
||||
|
||||
const navItems = $("nav a")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((t) => t.length > 0 && t.length < 100);
|
||||
|
||||
const bodyText = $("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim();
|
||||
|
||||
const features = detectFeatures($);
|
||||
const links = extractInternalLinks($, urlObj.origin);
|
||||
const images = extractImages($, urlObj.origin);
|
||||
|
||||
const description = $('meta[name="description"]').attr("content") || undefined;
|
||||
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
|
||||
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
|
||||
|
||||
return {
|
||||
url,
|
||||
pathname: urlObj.pathname,
|
||||
title,
|
||||
html,
|
||||
text: bodyText,
|
||||
headings,
|
||||
navItems,
|
||||
features,
|
||||
type: classifyPage(urlObj.pathname),
|
||||
links,
|
||||
images,
|
||||
meta: { description, ogTitle, ogImage },
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Crawl a website and persist all pages locally.
|
||||
*
|
||||
* Returns an array of CrawledPage objects.
|
||||
*/
|
||||
export async function crawlSite(
|
||||
targetUrl: string,
|
||||
config: ScraperConfig,
|
||||
): Promise<CrawledPage[]> {
|
||||
const urlObj = new URL(targetUrl);
|
||||
const origin = urlObj.origin;
|
||||
const domain = urlObj.hostname;
|
||||
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
|
||||
|
||||
// Check for existing crawl
|
||||
const metaFile = path.join(domainDir, "_crawl_meta.json");
|
||||
if (existsSync(metaFile)) {
|
||||
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
|
||||
return loadCrawlFromDisk(domainDir);
|
||||
}
|
||||
|
||||
console.log(`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`);
|
||||
|
||||
// Ensure output dir
|
||||
await fs.mkdir(domainDir, { recursive: true });
|
||||
|
||||
const maxPages = config.maxPages || 30;
|
||||
const visited = new Set<string>();
|
||||
const queue: string[] = [targetUrl];
|
||||
const pages: CrawledPage[] = [];
|
||||
|
||||
while (queue.length > 0 && visited.size < maxPages) {
|
||||
const url = queue.shift()!;
|
||||
const urlPath = new URL(url).pathname;
|
||||
|
||||
if (visited.has(urlPath)) continue;
|
||||
visited.add(urlPath);
|
||||
|
||||
try {
|
||||
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
|
||||
|
||||
let html: string;
|
||||
if (config.zyteApiKey) {
|
||||
html = await fetchWithZyte(url, config.zyteApiKey);
|
||||
} else {
|
||||
html = await fetchDirect(url);
|
||||
}
|
||||
|
||||
if (!html || html.length < 100) {
|
||||
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const page = parsePage(html, url);
|
||||
pages.push(page);
|
||||
|
||||
// Save HTML + metadata to disk
|
||||
const safeName = urlPath === "/" ? "index" : urlPath.replace(/\//g, "_").replace(/^_/, "");
|
||||
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
|
||||
await fs.writeFile(
|
||||
path.join(domainDir, `${safeName}.meta.json`),
|
||||
JSON.stringify(
|
||||
{
|
||||
url: page.url,
|
||||
pathname: page.pathname,
|
||||
title: page.title,
|
||||
type: page.type,
|
||||
headings: page.headings,
|
||||
navItems: page.navItems,
|
||||
features: page.features,
|
||||
links: page.links,
|
||||
images: page.images,
|
||||
meta: page.meta,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
|
||||
// Discover new links
|
||||
for (const link of page.links) {
|
||||
if (!visited.has(link)) {
|
||||
const fullUrl = `${origin}${link}`;
|
||||
queue.push(fullUrl);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Save crawl metadata
|
||||
await fs.writeFile(
|
||||
metaFile,
|
||||
JSON.stringify(
|
||||
{
|
||||
domain,
|
||||
crawledAt: new Date().toISOString(),
|
||||
totalPages: pages.length,
|
||||
urls: pages.map((p) => p.url),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
|
||||
console.log(`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`);
|
||||
return pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a previously crawled site from disk.
|
||||
*/
|
||||
async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
|
||||
const files = await fs.readdir(domainDir);
|
||||
const metaFiles = files.filter((f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json");
|
||||
|
||||
const pages: CrawledPage[] = [];
|
||||
for (const metaFile of metaFiles) {
|
||||
const baseName = metaFile.replace(".meta.json", "");
|
||||
const htmlFile = `${baseName}.html`;
|
||||
|
||||
const meta = JSON.parse(await fs.readFile(path.join(domainDir, metaFile), "utf8"));
|
||||
let html = "";
|
||||
if (files.includes(htmlFile)) {
|
||||
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
|
||||
}
|
||||
|
||||
const text = html
|
||||
? cheerio
|
||||
.load(html)("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim()
|
||||
: "";
|
||||
|
||||
pages.push({
|
||||
url: meta.url,
|
||||
pathname: meta.pathname,
|
||||
title: meta.title,
|
||||
html,
|
||||
text,
|
||||
headings: meta.headings || [],
|
||||
navItems: meta.navItems || [],
|
||||
features: meta.features || [],
|
||||
type: meta.type || "other",
|
||||
links: meta.links || [],
|
||||
images: meta.images || [],
|
||||
meta: meta.meta || {},
|
||||
});
|
||||
}
|
||||
|
||||
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
|
||||
return pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a cached crawl to force re-crawl.
|
||||
*/
|
||||
export async function clearCrawlCache(crawlDir: string, domain: string): Promise<void> {
|
||||
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
|
||||
if (existsSync(domainDir)) {
|
||||
await fs.rm(domainDir, { recursive: true, force: true });
|
||||
console.log(`🧹 Cleared crawl cache for ${domain}`);
|
||||
}
|
||||
}
|
||||
65
packages/concept-engine/src/steps/00a-site-audit.ts
Normal file
65
packages/concept-engine/src/steps/00a-site-audit.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
// ============================================================================
|
||||
// Step 00a: Site Audit (DataForSEO + AI)
|
||||
// ============================================================================
|
||||
|
||||
import { PageAuditor } from "@mintel/page-audit";
|
||||
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
|
||||
|
||||
export async function executeSiteAudit(
|
||||
state: ConceptState,
|
||||
config: PipelineConfig,
|
||||
): Promise<StepResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
if (!state.url) {
|
||||
return {
|
||||
success: true,
|
||||
data: null,
|
||||
usage: { step: "00a-site-audit", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const login = process.env.DATA_FOR_SEO_LOGIN || process.env.DATA_FOR_SEO_API_KEY?.split(":")?.[0];
|
||||
const password = process.env.DATA_FOR_SEO_PASSWORD || process.env.DATA_FOR_SEO_API_KEY?.split(":")?.slice(1)?.join(":");
|
||||
|
||||
if (!login || !password) {
|
||||
console.warn(" ⚠️ Site Audit skipped: DataForSEO credentials missing from environment.");
|
||||
return {
|
||||
success: true,
|
||||
data: null,
|
||||
usage: { step: "00a-site-audit", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
|
||||
};
|
||||
}
|
||||
|
||||
const auditor = new PageAuditor({
|
||||
dataForSeoLogin: login,
|
||||
dataForSeoPassword: password,
|
||||
openrouterKey: config.openrouterKey,
|
||||
outputDir: config.outputDir ? `${config.outputDir}/audits` : undefined,
|
||||
});
|
||||
|
||||
// Run audit (max 20 pages for the estimation phase to keep it fast)
|
||||
const result = await auditor.audit(state.url, { maxPages: 20 });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: result,
|
||||
usage: {
|
||||
step: "00a-site-audit",
|
||||
model: "dataforseo",
|
||||
cost: 0, // DataForSEO cost tracking could be added later
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
durationMs: Date.now() - startTime,
|
||||
},
|
||||
};
|
||||
} catch (err: any) {
|
||||
console.warn(` ⚠️ Site Audit failed, skipping: ${err.message}`);
|
||||
return {
|
||||
success: true,
|
||||
data: null,
|
||||
usage: { step: "00a-site-audit", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
|
||||
};
|
||||
}
|
||||
}
|
||||
121
packages/concept-engine/src/steps/00b-research.ts
Normal file
121
packages/concept-engine/src/steps/00b-research.ts
Normal file
@@ -0,0 +1,121 @@
|
||||
// ============================================================================
|
||||
// Step 00b: Research — Industry Research via @mintel/journaling (No LLM hallus)
|
||||
// Uses Serper API for real web search results about the industry/company.
|
||||
// ============================================================================
|
||||
|
||||
import type { ConceptState, StepResult } from "../types.js";
|
||||
|
||||
interface ResearchResult {
|
||||
companyContext: string[];
|
||||
industryInsights: string[];
|
||||
competitorInfo: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Research the company and industry using real web search data.
|
||||
* Uses @mintel/journaling's ResearchAgent — results are grounded in real sources.
|
||||
*
|
||||
* NOTE: The journaling package can cause unhandled rejections that crash the process.
|
||||
* We wrap each call in an additional safety layer.
|
||||
*/
|
||||
export async function executeResearch(
|
||||
state: ConceptState,
|
||||
): Promise<StepResult<ResearchResult>> {
|
||||
const startTime = Date.now();
|
||||
|
||||
const companyName = state.siteProfile?.companyInfo?.name || "";
|
||||
const websiteTopic = state.siteProfile?.services?.slice(0, 3).join(", ") || "";
|
||||
const domain = state.siteProfile?.domain || "";
|
||||
|
||||
if (!companyName && !websiteTopic && !domain) {
|
||||
return {
|
||||
success: true,
|
||||
data: { companyContext: [], industryInsights: [], competitorInfo: [] },
|
||||
usage: { step: "00b-research", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: 0 },
|
||||
};
|
||||
}
|
||||
|
||||
// Safety wrapper: catch ANY unhandled rejections during this step
|
||||
const safeCall = <T>(fn: () => Promise<T>, fallback: T): Promise<T> => {
|
||||
return new Promise<T>((resolve) => {
|
||||
const handler = (err: any) => {
|
||||
console.warn(` ⚠️ Unhandled rejection caught in research: ${err?.message || err}`);
|
||||
process.removeListener("unhandledRejection", handler);
|
||||
resolve(fallback);
|
||||
};
|
||||
process.on("unhandledRejection", handler);
|
||||
|
||||
fn()
|
||||
.then((result) => {
|
||||
process.removeListener("unhandledRejection", handler);
|
||||
resolve(result);
|
||||
})
|
||||
.catch((err) => {
|
||||
process.removeListener("unhandledRejection", handler);
|
||||
console.warn(` ⚠️ Research call failed: ${err?.message || err}`);
|
||||
resolve(fallback);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
try {
|
||||
const { ResearchAgent } = await import("@mintel/journaling");
|
||||
const agent = new ResearchAgent(process.env.OPENROUTER_API_KEY || "");
|
||||
|
||||
const results: ResearchResult = {
|
||||
companyContext: [],
|
||||
industryInsights: [],
|
||||
competitorInfo: [],
|
||||
};
|
||||
|
||||
// 1. Research the company itself
|
||||
if (companyName || domain) {
|
||||
const searchQuery = companyName
|
||||
? `${companyName} ${websiteTopic} Unternehmen`
|
||||
: `site:${domain}`;
|
||||
|
||||
console.log(` 🔍 Researching: "${searchQuery}"...`);
|
||||
const facts = await safeCall(
|
||||
() => agent.researchTopic(searchQuery),
|
||||
[] as any[],
|
||||
);
|
||||
results.companyContext = (facts || [])
|
||||
.filter((f: any) => f?.fact || f?.value || f?.text || f?.statement)
|
||||
.map((f: any) => f.fact || f.value || f.text || f.statement)
|
||||
.slice(0, 5);
|
||||
}
|
||||
|
||||
// 2. Industry research
|
||||
if (websiteTopic) {
|
||||
console.log(` 🔍 Researching industry: "${websiteTopic}"...`);
|
||||
const insights = await safeCall(
|
||||
() => agent.researchCompetitors(websiteTopic),
|
||||
[] as any[],
|
||||
);
|
||||
results.industryInsights = (insights || []).slice(0, 5);
|
||||
}
|
||||
|
||||
const totalFacts = results.companyContext.length + results.industryInsights.length + results.competitorInfo.length;
|
||||
console.log(` 📊 Research found ${totalFacts} data points.`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
usage: {
|
||||
step: "00b-research",
|
||||
model: "serper/datacommons",
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
cost: 0,
|
||||
durationMs: Date.now() - startTime,
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
console.warn(` ⚠️ Research step skipped: ${(err as Error).message}`);
|
||||
return {
|
||||
success: true,
|
||||
data: { companyContext: [], industryInsights: [], competitorInfo: [] },
|
||||
usage: { step: "00b-research", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: Date.now() - startTime },
|
||||
};
|
||||
}
|
||||
}
|
||||
108
packages/concept-engine/src/steps/01-extract.ts
Normal file
108
packages/concept-engine/src/steps/01-extract.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
// ============================================================================
|
||||
// Step 01: Extract — Briefing Fact Extraction (Gemini Flash)
|
||||
// ============================================================================
|
||||
|
||||
import { llmJsonRequest } from "../llm-client.js";
|
||||
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
|
||||
import { DEFAULT_MODELS } from "../types.js";
|
||||
|
||||
export async function executeExtract(
|
||||
state: ConceptState,
|
||||
config: PipelineConfig,
|
||||
): Promise<StepResult> {
|
||||
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
|
||||
const startTime = Date.now();
|
||||
|
||||
// Build site context from the deterministic analyzer
|
||||
const siteContext = state.siteProfile
|
||||
? `
|
||||
EXISTING WEBSITE ANALYSIS (FACTS — verifiably crawled, NOT guessed):
|
||||
- Domain: ${state.siteProfile.domain}
|
||||
- Total pages crawled: ${state.siteProfile.totalPages}
|
||||
- Navigation items: ${state.siteProfile.navigation.map((n) => n.label).join(", ") || "nicht erkannt"}
|
||||
- Existing features: ${state.siteProfile.existingFeatures.join(", ") || "keine"}
|
||||
- Services / Kompetenzen: ${state.siteProfile.services.join(" | ") || "keine"}
|
||||
- Employee count (from website text): ${(state.siteProfile as any).employeeCount || "nicht genannt"}
|
||||
- Company name: ${state.siteProfile.companyInfo.name || "unbekannt"}
|
||||
- Address: ${state.siteProfile.companyInfo.address || "unbekannt"}
|
||||
- Tax ID (USt-ID): ${state.siteProfile.companyInfo.taxId || "unbekannt"}
|
||||
- HRB: ${state.siteProfile.companyInfo.registerNumber || "unbekannt"}
|
||||
- Managing Director: ${state.siteProfile.companyInfo.managingDirector || "unbekannt"}
|
||||
- External related domains (HAVE OWN WEBSITES — DO NOT include as sub-pages!): ${state.siteProfile.externalDomains.join(", ") || "keine"}
|
||||
- Social links: ${Object.entries(state.siteProfile.socialLinks).map(([k, v]) => `${k}: ${v}`).join(", ") || "keine"}
|
||||
`
|
||||
: "No existing website data available.";
|
||||
|
||||
const systemPrompt = `
|
||||
You are a precision fact extractor. Your only job: extract verifiable facts from the BRIEFING.
|
||||
Output language: GERMAN (strict).
|
||||
Output format: flat JSON at root level. No nesting except arrays.
|
||||
|
||||
### CRITICAL RULES:
|
||||
1. "employeeCount": take from SITE ANALYSIS if available. Only override if briefing states something more specific.
|
||||
2. External domains (e.g. "etib-ing.com") have their OWN website. NEVER include them as sub-pages.
|
||||
3. Videos (Messefilm, Imagefilm) are CONTENT ASSETS, not pages.
|
||||
4. If existing site already has search, include "search" in functions.
|
||||
5. DO NOT invent pages not mentioned in briefing or existing navigation.
|
||||
|
||||
### CONSERVATIVE RULE:
|
||||
- simple lists (Jobs, Referenzen, Messen) = pages, NOT features
|
||||
- Assume "page" as default. Only add "feature" for complex interactive systems.
|
||||
|
||||
### OUTPUT FORMAT:
|
||||
{
|
||||
"companyName": string,
|
||||
"companyAddress": string,
|
||||
"personName": string,
|
||||
"email": string,
|
||||
"existingWebsite": string,
|
||||
"websiteTopic": string, // MAX 3 words
|
||||
"isRelaunch": boolean,
|
||||
"employeeCount": string, // from site analysis, e.g. "über 50"
|
||||
"pages": string[], // ALL pages: ["Startseite", "Über Uns", "Leistungen", ...]
|
||||
"functions": string[], // search, forms, maps, video, cookie_consent, etc.
|
||||
"assets": string[], // existing_website, logo, media, photos, videos
|
||||
"deadline": string,
|
||||
"targetAudience": string,
|
||||
"cmsSetup": boolean,
|
||||
"multilang": boolean
|
||||
}
|
||||
|
||||
BANNED OUTPUT KEYS: "selectedPages", "otherPages", "features", "apiSystems" — use pages[] and functions[] ONLY.
|
||||
`;
|
||||
|
||||
const userPrompt = `BRIEFING (TRUTH SOURCE):
|
||||
${state.briefing}
|
||||
|
||||
COMMENTS:
|
||||
${state.comments || "keine"}
|
||||
|
||||
${siteContext}`;
|
||||
|
||||
try {
|
||||
const { data, usage } = await llmJsonRequest({
|
||||
model: models.flash,
|
||||
systemPrompt,
|
||||
userPrompt,
|
||||
apiKey: config.openrouterKey,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data,
|
||||
usage: {
|
||||
step: "01-extract",
|
||||
model: models.flash,
|
||||
promptTokens: usage.promptTokens,
|
||||
completionTokens: usage.completionTokens,
|
||||
cost: usage.cost,
|
||||
durationMs: Date.now() - startTime,
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
success: false,
|
||||
error: `Extract step failed: ${(err as Error).message}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
110
packages/concept-engine/src/steps/02-audit.ts
Normal file
110
packages/concept-engine/src/steps/02-audit.ts
Normal file
@@ -0,0 +1,110 @@
|
||||
// ============================================================================
|
||||
// Step 02: Audit — Feature Auditor + Skeptical Review (Gemini Flash)
|
||||
// ============================================================================
|
||||
|
||||
import { llmJsonRequest } from "../llm-client.js";
|
||||
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
|
||||
import { DEFAULT_MODELS } from "../types.js";
|
||||
|
||||
export async function executeAudit(
|
||||
state: ConceptState,
|
||||
config: PipelineConfig,
|
||||
): Promise<StepResult> {
|
||||
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
|
||||
const startTime = Date.now();
|
||||
|
||||
if (!state.facts) {
|
||||
return { success: false, error: "No facts from Step 01 available." };
|
||||
}
|
||||
|
||||
const systemPrompt = `
|
||||
You are a "Strict Cost Controller". Your mission is to prevent over-billing.
|
||||
Review the extracted FEATURES against the BRIEFING and the EXISTING SITE ANALYSIS.
|
||||
|
||||
### RULE OF THUMB:
|
||||
- A "Feature" (1.500 €) is ONLY justified for complex, dynamic systems (logic, database, CMS-driven management, advanced filtering).
|
||||
- Simple lists, information sections, or static descriptions (e.g., "Messen", "Team", "Historie", "Jobs" as mere text) are ALWAYS "Pages" (600 €).
|
||||
- If the briefing doesn't explicitly mention "Management System", "Filterable Database", or "Client Login", it is a PAGE.
|
||||
|
||||
### ADDITIONAL CHECKS:
|
||||
1. If any feature maps to an entity that has its own external website (listed in EXTERNAL_DOMAINS), remove it entirely — it's out of scope.
|
||||
2. Videos are ASSETS not pages. Remove any video-related entries from pages.
|
||||
3. If the existing site has features (search, forms, etc.), ensure they are in the functions list.
|
||||
|
||||
### MISSION:
|
||||
Return the corrected 'features', 'otherPages', and 'functions' arrays.
|
||||
|
||||
### OUTPUT FORMAT:
|
||||
{
|
||||
"features": string[],
|
||||
"otherPages": string[],
|
||||
"functions": string[],
|
||||
"removedItems": [{ "item": string, "reason": string }],
|
||||
"addedItems": [{ "item": string, "reason": string }]
|
||||
}
|
||||
`;
|
||||
|
||||
const userPrompt = `
|
||||
EXTRACTED FACTS:
|
||||
${JSON.stringify(state.facts, null, 2)}
|
||||
|
||||
BRIEFING:
|
||||
${state.briefing}
|
||||
|
||||
EXTERNAL DOMAINS (have own websites, OUT OF SCOPE):
|
||||
${state.siteProfile?.externalDomains?.join(", ") || "none"}
|
||||
|
||||
EXISTING FEATURES ON CURRENT SITE:
|
||||
${state.siteProfile?.existingFeatures?.join(", ") || "none"}
|
||||
`;
|
||||
|
||||
try {
|
||||
const { data, usage } = await llmJsonRequest({
|
||||
model: models.flash,
|
||||
systemPrompt,
|
||||
userPrompt,
|
||||
apiKey: config.openrouterKey,
|
||||
});
|
||||
|
||||
// Apply audit results to facts
|
||||
const auditedFacts = { ...state.facts };
|
||||
auditedFacts.features = data.features || [];
|
||||
auditedFacts.otherPages = [
|
||||
...new Set([...(auditedFacts.otherPages || []), ...(data.otherPages || [])]),
|
||||
];
|
||||
if (data.functions) {
|
||||
auditedFacts.functions = [
|
||||
...new Set([...(auditedFacts.functions || []), ...data.functions]),
|
||||
];
|
||||
}
|
||||
|
||||
// Log changes
|
||||
if (data.removedItems?.length) {
|
||||
console.log(" 📉 Audit removed:");
|
||||
for (const item of data.removedItems) {
|
||||
console.log(` - ${item.item}: ${item.reason}`);
|
||||
}
|
||||
}
|
||||
if (data.addedItems?.length) {
|
||||
console.log(" 📈 Audit added:");
|
||||
for (const item of data.addedItems) {
|
||||
console.log(` + ${item.item}: ${item.reason}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: auditedFacts,
|
||||
usage: {
|
||||
step: "02-audit",
|
||||
model: models.flash,
|
||||
promptTokens: usage.promptTokens,
|
||||
completionTokens: usage.completionTokens,
|
||||
cost: usage.cost,
|
||||
durationMs: Date.now() - startTime,
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
return { success: false, error: `Audit step failed: ${(err as Error).message}` };
|
||||
}
|
||||
}
|
||||
99
packages/concept-engine/src/steps/03-strategize.ts
Normal file
99
packages/concept-engine/src/steps/03-strategize.ts
Normal file
@@ -0,0 +1,99 @@
|
||||
// ============================================================================
|
||||
// Step 03: Strategize — Briefing Summary + Design Vision (Gemini Pro)
|
||||
// ============================================================================
|
||||
|
||||
import { llmJsonRequest } from "../llm-client.js";
|
||||
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
|
||||
import { DEFAULT_MODELS } from "../types.js";
|
||||
|
||||
export async function executeStrategize(
|
||||
state: ConceptState,
|
||||
config: PipelineConfig,
|
||||
): Promise<StepResult> {
|
||||
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
|
||||
const startTime = Date.now();
|
||||
|
||||
if (!state.auditedFacts) {
|
||||
return { success: false, error: "No audited facts from Step 02 available." };
|
||||
}
|
||||
|
||||
const systemPrompt = `
|
||||
You are a high-end Digital Architect. Your goal is to make the CUSTOMER feel 100% understood.
|
||||
Analyze the BRIEFING and the EXISTING WEBSITE context.
|
||||
|
||||
### OBJECTIVE:
|
||||
1. **briefingSummary**: Ein sachlicher, tiefgehender Überblick der Unternehmenslage.
|
||||
- STIL: Keine Ich-Form. Keine Marketing-Floskeln. Nutze präzise Fachbegriffe. Sei prägnant.
|
||||
- FORM: EXAKT ZWEI ABSÄTZE. Insgesamt ca. 6 Sätze.
|
||||
- INHALT: Status Quo, was der Kunde will, welcher Sprung notwendig ist.
|
||||
- ABSOLUTE REGEL: Keine Halluzinationen. Keine namentlichen Nennungen von Personen.
|
||||
- RELAUNCH-REGEL: Wenn isRelaunch=true, NICHT sagen "keine digitale Präsenz". Es GIBT eine Seite.
|
||||
- SORGLOS BETRIEB: MUSS erwähnt werden als Teil des Gesamtpakets.
|
||||
|
||||
2. **designVision**: Ein abstraktes, strategisches Konzept.
|
||||
- STIL: Rein konzeptionell. Keine Umsetzungsschritte. Keine Ich-Form. Sei prägnant.
|
||||
- FORM: EXAKT ZWEI ABSÄTZE. Insgesamt ca. 4 Sätze.
|
||||
- DATENSCHUTZ: KEINERLEI namentliche Nennungen.
|
||||
- FOKUS: Welche strategische Wirkung soll erzielt werden?
|
||||
|
||||
### RULES:
|
||||
- NO "wir/unser". NO "Ich/Mein". Objective, fact-oriented narrative.
|
||||
- NO marketing lingo. NO "innovativ", "revolutionär", "state-of-the-art".
|
||||
- NO hallucinations about features not in the briefing.
|
||||
- NO "SEO-Standards zur Fachkräftesicherung" or "B2B-Nutzerströme" — das ist Schwachsinn.
|
||||
Use specific industry terms from the briefing (e.g. "Kabeltiefbau", "HDD-Bohrverfahren").
|
||||
- LANGUAGE: Professional German. Simple but expert-level.
|
||||
|
||||
### OUTPUT FORMAT:
|
||||
{
|
||||
"briefingSummary": string,
|
||||
"designVision": string
|
||||
}
|
||||
`;
|
||||
|
||||
const userPrompt = `
|
||||
BRIEFING (TRUTH SOURCE):
|
||||
${state.briefing}
|
||||
|
||||
EXISTING WEBSITE DATA:
|
||||
- Services: ${state.siteProfile?.services?.join(", ") || "unbekannt"}
|
||||
- Navigation: ${state.siteProfile?.navigation?.map((n) => n.label).join(", ") || "unbekannt"}
|
||||
- Company: ${state.auditedFacts.companyName || "unbekannt"}
|
||||
|
||||
EXTRACTED & AUDITED FACTS:
|
||||
${JSON.stringify(state.auditedFacts, null, 2)}
|
||||
|
||||
${state.siteAudit?.report ? `
|
||||
TECHNICAL SITE AUDIT (IST-Analyse):
|
||||
Health: ${state.siteAudit.report.overallHealth} (SEO: ${state.siteAudit.report.seoScore}, UX: ${state.siteAudit.report.uxScore}, Perf: ${state.siteAudit.report.performanceScore})
|
||||
- Executive Summary: ${state.siteAudit.report.executiveSummary}
|
||||
- Strengths: ${state.siteAudit.report.strengths.join(", ")}
|
||||
- Critical Issues: ${state.siteAudit.report.criticalIssues.join(", ")}
|
||||
- Quick Wins: ${state.siteAudit.report.quickWins.join(", ")}
|
||||
` : ""}
|
||||
`;
|
||||
|
||||
try {
|
||||
const { data, usage } = await llmJsonRequest({
|
||||
model: models.pro,
|
||||
systemPrompt,
|
||||
userPrompt,
|
||||
apiKey: config.openrouterKey,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data,
|
||||
usage: {
|
||||
step: "03-strategize",
|
||||
model: models.pro,
|
||||
promptTokens: usage.promptTokens,
|
||||
completionTokens: usage.completionTokens,
|
||||
cost: usage.cost,
|
||||
durationMs: Date.now() - startTime,
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
return { success: false, error: `Strategize step failed: ${(err as Error).message}` };
|
||||
}
|
||||
}
|
||||
133
packages/concept-engine/src/steps/04-architect.ts
Normal file
133
packages/concept-engine/src/steps/04-architect.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
// ============================================================================
|
||||
// Step 04: Architect — Sitemap & Information Architecture (Gemini Pro)
|
||||
// ============================================================================
|
||||
|
||||
import { llmJsonRequest } from "../llm-client.js";
|
||||
import type { ConceptState, StepResult, PipelineConfig } from "../types.js";
|
||||
import { DEFAULT_MODELS } from "../types.js";
|
||||
|
||||
export async function executeArchitect(
|
||||
state: ConceptState,
|
||||
config: PipelineConfig,
|
||||
): Promise<StepResult> {
|
||||
const models = { ...DEFAULT_MODELS, ...config.modelsOverride };
|
||||
const startTime = Date.now();
|
||||
|
||||
if (!state.auditedFacts) {
|
||||
return { success: false, error: "No audited facts available." };
|
||||
}
|
||||
|
||||
// Build navigation constraint from the real site
|
||||
const existingNav = state.siteProfile?.navigation?.map((n) => n.label).join(", ") || "unbekannt";
|
||||
const existingServices = state.siteProfile?.services?.join(", ") || "unbekannt";
|
||||
const externalDomains = state.siteProfile?.externalDomains?.join(", ") || "keine";
|
||||
|
||||
const systemPrompt = `
|
||||
Du bist ein Senior UX Architekt. Erstelle einen ECHTEN SEITENBAUM für die neue Website.
|
||||
Regelwerk für den Output:
|
||||
|
||||
### SEITENBAUM-REGELN:
|
||||
1. KEIN MARKETINGSPRECH als Kategoriename. Gültige Kategorien sind nur die echten Navigationspunkte der Website.
|
||||
ERLAUBT: "Startseite", "Leistungen", "Über uns", "Karriere", "Referenzen", "Kontakt", "Rechtliches"
|
||||
VERBOTEN: "Kern-Präsenz", "Vertrauen", "Business Areas", "Digitaler Auftritt"
|
||||
|
||||
2. LEISTUNGEN muss in ECHTE UNTERSEITEN aufgeteilt werden — nicht eine einzige "Leistungen"-Seite.
|
||||
Jede Kompetenz aus dem existierenden Leistungsspektrum = eine eigene Seite.
|
||||
Beispiel statt:
|
||||
{ category: "Leistungen", pages: [{ title: "Leistungen", desc: "..." }] }
|
||||
So:
|
||||
{ category: "Leistungen", pages: [
|
||||
{ title: "Kabeltiefbau", desc: "Mittelspannung, Niederspannung, Kabelpflugarbeiten..." },
|
||||
{ title: "Horizontalspülbohrungen", desc: "HDD in allen Bodenklassen..." },
|
||||
{ title: "Elektromontagen", desc: "Bis 110 kV, Glasfaserkabelmontagen..." },
|
||||
{ title: "Planung & Dokumentation", desc: "Genehmigungs- und Ausführungsplanung, Vermessung..." }
|
||||
]}
|
||||
|
||||
3. SEITENTITEL: Kurz, klar, faktisch. Kein Werbejargon.
|
||||
ERLAUBT: "Kabeltiefbau", "Über uns", "Karriere"
|
||||
VERBOTEN: "Unsere Expertise", "Kompetenzspektrum", "Community"
|
||||
|
||||
4. Gruppe die Leistungen nach dem ECHTEN Kompetenzkatalog der bestehenden Site — nicht erfinden.
|
||||
|
||||
5. Keine doppelten Seiten. Keine Phantomseiten.
|
||||
|
||||
6. Videos = Content-Assets, keine eigene Seite.
|
||||
|
||||
7. Entitäten mit eigener Domain (${externalDomains}) = NICHT als Seite. Nur als Teaser/Link wenn nötig.
|
||||
|
||||
### KONTEXT:
|
||||
Bestehende Navigation: ${existingNav}
|
||||
Bestehende Services: ${existingServices}
|
||||
Externe Domains (haben eigene Website): ${externalDomains}
|
||||
Angeforderte zusätzliche Seiten aus Briefing: ${(state.auditedFacts as any)?.pages?.join(", ") || "keine spezifischen"}
|
||||
|
||||
### OUTPUT FORMAT (JSON):
|
||||
{
|
||||
"websiteTopic": string, // MAX 3 Wörter, beschreibend
|
||||
"sitemap": [
|
||||
{
|
||||
"category": string, // Echter Nav-Eintrag. KEIN Marketingsprech.
|
||||
"pages": [
|
||||
{ "title": string, "desc": string } // Echte Unterseite, 1-2 Sätze Zweck
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
`;
|
||||
|
||||
const userPrompt = `
|
||||
BRIEFING:
|
||||
${state.briefing}
|
||||
|
||||
FAKTEN (aus Extraktion):
|
||||
${JSON.stringify({ facts: state.auditedFacts, strategy: { briefingSummary: state.briefingSummary } }, null, 2)}
|
||||
|
||||
Erstelle den Seitenbaum. Baue die Leistungen DETAILLIERT aus — echte Unterseiten pro Kompetenzbereich.
|
||||
`;
|
||||
|
||||
try {
|
||||
const { data, usage } = await llmJsonRequest({
|
||||
model: models.pro,
|
||||
systemPrompt,
|
||||
userPrompt,
|
||||
apiKey: config.openrouterKey,
|
||||
});
|
||||
|
||||
// Normalize sitemap structure
|
||||
let sitemap = data.sitemap;
|
||||
if (sitemap && !Array.isArray(sitemap)) {
|
||||
if (sitemap.categories) sitemap = sitemap.categories;
|
||||
else {
|
||||
const entries = Object.entries(sitemap);
|
||||
if (entries.every(([, v]) => Array.isArray(v))) {
|
||||
sitemap = entries.map(([category, pages]) => ({ category, pages }));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(sitemap)) {
|
||||
sitemap = sitemap.map((cat: any) => ({
|
||||
category: cat.category || cat.kategorie || cat.Kategorie || "Allgemein",
|
||||
pages: (cat.pages || cat.seiten || []).map((page: any) => ({
|
||||
title: page.title || page.titel || "Seite",
|
||||
desc: page.desc || page.beschreibung || page.description || "",
|
||||
})),
|
||||
}));
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: { websiteTopic: data.websiteTopic, sitemap },
|
||||
usage: {
|
||||
step: "04-architect",
|
||||
model: models.pro,
|
||||
promptTokens: usage.promptTokens,
|
||||
completionTokens: usage.completionTokens,
|
||||
cost: usage.cost,
|
||||
durationMs: Date.now() - startTime,
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
return { success: false, error: `Architect step failed: ${(err as Error).message}` };
|
||||
}
|
||||
}
|
||||
233
packages/concept-engine/src/types.ts
Normal file
233
packages/concept-engine/src/types.ts
Normal file
@@ -0,0 +1,233 @@
|
||||
// ============================================================================
|
||||
// @mintel/concept-engine — Core Type Definitions
|
||||
// ============================================================================
|
||||
|
||||
/** Page types recognized during crawling */
|
||||
export type PageType =
|
||||
| "home"
|
||||
| "service"
|
||||
| "about"
|
||||
| "contact"
|
||||
| "career"
|
||||
| "portfolio"
|
||||
| "blog"
|
||||
| "legal"
|
||||
| "other";
|
||||
|
||||
/** A single crawled page with extracted metadata */
|
||||
export interface CrawledPage {
|
||||
url: string;
|
||||
pathname: string;
|
||||
title: string;
|
||||
html: string;
|
||||
text: string;
|
||||
headings: string[];
|
||||
navItems: string[];
|
||||
features: string[];
|
||||
type: PageType;
|
||||
links: string[];
|
||||
images: string[];
|
||||
meta: {
|
||||
description?: string;
|
||||
ogTitle?: string;
|
||||
ogImage?: string;
|
||||
};
|
||||
}
|
||||
|
||||
/** Navigation item extracted from <nav> elements */
|
||||
export interface NavItem {
|
||||
label: string;
|
||||
href: string;
|
||||
children?: NavItem[];
|
||||
}
|
||||
|
||||
/** Company info extracted from Impressum / footer */
|
||||
export interface CompanyInfo {
|
||||
name?: string;
|
||||
address?: string;
|
||||
phone?: string;
|
||||
email?: string;
|
||||
taxId?: string;
|
||||
registerNumber?: string;
|
||||
managingDirector?: string;
|
||||
}
|
||||
|
||||
/** A page in the site inventory */
|
||||
export interface PageInventoryItem {
|
||||
url: string;
|
||||
pathname: string;
|
||||
title: string;
|
||||
type: PageType;
|
||||
headings: string[];
|
||||
services: string[];
|
||||
hasSearch: boolean;
|
||||
hasForms: boolean;
|
||||
hasMap: boolean;
|
||||
hasVideo: boolean;
|
||||
contentSummary: string;
|
||||
}
|
||||
|
||||
/** Full site profile — deterministic, no LLM involved */
|
||||
export interface SiteProfile {
|
||||
domain: string;
|
||||
crawledAt: string;
|
||||
totalPages: number;
|
||||
navigation: NavItem[];
|
||||
existingFeatures: string[];
|
||||
services: string[];
|
||||
companyInfo: CompanyInfo;
|
||||
pageInventory: PageInventoryItem[];
|
||||
colors: string[];
|
||||
socialLinks: Record<string, string>;
|
||||
externalDomains: string[];
|
||||
images: string[];
|
||||
employeeCount: string | null;
|
||||
}
|
||||
|
||||
/** Configuration for the estimation pipeline */
|
||||
export interface PipelineConfig {
|
||||
openrouterKey: string;
|
||||
zyteApiKey?: string;
|
||||
outputDir: string;
|
||||
crawlDir: string;
|
||||
modelsOverride?: Partial<ModelConfig>;
|
||||
}
|
||||
|
||||
/** Model routing configuration */
|
||||
export interface ModelConfig {
|
||||
flash: string;
|
||||
pro: string;
|
||||
opus: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_MODELS: ModelConfig = {
|
||||
flash: "google/gemini-3-flash-preview",
|
||||
pro: "google/gemini-3.1-pro-preview",
|
||||
opus: "anthropic/claude-opus-4-6",
|
||||
};
|
||||
|
||||
/** Input for a pipeline run */
|
||||
export interface PipelineInput {
|
||||
briefing: string;
|
||||
url?: string;
|
||||
budget?: string;
|
||||
comments?: string;
|
||||
clearCache?: boolean;
|
||||
}
|
||||
|
||||
/** State that flows through all concept pipeline steps */
|
||||
export interface ConceptState {
|
||||
// Input
|
||||
briefing: string;
|
||||
url?: string;
|
||||
comments?: string;
|
||||
|
||||
// Output: Scrape & Analyze
|
||||
siteProfile?: SiteProfile;
|
||||
crawlDir?: string;
|
||||
|
||||
// Output: Site Audit
|
||||
siteAudit?: any;
|
||||
|
||||
// Output: Research
|
||||
researchData?: any;
|
||||
|
||||
// Output: Extract
|
||||
facts?: Record<string, any>;
|
||||
|
||||
// Output: Audit
|
||||
auditedFacts?: Record<string, any>;
|
||||
|
||||
// Output: Strategy
|
||||
briefingSummary?: string;
|
||||
designVision?: string;
|
||||
|
||||
// Output: Architecture
|
||||
sitemap?: SitemapCategory[];
|
||||
websiteTopic?: string;
|
||||
|
||||
// Cost tracking
|
||||
usage: UsageStats;
|
||||
}
|
||||
|
||||
/** Final output of the Concept Engine */
|
||||
export interface ProjectConcept {
|
||||
domain: string;
|
||||
timestamp: string;
|
||||
briefing: string;
|
||||
auditedFacts: Record<string, any>;
|
||||
siteProfile?: SiteProfile;
|
||||
siteAudit?: any;
|
||||
researchData?: any;
|
||||
strategy: {
|
||||
briefingSummary: string;
|
||||
designVision: string;
|
||||
};
|
||||
architecture: {
|
||||
websiteTopic: string;
|
||||
sitemap: SitemapCategory[];
|
||||
};
|
||||
usage: UsageStats;
|
||||
}
|
||||
|
||||
export interface SitemapCategory {
|
||||
category: string;
|
||||
pages: { title: string; desc: string }[];
|
||||
}
|
||||
|
||||
export interface UsageStats {
|
||||
totalPromptTokens: number;
|
||||
totalCompletionTokens: number;
|
||||
totalCost: number;
|
||||
perStep: StepUsage[];
|
||||
}
|
||||
|
||||
export interface StepUsage {
|
||||
step: string;
|
||||
model: string;
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
cost: number;
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
/** Result of a single pipeline step */
|
||||
export interface StepResult<T = any> {
|
||||
success: boolean;
|
||||
data?: T;
|
||||
error?: string;
|
||||
usage?: StepUsage;
|
||||
}
|
||||
|
||||
/** Validation result from the deterministic validator */
|
||||
export interface ValidationResult {
|
||||
passed: boolean;
|
||||
errors: ValidationError[];
|
||||
warnings: ValidationWarning[];
|
||||
}
|
||||
|
||||
export interface ValidationError {
|
||||
code: string;
|
||||
message: string;
|
||||
field?: string;
|
||||
expected?: any;
|
||||
actual?: any;
|
||||
}
|
||||
|
||||
export interface ValidationWarning {
|
||||
code: string;
|
||||
message: string;
|
||||
suggestion?: string;
|
||||
}
|
||||
|
||||
/** Step definition for the concept pipeline */
|
||||
export interface PipelineStep {
|
||||
id: string;
|
||||
name: string;
|
||||
description: string;
|
||||
model: "flash" | "pro" | "opus" | "none";
|
||||
execute: (
|
||||
state: ConceptState,
|
||||
config: PipelineConfig,
|
||||
) => Promise<StepResult>;
|
||||
}
|
||||
Reference in New Issue
Block a user