From 8e99c9d1215df18024d30f010d38cd1ca9619dff Mon Sep 17 00:00:00 2001 From: Marc Mintel Date: Sat, 7 Mar 2026 15:39:10 +0100 Subject: [PATCH] feat: automated Qdrant sync with Mistral embeddings + Kabelhandbuch ingestion - Switch embedding API from OpenRouter to Mistral mistral-embed (1024-dim, EU/DSGVO) - Add afterChange/afterDelete hooks to Posts.ts and Pages.ts for live sync - Integrate kabelhandbuch.txt parsing into /api/sync-qdrant boot route - Add .gitignore entries for kabelhandbuch.txt --- .gitignore | 5 ++- app/api/sync-qdrant/route.ts | 41 ++++++++++++++++++++- src/lib/qdrant.ts | 37 +++++++------------ src/payload/collections/Pages.ts | 60 +++++++++++++++++++++++++++++++ src/payload/collections/Posts.ts | 61 ++++++++++++++++++++++++++++++++ src/scripts/ingest-pdf.ts | 1 + 6 files changed, 178 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 3d1e3e1f..33964874 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,7 @@ backups/ .env # Payload CMS auto-generated -app/(payload)/admin/importMap.js \ No newline at end of file +app/(payload)/admin/importMap.js + +# Knowledge base source files +kabelhandbuch.txt \ No newline at end of file diff --git a/app/api/sync-qdrant/route.ts b/app/api/sync-qdrant/route.ts index bed2dbda..b82900f4 100644 --- a/app/api/sync-qdrant/route.ts +++ b/app/api/sync-qdrant/route.ts @@ -106,8 +106,47 @@ export async function GET() { } } + // ── Kabelhandbuch (Static Text) ── + const os = require('os'); + const path = require('path'); + const fs = require('fs'); + const crypto = await import('crypto'); + + const txtPath = path.join(process.cwd(), 'kabelhandbuch.txt'); + let manualChunks = 0; + + if (fs.existsSync(txtPath)) { + try { + const text = fs.readFileSync(txtPath, 'utf8'); + const chunks = text + .split(/\n\s*\n/) + .map((c: string) => c.trim()) + .filter((c: string) => c.length > 50); + + for (let i = 0; i < chunks.length; i++) { + const chunkText = chunks[i]; + const syntheticId = crypto.randomUUID(); + + await upsertProductVector(syntheticId, chunkText, { + type: 'knowledge', + content: chunkText, + data: { + title: `Kabelhandbuch Wissen - Bereich ${i + 1}`, + source: 'Kabelhandbuch KLZ.pdf', + }, + }); + manualChunks++; + } + console.log(`[Qdrant Sync] ✅ ${manualChunks} Kabelhandbuch-Chunks synced`); + } catch (e: any) { + results.errors.push(`kabelhandbuch: ${e.message}`); + } + } else { + console.log(`[Qdrant Sync] ⚠️ skipped Kabelhandbuch: ${txtPath} not found`); + } + console.log( - `[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced`, + `[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced, ${manualChunks} manual chunks synced`, ); return NextResponse.json({ diff --git a/src/lib/qdrant.ts b/src/lib/qdrant.ts index 6ed12b75..c477e920 100644 --- a/src/lib/qdrant.ts +++ b/src/lib/qdrant.ts @@ -16,7 +16,7 @@ export const qdrant = new QdrantClient({ }); export const COLLECTION_NAME = 'klz_products'; -export const VECTOR_SIZE = 1536; // OpenAI text-embedding-3-small +export const VECTOR_SIZE = 1024; // Mistral mistral-embed // Cache TTLs const EMBEDDING_CACHE_TTL = 60 * 60 * 24; // 24h — embeddings are deterministic @@ -50,26 +50,15 @@ export async function ensureCollection() { } /** - * Simple hash for cache keys + * Hash text for cache key */ function hashKey(text: string): string { - let hash = 0; - for (let i = 0; i < text.length; i++) { - const chr = text.charCodeAt(i); - hash = (hash << 5) - hash + chr; - hash |= 0; - } - return hash.toString(36); + const { createHash } = require('crypto'); + return createHash('sha256').update(text).digest('hex').slice(0, 32); } /** - * Generate an embedding for a given text using OpenRouter (OpenAI embedding proxy). - * Results are cached in Redis for 24h since embeddings are deterministic. - * - * NOTE: We keep OpenRouter for embeddings because the Qdrant collection uses 1536-dim - * vectors (OpenAI text-embedding-3-small). Switching to Mistral embed (1024-dim) would - * require re-indexing the entire product catalog. - * User-facing chat uses Mistral AI directly for DSGVO compliance. + * Generate embedding using Mistral API (EU/DSGVO-compliant) */ export async function generateEmbedding(text: string): Promise { const cacheKey = `emb:${hashKey(text.toLowerCase().trim())}`; @@ -84,22 +73,20 @@ export async function generateEmbedding(text: string): Promise { // Redis down — proceed without cache } - const openRouterKey = process.env.OPENROUTER_API_KEY; - if (!openRouterKey) { - throw new Error('OPENROUTER_API_KEY is not set'); + const mistralKey = process.env.MISTRAL_API_KEY; + if (!mistralKey) { + throw new Error('MISTRAL_API_KEY is not set'); } - const response = await fetch('https://openrouter.ai/api/v1/embeddings', { + const response = await fetch('https://api.mistral.ai/v1/embeddings', { method: 'POST', headers: { - Authorization: `Bearer ${openRouterKey}`, + Authorization: `Bearer ${mistralKey}`, 'Content-Type': 'application/json', - 'HTTP-Referer': process.env.NEXT_PUBLIC_BASE_URL || 'https://klz-cables.com', - 'X-Title': 'KLZ Cables Search AI', }, body: JSON.stringify({ - model: 'openai/text-embedding-3-small', - input: text, + model: 'mistral-embed', + input: [text], }), }); diff --git a/src/payload/collections/Pages.ts b/src/payload/collections/Pages.ts index 517ce61f..03de368f 100644 --- a/src/payload/collections/Pages.ts +++ b/src/payload/collections/Pages.ts @@ -26,6 +26,66 @@ export const Pages: CollectionConfig = { }; }, }, + hooks: { + afterChange: [ + async ({ doc, req }) => { + // Run index sync asynchronously to not block the CMS save operation + setTimeout(async () => { + try { + const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant'); + + // Check if page is published + if (doc._status !== 'published') { + await deleteProductVector(`page_${doc.id}`); + req.payload.logger.info(`Removed drafted page ${doc.slug} from Qdrant`); + } else { + // Serialize payload + const contentText = [ + `Seite: ${doc.title}`, + doc.excerpt ? `Beschreibung: ${doc.excerpt}` : '', + ] + .filter(Boolean) + .join('\n'); + + const payload = { + type: 'knowledge', + content: contentText, + data: { + title: doc.title, + slug: doc.slug, + }, + }; + + await upsertProductVector(`page_${doc.id}`, contentText, payload); + req.payload.logger.info(`Upserted page ${doc.slug} to Qdrant`); + } + } catch (error) { + req.payload.logger.error({ + msg: 'Error syncing page to Qdrant', + err: error, + pageId: doc.id, + }); + } + }, 0); + return doc; + }, + ], + afterDelete: [ + async ({ id, req }) => { + try { + const { deleteProductVector } = await import('../../lib/qdrant'); + await deleteProductVector(`page_${id}`); + req.payload.logger.info(`Deleted page ${id} from Qdrant`); + } catch (error) { + req.payload.logger.error({ + msg: 'Error deleting page from Qdrant', + err: error, + pageId: id, + }); + } + }, + ], + }, fields: [ { name: 'title', diff --git a/src/payload/collections/Posts.ts b/src/payload/collections/Posts.ts index a497b8de..8cbb4c9b 100644 --- a/src/payload/collections/Posts.ts +++ b/src/payload/collections/Posts.ts @@ -45,6 +45,67 @@ export const Posts: CollectionConfig = { }; }, }, + hooks: { + afterChange: [ + async ({ doc, req }) => { + // Run index sync asynchronously to not block the CMS save operation + setTimeout(async () => { + try { + const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant'); + + // Check if post is published + if (doc._status !== 'published') { + await deleteProductVector(`post_${doc.id}`); + req.payload.logger.info(`Removed drafted post ${doc.slug} from Qdrant`); + } else { + // Serialize payload + const contentText = [ + `Blog-Artikel: ${doc.title}`, + doc.excerpt ? `Zusammenfassung: ${doc.excerpt}` : '', + doc.category ? `Kategorie: ${doc.category}` : '', + ] + .filter(Boolean) + .join('\n'); + + const payload = { + type: 'knowledge', + content: contentText, + data: { + title: doc.title, + slug: doc.slug, + }, + }; + + await upsertProductVector(`post_${doc.id}`, contentText, payload); + req.payload.logger.info(`Upserted post ${doc.slug} to Qdrant`); + } + } catch (error) { + req.payload.logger.error({ + msg: 'Error syncing post to Qdrant', + err: error, + postId: doc.id, + }); + } + }, 0); + return doc; + }, + ], + afterDelete: [ + async ({ id, req }) => { + try { + const { deleteProductVector } = await import('../../lib/qdrant'); + await deleteProductVector(`post_${id}`); + req.payload.logger.info(`Deleted post ${id} from Qdrant`); + } catch (error) { + req.payload.logger.error({ + msg: 'Error deleting post from Qdrant', + err: error, + postId: id, + }); + } + }, + ], + }, fields: [ { name: 'title', diff --git a/src/scripts/ingest-pdf.ts b/src/scripts/ingest-pdf.ts index e78437cf..138a9c07 100644 --- a/src/scripts/ingest-pdf.ts +++ b/src/scripts/ingest-pdf.ts @@ -1,6 +1,7 @@ import fs from 'fs'; import path from 'path'; import crypto from 'crypto'; +import 'dotenv/config'; // Override Qdrant URL for local script execution outside docker process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';