feat: automated Qdrant sync with Mistral embeddings + Kabelhandbuch ingestion
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s

- Switch embedding API from OpenRouter to Mistral mistral-embed (1024-dim, EU/DSGVO)
- Add afterChange/afterDelete hooks to Posts.ts and Pages.ts for live sync
- Integrate kabelhandbuch.txt parsing into /api/sync-qdrant boot route
- Add .gitignore entries for kabelhandbuch.txt
This commit is contained in:
2026-03-07 15:39:10 +01:00
parent 3acf0c3740
commit 8e99c9d121
6 changed files with 178 additions and 27 deletions

5
.gitignore vendored
View File

@@ -32,4 +32,7 @@ backups/
.env
# Payload CMS auto-generated
app/(payload)/admin/importMap.js
app/(payload)/admin/importMap.js
# Knowledge base source files
kabelhandbuch.txt

View File

@@ -106,8 +106,47 @@ export async function GET() {
}
}
// ── Kabelhandbuch (Static Text) ──
const os = require('os');
const path = require('path');
const fs = require('fs');
const crypto = await import('crypto');
const txtPath = path.join(process.cwd(), 'kabelhandbuch.txt');
let manualChunks = 0;
if (fs.existsSync(txtPath)) {
try {
const text = fs.readFileSync(txtPath, 'utf8');
const chunks = text
.split(/\n\s*\n/)
.map((c: string) => c.trim())
.filter((c: string) => c.length > 50);
for (let i = 0; i < chunks.length; i++) {
const chunkText = chunks[i];
const syntheticId = crypto.randomUUID();
await upsertProductVector(syntheticId, chunkText, {
type: 'knowledge',
content: chunkText,
data: {
title: `Kabelhandbuch Wissen - Bereich ${i + 1}`,
source: 'Kabelhandbuch KLZ.pdf',
},
});
manualChunks++;
}
console.log(`[Qdrant Sync] ✅ ${manualChunks} Kabelhandbuch-Chunks synced`);
} catch (e: any) {
results.errors.push(`kabelhandbuch: ${e.message}`);
}
} else {
console.log(`[Qdrant Sync] ⚠️ skipped Kabelhandbuch: ${txtPath} not found`);
}
console.log(
`[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced`,
`[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced, ${manualChunks} manual chunks synced`,
);
return NextResponse.json({

View File

@@ -16,7 +16,7 @@ export const qdrant = new QdrantClient({
});
export const COLLECTION_NAME = 'klz_products';
export const VECTOR_SIZE = 1536; // OpenAI text-embedding-3-small
export const VECTOR_SIZE = 1024; // Mistral mistral-embed
// Cache TTLs
const EMBEDDING_CACHE_TTL = 60 * 60 * 24; // 24h — embeddings are deterministic
@@ -50,26 +50,15 @@ export async function ensureCollection() {
}
/**
* Simple hash for cache keys
* Hash text for cache key
*/
function hashKey(text: string): string {
let hash = 0;
for (let i = 0; i < text.length; i++) {
const chr = text.charCodeAt(i);
hash = (hash << 5) - hash + chr;
hash |= 0;
}
return hash.toString(36);
const { createHash } = require('crypto');
return createHash('sha256').update(text).digest('hex').slice(0, 32);
}
/**
* Generate an embedding for a given text using OpenRouter (OpenAI embedding proxy).
* Results are cached in Redis for 24h since embeddings are deterministic.
*
* NOTE: We keep OpenRouter for embeddings because the Qdrant collection uses 1536-dim
* vectors (OpenAI text-embedding-3-small). Switching to Mistral embed (1024-dim) would
* require re-indexing the entire product catalog.
* User-facing chat uses Mistral AI directly for DSGVO compliance.
* Generate embedding using Mistral API (EU/DSGVO-compliant)
*/
export async function generateEmbedding(text: string): Promise<number[]> {
const cacheKey = `emb:${hashKey(text.toLowerCase().trim())}`;
@@ -84,22 +73,20 @@ export async function generateEmbedding(text: string): Promise<number[]> {
// Redis down — proceed without cache
}
const openRouterKey = process.env.OPENROUTER_API_KEY;
if (!openRouterKey) {
throw new Error('OPENROUTER_API_KEY is not set');
const mistralKey = process.env.MISTRAL_API_KEY;
if (!mistralKey) {
throw new Error('MISTRAL_API_KEY is not set');
}
const response = await fetch('https://openrouter.ai/api/v1/embeddings', {
const response = await fetch('https://api.mistral.ai/v1/embeddings', {
method: 'POST',
headers: {
Authorization: `Bearer ${openRouterKey}`,
Authorization: `Bearer ${mistralKey}`,
'Content-Type': 'application/json',
'HTTP-Referer': process.env.NEXT_PUBLIC_BASE_URL || 'https://klz-cables.com',
'X-Title': 'KLZ Cables Search AI',
},
body: JSON.stringify({
model: 'openai/text-embedding-3-small',
input: text,
model: 'mistral-embed',
input: [text],
}),
});

View File

@@ -26,6 +26,66 @@ export const Pages: CollectionConfig = {
};
},
},
hooks: {
afterChange: [
async ({ doc, req }) => {
// Run index sync asynchronously to not block the CMS save operation
setTimeout(async () => {
try {
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
// Check if page is published
if (doc._status !== 'published') {
await deleteProductVector(`page_${doc.id}`);
req.payload.logger.info(`Removed drafted page ${doc.slug} from Qdrant`);
} else {
// Serialize payload
const contentText = [
`Seite: ${doc.title}`,
doc.excerpt ? `Beschreibung: ${doc.excerpt}` : '',
]
.filter(Boolean)
.join('\n');
const payload = {
type: 'knowledge',
content: contentText,
data: {
title: doc.title,
slug: doc.slug,
},
};
await upsertProductVector(`page_${doc.id}`, contentText, payload);
req.payload.logger.info(`Upserted page ${doc.slug} to Qdrant`);
}
} catch (error) {
req.payload.logger.error({
msg: 'Error syncing page to Qdrant',
err: error,
pageId: doc.id,
});
}
}, 0);
return doc;
},
],
afterDelete: [
async ({ id, req }) => {
try {
const { deleteProductVector } = await import('../../lib/qdrant');
await deleteProductVector(`page_${id}`);
req.payload.logger.info(`Deleted page ${id} from Qdrant`);
} catch (error) {
req.payload.logger.error({
msg: 'Error deleting page from Qdrant',
err: error,
pageId: id,
});
}
},
],
},
fields: [
{
name: 'title',

View File

@@ -45,6 +45,67 @@ export const Posts: CollectionConfig = {
};
},
},
hooks: {
afterChange: [
async ({ doc, req }) => {
// Run index sync asynchronously to not block the CMS save operation
setTimeout(async () => {
try {
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
// Check if post is published
if (doc._status !== 'published') {
await deleteProductVector(`post_${doc.id}`);
req.payload.logger.info(`Removed drafted post ${doc.slug} from Qdrant`);
} else {
// Serialize payload
const contentText = [
`Blog-Artikel: ${doc.title}`,
doc.excerpt ? `Zusammenfassung: ${doc.excerpt}` : '',
doc.category ? `Kategorie: ${doc.category}` : '',
]
.filter(Boolean)
.join('\n');
const payload = {
type: 'knowledge',
content: contentText,
data: {
title: doc.title,
slug: doc.slug,
},
};
await upsertProductVector(`post_${doc.id}`, contentText, payload);
req.payload.logger.info(`Upserted post ${doc.slug} to Qdrant`);
}
} catch (error) {
req.payload.logger.error({
msg: 'Error syncing post to Qdrant',
err: error,
postId: doc.id,
});
}
}, 0);
return doc;
},
],
afterDelete: [
async ({ id, req }) => {
try {
const { deleteProductVector } = await import('../../lib/qdrant');
await deleteProductVector(`post_${id}`);
req.payload.logger.info(`Deleted post ${id} from Qdrant`);
} catch (error) {
req.payload.logger.error({
msg: 'Error deleting post from Qdrant',
err: error,
postId: id,
});
}
},
],
},
fields: [
{
name: 'title',

View File

@@ -1,6 +1,7 @@
import fs from 'fs';
import path from 'path';
import crypto from 'crypto';
import 'dotenv/config';
// Override Qdrant URL for local script execution outside docker
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';