feat: automated Qdrant sync with Mistral embeddings + Kabelhandbuch ingestion
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
- Switch embedding API from OpenRouter to Mistral mistral-embed (1024-dim, EU/DSGVO) - Add afterChange/afterDelete hooks to Posts.ts and Pages.ts for live sync - Integrate kabelhandbuch.txt parsing into /api/sync-qdrant boot route - Add .gitignore entries for kabelhandbuch.txt
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -32,4 +32,7 @@ backups/
|
||||
.env
|
||||
|
||||
# Payload CMS auto-generated
|
||||
app/(payload)/admin/importMap.js
|
||||
app/(payload)/admin/importMap.js
|
||||
|
||||
# Knowledge base source files
|
||||
kabelhandbuch.txt
|
||||
@@ -106,8 +106,47 @@ export async function GET() {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Kabelhandbuch (Static Text) ──
|
||||
const os = require('os');
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const crypto = await import('crypto');
|
||||
|
||||
const txtPath = path.join(process.cwd(), 'kabelhandbuch.txt');
|
||||
let manualChunks = 0;
|
||||
|
||||
if (fs.existsSync(txtPath)) {
|
||||
try {
|
||||
const text = fs.readFileSync(txtPath, 'utf8');
|
||||
const chunks = text
|
||||
.split(/\n\s*\n/)
|
||||
.map((c: string) => c.trim())
|
||||
.filter((c: string) => c.length > 50);
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunkText = chunks[i];
|
||||
const syntheticId = crypto.randomUUID();
|
||||
|
||||
await upsertProductVector(syntheticId, chunkText, {
|
||||
type: 'knowledge',
|
||||
content: chunkText,
|
||||
data: {
|
||||
title: `Kabelhandbuch Wissen - Bereich ${i + 1}`,
|
||||
source: 'Kabelhandbuch KLZ.pdf',
|
||||
},
|
||||
});
|
||||
manualChunks++;
|
||||
}
|
||||
console.log(`[Qdrant Sync] ✅ ${manualChunks} Kabelhandbuch-Chunks synced`);
|
||||
} catch (e: any) {
|
||||
results.errors.push(`kabelhandbuch: ${e.message}`);
|
||||
}
|
||||
} else {
|
||||
console.log(`[Qdrant Sync] ⚠️ skipped Kabelhandbuch: ${txtPath} not found`);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced`,
|
||||
`[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced, ${manualChunks} manual chunks synced`,
|
||||
);
|
||||
|
||||
return NextResponse.json({
|
||||
|
||||
@@ -16,7 +16,7 @@ export const qdrant = new QdrantClient({
|
||||
});
|
||||
|
||||
export const COLLECTION_NAME = 'klz_products';
|
||||
export const VECTOR_SIZE = 1536; // OpenAI text-embedding-3-small
|
||||
export const VECTOR_SIZE = 1024; // Mistral mistral-embed
|
||||
|
||||
// Cache TTLs
|
||||
const EMBEDDING_CACHE_TTL = 60 * 60 * 24; // 24h — embeddings are deterministic
|
||||
@@ -50,26 +50,15 @@ export async function ensureCollection() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple hash for cache keys
|
||||
* Hash text for cache key
|
||||
*/
|
||||
function hashKey(text: string): string {
|
||||
let hash = 0;
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const chr = text.charCodeAt(i);
|
||||
hash = (hash << 5) - hash + chr;
|
||||
hash |= 0;
|
||||
}
|
||||
return hash.toString(36);
|
||||
const { createHash } = require('crypto');
|
||||
return createHash('sha256').update(text).digest('hex').slice(0, 32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate an embedding for a given text using OpenRouter (OpenAI embedding proxy).
|
||||
* Results are cached in Redis for 24h since embeddings are deterministic.
|
||||
*
|
||||
* NOTE: We keep OpenRouter for embeddings because the Qdrant collection uses 1536-dim
|
||||
* vectors (OpenAI text-embedding-3-small). Switching to Mistral embed (1024-dim) would
|
||||
* require re-indexing the entire product catalog.
|
||||
* User-facing chat uses Mistral AI directly for DSGVO compliance.
|
||||
* Generate embedding using Mistral API (EU/DSGVO-compliant)
|
||||
*/
|
||||
export async function generateEmbedding(text: string): Promise<number[]> {
|
||||
const cacheKey = `emb:${hashKey(text.toLowerCase().trim())}`;
|
||||
@@ -84,22 +73,20 @@ export async function generateEmbedding(text: string): Promise<number[]> {
|
||||
// Redis down — proceed without cache
|
||||
}
|
||||
|
||||
const openRouterKey = process.env.OPENROUTER_API_KEY;
|
||||
if (!openRouterKey) {
|
||||
throw new Error('OPENROUTER_API_KEY is not set');
|
||||
const mistralKey = process.env.MISTRAL_API_KEY;
|
||||
if (!mistralKey) {
|
||||
throw new Error('MISTRAL_API_KEY is not set');
|
||||
}
|
||||
|
||||
const response = await fetch('https://openrouter.ai/api/v1/embeddings', {
|
||||
const response = await fetch('https://api.mistral.ai/v1/embeddings', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Bearer ${openRouterKey}`,
|
||||
Authorization: `Bearer ${mistralKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
'HTTP-Referer': process.env.NEXT_PUBLIC_BASE_URL || 'https://klz-cables.com',
|
||||
'X-Title': 'KLZ Cables Search AI',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: 'openai/text-embedding-3-small',
|
||||
input: text,
|
||||
model: 'mistral-embed',
|
||||
input: [text],
|
||||
}),
|
||||
});
|
||||
|
||||
|
||||
@@ -26,6 +26,66 @@ export const Pages: CollectionConfig = {
|
||||
};
|
||||
},
|
||||
},
|
||||
hooks: {
|
||||
afterChange: [
|
||||
async ({ doc, req }) => {
|
||||
// Run index sync asynchronously to not block the CMS save operation
|
||||
setTimeout(async () => {
|
||||
try {
|
||||
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
|
||||
|
||||
// Check if page is published
|
||||
if (doc._status !== 'published') {
|
||||
await deleteProductVector(`page_${doc.id}`);
|
||||
req.payload.logger.info(`Removed drafted page ${doc.slug} from Qdrant`);
|
||||
} else {
|
||||
// Serialize payload
|
||||
const contentText = [
|
||||
`Seite: ${doc.title}`,
|
||||
doc.excerpt ? `Beschreibung: ${doc.excerpt}` : '',
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n');
|
||||
|
||||
const payload = {
|
||||
type: 'knowledge',
|
||||
content: contentText,
|
||||
data: {
|
||||
title: doc.title,
|
||||
slug: doc.slug,
|
||||
},
|
||||
};
|
||||
|
||||
await upsertProductVector(`page_${doc.id}`, contentText, payload);
|
||||
req.payload.logger.info(`Upserted page ${doc.slug} to Qdrant`);
|
||||
}
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error syncing page to Qdrant',
|
||||
err: error,
|
||||
pageId: doc.id,
|
||||
});
|
||||
}
|
||||
}, 0);
|
||||
return doc;
|
||||
},
|
||||
],
|
||||
afterDelete: [
|
||||
async ({ id, req }) => {
|
||||
try {
|
||||
const { deleteProductVector } = await import('../../lib/qdrant');
|
||||
await deleteProductVector(`page_${id}`);
|
||||
req.payload.logger.info(`Deleted page ${id} from Qdrant`);
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error deleting page from Qdrant',
|
||||
err: error,
|
||||
pageId: id,
|
||||
});
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
fields: [
|
||||
{
|
||||
name: 'title',
|
||||
|
||||
@@ -45,6 +45,67 @@ export const Posts: CollectionConfig = {
|
||||
};
|
||||
},
|
||||
},
|
||||
hooks: {
|
||||
afterChange: [
|
||||
async ({ doc, req }) => {
|
||||
// Run index sync asynchronously to not block the CMS save operation
|
||||
setTimeout(async () => {
|
||||
try {
|
||||
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
|
||||
|
||||
// Check if post is published
|
||||
if (doc._status !== 'published') {
|
||||
await deleteProductVector(`post_${doc.id}`);
|
||||
req.payload.logger.info(`Removed drafted post ${doc.slug} from Qdrant`);
|
||||
} else {
|
||||
// Serialize payload
|
||||
const contentText = [
|
||||
`Blog-Artikel: ${doc.title}`,
|
||||
doc.excerpt ? `Zusammenfassung: ${doc.excerpt}` : '',
|
||||
doc.category ? `Kategorie: ${doc.category}` : '',
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n');
|
||||
|
||||
const payload = {
|
||||
type: 'knowledge',
|
||||
content: contentText,
|
||||
data: {
|
||||
title: doc.title,
|
||||
slug: doc.slug,
|
||||
},
|
||||
};
|
||||
|
||||
await upsertProductVector(`post_${doc.id}`, contentText, payload);
|
||||
req.payload.logger.info(`Upserted post ${doc.slug} to Qdrant`);
|
||||
}
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error syncing post to Qdrant',
|
||||
err: error,
|
||||
postId: doc.id,
|
||||
});
|
||||
}
|
||||
}, 0);
|
||||
return doc;
|
||||
},
|
||||
],
|
||||
afterDelete: [
|
||||
async ({ id, req }) => {
|
||||
try {
|
||||
const { deleteProductVector } = await import('../../lib/qdrant');
|
||||
await deleteProductVector(`post_${id}`);
|
||||
req.payload.logger.info(`Deleted post ${id} from Qdrant`);
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error deleting post from Qdrant',
|
||||
err: error,
|
||||
postId: id,
|
||||
});
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
fields: [
|
||||
{
|
||||
name: 'title',
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import crypto from 'crypto';
|
||||
import 'dotenv/config';
|
||||
|
||||
// Override Qdrant URL for local script execution outside docker
|
||||
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';
|
||||
|
||||
Reference in New Issue
Block a user