feat: automated Qdrant sync with Mistral embeddings + Kabelhandbuch ingestion
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
- Switch embedding API from OpenRouter to Mistral mistral-embed (1024-dim, EU/DSGVO) - Add afterChange/afterDelete hooks to Posts.ts and Pages.ts for live sync - Integrate kabelhandbuch.txt parsing into /api/sync-qdrant boot route - Add .gitignore entries for kabelhandbuch.txt
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -32,4 +32,7 @@ backups/
|
|||||||
.env
|
.env
|
||||||
|
|
||||||
# Payload CMS auto-generated
|
# Payload CMS auto-generated
|
||||||
app/(payload)/admin/importMap.js
|
app/(payload)/admin/importMap.js
|
||||||
|
|
||||||
|
# Knowledge base source files
|
||||||
|
kabelhandbuch.txt
|
||||||
@@ -106,8 +106,47 @@ export async function GET() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Kabelhandbuch (Static Text) ──
|
||||||
|
const os = require('os');
|
||||||
|
const path = require('path');
|
||||||
|
const fs = require('fs');
|
||||||
|
const crypto = await import('crypto');
|
||||||
|
|
||||||
|
const txtPath = path.join(process.cwd(), 'kabelhandbuch.txt');
|
||||||
|
let manualChunks = 0;
|
||||||
|
|
||||||
|
if (fs.existsSync(txtPath)) {
|
||||||
|
try {
|
||||||
|
const text = fs.readFileSync(txtPath, 'utf8');
|
||||||
|
const chunks = text
|
||||||
|
.split(/\n\s*\n/)
|
||||||
|
.map((c: string) => c.trim())
|
||||||
|
.filter((c: string) => c.length > 50);
|
||||||
|
|
||||||
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
const chunkText = chunks[i];
|
||||||
|
const syntheticId = crypto.randomUUID();
|
||||||
|
|
||||||
|
await upsertProductVector(syntheticId, chunkText, {
|
||||||
|
type: 'knowledge',
|
||||||
|
content: chunkText,
|
||||||
|
data: {
|
||||||
|
title: `Kabelhandbuch Wissen - Bereich ${i + 1}`,
|
||||||
|
source: 'Kabelhandbuch KLZ.pdf',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
manualChunks++;
|
||||||
|
}
|
||||||
|
console.log(`[Qdrant Sync] ✅ ${manualChunks} Kabelhandbuch-Chunks synced`);
|
||||||
|
} catch (e: any) {
|
||||||
|
results.errors.push(`kabelhandbuch: ${e.message}`);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.log(`[Qdrant Sync] ⚠️ skipped Kabelhandbuch: ${txtPath} not found`);
|
||||||
|
}
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced`,
|
`[Qdrant Sync] ✅ ${results.products} products, ${results.posts} posts, ${results.pages} pages synced, ${manualChunks} manual chunks synced`,
|
||||||
);
|
);
|
||||||
|
|
||||||
return NextResponse.json({
|
return NextResponse.json({
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ export const qdrant = new QdrantClient({
|
|||||||
});
|
});
|
||||||
|
|
||||||
export const COLLECTION_NAME = 'klz_products';
|
export const COLLECTION_NAME = 'klz_products';
|
||||||
export const VECTOR_SIZE = 1536; // OpenAI text-embedding-3-small
|
export const VECTOR_SIZE = 1024; // Mistral mistral-embed
|
||||||
|
|
||||||
// Cache TTLs
|
// Cache TTLs
|
||||||
const EMBEDDING_CACHE_TTL = 60 * 60 * 24; // 24h — embeddings are deterministic
|
const EMBEDDING_CACHE_TTL = 60 * 60 * 24; // 24h — embeddings are deterministic
|
||||||
@@ -50,26 +50,15 @@ export async function ensureCollection() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple hash for cache keys
|
* Hash text for cache key
|
||||||
*/
|
*/
|
||||||
function hashKey(text: string): string {
|
function hashKey(text: string): string {
|
||||||
let hash = 0;
|
const { createHash } = require('crypto');
|
||||||
for (let i = 0; i < text.length; i++) {
|
return createHash('sha256').update(text).digest('hex').slice(0, 32);
|
||||||
const chr = text.charCodeAt(i);
|
|
||||||
hash = (hash << 5) - hash + chr;
|
|
||||||
hash |= 0;
|
|
||||||
}
|
|
||||||
return hash.toString(36);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate an embedding for a given text using OpenRouter (OpenAI embedding proxy).
|
* Generate embedding using Mistral API (EU/DSGVO-compliant)
|
||||||
* Results are cached in Redis for 24h since embeddings are deterministic.
|
|
||||||
*
|
|
||||||
* NOTE: We keep OpenRouter for embeddings because the Qdrant collection uses 1536-dim
|
|
||||||
* vectors (OpenAI text-embedding-3-small). Switching to Mistral embed (1024-dim) would
|
|
||||||
* require re-indexing the entire product catalog.
|
|
||||||
* User-facing chat uses Mistral AI directly for DSGVO compliance.
|
|
||||||
*/
|
*/
|
||||||
export async function generateEmbedding(text: string): Promise<number[]> {
|
export async function generateEmbedding(text: string): Promise<number[]> {
|
||||||
const cacheKey = `emb:${hashKey(text.toLowerCase().trim())}`;
|
const cacheKey = `emb:${hashKey(text.toLowerCase().trim())}`;
|
||||||
@@ -84,22 +73,20 @@ export async function generateEmbedding(text: string): Promise<number[]> {
|
|||||||
// Redis down — proceed without cache
|
// Redis down — proceed without cache
|
||||||
}
|
}
|
||||||
|
|
||||||
const openRouterKey = process.env.OPENROUTER_API_KEY;
|
const mistralKey = process.env.MISTRAL_API_KEY;
|
||||||
if (!openRouterKey) {
|
if (!mistralKey) {
|
||||||
throw new Error('OPENROUTER_API_KEY is not set');
|
throw new Error('MISTRAL_API_KEY is not set');
|
||||||
}
|
}
|
||||||
|
|
||||||
const response = await fetch('https://openrouter.ai/api/v1/embeddings', {
|
const response = await fetch('https://api.mistral.ai/v1/embeddings', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${openRouterKey}`,
|
Authorization: `Bearer ${mistralKey}`,
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'HTTP-Referer': process.env.NEXT_PUBLIC_BASE_URL || 'https://klz-cables.com',
|
|
||||||
'X-Title': 'KLZ Cables Search AI',
|
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
model: 'openai/text-embedding-3-small',
|
model: 'mistral-embed',
|
||||||
input: text,
|
input: [text],
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,66 @@ export const Pages: CollectionConfig = {
|
|||||||
};
|
};
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
hooks: {
|
||||||
|
afterChange: [
|
||||||
|
async ({ doc, req }) => {
|
||||||
|
// Run index sync asynchronously to not block the CMS save operation
|
||||||
|
setTimeout(async () => {
|
||||||
|
try {
|
||||||
|
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
|
||||||
|
|
||||||
|
// Check if page is published
|
||||||
|
if (doc._status !== 'published') {
|
||||||
|
await deleteProductVector(`page_${doc.id}`);
|
||||||
|
req.payload.logger.info(`Removed drafted page ${doc.slug} from Qdrant`);
|
||||||
|
} else {
|
||||||
|
// Serialize payload
|
||||||
|
const contentText = [
|
||||||
|
`Seite: ${doc.title}`,
|
||||||
|
doc.excerpt ? `Beschreibung: ${doc.excerpt}` : '',
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join('\n');
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
type: 'knowledge',
|
||||||
|
content: contentText,
|
||||||
|
data: {
|
||||||
|
title: doc.title,
|
||||||
|
slug: doc.slug,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
await upsertProductVector(`page_${doc.id}`, contentText, payload);
|
||||||
|
req.payload.logger.info(`Upserted page ${doc.slug} to Qdrant`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
req.payload.logger.error({
|
||||||
|
msg: 'Error syncing page to Qdrant',
|
||||||
|
err: error,
|
||||||
|
pageId: doc.id,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, 0);
|
||||||
|
return doc;
|
||||||
|
},
|
||||||
|
],
|
||||||
|
afterDelete: [
|
||||||
|
async ({ id, req }) => {
|
||||||
|
try {
|
||||||
|
const { deleteProductVector } = await import('../../lib/qdrant');
|
||||||
|
await deleteProductVector(`page_${id}`);
|
||||||
|
req.payload.logger.info(`Deleted page ${id} from Qdrant`);
|
||||||
|
} catch (error) {
|
||||||
|
req.payload.logger.error({
|
||||||
|
msg: 'Error deleting page from Qdrant',
|
||||||
|
err: error,
|
||||||
|
pageId: id,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
fields: [
|
fields: [
|
||||||
{
|
{
|
||||||
name: 'title',
|
name: 'title',
|
||||||
|
|||||||
@@ -45,6 +45,67 @@ export const Posts: CollectionConfig = {
|
|||||||
};
|
};
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
hooks: {
|
||||||
|
afterChange: [
|
||||||
|
async ({ doc, req }) => {
|
||||||
|
// Run index sync asynchronously to not block the CMS save operation
|
||||||
|
setTimeout(async () => {
|
||||||
|
try {
|
||||||
|
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
|
||||||
|
|
||||||
|
// Check if post is published
|
||||||
|
if (doc._status !== 'published') {
|
||||||
|
await deleteProductVector(`post_${doc.id}`);
|
||||||
|
req.payload.logger.info(`Removed drafted post ${doc.slug} from Qdrant`);
|
||||||
|
} else {
|
||||||
|
// Serialize payload
|
||||||
|
const contentText = [
|
||||||
|
`Blog-Artikel: ${doc.title}`,
|
||||||
|
doc.excerpt ? `Zusammenfassung: ${doc.excerpt}` : '',
|
||||||
|
doc.category ? `Kategorie: ${doc.category}` : '',
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join('\n');
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
type: 'knowledge',
|
||||||
|
content: contentText,
|
||||||
|
data: {
|
||||||
|
title: doc.title,
|
||||||
|
slug: doc.slug,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
await upsertProductVector(`post_${doc.id}`, contentText, payload);
|
||||||
|
req.payload.logger.info(`Upserted post ${doc.slug} to Qdrant`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
req.payload.logger.error({
|
||||||
|
msg: 'Error syncing post to Qdrant',
|
||||||
|
err: error,
|
||||||
|
postId: doc.id,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, 0);
|
||||||
|
return doc;
|
||||||
|
},
|
||||||
|
],
|
||||||
|
afterDelete: [
|
||||||
|
async ({ id, req }) => {
|
||||||
|
try {
|
||||||
|
const { deleteProductVector } = await import('../../lib/qdrant');
|
||||||
|
await deleteProductVector(`post_${id}`);
|
||||||
|
req.payload.logger.info(`Deleted post ${id} from Qdrant`);
|
||||||
|
} catch (error) {
|
||||||
|
req.payload.logger.error({
|
||||||
|
msg: 'Error deleting post from Qdrant',
|
||||||
|
err: error,
|
||||||
|
postId: id,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
fields: [
|
fields: [
|
||||||
{
|
{
|
||||||
name: 'title',
|
name: 'title',
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import crypto from 'crypto';
|
import crypto from 'crypto';
|
||||||
|
import 'dotenv/config';
|
||||||
|
|
||||||
// Override Qdrant URL for local script execution outside docker
|
// Override Qdrant URL for local script execution outside docker
|
||||||
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';
|
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';
|
||||||
|
|||||||
Reference in New Issue
Block a user