feat: automated Qdrant sync with Mistral embeddings + Kabelhandbuch ingestion
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
- Switch embedding API from OpenRouter to Mistral mistral-embed (1024-dim, EU/DSGVO) - Add afterChange/afterDelete hooks to Posts.ts and Pages.ts for live sync - Integrate kabelhandbuch.txt parsing into /api/sync-qdrant boot route - Add .gitignore entries for kabelhandbuch.txt
This commit is contained in:
@@ -16,7 +16,7 @@ export const qdrant = new QdrantClient({
|
||||
});
|
||||
|
||||
export const COLLECTION_NAME = 'klz_products';
|
||||
export const VECTOR_SIZE = 1536; // OpenAI text-embedding-3-small
|
||||
export const VECTOR_SIZE = 1024; // Mistral mistral-embed
|
||||
|
||||
// Cache TTLs
|
||||
const EMBEDDING_CACHE_TTL = 60 * 60 * 24; // 24h — embeddings are deterministic
|
||||
@@ -50,26 +50,15 @@ export async function ensureCollection() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple hash for cache keys
|
||||
* Hash text for cache key
|
||||
*/
|
||||
function hashKey(text: string): string {
|
||||
let hash = 0;
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const chr = text.charCodeAt(i);
|
||||
hash = (hash << 5) - hash + chr;
|
||||
hash |= 0;
|
||||
}
|
||||
return hash.toString(36);
|
||||
const { createHash } = require('crypto');
|
||||
return createHash('sha256').update(text).digest('hex').slice(0, 32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate an embedding for a given text using OpenRouter (OpenAI embedding proxy).
|
||||
* Results are cached in Redis for 24h since embeddings are deterministic.
|
||||
*
|
||||
* NOTE: We keep OpenRouter for embeddings because the Qdrant collection uses 1536-dim
|
||||
* vectors (OpenAI text-embedding-3-small). Switching to Mistral embed (1024-dim) would
|
||||
* require re-indexing the entire product catalog.
|
||||
* User-facing chat uses Mistral AI directly for DSGVO compliance.
|
||||
* Generate embedding using Mistral API (EU/DSGVO-compliant)
|
||||
*/
|
||||
export async function generateEmbedding(text: string): Promise<number[]> {
|
||||
const cacheKey = `emb:${hashKey(text.toLowerCase().trim())}`;
|
||||
@@ -84,22 +73,20 @@ export async function generateEmbedding(text: string): Promise<number[]> {
|
||||
// Redis down — proceed without cache
|
||||
}
|
||||
|
||||
const openRouterKey = process.env.OPENROUTER_API_KEY;
|
||||
if (!openRouterKey) {
|
||||
throw new Error('OPENROUTER_API_KEY is not set');
|
||||
const mistralKey = process.env.MISTRAL_API_KEY;
|
||||
if (!mistralKey) {
|
||||
throw new Error('MISTRAL_API_KEY is not set');
|
||||
}
|
||||
|
||||
const response = await fetch('https://openrouter.ai/api/v1/embeddings', {
|
||||
const response = await fetch('https://api.mistral.ai/v1/embeddings', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Bearer ${openRouterKey}`,
|
||||
Authorization: `Bearer ${mistralKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
'HTTP-Referer': process.env.NEXT_PUBLIC_BASE_URL || 'https://klz-cables.com',
|
||||
'X-Title': 'KLZ Cables Search AI',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: 'openai/text-embedding-3-small',
|
||||
input: text,
|
||||
model: 'mistral-embed',
|
||||
input: [text],
|
||||
}),
|
||||
});
|
||||
|
||||
|
||||
@@ -26,6 +26,66 @@ export const Pages: CollectionConfig = {
|
||||
};
|
||||
},
|
||||
},
|
||||
hooks: {
|
||||
afterChange: [
|
||||
async ({ doc, req }) => {
|
||||
// Run index sync asynchronously to not block the CMS save operation
|
||||
setTimeout(async () => {
|
||||
try {
|
||||
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
|
||||
|
||||
// Check if page is published
|
||||
if (doc._status !== 'published') {
|
||||
await deleteProductVector(`page_${doc.id}`);
|
||||
req.payload.logger.info(`Removed drafted page ${doc.slug} from Qdrant`);
|
||||
} else {
|
||||
// Serialize payload
|
||||
const contentText = [
|
||||
`Seite: ${doc.title}`,
|
||||
doc.excerpt ? `Beschreibung: ${doc.excerpt}` : '',
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n');
|
||||
|
||||
const payload = {
|
||||
type: 'knowledge',
|
||||
content: contentText,
|
||||
data: {
|
||||
title: doc.title,
|
||||
slug: doc.slug,
|
||||
},
|
||||
};
|
||||
|
||||
await upsertProductVector(`page_${doc.id}`, contentText, payload);
|
||||
req.payload.logger.info(`Upserted page ${doc.slug} to Qdrant`);
|
||||
}
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error syncing page to Qdrant',
|
||||
err: error,
|
||||
pageId: doc.id,
|
||||
});
|
||||
}
|
||||
}, 0);
|
||||
return doc;
|
||||
},
|
||||
],
|
||||
afterDelete: [
|
||||
async ({ id, req }) => {
|
||||
try {
|
||||
const { deleteProductVector } = await import('../../lib/qdrant');
|
||||
await deleteProductVector(`page_${id}`);
|
||||
req.payload.logger.info(`Deleted page ${id} from Qdrant`);
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error deleting page from Qdrant',
|
||||
err: error,
|
||||
pageId: id,
|
||||
});
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
fields: [
|
||||
{
|
||||
name: 'title',
|
||||
|
||||
@@ -45,6 +45,67 @@ export const Posts: CollectionConfig = {
|
||||
};
|
||||
},
|
||||
},
|
||||
hooks: {
|
||||
afterChange: [
|
||||
async ({ doc, req }) => {
|
||||
// Run index sync asynchronously to not block the CMS save operation
|
||||
setTimeout(async () => {
|
||||
try {
|
||||
const { upsertProductVector, deleteProductVector } = await import('../../lib/qdrant');
|
||||
|
||||
// Check if post is published
|
||||
if (doc._status !== 'published') {
|
||||
await deleteProductVector(`post_${doc.id}`);
|
||||
req.payload.logger.info(`Removed drafted post ${doc.slug} from Qdrant`);
|
||||
} else {
|
||||
// Serialize payload
|
||||
const contentText = [
|
||||
`Blog-Artikel: ${doc.title}`,
|
||||
doc.excerpt ? `Zusammenfassung: ${doc.excerpt}` : '',
|
||||
doc.category ? `Kategorie: ${doc.category}` : '',
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n');
|
||||
|
||||
const payload = {
|
||||
type: 'knowledge',
|
||||
content: contentText,
|
||||
data: {
|
||||
title: doc.title,
|
||||
slug: doc.slug,
|
||||
},
|
||||
};
|
||||
|
||||
await upsertProductVector(`post_${doc.id}`, contentText, payload);
|
||||
req.payload.logger.info(`Upserted post ${doc.slug} to Qdrant`);
|
||||
}
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error syncing post to Qdrant',
|
||||
err: error,
|
||||
postId: doc.id,
|
||||
});
|
||||
}
|
||||
}, 0);
|
||||
return doc;
|
||||
},
|
||||
],
|
||||
afterDelete: [
|
||||
async ({ id, req }) => {
|
||||
try {
|
||||
const { deleteProductVector } = await import('../../lib/qdrant');
|
||||
await deleteProductVector(`post_${id}`);
|
||||
req.payload.logger.info(`Deleted post ${id} from Qdrant`);
|
||||
} catch (error) {
|
||||
req.payload.logger.error({
|
||||
msg: 'Error deleting post from Qdrant',
|
||||
err: error,
|
||||
postId: id,
|
||||
});
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
fields: [
|
||||
{
|
||||
name: 'title',
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import crypto from 'crypto';
|
||||
import 'dotenv/config';
|
||||
|
||||
// Override Qdrant URL for local script execution outside docker
|
||||
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';
|
||||
|
||||
Reference in New Issue
Block a user