feat: Automate Qdrant PDF ingestion via Media hooks
All checks were successful
Build & Deploy / 🔍 Prepare (push) Successful in 7s
Build & Deploy / 🧪 QA (push) Successful in 1m1s
Build & Deploy / 🏗️ Build (push) Successful in 4m22s
Build & Deploy / 🚀 Deploy (push) Successful in 1m41s
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s

This commit is contained in:
2026-03-08 01:08:55 +01:00
parent 7f1aeaee7e
commit 1dc52da677
4 changed files with 161 additions and 1 deletions

View File

@@ -45,4 +45,81 @@ export const Media: CollectionConfig = {
type: 'text',
},
],
hooks: {
afterChange: [
async ({ doc, req }) => {
// Only process PDF files
if (doc.mimeType === 'application/pdf') {
try {
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const pdfParse = require('pdf-parse');
const { upsertProductVector, deleteKnowledgeByMediaId } = require('../../lib/qdrant');
const filePath = path.join(process.cwd(), 'public/media', doc.filename);
if (fs.existsSync(filePath)) {
req.payload.logger.info(`Extracting text from PDF: ${doc.filename}`);
const dataBuffer = fs.readFileSync(filePath);
const data = await pdfParse(dataBuffer);
// Clear any previously indexed chunks for this file just in case it's an update
await deleteKnowledgeByMediaId(doc.id);
// Chunk the text like we did in the ingest script
const chunks = data.text
.split(/\n\s*\n/)
.map((c: string) => c.trim())
.filter((c: string) => c.length > 50);
let successCount = 0;
for (let i = 0; i < chunks.length; i++) {
// Generate a deterministic UUID based on doc ID and chunk index
const hash = crypto.createHash('md5').update(`${doc.id}-${i}`).digest('hex');
// Qdrant strictly requires UUID: 8-4-4-4-12
const uuid = [
hash.substring(0, 8),
hash.substring(8, 12),
hash.substring(12, 16),
hash.substring(16, 20),
hash.substring(20, 32),
].join('-');
await upsertProductVector(uuid, chunks[i], {
type: 'knowledge',
title: `${doc.filename} - Teil ${i + 1}`,
content: chunks[i],
source: doc.filename,
mediaId: doc.id,
});
successCount++;
}
req.payload.logger.info(
`Successfully ingested ${successCount} chunks from ${doc.filename} into Qdrant`,
);
}
} catch (e: any) {
req.payload.logger.error(`Error parsing PDF ${doc.filename}: ${e.message}`);
}
}
},
],
afterDelete: [
async ({ id, doc, req }) => {
if (doc.mimeType === 'application/pdf') {
try {
const { deleteKnowledgeByMediaId } = require('../../lib/qdrant');
await deleteKnowledgeByMediaId(id);
req.payload.logger.info(`Removed Qdrant chunks for deleted PDF: ${doc.filename}`);
} catch (e: any) {
req.payload.logger.error(
`Error removing Qdrant chunks for ${doc.filename}: ${e.message}`,
);
}
}
},
],
},
};