feat: Automate Qdrant PDF ingestion via Media hooks

2026-03-08 01:08:55 +01:00
parent 7f1aeaee7e
commit 1dc52da677
4 changed files with 161 additions and 1 deletions
--- a/src/payload/collections/Media.ts
+++ b/src/payload/collections/Media.ts
@@ -45,4 +45,81 @@ export const Media: CollectionConfig = {
      type: 'text',
    },
  ],
+  hooks: {
+    afterChange: [
+      async ({ doc, req }) => {
+        // Only process PDF files
+        if (doc.mimeType === 'application/pdf') {
+          try {
+            const fs = require('fs');
+            const path = require('path');
+            const crypto = require('crypto');
+            const pdfParse = require('pdf-parse');
+            const { upsertProductVector, deleteKnowledgeByMediaId } = require('../../lib/qdrant');
+
+            const filePath = path.join(process.cwd(), 'public/media', doc.filename);
+
+            if (fs.existsSync(filePath)) {
+              req.payload.logger.info(`Extracting text from PDF: ${doc.filename}`);
+
+              const dataBuffer = fs.readFileSync(filePath);
+              const data = await pdfParse(dataBuffer);
+
+              // Clear any previously indexed chunks for this file just in case it's an update
+              await deleteKnowledgeByMediaId(doc.id);
+
+              // Chunk the text like we did in the ingest script
+              const chunks = data.text
+                .split(/\n\s*\n/)
+                .map((c: string) => c.trim())
+                .filter((c: string) => c.length > 50);
+
+              let successCount = 0;
+              for (let i = 0; i < chunks.length; i++) {
+                // Generate a deterministic UUID based on doc ID and chunk index
+                const hash = crypto.createHash('md5').update(`${doc.id}-${i}`).digest('hex');
+                // Qdrant strictly requires UUID: 8-4-4-4-12
+                const uuid = [
+                  hash.substring(0, 8),
+                  hash.substring(8, 12),
+                  hash.substring(12, 16),
+                  hash.substring(16, 20),
+                  hash.substring(20, 32),
+                ].join('-');
+
+                await upsertProductVector(uuid, chunks[i], {
+                  type: 'knowledge',
+                  title: `${doc.filename} - Teil ${i + 1}`,
+                  content: chunks[i],
+                  source: doc.filename,
+                  mediaId: doc.id,
+                });
+                successCount++;
+              }
+              req.payload.logger.info(
+                `Successfully ingested ${successCount} chunks from ${doc.filename} into Qdrant`,
+              );
+            }
+          } catch (e: any) {
+            req.payload.logger.error(`Error parsing PDF ${doc.filename}: ${e.message}`);
+          }
+        }
+      },
+    ],
+    afterDelete: [
+      async ({ id, doc, req }) => {
+        if (doc.mimeType === 'application/pdf') {
+          try {
+            const { deleteKnowledgeByMediaId } = require('../../lib/qdrant');
+            await deleteKnowledgeByMediaId(id);
+            req.payload.logger.info(`Removed Qdrant chunks for deleted PDF: ${doc.filename}`);
+          } catch (e: any) {
+            req.payload.logger.error(
+              `Error removing Qdrant chunks for ${doc.filename}: ${e.message}`,
+            );
+          }
+        }
+      },
+    ],
+  },
 };