From 1dc52da67734efe28bf5ac7997624c7ce61e5bf7 Mon Sep 17 00:00:00 2001 From: Marc Mintel Date: Sun, 8 Mar 2026 01:08:55 +0100 Subject: [PATCH] feat: Automate Qdrant PDF ingestion via Media hooks --- package.json | 2 +- src/lib/qdrant.ts | 25 +++++++++++ src/payload/collections/Media.ts | 77 ++++++++++++++++++++++++++++++++ src/scripts/upload-pdfs.ts | 58 ++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 src/scripts/upload-pdfs.ts diff --git a/package.json b/package.json index 6cdda423..f4683932 100644 --- a/package.json +++ b/package.json @@ -106,7 +106,7 @@ }, "scripts": { "dev": "bash -c '[ -f .env ] || (cp .env.example .env && sed -i.bak \"s/TRAEFIK_HOST=klz-cables.com/TRAEFIK_HOST=klz.localhost/\" .env && rm -f .env.bak && echo \"✅ Created .env from .env.example\"); trap \"COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down\" EXIT INT TERM; docker network create infra 2>/dev/null || true && COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down && COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up klz-app klz-db klz-proxy klz-qdrant klz-redis --remove-orphans'", - "dev:local": "bash -c 'trap \"COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down\" EXIT INT TERM; COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up -d klz-db klz-proxy klz-qdrant klz-redis && POSTGRES_URI=NODE_ENV=development next dev --webpack --port 3100 --hostname 0.0.0.0'", + "dev:local": "bash -c 'trap \"COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down\" EXIT INT TERM; COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up -d klz-db klz-proxy klz-qdrant klz-redis && POSTGRES_URI=\"\" NODE_ENV=development next dev --webpack --port 3100 --hostname 0.0.0.0'", "dev:infra": "COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up -d klz-db klz-proxy", "build": "next build", "start": "next start", diff --git a/src/lib/qdrant.ts b/src/lib/qdrant.ts index c477e920..31477344 100644 --- a/src/lib/qdrant.ts +++ b/src/lib/qdrant.ts @@ -152,6 +152,31 @@ export async function deleteProductVector(id: string | number) { } } +/** + * Delete knowledge chunks by their source Media ID + */ +export async function deleteKnowledgeByMediaId(mediaId: string | number) { + try { + await ensureCollection(); + await qdrant.delete(COLLECTION_NAME, { + wait: true, + filter: { + must: [ + { + key: 'mediaId', + match: { + value: mediaId, + }, + }, + ], + }, + }); + console.log(`Successfully deleted Qdrant chunks for Media ID: ${mediaId}`); + } catch (error) { + console.error('Error deleting knowledge by Media ID from Qdrant:', error); + } +} + /** * Search products in Qdrant. * Results are cached in Redis for 30 minutes keyed by query text. diff --git a/src/payload/collections/Media.ts b/src/payload/collections/Media.ts index dd4a6244..9ba3865b 100644 --- a/src/payload/collections/Media.ts +++ b/src/payload/collections/Media.ts @@ -45,4 +45,81 @@ export const Media: CollectionConfig = { type: 'text', }, ], + hooks: { + afterChange: [ + async ({ doc, req }) => { + // Only process PDF files + if (doc.mimeType === 'application/pdf') { + try { + const fs = require('fs'); + const path = require('path'); + const crypto = require('crypto'); + const pdfParse = require('pdf-parse'); + const { upsertProductVector, deleteKnowledgeByMediaId } = require('../../lib/qdrant'); + + const filePath = path.join(process.cwd(), 'public/media', doc.filename); + + if (fs.existsSync(filePath)) { + req.payload.logger.info(`Extracting text from PDF: ${doc.filename}`); + + const dataBuffer = fs.readFileSync(filePath); + const data = await pdfParse(dataBuffer); + + // Clear any previously indexed chunks for this file just in case it's an update + await deleteKnowledgeByMediaId(doc.id); + + // Chunk the text like we did in the ingest script + const chunks = data.text + .split(/\n\s*\n/) + .map((c: string) => c.trim()) + .filter((c: string) => c.length > 50); + + let successCount = 0; + for (let i = 0; i < chunks.length; i++) { + // Generate a deterministic UUID based on doc ID and chunk index + const hash = crypto.createHash('md5').update(`${doc.id}-${i}`).digest('hex'); + // Qdrant strictly requires UUID: 8-4-4-4-12 + const uuid = [ + hash.substring(0, 8), + hash.substring(8, 12), + hash.substring(12, 16), + hash.substring(16, 20), + hash.substring(20, 32), + ].join('-'); + + await upsertProductVector(uuid, chunks[i], { + type: 'knowledge', + title: `${doc.filename} - Teil ${i + 1}`, + content: chunks[i], + source: doc.filename, + mediaId: doc.id, + }); + successCount++; + } + req.payload.logger.info( + `Successfully ingested ${successCount} chunks from ${doc.filename} into Qdrant`, + ); + } + } catch (e: any) { + req.payload.logger.error(`Error parsing PDF ${doc.filename}: ${e.message}`); + } + } + }, + ], + afterDelete: [ + async ({ id, doc, req }) => { + if (doc.mimeType === 'application/pdf') { + try { + const { deleteKnowledgeByMediaId } = require('../../lib/qdrant'); + await deleteKnowledgeByMediaId(id); + req.payload.logger.info(`Removed Qdrant chunks for deleted PDF: ${doc.filename}`); + } catch (e: any) { + req.payload.logger.error( + `Error removing Qdrant chunks for ${doc.filename}: ${e.message}`, + ); + } + } + }, + ], + }, }; diff --git a/src/scripts/upload-pdfs.ts b/src/scripts/upload-pdfs.ts new file mode 100644 index 00000000..44e69211 --- /dev/null +++ b/src/scripts/upload-pdfs.ts @@ -0,0 +1,58 @@ +import fs from 'fs'; +import path from 'path'; +import 'dotenv/config'; +import { getPayload } from 'payload'; +import configPromise from '@payload-config'; + +async function uploadPDFs() { + const payload = await getPayload({ config: configPromise }); + + const downloadDir = '/Users/marcmintel/Downloads'; + const files = fs.readdirSync(downloadDir).filter((f) => f.endsWith('.pdf')); + + console.log(`Found ${files.length} PDFs in Downloads folder.`); + + for (const file of files) { + const filePath = path.join(downloadDir, file); + try { + const stats = fs.statSync(filePath); + + // Check if it already exists + const existing = await payload.find({ + collection: 'media', + where: { + filename: { + equals: file, + }, + }, + }); + + if (existing.docs.length > 0) { + console.log(`Skipping ${file} - already exists in CMS`); + continue; + } + + console.log(`Uploading ${file}...`); + await payload.create({ + collection: 'media', + data: { + alt: file, + }, + file: { + data: fs.readFileSync(filePath), + mimetype: 'application/pdf', + name: file, + size: stats.size, + }, + }); + console.log(`✅ Uploaded ${file}`); + } catch (err) { + console.error(`❌ Failed to upload ${file}:`, err); + } + } + + console.log('Done uploading PDFs to Payload CMS. Payload hooks have synced them to Qdrant.'); + process.exit(0); +} + +uploadPDFs();