feat: Automate Qdrant PDF ingestion via Media hooks
All checks were successful
Build & Deploy / 🔍 Prepare (push) Successful in 7s
Build & Deploy / 🧪 QA (push) Successful in 1m1s
Build & Deploy / 🏗️ Build (push) Successful in 4m22s
Build & Deploy / 🚀 Deploy (push) Successful in 1m41s
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
All checks were successful
Build & Deploy / 🔍 Prepare (push) Successful in 7s
Build & Deploy / 🧪 QA (push) Successful in 1m1s
Build & Deploy / 🏗️ Build (push) Successful in 4m22s
Build & Deploy / 🚀 Deploy (push) Successful in 1m41s
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
This commit is contained in:
@@ -106,7 +106,7 @@
|
|||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "bash -c '[ -f .env ] || (cp .env.example .env && sed -i.bak \"s/TRAEFIK_HOST=klz-cables.com/TRAEFIK_HOST=klz.localhost/\" .env && rm -f .env.bak && echo \"✅ Created .env from .env.example\"); trap \"COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down\" EXIT INT TERM; docker network create infra 2>/dev/null || true && COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down && COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up klz-app klz-db klz-proxy klz-qdrant klz-redis --remove-orphans'",
|
"dev": "bash -c '[ -f .env ] || (cp .env.example .env && sed -i.bak \"s/TRAEFIK_HOST=klz-cables.com/TRAEFIK_HOST=klz.localhost/\" .env && rm -f .env.bak && echo \"✅ Created .env from .env.example\"); trap \"COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down\" EXIT INT TERM; docker network create infra 2>/dev/null || true && COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down && COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up klz-app klz-db klz-proxy klz-qdrant klz-redis --remove-orphans'",
|
||||||
"dev:local": "bash -c 'trap \"COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down\" EXIT INT TERM; COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up -d klz-db klz-proxy klz-qdrant klz-redis && POSTGRES_URI=NODE_ENV=development next dev --webpack --port 3100 --hostname 0.0.0.0'",
|
"dev:local": "bash -c 'trap \"COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml down\" EXIT INT TERM; COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up -d klz-db klz-proxy klz-qdrant klz-redis && POSTGRES_URI=\"\" NODE_ENV=development next dev --webpack --port 3100 --hostname 0.0.0.0'",
|
||||||
"dev:infra": "COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up -d klz-db klz-proxy",
|
"dev:infra": "COMPOSE_PROJECT_NAME=klz-2026 docker-compose -f docker-compose.dev.yml up -d klz-db klz-proxy",
|
||||||
"build": "next build",
|
"build": "next build",
|
||||||
"start": "next start",
|
"start": "next start",
|
||||||
|
|||||||
@@ -152,6 +152,31 @@ export async function deleteProductVector(id: string | number) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete knowledge chunks by their source Media ID
|
||||||
|
*/
|
||||||
|
export async function deleteKnowledgeByMediaId(mediaId: string | number) {
|
||||||
|
try {
|
||||||
|
await ensureCollection();
|
||||||
|
await qdrant.delete(COLLECTION_NAME, {
|
||||||
|
wait: true,
|
||||||
|
filter: {
|
||||||
|
must: [
|
||||||
|
{
|
||||||
|
key: 'mediaId',
|
||||||
|
match: {
|
||||||
|
value: mediaId,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
console.log(`Successfully deleted Qdrant chunks for Media ID: ${mediaId}`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error deleting knowledge by Media ID from Qdrant:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Search products in Qdrant.
|
* Search products in Qdrant.
|
||||||
* Results are cached in Redis for 30 minutes keyed by query text.
|
* Results are cached in Redis for 30 minutes keyed by query text.
|
||||||
|
|||||||
@@ -45,4 +45,81 @@ export const Media: CollectionConfig = {
|
|||||||
type: 'text',
|
type: 'text',
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
hooks: {
|
||||||
|
afterChange: [
|
||||||
|
async ({ doc, req }) => {
|
||||||
|
// Only process PDF files
|
||||||
|
if (doc.mimeType === 'application/pdf') {
|
||||||
|
try {
|
||||||
|
const fs = require('fs');
|
||||||
|
const path = require('path');
|
||||||
|
const crypto = require('crypto');
|
||||||
|
const pdfParse = require('pdf-parse');
|
||||||
|
const { upsertProductVector, deleteKnowledgeByMediaId } = require('../../lib/qdrant');
|
||||||
|
|
||||||
|
const filePath = path.join(process.cwd(), 'public/media', doc.filename);
|
||||||
|
|
||||||
|
if (fs.existsSync(filePath)) {
|
||||||
|
req.payload.logger.info(`Extracting text from PDF: ${doc.filename}`);
|
||||||
|
|
||||||
|
const dataBuffer = fs.readFileSync(filePath);
|
||||||
|
const data = await pdfParse(dataBuffer);
|
||||||
|
|
||||||
|
// Clear any previously indexed chunks for this file just in case it's an update
|
||||||
|
await deleteKnowledgeByMediaId(doc.id);
|
||||||
|
|
||||||
|
// Chunk the text like we did in the ingest script
|
||||||
|
const chunks = data.text
|
||||||
|
.split(/\n\s*\n/)
|
||||||
|
.map((c: string) => c.trim())
|
||||||
|
.filter((c: string) => c.length > 50);
|
||||||
|
|
||||||
|
let successCount = 0;
|
||||||
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
// Generate a deterministic UUID based on doc ID and chunk index
|
||||||
|
const hash = crypto.createHash('md5').update(`${doc.id}-${i}`).digest('hex');
|
||||||
|
// Qdrant strictly requires UUID: 8-4-4-4-12
|
||||||
|
const uuid = [
|
||||||
|
hash.substring(0, 8),
|
||||||
|
hash.substring(8, 12),
|
||||||
|
hash.substring(12, 16),
|
||||||
|
hash.substring(16, 20),
|
||||||
|
hash.substring(20, 32),
|
||||||
|
].join('-');
|
||||||
|
|
||||||
|
await upsertProductVector(uuid, chunks[i], {
|
||||||
|
type: 'knowledge',
|
||||||
|
title: `${doc.filename} - Teil ${i + 1}`,
|
||||||
|
content: chunks[i],
|
||||||
|
source: doc.filename,
|
||||||
|
mediaId: doc.id,
|
||||||
|
});
|
||||||
|
successCount++;
|
||||||
|
}
|
||||||
|
req.payload.logger.info(
|
||||||
|
`Successfully ingested ${successCount} chunks from ${doc.filename} into Qdrant`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (e: any) {
|
||||||
|
req.payload.logger.error(`Error parsing PDF ${doc.filename}: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
afterDelete: [
|
||||||
|
async ({ id, doc, req }) => {
|
||||||
|
if (doc.mimeType === 'application/pdf') {
|
||||||
|
try {
|
||||||
|
const { deleteKnowledgeByMediaId } = require('../../lib/qdrant');
|
||||||
|
await deleteKnowledgeByMediaId(id);
|
||||||
|
req.payload.logger.info(`Removed Qdrant chunks for deleted PDF: ${doc.filename}`);
|
||||||
|
} catch (e: any) {
|
||||||
|
req.payload.logger.error(
|
||||||
|
`Error removing Qdrant chunks for ${doc.filename}: ${e.message}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
58
src/scripts/upload-pdfs.ts
Normal file
58
src/scripts/upload-pdfs.ts
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
import fs from 'fs';
|
||||||
|
import path from 'path';
|
||||||
|
import 'dotenv/config';
|
||||||
|
import { getPayload } from 'payload';
|
||||||
|
import configPromise from '@payload-config';
|
||||||
|
|
||||||
|
async function uploadPDFs() {
|
||||||
|
const payload = await getPayload({ config: configPromise });
|
||||||
|
|
||||||
|
const downloadDir = '/Users/marcmintel/Downloads';
|
||||||
|
const files = fs.readdirSync(downloadDir).filter((f) => f.endsWith('.pdf'));
|
||||||
|
|
||||||
|
console.log(`Found ${files.length} PDFs in Downloads folder.`);
|
||||||
|
|
||||||
|
for (const file of files) {
|
||||||
|
const filePath = path.join(downloadDir, file);
|
||||||
|
try {
|
||||||
|
const stats = fs.statSync(filePath);
|
||||||
|
|
||||||
|
// Check if it already exists
|
||||||
|
const existing = await payload.find({
|
||||||
|
collection: 'media',
|
||||||
|
where: {
|
||||||
|
filename: {
|
||||||
|
equals: file,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (existing.docs.length > 0) {
|
||||||
|
console.log(`Skipping ${file} - already exists in CMS`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Uploading ${file}...`);
|
||||||
|
await payload.create({
|
||||||
|
collection: 'media',
|
||||||
|
data: {
|
||||||
|
alt: file,
|
||||||
|
},
|
||||||
|
file: {
|
||||||
|
data: fs.readFileSync(filePath),
|
||||||
|
mimetype: 'application/pdf',
|
||||||
|
name: file,
|
||||||
|
size: stats.size,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
console.log(`✅ Uploaded ${file}`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`❌ Failed to upload ${file}:`, err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('Done uploading PDFs to Payload CMS. Payload hooks have synced them to Qdrant.');
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
uploadPDFs();
|
||||||
Reference in New Issue
Block a user