All checks were successful
Build & Deploy / 🔍 Prepare (push) Successful in 7s
Build & Deploy / 🧪 QA (push) Successful in 1m1s
Build & Deploy / 🏗️ Build (push) Successful in 4m22s
Build & Deploy / 🚀 Deploy (push) Successful in 1m41s
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
126 lines
3.8 KiB
TypeScript
126 lines
3.8 KiB
TypeScript
import type { CollectionConfig } from 'payload';
|
|
|
|
export const Media: CollectionConfig = {
|
|
slug: 'media',
|
|
access: {
|
|
read: () => true,
|
|
},
|
|
admin: {
|
|
useAsTitle: 'filename',
|
|
defaultColumns: ['filename', 'alt', 'updatedAt'],
|
|
},
|
|
upload: {
|
|
staticDir: 'public/media',
|
|
adminThumbnail: 'thumbnail',
|
|
imageSizes: [
|
|
{
|
|
name: 'thumbnail',
|
|
width: 600,
|
|
// height: undefined allows wide 5:1 aspect ratios to be preserved without cropping
|
|
height: undefined,
|
|
position: 'centre',
|
|
},
|
|
{
|
|
name: 'card',
|
|
width: 768,
|
|
height: undefined,
|
|
position: 'centre',
|
|
},
|
|
{
|
|
name: 'tablet',
|
|
width: 1024,
|
|
height: undefined,
|
|
position: 'centre',
|
|
},
|
|
],
|
|
},
|
|
fields: [
|
|
{
|
|
name: 'alt',
|
|
type: 'text',
|
|
required: true,
|
|
},
|
|
{
|
|
name: 'caption',
|
|
type: 'text',
|
|
},
|
|
],
|
|
hooks: {
|
|
afterChange: [
|
|
async ({ doc, req }) => {
|
|
// Only process PDF files
|
|
if (doc.mimeType === 'application/pdf') {
|
|
try {
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const crypto = require('crypto');
|
|
const pdfParse = require('pdf-parse');
|
|
const { upsertProductVector, deleteKnowledgeByMediaId } = require('../../lib/qdrant');
|
|
|
|
const filePath = path.join(process.cwd(), 'public/media', doc.filename);
|
|
|
|
if (fs.existsSync(filePath)) {
|
|
req.payload.logger.info(`Extracting text from PDF: ${doc.filename}`);
|
|
|
|
const dataBuffer = fs.readFileSync(filePath);
|
|
const data = await pdfParse(dataBuffer);
|
|
|
|
// Clear any previously indexed chunks for this file just in case it's an update
|
|
await deleteKnowledgeByMediaId(doc.id);
|
|
|
|
// Chunk the text like we did in the ingest script
|
|
const chunks = data.text
|
|
.split(/\n\s*\n/)
|
|
.map((c: string) => c.trim())
|
|
.filter((c: string) => c.length > 50);
|
|
|
|
let successCount = 0;
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
// Generate a deterministic UUID based on doc ID and chunk index
|
|
const hash = crypto.createHash('md5').update(`${doc.id}-${i}`).digest('hex');
|
|
// Qdrant strictly requires UUID: 8-4-4-4-12
|
|
const uuid = [
|
|
hash.substring(0, 8),
|
|
hash.substring(8, 12),
|
|
hash.substring(12, 16),
|
|
hash.substring(16, 20),
|
|
hash.substring(20, 32),
|
|
].join('-');
|
|
|
|
await upsertProductVector(uuid, chunks[i], {
|
|
type: 'knowledge',
|
|
title: `${doc.filename} - Teil ${i + 1}`,
|
|
content: chunks[i],
|
|
source: doc.filename,
|
|
mediaId: doc.id,
|
|
});
|
|
successCount++;
|
|
}
|
|
req.payload.logger.info(
|
|
`Successfully ingested ${successCount} chunks from ${doc.filename} into Qdrant`,
|
|
);
|
|
}
|
|
} catch (e: any) {
|
|
req.payload.logger.error(`Error parsing PDF ${doc.filename}: ${e.message}`);
|
|
}
|
|
}
|
|
},
|
|
],
|
|
afterDelete: [
|
|
async ({ id, doc, req }) => {
|
|
if (doc.mimeType === 'application/pdf') {
|
|
try {
|
|
const { deleteKnowledgeByMediaId } = require('../../lib/qdrant');
|
|
await deleteKnowledgeByMediaId(id);
|
|
req.payload.logger.info(`Removed Qdrant chunks for deleted PDF: ${doc.filename}`);
|
|
} catch (e: any) {
|
|
req.payload.logger.error(
|
|
`Error removing Qdrant chunks for ${doc.filename}: ${e.message}`,
|
|
);
|
|
}
|
|
}
|
|
},
|
|
],
|
|
},
|
|
};
|