Files
klz-cables.com/src/scripts/ingest-pdf.ts
Marc Mintel 8e99c9d121
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 6s
Build & Deploy / 🧪 QA (push) Failing after 55s
Build & Deploy / 🏗️ Build (push) Has been skipped
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 2s
feat: automated Qdrant sync with Mistral embeddings + Kabelhandbuch ingestion
- Switch embedding API from OpenRouter to Mistral mistral-embed (1024-dim, EU/DSGVO)
- Add afterChange/afterDelete hooks to Posts.ts and Pages.ts for live sync
- Integrate kabelhandbuch.txt parsing into /api/sync-qdrant boot route
- Add .gitignore entries for kabelhandbuch.txt
2026-03-07 15:39:10 +01:00

65 lines
2.2 KiB
TypeScript

import fs from 'fs';
import path from 'path';
import crypto from 'crypto';
import 'dotenv/config';
// Override Qdrant URL for local script execution outside docker
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';
import { upsertProductVector } from '../lib/qdrant';
// Ingests the extracted Kabelhandbuch text into Qdrant as distinct knowledge topics.
async function ingestPDF(txtPath: string) {
if (!fs.existsSync(txtPath)) {
console.error(`File not found: ${txtPath}`);
process.exit(1);
}
try {
const text = fs.readFileSync(txtPath, 'utf8');
// Simple sentence/paragraph chunking
// We split by standard paragraph breaks (double newline) or large content blocks.
const chunks = text
.split(/\n\s*\n/)
.map((c) => c.trim())
.filter((c) => c.length > 50);
console.log(`Extracted ${text.length} characters from PDF.`);
console.log(`Generated ${chunks.length} chunks for vector ingestion.\n`);
for (let i = 0; i < chunks.length; i++) {
// We limit chuck sizes to ensure Openrouter embedding models don't timeout/fail,
// stringing multiple paragraphs if they are short, or cutting them if too long.
// For baseline, we'll index every chunk individually mapped as 'knowledge' with a unique ID
const chunkText = chunks[i];
// Generate a synthetic ID that won't collide with Payload Product IDs
// Qdrant strictly requires UUID or unsigned int.
const syntheticId = crypto.randomUUID();
const payloadData = {
type: 'knowledge', // Custom flag to differentiate from 'product'
title: `Kabelhandbuch Wissen - Bereich ${i + 1}`,
content: chunkText,
source: 'Kabelhandbuch KLZ.pdf',
};
// Use the existing upsert function since it just embeds the text and stores the payload
await upsertProductVector(syntheticId, chunkText, payloadData);
console.log(`✅ Upserted chunk ${i + 1}/${chunks.length}`);
}
console.log('🎉 PDF Ingestion Complete!');
process.exit(0);
} catch (err) {
console.error('Failed to parse PDF:', err);
process.exit(1);
}
}
// Run mapping
const targetTxt = '/Users/marcmintel/Downloads/kabelhandbuch.txt';
ingestPDF(targetTxt);