import fs from 'fs'; import path from 'path'; import crypto from 'crypto'; import 'dotenv/config'; // Override Qdrant URL for local script execution outside docker process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333'; import { upsertProductVector } from '../lib/qdrant'; // Ingests the extracted Kabelhandbuch text into Qdrant as distinct knowledge topics. async function ingestPDF(txtPath: string) { if (!fs.existsSync(txtPath)) { console.error(`File not found: ${txtPath}`); process.exit(1); } try { const text = fs.readFileSync(txtPath, 'utf8'); // Simple sentence/paragraph chunking // We split by standard paragraph breaks (double newline) or large content blocks. const chunks = text .split(/\n\s*\n/) .map((c) => c.trim()) .filter((c) => c.length > 50); console.log(`Extracted ${text.length} characters from PDF.`); console.log(`Generated ${chunks.length} chunks for vector ingestion.\n`); for (let i = 0; i < chunks.length; i++) { // We limit chuck sizes to ensure Openrouter embedding models don't timeout/fail, // stringing multiple paragraphs if they are short, or cutting them if too long. // For baseline, we'll index every chunk individually mapped as 'knowledge' with a unique ID const chunkText = chunks[i]; // Generate a synthetic ID that won't collide with Payload Product IDs // Qdrant strictly requires UUID or unsigned int. const syntheticId = crypto.randomUUID(); const payloadData = { type: 'knowledge', // Custom flag to differentiate from 'product' title: `Kabelhandbuch Wissen - Bereich ${i + 1}`, content: chunkText, source: 'Kabelhandbuch KLZ.pdf', }; // Use the existing upsert function since it just embeds the text and stores the payload await upsertProductVector(syntheticId, chunkText, payloadData); console.log(`✅ Upserted chunk ${i + 1}/${chunks.length}`); } console.log('🎉 PDF Ingestion Complete!'); process.exit(0); } catch (err) { console.error('Failed to parse PDF:', err); process.exit(1); } } // Run mapping const targetTxt = '/Users/marcmintel/Downloads/kabelhandbuch.txt'; ingestPDF(targetTxt);