Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 11s
Build & Deploy / 🧪 QA (push) Successful in 1m18s
Build & Deploy / 🚀 Deploy (push) Has been cancelled
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been cancelled
Build & Deploy / 🔔 Notify (push) Has been cancelled
Build & Deploy / 🏗️ Build (push) Has been cancelled
CI - Lint, Typecheck & Test / quality-assurance (pull_request) Failing after 3m55s
64 lines
2.2 KiB
TypeScript
64 lines
2.2 KiB
TypeScript
import fs from 'fs';
|
|
import path from 'path';
|
|
import crypto from 'crypto';
|
|
|
|
// Override Qdrant URL for local script execution outside docker
|
|
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';
|
|
|
|
import { upsertProductVector } from '../lib/qdrant';
|
|
|
|
// Ingests the extracted Kabelhandbuch text into Qdrant as distinct knowledge topics.
|
|
async function ingestPDF(txtPath: string) {
|
|
if (!fs.existsSync(txtPath)) {
|
|
console.error(`File not found: ${txtPath}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
try {
|
|
const text = fs.readFileSync(txtPath, 'utf8');
|
|
|
|
// Simple sentence/paragraph chunking
|
|
// We split by standard paragraph breaks (double newline) or large content blocks.
|
|
const chunks = text
|
|
.split(/\n\s*\n/)
|
|
.map((c) => c.trim())
|
|
.filter((c) => c.length > 50);
|
|
|
|
console.log(`Extracted ${text.length} characters from PDF.`);
|
|
console.log(`Generated ${chunks.length} chunks for vector ingestion.\n`);
|
|
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
// We limit chuck sizes to ensure Openrouter embedding models don't timeout/fail,
|
|
// stringing multiple paragraphs if they are short, or cutting them if too long.
|
|
// For baseline, we'll index every chunk individually mapped as 'knowledge' with a unique ID
|
|
|
|
const chunkText = chunks[i];
|
|
|
|
// Generate a synthetic ID that won't collide with Payload Product IDs
|
|
// Qdrant strictly requires UUID or unsigned int.
|
|
const syntheticId = crypto.randomUUID();
|
|
|
|
const payloadData = {
|
|
type: 'knowledge', // Custom flag to differentiate from 'product'
|
|
title: `Kabelhandbuch Wissen - Bereich ${i + 1}`,
|
|
content: chunkText,
|
|
source: 'Kabelhandbuch KLZ.pdf',
|
|
};
|
|
|
|
// Use the existing upsert function since it just embeds the text and stores the payload
|
|
await upsertProductVector(syntheticId, chunkText, payloadData);
|
|
console.log(`✅ Upserted chunk ${i + 1}/${chunks.length}`);
|
|
}
|
|
|
|
console.log('🎉 PDF Ingestion Complete!');
|
|
process.exit(0);
|
|
} catch (err) {
|
|
console.error('Failed to parse PDF:', err);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Run mapping
|
|
const targetTxt = '/Users/marcmintel/Downloads/kabelhandbuch.txt';
|
|
ingestPDF(targetTxt);
|