feat(ai-search): add interactive WebGL Orb, Markdown support, and Sentry tracking
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 11s
Build & Deploy / 🧪 QA (push) Successful in 1m18s
Build & Deploy / 🚀 Deploy (push) Has been cancelled
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been cancelled
Build & Deploy / 🔔 Notify (push) Has been cancelled
Build & Deploy / 🏗️ Build (push) Has been cancelled
CI - Lint, Typecheck & Test / quality-assurance (pull_request) Failing after 3m55s
Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 11s
Build & Deploy / 🧪 QA (push) Successful in 1m18s
Build & Deploy / 🚀 Deploy (push) Has been cancelled
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been cancelled
Build & Deploy / 🔔 Notify (push) Has been cancelled
Build & Deploy / 🏗️ Build (push) Has been cancelled
CI - Lint, Typecheck & Test / quality-assurance (pull_request) Failing after 3m55s
This commit is contained in:
63
src/scripts/ingest-pdf.ts
Normal file
63
src/scripts/ingest-pdf.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import crypto from 'crypto';
|
||||
|
||||
// Override Qdrant URL for local script execution outside docker
|
||||
process.env.QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333';
|
||||
|
||||
import { upsertProductVector } from '../lib/qdrant';
|
||||
|
||||
// Ingests the extracted Kabelhandbuch text into Qdrant as distinct knowledge topics.
|
||||
async function ingestPDF(txtPath: string) {
|
||||
if (!fs.existsSync(txtPath)) {
|
||||
console.error(`File not found: ${txtPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
const text = fs.readFileSync(txtPath, 'utf8');
|
||||
|
||||
// Simple sentence/paragraph chunking
|
||||
// We split by standard paragraph breaks (double newline) or large content blocks.
|
||||
const chunks = text
|
||||
.split(/\n\s*\n/)
|
||||
.map((c) => c.trim())
|
||||
.filter((c) => c.length > 50);
|
||||
|
||||
console.log(`Extracted ${text.length} characters from PDF.`);
|
||||
console.log(`Generated ${chunks.length} chunks for vector ingestion.\n`);
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
// We limit chuck sizes to ensure Openrouter embedding models don't timeout/fail,
|
||||
// stringing multiple paragraphs if they are short, or cutting them if too long.
|
||||
// For baseline, we'll index every chunk individually mapped as 'knowledge' with a unique ID
|
||||
|
||||
const chunkText = chunks[i];
|
||||
|
||||
// Generate a synthetic ID that won't collide with Payload Product IDs
|
||||
// Qdrant strictly requires UUID or unsigned int.
|
||||
const syntheticId = crypto.randomUUID();
|
||||
|
||||
const payloadData = {
|
||||
type: 'knowledge', // Custom flag to differentiate from 'product'
|
||||
title: `Kabelhandbuch Wissen - Bereich ${i + 1}`,
|
||||
content: chunkText,
|
||||
source: 'Kabelhandbuch KLZ.pdf',
|
||||
};
|
||||
|
||||
// Use the existing upsert function since it just embeds the text and stores the payload
|
||||
await upsertProductVector(syntheticId, chunkText, payloadData);
|
||||
console.log(`✅ Upserted chunk ${i + 1}/${chunks.length}`);
|
||||
}
|
||||
|
||||
console.log('🎉 PDF Ingestion Complete!');
|
||||
process.exit(0);
|
||||
} catch (err) {
|
||||
console.error('Failed to parse PDF:', err);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Run mapping
|
||||
const targetTxt = '/Users/marcmintel/Downloads/kabelhandbuch.txt';
|
||||
ingestPDF(targetTxt);
|
||||
Reference in New Issue
Block a user