/** * Index all published blog posts into Qdrant for AI search. * * Usage: pnpm --filter @mintel/web run index:posts */ import { getPayload } from 'payload'; import configPromise from '../payload.config'; import { upsertPostVector } from '../src/lib/qdrant'; function extractPlainText(node: any): string { if (!node) return ''; // Handle text nodes if (typeof node === 'string') return node; if (node.text) return node.text; // Handle arrays if (Array.isArray(node)) { return node.map(extractPlainText).join(''); } // Handle node with children if (node.children) { const childText = node.children.map(extractPlainText).join(''); // Add line breaks for block-level elements if (['paragraph', 'heading', 'listitem', 'quote'].includes(node.type)) { return childText + '\n'; } return childText; } // Lexical root if (node.root) { return extractPlainText(node.root); } return ''; } async function run() { console.log('šŸ” Starting blog post indexing for AI search...'); let payload; let retries = 5; while (retries > 0) { try { console.log(`Connecting to database (URI: ${process.env.DATABASE_URI || 'default'})...`); payload = await getPayload({ config: configPromise }); break; } catch (e: any) { if ( e.code === 'ECONNREFUSED' || e.code === 'ENOTFOUND' || e.message?.includes('ECONNREFUSED') || e.message?.includes('cannot connect to Postgres') ) { console.log(`Database not ready, retrying in 3s... (${retries} retries left)`); retries--; await new Promise((res) => setTimeout(res, 3000)); } else { throw e; } } } if (!payload) { throw new Error('Failed to connect to database after multiple retries.'); } // Fetch all published posts const result = await payload.find({ collection: 'posts', limit: 1000, where: { _status: { equals: 'published' }, }, }); console.log(`Found ${result.docs.length} published posts to index.`); let indexed = 0; for (const post of result.docs) { const plainContent = extractPlainText(post.content); // Build searchable text: title + description + tags + content const tags = (post.tags as any[])?.map((t: any) => t.tag).filter(Boolean).join(', ') || ''; const searchableText = [ `Titel: ${post.title}`, `Beschreibung: ${post.description}`, tags ? `Tags: ${tags}` : '', `Inhalt: ${plainContent.substring(0, 2000)}`, // Limit content to avoid token overflow ] .filter(Boolean) .join('\n\n'); // Upsert into Qdrant await upsertPostVector( post.id, searchableText, { content: searchableText, data: { id: post.id, title: post.title, slug: post.slug, description: post.description, tags, }, }, ); indexed++; console.log(` āœ… [${indexed}/${result.docs.length}] ${post.title}`); // Small delay to avoid rate limiting on the embedding API await new Promise((res) => setTimeout(res, 200)); } console.log(`\nšŸŽ‰ Successfully indexed ${indexed} posts into Qdrant.`); process.exit(0); } run().catch((e) => { console.error('Indexing failed:', e); process.exit(1); });