Some checks failed
Build & Deploy / 🔍 Prepare (push) Successful in 7s
Build & Deploy / 🏗️ Build (push) Failing after 18m2s
Build & Deploy / 🚀 Deploy (push) Has been skipped
Build & Deploy / 🧪 QA (push) Has been skipped
Build & Deploy / 🧪 Post-Deploy Verification (push) Has been skipped
Build & Deploy / 🔔 Notify (push) Successful in 3s
128 lines
3.7 KiB
TypeScript
128 lines
3.7 KiB
TypeScript
/**
|
|
* Index all published blog posts into Qdrant for AI search.
|
|
*
|
|
* Usage: pnpm --filter @mintel/web run index:posts
|
|
*/
|
|
import { getPayload } from 'payload';
|
|
import configPromise from '../payload.config';
|
|
import { upsertPostVector } from '../src/lib/qdrant';
|
|
|
|
function extractPlainText(node: any): string {
|
|
if (!node) return '';
|
|
|
|
// Handle text nodes
|
|
if (typeof node === 'string') return node;
|
|
if (node.text) return node.text;
|
|
|
|
// Handle arrays
|
|
if (Array.isArray(node)) {
|
|
return node.map(extractPlainText).join('');
|
|
}
|
|
|
|
// Handle node with children
|
|
if (node.children) {
|
|
const childText = node.children.map(extractPlainText).join('');
|
|
|
|
// Add line breaks for block-level elements
|
|
if (['paragraph', 'heading', 'listitem', 'quote'].includes(node.type)) {
|
|
return childText + '\n';
|
|
}
|
|
return childText;
|
|
}
|
|
|
|
// Lexical root
|
|
if (node.root) {
|
|
return extractPlainText(node.root);
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
async function run() {
|
|
console.log('🔍 Starting blog post indexing for AI search...');
|
|
|
|
let payload;
|
|
let retries = 5;
|
|
while (retries > 0) {
|
|
try {
|
|
console.log(`Connecting to database (URI: ${process.env.DATABASE_URI || 'default'})...`);
|
|
payload = await getPayload({ config: configPromise });
|
|
break;
|
|
} catch (e: any) {
|
|
if (
|
|
e.code === 'ECONNREFUSED' ||
|
|
e.code === 'ENOTFOUND' ||
|
|
e.message?.includes('ECONNREFUSED') ||
|
|
e.message?.includes('cannot connect to Postgres')
|
|
) {
|
|
console.log(`Database not ready, retrying in 3s... (${retries} retries left)`);
|
|
retries--;
|
|
await new Promise((res) => setTimeout(res, 3000));
|
|
} else {
|
|
throw e;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!payload) {
|
|
throw new Error('Failed to connect to database after multiple retries.');
|
|
}
|
|
|
|
// Fetch all published posts
|
|
const result = await payload.find({
|
|
collection: 'posts',
|
|
limit: 1000,
|
|
where: {
|
|
_status: { equals: 'published' },
|
|
},
|
|
});
|
|
|
|
console.log(`Found ${result.docs.length} published posts to index.`);
|
|
|
|
let indexed = 0;
|
|
for (const post of result.docs) {
|
|
const plainContent = extractPlainText(post.content);
|
|
|
|
// Build searchable text: title + description + tags + content
|
|
const tags = (post.tags as any[])?.map((t: any) => t.tag).filter(Boolean).join(', ') || '';
|
|
const searchableText = [
|
|
`Titel: ${post.title}`,
|
|
`Beschreibung: ${post.description}`,
|
|
tags ? `Tags: ${tags}` : '',
|
|
`Inhalt: ${plainContent.substring(0, 2000)}`, // Limit content to avoid token overflow
|
|
]
|
|
.filter(Boolean)
|
|
.join('\n\n');
|
|
|
|
// Upsert into Qdrant
|
|
await upsertPostVector(
|
|
post.id,
|
|
searchableText,
|
|
{
|
|
content: searchableText,
|
|
data: {
|
|
id: post.id,
|
|
title: post.title,
|
|
slug: post.slug,
|
|
description: post.description,
|
|
tags,
|
|
},
|
|
},
|
|
);
|
|
|
|
indexed++;
|
|
console.log(` ✅ [${indexed}/${result.docs.length}] ${post.title}`);
|
|
|
|
// Small delay to avoid rate limiting on the embedding API
|
|
await new Promise((res) => setTimeout(res, 200));
|
|
}
|
|
|
|
console.log(`\n🎉 Successfully indexed ${indexed} posts into Qdrant.`);
|
|
process.exit(0);
|
|
}
|
|
|
|
run().catch((e) => {
|
|
console.error('Indexing failed:', e);
|
|
process.exit(1);
|
|
});
|