feat(journaling): implement secondary LLM validation for YouTube video selection

2026-02-22 17:43:37 +01:00
parent f4507ef121
commit 02bffbc67f
1 changed files with 51 additions and 7 deletions
--- a/packages/journaling/src/agent.ts
+++ b/packages/journaling/src/agent.ts
@@ -204,7 +204,7 @@ Return a JSON object with a single string field "query". Example: {"query": "cor
        {
          role: "user",
          content: `CONTEXT: ${topic}`,
-        }
+        },
      ],
      response_format: { type: "json_object" },
    });
@@ -226,9 +226,15 @@ Return a JSON object with a single string field "query". Example: {"query": "cor
      }
      // Filter for youtube results
-      const ytVideos = videos.filter(
+      const ytVideos = videos
-        (v) => v.link && v.link.includes("youtube.com/watch"),
+        .filter(
-      );
+          (v) =>
            v.link &&
            v.link.includes("youtube.com/watch") &&
            v.title &&
            v.channel,
        )
        .slice(0, 5); // Take top 5 for evaluation
      if (ytVideos.length === 0) {
        console.warn(`⚠️ [Serper] No YouTube videos in search results.`);
@@ -236,8 +242,46 @@ Return a JSON object with a single string field "query". Example: {"query": "cor
        return [];
      }
-      // Pick the best one (usually the first result)
+      // Step 3: Ask the LLM to evaluate the relevance of the found videos
-      const bestVideo = ytVideos[0];
+      const evalPrompt = `You are a strict technical evaluator. You must select the MOST RELEVANT educational tech video from the list below based on this context: "${topic.slice(0, 500)}..."
 Videos:
 ${ytVideos.map((v, i) => `[ID: ${i}] Title: "${v.title}" | Channel: "${v.channel}" | Snippet: "${v.snippet || "none"}"`).join("\n")}
 RULES:
 1. The video MUST be highly relevant to the context.
 2. The channel SHOULD be a tech, development, or professional business channel (avoid gaming, vlogs, unrelated topics).
 3. If none are truly relevant, return -1.
 4. If one is highly relevant, return its ID number.
 Return ONLY a JSON object: {"bestVideoId": number}`;
      const evalResponse = await this.openai.chat.completions.create({
        model: "google/gemini-2.5-flash",
        messages: [{ role: "system", content: evalPrompt }],
        response_format: { type: "json_object" },
      });
      let bestIdx = -1;
      try {
        const evalParsed = JSON.parse(
          evalResponse.choices[0].message.content || '{"bestVideoId": -1}',
        );
        bestIdx = evalParsed.bestVideoId;
      } catch (e) {
        console.warn("Failed to parse video evaluation response");
      }
      if (bestIdx < 0 || bestIdx >= ytVideos.length) {
        console.warn(`⚠️ [Serper] LLM rejected all videos as irrelevant.`);
        if (retries > 0) return this.fetchRealSocialPosts(topic, retries - 1);
        return [];
      }
      const bestVideo = ytVideos[bestIdx];
      console.log(
        `✅ [Serper] AI selected video: ${bestVideo.title} (Channel: ${bestVideo.channel})`,
      );
      // Extract the 11-char video ID from the link (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)
      const urlObj = new URL(bestVideo.link);
@@ -299,7 +343,7 @@ CRITICAL: Do NOT provide more than 2 trendsKeywords. Keep it extremely focused.`
    try {
      let parsed = JSON.parse(
        response.choices[0].message.content ||
-        '{"trendsKeywords": [], "dcVariables": []}',
+          '{"trendsKeywords": [], "dcVariables": []}',
      );
      if (Array.isArray(parsed)) {
        parsed = parsed[0] || { trendsKeywords: [], dcVariables: [] };