feat(journaling): implement secondary LLM validation for YouTube video selection

2026-02-22 17:43:37 +01:00
parent f4507ef121
commit 02bffbc67f
1 changed files with 51 additions and 7 deletions
--- a/packages/journaling/src/agent.ts
+++ b/packages/journaling/src/agent.ts
@@ -204,7 +204,7 @@ Return a JSON object with a single string field "query". Example: {"query": "cor
        {
          role: "user",
          content: `CONTEXT: ${topic}`,
-        }
+        },
      ],
      response_format: { type: "json_object" },
    });
@@ -226,9 +226,15 @@ Return a JSON object with a single string field "query". Example: {"query": "cor
      }

      // Filter for youtube results
-      const ytVideos = videos.filter(
-        (v) => v.link && v.link.includes("youtube.com/watch"),
-      );
+      const ytVideos = videos
+        .filter(
+          (v) =>
+            v.link &&
+            v.link.includes("youtube.com/watch") &&
+            v.title &&
+            v.channel,
+        )
+        .slice(0, 5); // Take top 5 for evaluation

      if (ytVideos.length === 0) {
        console.warn(`⚠️ [Serper] No YouTube videos in search results.`);
@@ -236,8 +242,46 @@ Return a JSON object with a single string field "query". Example: {"query": "cor
        return [];
      }

-      // Pick the best one (usually the first result)
-      const bestVideo = ytVideos[0];
+      // Step 3: Ask the LLM to evaluate the relevance of the found videos
+      const evalPrompt = `You are a strict technical evaluator. You must select the MOST RELEVANT educational tech video from the list below based on this context: "${topic.slice(0, 500)}..."
+
+Videos:
+${ytVideos.map((v, i) => `[ID: ${i}] Title: "${v.title}" | Channel: "${v.channel}" | Snippet: "${v.snippet || "none"}"`).join("\n")}
+
+RULES:
+1. The video MUST be highly relevant to the context.
+2. The channel SHOULD be a tech, development, or professional business channel (avoid gaming, vlogs, unrelated topics).
+3. If none are truly relevant, return -1.
+4. If one is highly relevant, return its ID number.
+
+Return ONLY a JSON object: {"bestVideoId": number}`;
+
+      const evalResponse = await this.openai.chat.completions.create({
+        model: "google/gemini-2.5-flash",
+        messages: [{ role: "system", content: evalPrompt }],
+        response_format: { type: "json_object" },
+      });
+
+      let bestIdx = -1;
+      try {
+        const evalParsed = JSON.parse(
+          evalResponse.choices[0].message.content || '{"bestVideoId": -1}',
+        );
+        bestIdx = evalParsed.bestVideoId;
+      } catch (e) {
+        console.warn("Failed to parse video evaluation response");
+      }
+
+      if (bestIdx < 0 || bestIdx >= ytVideos.length) {
+        console.warn(`⚠️ [Serper] LLM rejected all videos as irrelevant.`);
+        if (retries > 0) return this.fetchRealSocialPosts(topic, retries - 1);
+        return [];
+      }
+
+      const bestVideo = ytVideos[bestIdx];
+      console.log(
+        `✅ [Serper] AI selected video: ${bestVideo.title} (Channel: ${bestVideo.channel})`,
+      );

      // Extract the 11-char video ID from the link (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)
      const urlObj = new URL(bestVideo.link);
@@ -299,7 +343,7 @@ CRITICAL: Do NOT provide more than 2 trendsKeywords. Keep it extremely focused.`
    try {
      let parsed = JSON.parse(
        response.choices[0].message.content ||
-        '{"trendsKeywords": [], "dcVariables": []}',
+          '{"trendsKeywords": [], "dcVariables": []}',
      );
      if (Array.isArray(parsed)) {
        parsed = parsed[0] || { trendsKeywords: [], dcVariables: [] };