feat: content engine

2026-02-22 02:39:27 +01:00
parent a9adb2eff7
commit 3a1a88db89
11 changed files with 942 additions and 172 deletions
--- a/packages/journaling/src/agent.ts
+++ b/packages/journaling/src/agent.ts
@@ -1,6 +1,7 @@
 import OpenAI from "openai";
 import { DataCommonsClient } from "./clients/data-commons";
 import { TrendsClient } from "./clients/trends";
+import { SerperClient, type SerperVideoResult } from "./clients/serper";

 export interface Fact {
  statement: string;
@@ -20,6 +21,7 @@ export class ResearchAgent {
  private openai: OpenAI;
  private dcClient: DataCommonsClient;
  private trendsClient: TrendsClient;
+  private serperClient: SerperClient;

  constructor(apiKey: string) {
    this.openai = new OpenAI({
@@ -31,7 +33,8 @@ export class ResearchAgent {
      },
    });
    this.dcClient = new DataCommonsClient();
-    this.trendsClient = new TrendsClient();
+    this.trendsClient = new TrendsClient(apiKey);
+    this.serperClient = new SerperClient(process.env.SERPER_API_KEY);
  }

  async researchTopic(topic: string): Promise<Fact[]> {
@@ -107,120 +110,151 @@ Return JSON: { "facts": [ { "statement": "...", "source": "Organization Name Onl
    return result.facts || [];
  }

-  async findSocialPosts(
+  /**
+   * Extracts existing social media embeds from MDX content via regex.
+   * No LLM involved — purely deterministic parsing.
+   * Only returns posts that are already present in the article.
+   */
+  extractSocialPosts(content: string): SocialPost[] {
+    const posts: SocialPost[] = [];
+
+    // YouTube: <YouTubeEmbed videoId="..." />
+    const ytMatches = [
+      ...content.matchAll(/<YouTubeEmbed[^>]*videoId="([^"]+)"[^>]*\/>/gi),
+    ];
+    for (const match of ytMatches) {
+      if (!posts.some((p) => p.embedId === match[1])) {
+        posts.push({
+          platform: "youtube",
+          embedId: match[1],
+          description: "Existing YouTube embed",
+        });
+      }
+    }
+
+    // Twitter/X: <TwitterEmbed tweetId="..." />
+    const twMatches = [
+      ...content.matchAll(/<TwitterEmbed[^>]*tweetId="([^"]+)"[^>]*\/>/gi),
+    ];
+    for (const match of twMatches) {
+      if (!posts.some((p) => p.embedId === match[1])) {
+        posts.push({
+          platform: "twitter",
+          embedId: match[1],
+          description: "Existing Twitter/X embed",
+        });
+      }
+    }
+
+    // LinkedIn: <LinkedInEmbed url="..." /> or <LinkedInEmbed urn="..." />
+    const liMatches = [
+      ...content.matchAll(/<LinkedInEmbed[^>]*(?:url|urn)="([^"]+)"[^>]*\/>/gi),
+    ];
+    for (const match of liMatches) {
+      if (!posts.some((p) => p.embedId === match[1])) {
+        posts.push({
+          platform: "linkedin",
+          embedId: match[1],
+          description: "Existing LinkedIn embed",
+        });
+      }
+    }
+
+    if (posts.length > 0) {
+      console.log(
+        `📱 Extracted ${posts.length} existing social media embed(s) from content`,
+      );
+    } else {
+      console.log(`📱 No existing social media embeds found in content`);
+    }
+
+    return posts;
+  }
+
+  /**
+   * Fetches real, verified social media posts using the Serper API (Google Video Search).
+   * This completely prevents hallucinations as it relies on actual search results.
+   */
+  async fetchRealSocialPosts(
    topic: string,
-    retries = 2,
-    previousFailures: string[] = [],
+    retries = 1,
  ): Promise<SocialPost[]> {
    console.log(
-      `📱 Searching for relevant Social Media Posts: "${topic}"${retries < 2 ? ` (Retry ${2 - retries}/2)` : ""}`,
+      `🌐 [Serper] Fetching real social media posts for topic: "${topic}"...`,
    );

-    const failureContext =
-      previousFailures.length > 0
-        ? `\nCRITICAL FAILURE WARNING: The following IDs you generated previously returned 404 Not Found and were Hallucinations: ${previousFailures.join(", ")}. You MUST provide REAL, verifiable IDs. If you cannot 100% guarantee an ID exists, return an empty array instead of guessing.`
-        : "";
-
-    const response = await this.openai.chat.completions.create({
-      model: "google/gemini-2.5-pro",
+    // Step 1: Ask the LLM to generate a highly specific YouTube search query
+    // We want tutorials, explanations, or deep dives.
+    const queryGen = await this.openai.chat.completions.create({
+      model: "google/gemini-2.5-flash",
      messages: [
        {
          role: "system",
-          content: `You are a social media researcher finding high-value, real expert posts and videos to embed in a B2B Tech Blog post about: "${topic}".
-                    
-Your Goal: Identify 1-3 REAL, highly relevant social media posts (YouTube, Twitter/X, LinkedIn) that provide social proof, expert opinions, or deep dives.${failureContext}
-
-Constraint: You MUST provide the exact mathematical or alphanumeric ID for the embed.
- YouTube: The 11-character video ID (e.g. "dQw4w9WgXcQ")
- Twitter: The numerical tweet ID (e.g. "1753464161943834945")
- LinkedIn: The activity URN (e.g. "urn:li:activity:7153664326573674496" or just the numerical 19-digit ID)
-
-Return JSON exactly as follows:
-{
-  "posts": [
-    { "platform": "youtube", "embedId": "dQw4w9WgXcQ", "description": "Google Web Dev explaining Core Web Vitals" }
-  ]
-}
-Return ONLY the JSON.`,
+          content: `Generate a YouTube search query to find a high-quality, professional educational video about: "${topic}".
+Prefer official tech channels or well-known developers (e.g., Google Chrome Developers, Vercel, Theo - t3.gg, Fireship, etc.). 
+Return a JSON object with a single string field "query". Example: {"query": "core web vitals explanation google developers"}.
+DO NOT USE QUOTES IN THE QUERY ITSELF.`,
        },
      ],
      response_format: { type: "json_object" },
    });

-    if (
-      !response.choices ||
-      response.choices.length === 0 ||
-      !response.choices[0].message
-    ) {
-      console.warn(`⚠️ Social post search failed for concept: "${topic}"`);
+    try {
+      let queryStr = "";
+      const parsed = JSON.parse(
+        queryGen.choices[0].message.content || '{"query": ""}',
+      );
+      queryStr = parsed.query || `${topic} tutorial explanation`;
+
+      // Step 2: Search via Serper Video Search
+      const videos = await this.serperClient.searchVideos(queryStr);
+
+      if (!videos || videos.length === 0) {
+        console.warn(`⚠️ [Serper] No videos found for query: "${queryStr}"`);
+        if (retries > 0) return this.fetchRealSocialPosts(topic, retries - 1);
+        return [];
+      }
+
+      // Filter for youtube results
+      const ytVideos = videos.filter(
+        (v) => v.link && v.link.includes("youtube.com/watch"),
+      );
+
+      if (ytVideos.length === 0) {
+        console.warn(`⚠️ [Serper] No YouTube videos in search results.`);
+        if (retries > 0) return this.fetchRealSocialPosts(topic, retries - 1);
+        return [];
+      }
+
+      // Pick the best one (usually the first result)
+      const bestVideo = ytVideos[0];
+
+      // Extract the 11-char video ID from the link (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)
+      const urlObj = new URL(bestVideo.link);
+      const videoId = urlObj.searchParams.get("v");
+
+      if (!videoId) {
+        console.warn(
+          `⚠️ [Serper] Could not extract video ID from: ${bestVideo.link}`,
+        );
+        return [];
+      }
+
+      console.log(
+        `✅ [Serper] Found valid YouTube Video: ${videoId} ("${bestVideo.title}")`,
+      );
+
+      return [
+        {
+          platform: "youtube",
+          embedId: videoId,
+          description: bestVideo.title || "YouTube Video",
+        },
+      ];
+    } catch (e) {
+      console.error("❌ Failed to fetch real social posts:", e);
      return [];
    }
-
-    const result = JSON.parse(response.choices[0].message.content || "{}");
-    const rawPosts: SocialPost[] = result.posts || [];
-
-    // CRITICAL WORKFLOW FIX: Absolutely forbid hallucinations by verifying via oEmbed APIs
-    const verifiedPosts: SocialPost[] = [];
-    if (rawPosts.length > 0) {
-      console.log(
-        `🛡️ Verifying ${rawPosts.length} generated social ID(s) against network...`,
-      );
-    }
-
-    const failedIdsForThisRun: string[] = [];
-
-    for (const post of rawPosts) {
-      let isValid = false;
-      try {
-        if (post.platform === "youtube") {
-          const res = await fetch(
-            `https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=${post.embedId}`,
-          );
-          isValid = res.ok;
-        } else if (post.platform === "twitter") {
-          const res = await fetch(
-            `https://publish.twitter.com/oembed?url=https://twitter.com/x/status/${post.embedId}`,
-          );
-          isValid = res.ok;
-        } else if (post.platform === "linkedin") {
-          // LinkedIn doesn't have an unauthenticated oEmbed, so we use heuristic URL/URN format validation
-          if (
-            post.embedId.includes("urn:li:") ||
-            post.embedId.includes("linkedin.com") ||
-            /^\d{19}$/.test(post.embedId)
-          ) {
-            isValid = true;
-          }
-        }
-      } catch (e) {
-        isValid = false;
-      }
-
-      if (isValid) {
-        verifiedPosts.push(post);
-        console.log(
-          `✅ Verified real post ID: ${post.embedId} (${post.platform})`,
-        );
-      } else {
-        failedIdsForThisRun.push(post.embedId);
-        console.warn(
-          `🛑 Dropped hallucinated or dead post ID: ${post.embedId} (${post.platform})`,
-        );
-      }
-    }
-
-    // AGENT SELF-HEALING: If all found posts were hallucinations and we have retries, challenge the LLM to try again
-    if (verifiedPosts.length === 0 && rawPosts.length > 0 && retries > 0) {
-      console.warn(
-        `🔄 Self-Healing triggered: All IDs were hallucinations. Challenging agent to find real IDs...`,
-      );
-      return this.findSocialPosts(topic, retries - 1, [
-        ...previousFailures,
-        ...failedIdsForThisRun,
-      ]);
-    }
-
-    return verifiedPosts;
  }

  private async planResearch(
@@ -273,4 +307,60 @@ CRITICAL: Do NOT provide more than 2 trendsKeywords. Keep it extremely focused.`
      return { trendsKeywords: [], dcVariables: [] };
    }
  }
+
+  /**
+   * Researches the top-ranking competitors on Google for a given topic.
+   * Extracts their titles and snippets to guide the LLM to write better content.
+   */
+  async researchCompetitors(topic: string, retries = 1): Promise<string[]> {
+    console.log(
+      `🔍 [Competitor Research] Fetching top ranking web pages for topic: "${topic.slice(0, 50)}..."`,
+    );
+
+    // Step 1: LLM generates the optimal Google Search query
+    const queryGen = await this.openai.chat.completions.create({
+      model: "google/gemini-2.5-flash",
+      messages: [
+        {
+          role: "system",
+          content: `Generate a Google Search query that a B2B decision maker would use to research the following topic: "${topic}".
+Focus on intent-driven keywords.
+Return a JSON object with a single string field "query". Example: {"query": "Next.js performance optimization agency"}.
+DO NOT USE QUOTES IN THE QUERY ITSELF.`,
+        },
+      ],
+      response_format: { type: "json_object" },
+    });
+
+    try {
+      const parsed = JSON.parse(
+        queryGen.choices[0].message.content || '{"query": ""}',
+      );
+      const queryStr = parsed.query || topic;
+
+      // Step 2: Search via Serper Web Search
+      const organicResults = await this.serperClient.searchWeb(queryStr, 5);
+
+      if (!organicResults || organicResults.length === 0) {
+        console.warn(
+          `⚠️ [Competitor Research] No web results found for query: "${queryStr}"`,
+        );
+        if (retries > 0) return this.researchCompetitors(topic, retries - 1);
+        return [];
+      }
+
+      // Map to structured insights string
+      const insights = organicResults.map((result, i) => {
+        return `[Rank #${i + 1}] Title: "${result.title}" | Snippet: "${result.snippet}"`;
+      });
+
+      console.log(
+        `✅ [Competitor Research] Analyzed top ${insights.length} competitor articles.`,
+      );
+      return insights;
+    } catch (e) {
+      console.error("❌ Failed to fetch competitor research:", e);
+      return [];
+    }
+  }
 }
--- a/packages/journaling/src/clients/serper.ts
+++ b/packages/journaling/src/clients/serper.ts
@@ -0,0 +1,128 @@
+export interface SerperVideoResult {
+  title: string;
+  link: string;
+  snippet?: string;
+  date?: string;
+  duration?: string;
+  channel?: string;
+}
+
+export interface SerperVideoResponse {
+  searchParameters: any;
+  videos: SerperVideoResult[];
+}
+
+export interface SerperWebResult {
+  title: string;
+  link: string;
+  snippet: string;
+  date?: string;
+  sitelinks?: any[];
+  position: number;
+}
+
+export interface SerperWebResponse {
+  searchParameters: any;
+  organic: SerperWebResult[];
+}
+
+export class SerperClient {
+  private apiKey: string;
+
+  constructor(apiKey?: string) {
+    const key = apiKey || process.env.SERPER_API_KEY;
+    if (!key) {
+      console.warn("⚠️ SERPER_API_KEY is not defined. SerperClient will fail.");
+    }
+    this.apiKey = key || "";
+  }
+
+  /**
+   * Performs a video search via Serper (Google Video Search).
+   * Great for finding relevant YouTube videos.
+   */
+  async searchVideos(
+    query: string,
+    num: number = 5,
+  ): Promise<SerperVideoResult[]> {
+    if (!this.apiKey) {
+      console.error("❌ SERPER_API_KEY missing - cannot execute search.");
+      return [];
+    }
+
+    try {
+      console.log(`🔍 [Serper] Searching videos for: "${query}"`);
+      const response = await fetch("https://google.serper.dev/videos", {
+        method: "POST",
+        headers: {
+          "X-API-KEY": this.apiKey,
+          "Content-Type": "application/json",
+        },
+        body: JSON.stringify({
+          q: query,
+          num: num,
+          gl: "de", // Germany for localized results
+          hl: "de", // German language
+        }),
+      });
+
+      if (!response.ok) {
+        console.error(
+          `❌ [Serper] API Error: ${response.status} ${response.statusText}`,
+        );
+        const text = await response.text();
+        console.error(text);
+        return [];
+      }
+
+      const data = (await response.json()) as SerperVideoResponse;
+      return data.videos || [];
+    } catch (e) {
+      console.error("❌ [Serper] Request failed", e);
+      return [];
+    }
+  }
+
+  /**
+   * Performs a standard web search via Serper.
+   * Crucial for B2B competitor analysis and context gathering.
+   */
+  async searchWeb(query: string, num: number = 5): Promise<SerperWebResult[]> {
+    if (!this.apiKey) {
+      console.error("❌ SERPER_API_KEY missing - cannot execute web search.");
+      return [];
+    }
+
+    try {
+      console.log(`🔍 [Serper] Web Search for Competitor Insights: "${query}"`);
+      const response = await fetch("https://google.serper.dev/search", {
+        method: "POST",
+        headers: {
+          "X-API-KEY": this.apiKey,
+          "Content-Type": "application/json",
+        },
+        body: JSON.stringify({
+          q: query,
+          num: num,
+          gl: "de", // Germany for localized results
+          hl: "de", // German language
+        }),
+      });
+
+      if (!response.ok) {
+        console.error(
+          `❌ [Serper] API Error: ${response.status} ${response.statusText}`,
+        );
+        const text = await response.text();
+        console.error(text);
+        return [];
+      }
+
+      const data = (await response.json()) as SerperWebResponse;
+      return data.organic || [];
+    } catch (e) {
+      console.error("❌ [Serper] Web Request failed", e);
+      return [];
+    }
+  }
+}
--- a/packages/journaling/src/index.ts
+++ b/packages/journaling/src/index.ts
@@ -1,3 +1,4 @@
 export * from "./clients/data-commons";
 export * from "./clients/trends";
+export * from "./clients/serper";
 export * from "./agent";