at-mintel/packages/journaling/src/agent.ts

import OpenAI from "openai";
import { DataCommonsClient } from "./clients/data-commons";
import { TrendsClient } from "./clients/trends";
import { SerperClient } from "./clients/serper";

export interface Fact {
  statement: string;
  source: string;
  url?: string;
  confidence: "high" | "medium" | "low";
  data?: any;
}

export interface SocialPost {
  platform: "youtube" | "twitter" | "linkedin";
  embedId: string;
  description: string;
}

export class ResearchAgent {
  private openai: OpenAI;
  private dcClient: DataCommonsClient;
  private trendsClient: TrendsClient;
  private serperClient: SerperClient;

  constructor(apiKey: string) {
    this.openai = new OpenAI({
      apiKey,
      baseURL: "https://openrouter.ai/api/v1",
      defaultHeaders: {
        "HTTP-Referer": "https://mintel.me",
        "X-Title": "Mintel Journaling Agent",
      },
    });
    this.dcClient = new DataCommonsClient();
    this.trendsClient = new TrendsClient(apiKey);
    this.serperClient = new SerperClient(process.env.SERPER_API_KEY);
  }

  async researchTopic(topic: string): Promise<Fact[]> {
    console.log(`🔎 Researching: ${topic}`);

    // 1. Plan Research
    const plan = await this.planResearch(topic);
    console.log(`📋 Research Plan:`, plan);

    const facts: Fact[] = [];

    // 2. Execute Plan
    // Google Trends
    for (const kw of plan.trendsKeywords) {
      try {
        const data = await this.trendsClient.getInterestOverTime(kw);
        if (data.length > 0) {
          // Analyze trend
          const latest = data[data.length - 1];
          facts.push({
            statement: `Interest in "${kw}" is currently at ${latest.value}% of peak popularity.`,
            source: "Google Trends",
            confidence: "high",
            data: data.slice(-5), // Last 5 points
          });
        }
      } catch (e) {
        console.error(`Error fetching trends for ${kw}`, e);
      }
    }

    // Data Commons
    // We need DCIDs. LLM should have provided them or we need a search.
    // For this POC, let's assume the LLM provides plausible DCIDs or we skip deep DC integration for now
    // and rely on the LLM's own knowledge + the verified trends.
    // However, if the plan has dcVariables, let's try.

    // 3. Synthesize & Verify
    // Ask LLM to verify its own knowledge against the data we found (if any) or just use its training data
    // but formatted as "facts".

    const synthesis = await this.openai.chat.completions.create({
      model: "google/gemini-3-flash-preview",
      messages: [
        {
          role: "system",
          content: `You are a professional digital researcher and fact-checker.
Topic: "${topic}"

Your Goal: Provide 5-7 concrete, verifiable, statistical facts.
Constraint 1: Cite real sources (e.g. "Google Developers", "HTTP Archive", "Deloitte", "Nielsen Norman Group").
Constraint 2: DO NOT cite "General Knowledge".
Constraint 3: CRITICAL MANDATE - NEVER generate or guess URLs. You must hallucinate NO links. Use ONLY the Organization's Name as the "source" field.

Return JSON: { "facts": [ { "statement": "...", "source": "Organization Name Only", "confidence": "high" } ] }`,
        },
        { role: "user", content: "Extract facts." },
      ],
      response_format: { type: "json_object" },
    });

    if (
      !synthesis.choices ||
      synthesis.choices.length === 0 ||
      !synthesis.choices[0].message
    ) {
      console.warn(`⚠️ Research synthesis failed for concept: "${topic}"`);
      return [];
    }

    const result = JSON.parse(synthesis.choices[0].message.content || "{}");
    return result.facts || [];
  }

  /**
   * Extracts existing social media embeds from MDX content via regex.
   * No LLM involved — purely deterministic parsing.
   * Only returns posts that are already present in the article.
   */
  extractSocialPosts(content: string): SocialPost[] {
    const posts: SocialPost[] = [];

    // YouTube: <YouTubeEmbed videoId="..." />
    const ytMatches = [
      ...content.matchAll(/<YouTubeEmbed[^>]*videoId="([^"]+)"[^>]*\/>/gi),
    ];
    for (const match of ytMatches) {
      if (!posts.some((p) => p.embedId === match[1])) {
        posts.push({
          platform: "youtube",
          embedId: match[1],
          description: "Existing YouTube embed",
        });
      }
    }

    // Twitter/X: <TwitterEmbed tweetId="..." />
    const twMatches = [
      ...content.matchAll(/<TwitterEmbed[^>]*tweetId="([^"]+)"[^>]*\/>/gi),
    ];
    for (const match of twMatches) {
      if (!posts.some((p) => p.embedId === match[1])) {
        posts.push({
          platform: "twitter",
          embedId: match[1],
          description: "Existing Twitter/X embed",
        });
      }
    }

    // LinkedIn: <LinkedInEmbed url="..." /> or <LinkedInEmbed urn="..." />
    const liMatches = [
      ...content.matchAll(/<LinkedInEmbed[^>]*(?:url|urn)="([^"]+)"[^>]*\/>/gi),
    ];
    for (const match of liMatches) {
      if (!posts.some((p) => p.embedId === match[1])) {
        posts.push({
          platform: "linkedin",
          embedId: match[1],
          description: "Existing LinkedIn embed",
        });
      }
    }

    if (posts.length > 0) {
      console.log(
        `📱 Extracted ${posts.length} existing social media embed(s) from content`,
      );
    } else {
      console.log(`📱 No existing social media embeds found in content`);
    }

    return posts;
  }

  /**
   * Fetches real, verified social media posts using the Serper API (Google Video Search).
   * This completely prevents hallucinations as it relies on actual search results.
   */
  async fetchRealSocialPosts(
    topic: string,
    customSources?: string[],
    retries = 1,
  ): Promise<SocialPost[]> {
    console.log(
      `🌐 [Serper] Fetching real social media posts for topic: "${topic}"...`,
    );

    // Step 1: Ask the LLM to generate a highly specific YouTube search query
    // We want tutorials, explanations, or deep dives.
    const queryGen = await this.openai.chat.completions.create({
      model: "google/gemini-3-flash-preview",
      messages: [
        {
          role: "system",
          content: `You generate ultra-short, highly relevant YouTube search queries based on a given text context.

RULES:
1. Extract only the 2-4 most important technical or business keywords from the provided text.
2. Ignore all markdown syntax, frontmatter (---), titles, and descriptions.
3. Keep the query generic enough to find popular educational tech videos, BUT ensure it specifically targets the core technical subject. Append "tutorial" or "b2b explanation" if necessary to find high-quality content.
4. DO NOT append specific channel names (e.g., "Fireship", "Vercel") to the query.
5. DO NOT USE QUOTES IN THE QUERY.

Return a JSON object with a single string field "query". Example: {"query": "core web vitals performance tutorial"}`,
        },
        {
          role: "user",
          content: `CONTEXT: ${topic}`,
        },
      ],
      response_format: { type: "json_object" },
    });

    try {
      let queryStr = "";
      const parsed = JSON.parse(
        queryGen.choices[0].message.content || '{"query": ""}',
      );
      queryStr = parsed.query || `${topic} tutorial explanation`;

      // Step 2: Search via Serper Video Search
      const videos = await this.serperClient.searchVideos(queryStr);

      if (!videos || videos.length === 0) {
        console.warn(`⚠️ [Serper] No videos found for query: "${queryStr}"`);
        if (retries > 0) return this.fetchRealSocialPosts(topic, customSources, retries - 1);
        return [];
      }

      // Filter for youtube results
      const ytVideos = videos
        .filter(
          (v) =>
            v.link &&
            v.link.includes("youtube.com/watch") &&
            v.title &&
            v.channel,
        )
        .slice(0, 5); // Take top 5 for evaluation

      if (ytVideos.length === 0) {
        console.warn(`⚠️ [Serper] No YouTube videos in search results.`);
        if (retries > 0) return this.fetchRealSocialPosts(topic, customSources, retries - 1);
        return [];
      }

      // Step 3: Ask the LLM to evaluate the relevance of the found videos

      const sourceExamples = customSources && customSources.length > 0
        ? `Specifically prioritize content from: ${customSources.join(", ")}.`
        : `(e.g., Google Developers, Vercel, Theo - t3.gg, Fireship, Syntax, ByteByteGo, IBM Technology, McKinsey, Gartner, Deloitte).`;

      const evalPrompt = `You are a strict technical evaluator. You must select the MOST RELEVANT educational tech video from the list below based on this core article context: "${topic.slice(0, 800)}..."

Videos:
${ytVideos.map((v, i) => `[ID: ${i}] Title: "${v.title}" | Channel: "${v.channel}" | Snippet: "${v.snippet || "none"}"`).join("\n")}

RULES:
1. The video MUST be highly relevant to the EXACT technical topic of the context.
2. The channel SHOULD be a high-quality tech, development, or professional B2B channel ${sourceExamples} AVOID gaming, generic vlogs, clickbait, off-topic podcasts, or unrelated topics.
3. If none of the videos are strictly relevant to the core technical or business subject (e.g. they are just casually mentioning the word), YOU MUST RETURN -1. Be extremely critical. Do not just pick the "best of the worst".
4. If one is highly relevant, return its ID number.

Return ONLY a JSON object: {"bestVideoId": number}`;

      const evalResponse = await this.openai.chat.completions.create({
        model: "google/gemini-3-flash-preview",
        messages: [{ role: "system", content: evalPrompt }],
        response_format: { type: "json_object" },
      });

      let bestIdx = -1;
      try {
        const evalParsed = JSON.parse(
          evalResponse.choices[0].message.content || '{"bestVideoId": -1}',
        );
        bestIdx = evalParsed.bestVideoId;
      } catch {
        console.warn("Failed to parse video evaluation response");
      }

      if (bestIdx < 0 || bestIdx >= ytVideos.length) {
        console.warn(`⚠️ [Serper] LLM rejected all videos as irrelevant.`);
        if (retries > 0) return this.fetchRealSocialPosts(topic, customSources, retries - 1);
        return [];
      }

      const bestVideo = ytVideos[bestIdx];
      console.log(
        `✅ [Serper] AI selected video: ${bestVideo.title} (Channel: ${bestVideo.channel})`,
      );

      // Extract the 11-char video ID from the link (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)
      const urlObj = new URL(bestVideo.link);
      const videoId = urlObj.searchParams.get("v");

      if (!videoId) {
        console.warn(
          `⚠️ [Serper] Could not extract video ID from: ${bestVideo.link}`,
        );
        return [];
      }

      console.log(
        `✅ [Serper] Found valid YouTube Video: ${videoId} ("${bestVideo.title}")`,
      );

      return [
        {
          platform: "youtube",
          embedId: videoId,
          description: bestVideo.title || "YouTube Video",
        },
      ];
    } catch (e) {
      console.error("❌ Failed to fetch real social posts:", e);
      return [];
    }
  }

  private async planResearch(
    topic: string,
  ): Promise<{ trendsKeywords: string[]; dcVariables: string[] }> {
    const response = await this.openai.chat.completions.create({
      model: "google/gemini-3-flash-preview",
      messages: [
        {
          role: "system",
          content: `Plan research for: "${topic}".
Return JSON:
{
  "trendsKeywords": ["list", "of", "max", "2", "keywords"],
  "dcVariables": ["StatisticalVariables", "if", "known", "otherwise", "empty"]
}
CRITICAL: Do NOT provide more than 2 trendsKeywords. Keep it extremely focused.`,
        },
      ],
      response_format: { type: "json_object" },
    });

    if (
      !response.choices ||
      response.choices.length === 0 ||
      !response.choices[0].message
    ) {
      console.warn(`⚠️ Research planning failed for concept: "${topic}"`);
      return { trendsKeywords: [], dcVariables: [] };
    }

    try {
      let parsed = JSON.parse(
        response.choices[0].message.content ||
        '{"trendsKeywords": [], "dcVariables": []}',
      );
      if (Array.isArray(parsed)) {
        parsed = parsed[0] || { trendsKeywords: [], dcVariables: [] };
      }
      return {
        trendsKeywords: Array.isArray(parsed.trendsKeywords)
          ? parsed.trendsKeywords
          : [],
        dcVariables: Array.isArray(parsed.dcVariables)
          ? parsed.dcVariables
          : [],
      };
    } catch (e) {
      console.error("Failed to parse research plan JSON", e);
      return { trendsKeywords: [], dcVariables: [] };
    }
  }

  /**
   * Researches the top-ranking competitors on Google for a given topic.
   * Extracts their titles and snippets to guide the LLM to write better content.
   */
  async researchCompetitors(topic: string, retries = 1): Promise<string[]> {
    console.log(
      `🔍 [Competitor Research] Fetching top ranking web pages for topic: "${topic.slice(0, 50)}..."`,
    );

    // Step 1: LLM generates the optimal Google Search query
    const queryGen = await this.openai.chat.completions.create({
      model: "google/gemini-3-flash-preview",
      messages: [
        {
          role: "system",
          content: `Generate a Google Search query that a B2B decision maker would use to research the following topic: "${topic}".
Focus on intent-driven keywords.
Return a JSON object with a single string field "query". Example: {"query": "Next.js performance optimization agency"}.
DO NOT USE QUOTES IN THE QUERY ITSELF.`,
        },
      ],
      response_format: { type: "json_object" },
    });

    try {
      const parsed = JSON.parse(
        queryGen.choices[0].message.content || '{"query": ""}',
      );
      const queryStr = parsed.query || topic;

      // Step 2: Search via Serper Web Search
      const organicResults = await this.serperClient.searchWeb(queryStr, 5);

      if (!organicResults || organicResults.length === 0) {
        console.warn(
          `⚠️ [Competitor Research] No web results found for query: "${queryStr}"`,
        );
        if (retries > 0) return this.researchCompetitors(topic, retries - 1);
        return [];
      }

      // Map to structured insights string
      const insights = organicResults.map((result, i) => {
        return `[Rank #${i + 1}] Title: "${result.title}" | Snippet: "${result.snippet}"`;
      });

      console.log(
        `✅ [Competitor Research] Analyzed top ${insights.length} competitor articles.`,
      );
      return insights;
    } catch (e) {
      console.error("❌ Failed to fetch competitor research:", e);
      return [];
    }
  }
}