Files
at-mintel/packages/journaling/src/agent.ts
Marc Mintel f4507ef121
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧪 Test (push) Successful in 59s
Monorepo Pipeline / 🧹 Lint (push) Failing after 2m0s
Monorepo Pipeline / 🏗️ Build (push) Successful in 5m9s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
fix(journaling): optimize serper video search queries to prevent MDX hallucination
2026-02-22 17:35:38 +01:00

377 lines
12 KiB
TypeScript

import OpenAI from "openai";
import { DataCommonsClient } from "./clients/data-commons";
import { TrendsClient } from "./clients/trends";
import { SerperClient, type SerperVideoResult } from "./clients/serper";
export interface Fact {
statement: string;
source: string;
url?: string;
confidence: "high" | "medium" | "low";
data?: any;
}
export interface SocialPost {
platform: "youtube" | "twitter" | "linkedin";
embedId: string;
description: string;
}
export class ResearchAgent {
private openai: OpenAI;
private dcClient: DataCommonsClient;
private trendsClient: TrendsClient;
private serperClient: SerperClient;
constructor(apiKey: string) {
this.openai = new OpenAI({
apiKey,
baseURL: "https://openrouter.ai/api/v1",
defaultHeaders: {
"HTTP-Referer": "https://mintel.me",
"X-Title": "Mintel Journaling Agent",
},
});
this.dcClient = new DataCommonsClient();
this.trendsClient = new TrendsClient(apiKey);
this.serperClient = new SerperClient(process.env.SERPER_API_KEY);
}
async researchTopic(topic: string): Promise<Fact[]> {
console.log(`🔎 Researching: ${topic}`);
// 1. Plan Research
const plan = await this.planResearch(topic);
console.log(`📋 Research Plan:`, plan);
const facts: Fact[] = [];
// 2. Execute Plan
// Google Trends
for (const kw of plan.trendsKeywords) {
try {
const data = await this.trendsClient.getInterestOverTime(kw);
if (data.length > 0) {
// Analyze trend
const latest = data[data.length - 1];
const max = Math.max(...data.map((d) => d.value));
facts.push({
statement: `Interest in "${kw}" is currently at ${latest.value}% of peak popularity.`,
source: "Google Trends",
confidence: "high",
data: data.slice(-5), // Last 5 points
});
}
} catch (e) {
console.error(`Error fetching trends for ${kw}`, e);
}
}
// Data Commons
// We need DCIDs. LLM should have provided them or we need a search.
// For this POC, let's assume the LLM provides plausible DCIDs or we skip deep DC integration for now
// and rely on the LLM's own knowledge + the verified trends.
// However, if the plan has dcVariables, let's try.
// 3. Synthesize & Verify
// Ask LLM to verify its own knowledge against the data we found (if any) or just use its training data
// but formatted as "facts".
const synthesis = await this.openai.chat.completions.create({
model: "google/gemini-2.0-flash-001",
messages: [
{
role: "system",
content: `You are a professional digital researcher and fact-checker.
Topic: "${topic}"
Your Goal: Provide 5-7 concrete, verifiable, statistical facts.
Constraint 1: Cite real sources (e.g. "Google Developers", "HTTP Archive", "Deloitte", "Nielsen Norman Group").
Constraint 2: DO NOT cite "General Knowledge".
Constraint 3: CRITICAL MANDATE - NEVER generate or guess URLs. You must hallucinate NO links. Use ONLY the Organization's Name as the "source" field.
Return JSON: { "facts": [ { "statement": "...", "source": "Organization Name Only", "confidence": "high" } ] }`,
},
{ role: "user", content: "Extract facts." },
],
response_format: { type: "json_object" },
});
if (
!synthesis.choices ||
synthesis.choices.length === 0 ||
!synthesis.choices[0].message
) {
console.warn(`⚠️ Research synthesis failed for concept: "${topic}"`);
return [];
}
const result = JSON.parse(synthesis.choices[0].message.content || "{}");
return result.facts || [];
}
/**
* Extracts existing social media embeds from MDX content via regex.
* No LLM involved — purely deterministic parsing.
* Only returns posts that are already present in the article.
*/
extractSocialPosts(content: string): SocialPost[] {
const posts: SocialPost[] = [];
// YouTube: <YouTubeEmbed videoId="..." />
const ytMatches = [
...content.matchAll(/<YouTubeEmbed[^>]*videoId="([^"]+)"[^>]*\/>/gi),
];
for (const match of ytMatches) {
if (!posts.some((p) => p.embedId === match[1])) {
posts.push({
platform: "youtube",
embedId: match[1],
description: "Existing YouTube embed",
});
}
}
// Twitter/X: <TwitterEmbed tweetId="..." />
const twMatches = [
...content.matchAll(/<TwitterEmbed[^>]*tweetId="([^"]+)"[^>]*\/>/gi),
];
for (const match of twMatches) {
if (!posts.some((p) => p.embedId === match[1])) {
posts.push({
platform: "twitter",
embedId: match[1],
description: "Existing Twitter/X embed",
});
}
}
// LinkedIn: <LinkedInEmbed url="..." /> or <LinkedInEmbed urn="..." />
const liMatches = [
...content.matchAll(/<LinkedInEmbed[^>]*(?:url|urn)="([^"]+)"[^>]*\/>/gi),
];
for (const match of liMatches) {
if (!posts.some((p) => p.embedId === match[1])) {
posts.push({
platform: "linkedin",
embedId: match[1],
description: "Existing LinkedIn embed",
});
}
}
if (posts.length > 0) {
console.log(
`📱 Extracted ${posts.length} existing social media embed(s) from content`,
);
} else {
console.log(`📱 No existing social media embeds found in content`);
}
return posts;
}
/**
* Fetches real, verified social media posts using the Serper API (Google Video Search).
* This completely prevents hallucinations as it relies on actual search results.
*/
async fetchRealSocialPosts(
topic: string,
retries = 1,
): Promise<SocialPost[]> {
console.log(
`🌐 [Serper] Fetching real social media posts for topic: "${topic}"...`,
);
// Step 1: Ask the LLM to generate a highly specific YouTube search query
// We want tutorials, explanations, or deep dives.
const queryGen = await this.openai.chat.completions.create({
model: "google/gemini-2.5-flash",
messages: [
{
role: "system",
content: `You generate ultra-short, highly relevant YouTube search queries based on a given text context.
RULES:
1. Extract only the 2-4 most important technical or business keywords from the provided text.
2. Ignore all markdown syntax, frontmatter (---), titles, and descriptions.
3. Keep the query generic enough to find popular educational tech videos.
4. DO NOT append specific channel names (e.g., "Fireship", "Vercel") to the query.
5. DO NOT USE QUOTES IN THE QUERY.
Return a JSON object with a single string field "query". Example: {"query": "core web vitals performance"}`,
},
{
role: "user",
content: `CONTEXT: ${topic}`,
}
],
response_format: { type: "json_object" },
});
try {
let queryStr = "";
const parsed = JSON.parse(
queryGen.choices[0].message.content || '{"query": ""}',
);
queryStr = parsed.query || `${topic} tutorial explanation`;
// Step 2: Search via Serper Video Search
const videos = await this.serperClient.searchVideos(queryStr);
if (!videos || videos.length === 0) {
console.warn(`⚠️ [Serper] No videos found for query: "${queryStr}"`);
if (retries > 0) return this.fetchRealSocialPosts(topic, retries - 1);
return [];
}
// Filter for youtube results
const ytVideos = videos.filter(
(v) => v.link && v.link.includes("youtube.com/watch"),
);
if (ytVideos.length === 0) {
console.warn(`⚠️ [Serper] No YouTube videos in search results.`);
if (retries > 0) return this.fetchRealSocialPosts(topic, retries - 1);
return [];
}
// Pick the best one (usually the first result)
const bestVideo = ytVideos[0];
// Extract the 11-char video ID from the link (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)
const urlObj = new URL(bestVideo.link);
const videoId = urlObj.searchParams.get("v");
if (!videoId) {
console.warn(
`⚠️ [Serper] Could not extract video ID from: ${bestVideo.link}`,
);
return [];
}
console.log(
`✅ [Serper] Found valid YouTube Video: ${videoId} ("${bestVideo.title}")`,
);
return [
{
platform: "youtube",
embedId: videoId,
description: bestVideo.title || "YouTube Video",
},
];
} catch (e) {
console.error("❌ Failed to fetch real social posts:", e);
return [];
}
}
private async planResearch(
topic: string,
): Promise<{ trendsKeywords: string[]; dcVariables: string[] }> {
const response = await this.openai.chat.completions.create({
model: "google/gemini-2.0-flash-001",
messages: [
{
role: "system",
content: `Plan research for: "${topic}".
Return JSON:
{
"trendsKeywords": ["list", "of", "max", "2", "keywords"],
"dcVariables": ["StatisticalVariables", "if", "known", "otherwise", "empty"]
}
CRITICAL: Do NOT provide more than 2 trendsKeywords. Keep it extremely focused.`,
},
],
response_format: { type: "json_object" },
});
if (
!response.choices ||
response.choices.length === 0 ||
!response.choices[0].message
) {
console.warn(`⚠️ Research planning failed for concept: "${topic}"`);
return { trendsKeywords: [], dcVariables: [] };
}
try {
let parsed = JSON.parse(
response.choices[0].message.content ||
'{"trendsKeywords": [], "dcVariables": []}',
);
if (Array.isArray(parsed)) {
parsed = parsed[0] || { trendsKeywords: [], dcVariables: [] };
}
return {
trendsKeywords: Array.isArray(parsed.trendsKeywords)
? parsed.trendsKeywords
: [],
dcVariables: Array.isArray(parsed.dcVariables)
? parsed.dcVariables
: [],
};
} catch (e) {
console.error("Failed to parse research plan JSON", e);
return { trendsKeywords: [], dcVariables: [] };
}
}
/**
* Researches the top-ranking competitors on Google for a given topic.
* Extracts their titles and snippets to guide the LLM to write better content.
*/
async researchCompetitors(topic: string, retries = 1): Promise<string[]> {
console.log(
`🔍 [Competitor Research] Fetching top ranking web pages for topic: "${topic.slice(0, 50)}..."`,
);
// Step 1: LLM generates the optimal Google Search query
const queryGen = await this.openai.chat.completions.create({
model: "google/gemini-2.5-flash",
messages: [
{
role: "system",
content: `Generate a Google Search query that a B2B decision maker would use to research the following topic: "${topic}".
Focus on intent-driven keywords.
Return a JSON object with a single string field "query". Example: {"query": "Next.js performance optimization agency"}.
DO NOT USE QUOTES IN THE QUERY ITSELF.`,
},
],
response_format: { type: "json_object" },
});
try {
const parsed = JSON.parse(
queryGen.choices[0].message.content || '{"query": ""}',
);
const queryStr = parsed.query || topic;
// Step 2: Search via Serper Web Search
const organicResults = await this.serperClient.searchWeb(queryStr, 5);
if (!organicResults || organicResults.length === 0) {
console.warn(
`⚠️ [Competitor Research] No web results found for query: "${queryStr}"`,
);
if (retries > 0) return this.researchCompetitors(topic, retries - 1);
return [];
}
// Map to structured insights string
const insights = organicResults.map((result, i) => {
return `[Rank #${i + 1}] Title: "${result.title}" | Snippet: "${result.snippet}"`;
});
console.log(
`✅ [Competitor Research] Analyzed top ${insights.length} competitor articles.`,
);
return insights;
} catch (e) {
console.error("❌ Failed to fetch competitor research:", e);
return [];
}
}
}