// ============================================================================ // @mintel/page-audit — DataForSEO API Client // Uses native fetch (no axios) to avoid Node event loop exit during polling. // Docs: https://docs.dataforseo.com/v3/on_page/ // ============================================================================ import type { PageAuditData, AuditIssue } from "./types.js"; const API_BASE = "https://api.dataforseo.com/v3"; /** Authenticated DataForSEO client */ export class DataForSeoClient { private auth: string; constructor(login: string, password: string) { this.auth = Buffer.from(`${login}:${password}`).toString("base64"); } private get headers(): Record { return { Authorization: `Basic ${this.auth}`, "Content-Type": "application/json", }; } private async apiGet(path: string): Promise { const resp = await fetch(`${API_BASE}${path}`, { headers: this.headers }); if (!resp.ok) throw new Error(`DataForSEO GET ${path} failed: ${resp.status}`); return resp.json(); } private async apiPost(path: string, body: any): Promise { const resp = await fetch(`${API_BASE}${path}`, { method: "POST", headers: this.headers, body: JSON.stringify(body), }); if (!resp.ok) throw new Error(`DataForSEO POST ${path} failed: ${resp.status}`); return resp.json(); } /** * Start an On-Page crawl for a domain and return the task ID. */ async startCrawl(domain: string, maxCrawlPages = 50): Promise { const url = domain.startsWith("http") ? domain : `https://${domain}`; const data = await this.apiPost("/on_page/task_post", [ { target: url, max_crawl_pages: maxCrawlPages, load_resources: true, enable_javascript: true, enable_browser_rendering: true, check_spell: false, calculate_keyword_density: false, }, ]); const task = data?.tasks?.[0]; if (!task?.id) { throw new Error(`DataForSEO task creation failed: ${JSON.stringify(task?.status_message || "unknown")}`); } return task.id; } /** * Check if a task is ready via the tasks_ready endpoint. */ async isTaskReady(taskId: string): Promise { const data = await this.apiGet("/on_page/tasks_ready"); const readyTasks: string[] = data?.tasks?.[0]?.result?.map((t: any) => t.id) || []; return readyTasks.includes(taskId); } /** * Poll for task completion using tasks_ready endpoint. * DataForSEO crawls can take 2-5 minutes. */ async waitForTask(taskId: string, timeoutMs = 300_000): Promise { const start = Date.now(); let delay = 15_000; let pollCount = 0; // Force Node event loop active const keepAlive = setInterval(() => { }, 1000); try { while (Date.now() - start < timeoutMs) { await this.sleep(delay); pollCount++; const ready = await this.isTaskReady(taskId); const elapsed = Math.round((Date.now() - start) / 1000); console.log(` 📊 Poll #${pollCount}: ${ready ? "READY ✅" : "not ready"} (${elapsed}s elapsed)`); if (ready) { // Short grace period so the pages endpoint settles await this.sleep(5_000); return; } delay = Math.min(delay * 1.3, 30_000); } throw new Error(`DataForSEO task ${taskId} timed out after ${timeoutMs / 1000}s`); } finally { clearInterval(keepAlive); } } /** * Sleep that keeps the Node event loop alive. */ private sleep(ms: number): Promise { return new Promise((resolve) => { const timer = setTimeout(resolve, ms); // Explicitly ref the timer to prevent Node from exiting if (timer && typeof timer === "object" && "ref" in timer) { (timer as NodeJS.Timeout).ref(); } }); } /** * Fetch the crawl summary. */ async getCrawlSummary(taskId: string): Promise { const data = await this.apiGet(`/on_page/summary/${taskId}`); return data?.tasks?.[0]?.result?.[0] || null; } /** * Fetch all page-level results. */ async getPages(taskId: string, limit = 100, offset = 0): Promise { const data = await this.apiPost("/on_page/pages", [ { id: taskId, limit, offset }, ]); return data?.tasks?.[0]?.result?.[0]?.items || []; } /** * Fetch non-indexable pages. */ async getNonIndexable(taskId: string): Promise { const data = await this.apiPost("/on_page/non_indexable", [ { id: taskId, limit: 100, offset: 0 }, ]); return data?.tasks?.[0]?.result?.[0]?.items || []; } /** * Fetch broken resources (404s, timeouts, etc.) */ async getBrokenResources(taskId: string): Promise { const data = await this.apiPost("/on_page/resources", [ { id: taskId, limit: 100, filters: [["status_code", ">", "399"]] }, ]); return data?.tasks?.[0]?.result?.[0]?.items || []; } } /** * Normalize a DataForSEO raw page result into our PageAuditData type. */ export function normalizePage(raw: any): PageAuditData { const issues: AuditIssue[] = []; const checks = raw.checks || {}; if (checks.no_title) issues.push({ code: "NO_TITLE", severity: "critical", message: "Seite hat keinen Tag" }); if (checks.title_too_long) issues.push({ code: "TITLE_TOO_LONG", severity: "warning", message: `Title zu lang (${raw.meta?.title?.length || "?"} Zeichen, max 60)` }); if (checks.no_description) issues.push({ code: "NO_META_DESCRIPTION", severity: "warning", message: "Keine Meta-Description" }); if (checks.description_too_long) issues.push({ code: "META_DESC_TOO_LONG", severity: "info", message: "Meta-Description zu lang (max 160)" }); if (checks.no_h1_tag) issues.push({ code: "NO_H1", severity: "critical", message: "Kein H1-Tag auf der Seite" }); if (checks.duplicate_h1_tag) issues.push({ code: "DUPLICATE_H1", severity: "warning", message: "Mehrere H1-Tags gefunden" }); if (checks.is_broken) issues.push({ code: "PAGE_BROKEN", severity: "critical", message: `HTTP ${raw.status_code}: Seite nicht erreichbar` }); if (checks.low_content_rate) issues.push({ code: "THIN_CONTENT", severity: "warning", message: "Zu wenig Content (dünne Seite)" }); if (checks.has_render_blocking_resources) issues.push({ code: "RENDER_BLOCKING", severity: "warning", message: "Render-blockierende Ressourcen gefunden" }); if (checks.image_not_optimized) issues.push({ code: "UNOPTIMIZED_IMAGES", severity: "info", message: "Nicht-optimierte Bilder vorhanden" }); const imagesWithoutAlt = raw.checks?.no_image_alt ? (raw.meta?.images_count || 0) : 0; return { url: raw.url, statusCode: raw.status_code, pageTitle: raw.meta?.title || null, metaDescription: raw.meta?.description || null, h1: raw.meta?.htags?.h1?.[0] || null, wordCount: raw.meta?.content?.words_count || 0, loadTime: raw.page_timing?.time_to_interactive || null, links: { internal: raw.meta?.internal_links_count || 0, external: raw.meta?.external_links_count || 0, broken: 0, }, images: { total: raw.meta?.images_count || 0, missingAlt: imagesWithoutAlt, }, seo: { hasViewport: !raw.checks?.no_viewport_tag, hasCanonical: !!raw.meta?.canonical, isIndexable: !raw.checks?.is_4xx_code && !raw.checks?.is_5xx_code, robotsTxt: raw.meta?.robots || null, ogTitle: raw.meta?.social_media_tags?.og_title || null, ogDescription: raw.meta?.social_media_tags?.og_description || null, }, performance: { cls: raw.page_timing?.cumulative_layout_shift || null, lcp: raw.page_timing?.largest_contentful_paint || null, fid: raw.page_timing?.first_input_delay || null, ttfb: raw.page_timing?.waiting_time || null, }, issues, }; }