Files
at-mintel/packages/page-audit/src/dataforseo.ts
Marc Mintel a43d96dd0e
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 1m1s
Monorepo Pipeline / 🧪 Test (push) Failing after 1m7s
Monorepo Pipeline / 🏗️ Build (push) Failing after 1m10s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
fix(pdf): decouple 6 distinct PDFs, fix layout issues and DataForSEO event loop
2026-02-27 18:26:00 +01:00

220 lines
8.4 KiB
TypeScript

// ============================================================================
// @mintel/page-audit — DataForSEO API Client
// Uses native fetch (no axios) to avoid Node event loop exit during polling.
// Docs: https://docs.dataforseo.com/v3/on_page/
// ============================================================================
import type { PageAuditData, AuditIssue } from "./types.js";
const API_BASE = "https://api.dataforseo.com/v3";
/** Authenticated DataForSEO client */
export class DataForSeoClient {
private auth: string;
constructor(login: string, password: string) {
this.auth = Buffer.from(`${login}:${password}`).toString("base64");
}
private get headers(): Record<string, string> {
return {
Authorization: `Basic ${this.auth}`,
"Content-Type": "application/json",
};
}
private async apiGet(path: string): Promise<any> {
const resp = await fetch(`${API_BASE}${path}`, { headers: this.headers });
if (!resp.ok) throw new Error(`DataForSEO GET ${path} failed: ${resp.status}`);
return resp.json();
}
private async apiPost(path: string, body: any): Promise<any> {
const resp = await fetch(`${API_BASE}${path}`, {
method: "POST",
headers: this.headers,
body: JSON.stringify(body),
});
if (!resp.ok) throw new Error(`DataForSEO POST ${path} failed: ${resp.status}`);
return resp.json();
}
/**
* Start an On-Page crawl for a domain and return the task ID.
*/
async startCrawl(domain: string, maxCrawlPages = 50): Promise<string> {
const url = domain.startsWith("http") ? domain : `https://${domain}`;
const data = await this.apiPost("/on_page/task_post", [
{
target: url,
max_crawl_pages: maxCrawlPages,
load_resources: true,
enable_javascript: true,
enable_browser_rendering: true,
check_spell: false,
calculate_keyword_density: false,
},
]);
const task = data?.tasks?.[0];
if (!task?.id) {
throw new Error(`DataForSEO task creation failed: ${JSON.stringify(task?.status_message || "unknown")}`);
}
return task.id;
}
/**
* Check if a task is ready via the tasks_ready endpoint.
*/
async isTaskReady(taskId: string): Promise<boolean> {
const data = await this.apiGet("/on_page/tasks_ready");
const readyTasks: string[] = data?.tasks?.[0]?.result?.map((t: any) => t.id) || [];
return readyTasks.includes(taskId);
}
/**
* Poll for task completion using tasks_ready endpoint.
* DataForSEO crawls can take 2-5 minutes.
*/
async waitForTask(taskId: string, timeoutMs = 300_000): Promise<void> {
const start = Date.now();
let delay = 15_000;
let pollCount = 0;
// Force Node event loop active
const keepAlive = setInterval(() => { }, 1000);
try {
while (Date.now() - start < timeoutMs) {
await this.sleep(delay);
pollCount++;
const ready = await this.isTaskReady(taskId);
const elapsed = Math.round((Date.now() - start) / 1000);
console.log(` 📊 Poll #${pollCount}: ${ready ? "READY ✅" : "not ready"} (${elapsed}s elapsed)`);
if (ready) {
// Short grace period so the pages endpoint settles
await this.sleep(5_000);
return;
}
delay = Math.min(delay * 1.3, 30_000);
}
throw new Error(`DataForSEO task ${taskId} timed out after ${timeoutMs / 1000}s`);
} finally {
clearInterval(keepAlive);
}
}
/**
* Sleep that keeps the Node event loop alive.
*/
private sleep(ms: number): Promise<void> {
return new Promise((resolve) => {
const timer = setTimeout(resolve, ms);
// Explicitly ref the timer to prevent Node from exiting
if (timer && typeof timer === "object" && "ref" in timer) {
(timer as NodeJS.Timeout).ref();
}
});
}
/**
* Fetch the crawl summary.
*/
async getCrawlSummary(taskId: string): Promise<any> {
const data = await this.apiGet(`/on_page/summary/${taskId}`);
return data?.tasks?.[0]?.result?.[0] || null;
}
/**
* Fetch all page-level results.
*/
async getPages(taskId: string, limit = 100, offset = 0): Promise<any[]> {
const data = await this.apiPost("/on_page/pages", [
{ id: taskId, limit, offset },
]);
return data?.tasks?.[0]?.result?.[0]?.items || [];
}
/**
* Fetch non-indexable pages.
*/
async getNonIndexable(taskId: string): Promise<any[]> {
const data = await this.apiPost("/on_page/non_indexable", [
{ id: taskId, limit: 100, offset: 0 },
]);
return data?.tasks?.[0]?.result?.[0]?.items || [];
}
/**
* Fetch broken resources (404s, timeouts, etc.)
*/
async getBrokenResources(taskId: string): Promise<any[]> {
const data = await this.apiPost("/on_page/resources", [
{ id: taskId, limit: 100, filters: [["status_code", ">", "399"]] },
]);
return data?.tasks?.[0]?.result?.[0]?.items || [];
}
}
/**
* Normalize a DataForSEO raw page result into our PageAuditData type.
*/
export function normalizePage(raw: any): PageAuditData {
const issues: AuditIssue[] = [];
const checks = raw.checks || {};
if (checks.no_title) issues.push({ code: "NO_TITLE", severity: "critical", message: "Seite hat keinen <title> Tag" });
if (checks.title_too_long) issues.push({ code: "TITLE_TOO_LONG", severity: "warning", message: `Title zu lang (${raw.meta?.title?.length || "?"} Zeichen, max 60)` });
if (checks.no_description) issues.push({ code: "NO_META_DESCRIPTION", severity: "warning", message: "Keine Meta-Description" });
if (checks.description_too_long) issues.push({ code: "META_DESC_TOO_LONG", severity: "info", message: "Meta-Description zu lang (max 160)" });
if (checks.no_h1_tag) issues.push({ code: "NO_H1", severity: "critical", message: "Kein H1-Tag auf der Seite" });
if (checks.duplicate_h1_tag) issues.push({ code: "DUPLICATE_H1", severity: "warning", message: "Mehrere H1-Tags gefunden" });
if (checks.is_broken) issues.push({ code: "PAGE_BROKEN", severity: "critical", message: `HTTP ${raw.status_code}: Seite nicht erreichbar` });
if (checks.low_content_rate) issues.push({ code: "THIN_CONTENT", severity: "warning", message: "Zu wenig Content (dünne Seite)" });
if (checks.has_render_blocking_resources) issues.push({ code: "RENDER_BLOCKING", severity: "warning", message: "Render-blockierende Ressourcen gefunden" });
if (checks.image_not_optimized) issues.push({ code: "UNOPTIMIZED_IMAGES", severity: "info", message: "Nicht-optimierte Bilder vorhanden" });
const imagesWithoutAlt = raw.checks?.no_image_alt ? (raw.meta?.images_count || 0) : 0;
return {
url: raw.url,
statusCode: raw.status_code,
pageTitle: raw.meta?.title || null,
metaDescription: raw.meta?.description || null,
h1: raw.meta?.htags?.h1?.[0] || null,
wordCount: raw.meta?.content?.words_count || 0,
loadTime: raw.page_timing?.time_to_interactive || null,
links: {
internal: raw.meta?.internal_links_count || 0,
external: raw.meta?.external_links_count || 0,
broken: 0,
},
images: {
total: raw.meta?.images_count || 0,
missingAlt: imagesWithoutAlt,
},
seo: {
hasViewport: !raw.checks?.no_viewport_tag,
hasCanonical: !!raw.meta?.canonical,
isIndexable: !raw.checks?.is_4xx_code && !raw.checks?.is_5xx_code,
robotsTxt: raw.meta?.robots || null,
ogTitle: raw.meta?.social_media_tags?.og_title || null,
ogDescription: raw.meta?.social_media_tags?.og_description || null,
},
performance: {
cls: raw.page_timing?.cumulative_layout_shift || null,
lcp: raw.page_timing?.largest_contentful_paint || null,
fid: raw.page_timing?.first_input_delay || null,
ttfb: raw.page_timing?.waiting_time || null,
},
issues,
};
}