fix(pdf): decouple 6 distinct PDFs, fix layout issues and DataForSEO event loop
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 1m1s
Monorepo Pipeline / 🧪 Test (push) Failing after 1m7s
Monorepo Pipeline / 🏗️ Build (push) Failing after 1m10s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped

This commit is contained in:
2026-02-27 18:26:00 +01:00
parent 60a2709999
commit a43d96dd0e
12 changed files with 187 additions and 102 deletions

View File

@@ -1,9 +1,8 @@
// ============================================================================
// Scraper — Zyte API + Local Persistence
// Crawls all pages of a website, stores them locally for reuse.
// Crawls all pages of a website, stores them locally for reuse.
// ============================================================================
import axios from "axios";
import * as cheerio from "cheerio";
import * as fs from "node:fs/promises";
import * as path from "node:path";
@@ -171,32 +170,39 @@ function extractServices(text: string): string[] {
*/
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
try {
const resp = await axios.post(
"https://api.zyte.com/v1/extract",
{
const auth = Buffer.from(`${apiKey}:`).toString("base64");
const resp = await fetch("https://api.zyte.com/v1/extract", {
method: "POST",
headers: {
"Authorization": `Basic ${auth}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url,
browserHtml: true,
},
{
auth: { username: apiKey, password: "" },
timeout: 60000,
},
);
const html = resp.data.browserHtml || "";
}),
signal: AbortSignal.timeout(60000),
});
if (!resp.ok) {
const errorText = await resp.text();
console.error(` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`);
// Rate limited — wait and retry once
if (resp.status === 429) {
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
await new Promise((r) => setTimeout(r, 5000));
return fetchWithZyte(url, apiKey);
}
throw new Error(`HTTP ${resp.status}: ${errorText}`);
}
const data = await resp.json();
const html = data.browserHtml || "";
if (!html) {
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
}
return html;
} catch (err: any) {
if (err.response) {
console.error(` ❌ Zyte API error ${err.response.status} for ${url}: ${err.response.data?.detail || err.response.statusText}`);
// Rate limited — wait and retry once
if (err.response.status === 429) {
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
await new Promise((r) => setTimeout(r, 5000));
return fetchWithZyte(url, apiKey);
}
}
throw err;
}
}
@@ -205,14 +211,19 @@ async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
* Fetch a page via simple HTTP GET (fallback).
*/
async function fetchDirect(url: string): Promise<string> {
const resp = await axios.get(url, {
timeout: 30000,
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
},
});
return typeof resp.data === "string" ? resp.data : "";
try {
const resp = await fetch(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
},
signal: AbortSignal.timeout(30000),
});
if (!resp.ok) return "";
return await resp.text();
} catch {
return "";
}
}
/**