fix(pdf): decouple 6 distinct PDFs, fix layout issues and DataForSEO event loop
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 1m1s
Monorepo Pipeline / 🧪 Test (push) Failing after 1m7s
Monorepo Pipeline / 🏗️ Build (push) Failing after 1m10s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 1m1s
Monorepo Pipeline / 🧪 Test (push) Failing after 1m7s
Monorepo Pipeline / 🏗️ Build (push) Failing after 1m10s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
This commit is contained in:
@@ -1,9 +1,8 @@
|
||||
// ============================================================================
|
||||
// Scraper — Zyte API + Local Persistence
|
||||
// Crawls all pages of a website, stores them locally for reuse.
|
||||
// Crawls all pages of a website, stores them locally for reuse.
|
||||
// ============================================================================
|
||||
|
||||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import * as fs from "node:fs/promises";
|
||||
import * as path from "node:path";
|
||||
@@ -171,32 +170,39 @@ function extractServices(text: string): string[] {
|
||||
*/
|
||||
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
|
||||
try {
|
||||
const resp = await axios.post(
|
||||
"https://api.zyte.com/v1/extract",
|
||||
{
|
||||
const auth = Buffer.from(`${apiKey}:`).toString("base64");
|
||||
const resp = await fetch("https://api.zyte.com/v1/extract", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Authorization": `Basic ${auth}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url,
|
||||
browserHtml: true,
|
||||
},
|
||||
{
|
||||
auth: { username: apiKey, password: "" },
|
||||
timeout: 60000,
|
||||
},
|
||||
);
|
||||
const html = resp.data.browserHtml || "";
|
||||
}),
|
||||
signal: AbortSignal.timeout(60000),
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const errorText = await resp.text();
|
||||
console.error(` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`);
|
||||
// Rate limited — wait and retry once
|
||||
if (resp.status === 429) {
|
||||
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
return fetchWithZyte(url, apiKey);
|
||||
}
|
||||
throw new Error(`HTTP ${resp.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json();
|
||||
const html = data.browserHtml || "";
|
||||
if (!html) {
|
||||
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
|
||||
}
|
||||
return html;
|
||||
} catch (err: any) {
|
||||
if (err.response) {
|
||||
console.error(` ❌ Zyte API error ${err.response.status} for ${url}: ${err.response.data?.detail || err.response.statusText}`);
|
||||
// Rate limited — wait and retry once
|
||||
if (err.response.status === 429) {
|
||||
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
return fetchWithZyte(url, apiKey);
|
||||
}
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
@@ -205,14 +211,19 @@ async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
|
||||
* Fetch a page via simple HTTP GET (fallback).
|
||||
*/
|
||||
async function fetchDirect(url: string): Promise<string> {
|
||||
const resp = await axios.get(url, {
|
||||
timeout: 30000,
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
},
|
||||
});
|
||||
return typeof resp.data === "string" ? resp.data : "";
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
},
|
||||
signal: AbortSignal.timeout(30000),
|
||||
});
|
||||
if (!resp.ok) return "";
|
||||
return await resp.text();
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user