feat: migrate npm registry from Verdaccio to Gitea Packages
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
Some checks failed
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 1s
Monorepo Pipeline / 🧹 Lint (push) Failing after 35s
Monorepo Pipeline / 🧪 Test (push) Failing after 35s
Monorepo Pipeline / 🏗️ Build (push) Failing after 12s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
This commit is contained in:
432
packages/concept-engine/src/scraper.ts
Normal file
432
packages/concept-engine/src/scraper.ts
Normal file
@@ -0,0 +1,432 @@
|
||||
// ============================================================================
|
||||
// Scraper — Zyte API + Local Persistence
|
||||
// Crawls all pages of a website, stores them locally for reuse.
|
||||
// ============================================================================
|
||||
|
||||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import * as fs from "node:fs/promises";
|
||||
import * as path from "node:path";
|
||||
import { existsSync } from "node:fs";
|
||||
import type { CrawledPage, PageType } from "./types.js";
|
||||
|
||||
interface ScraperConfig {
|
||||
zyteApiKey?: string;
|
||||
crawlDir: string;
|
||||
maxPages?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify a URL pathname into a page type.
|
||||
*/
|
||||
function classifyPage(pathname: string): PageType {
|
||||
const p = pathname.toLowerCase();
|
||||
if (p === "/" || p === "" || p === "/index.html") return "home";
|
||||
if (p.includes("service") || p.includes("leistung") || p.includes("kompetenz"))
|
||||
return "service";
|
||||
if (p.includes("about") || p.includes("ueber") || p.includes("über") || p.includes("unternehmen"))
|
||||
return "about";
|
||||
if (p.includes("contact") || p.includes("kontakt")) return "contact";
|
||||
if (p.includes("job") || p.includes("karriere") || p.includes("career") || p.includes("human-resources"))
|
||||
return "career";
|
||||
if (p.includes("portfolio") || p.includes("referenz") || p.includes("projekt") || p.includes("case-study"))
|
||||
return "portfolio";
|
||||
if (p.includes("blog") || p.includes("news") || p.includes("aktuelles") || p.includes("magazin"))
|
||||
return "blog";
|
||||
if (p.includes("legal") || p.includes("impressum") || p.includes("datenschutz") || p.includes("privacy") || p.includes("agb"))
|
||||
return "legal";
|
||||
return "other";
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect interactive features present on a page.
|
||||
*/
|
||||
function detectFeatures($: cheerio.CheerioAPI): string[] {
|
||||
const features: string[] = [];
|
||||
|
||||
// Search
|
||||
if (
|
||||
$('input[type="search"]').length > 0 ||
|
||||
$('form[role="search"]').length > 0 ||
|
||||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
|
||||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
|
||||
) {
|
||||
features.push("search");
|
||||
}
|
||||
|
||||
// Forms (beyond search)
|
||||
const formCount = $("form").length;
|
||||
const searchForms = $('form[role="search"], .search-form').length;
|
||||
if (formCount > searchForms) {
|
||||
features.push("forms");
|
||||
}
|
||||
|
||||
// Maps
|
||||
if (
|
||||
$('iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]').length > 0
|
||||
) {
|
||||
features.push("maps");
|
||||
}
|
||||
|
||||
// Video
|
||||
if (
|
||||
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container").length > 0
|
||||
) {
|
||||
features.push("video");
|
||||
}
|
||||
|
||||
// Calendar / Events
|
||||
if ($(".calendar, .event, [data-calendar]").length > 0) {
|
||||
features.push("calendar");
|
||||
}
|
||||
|
||||
// Cookie consent
|
||||
if ($(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length > 0) {
|
||||
features.push("cookie-consent");
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all internal links from a page.
|
||||
*/
|
||||
function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
|
||||
const links: string[] = [];
|
||||
$("a[href]").each((_, el) => {
|
||||
const href = $(el).attr("href");
|
||||
if (!href) return;
|
||||
try {
|
||||
const url = new URL(href, origin);
|
||||
if (url.origin === origin) {
|
||||
// Skip assets
|
||||
if (/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(url.pathname)) return;
|
||||
// Skip anchors-only
|
||||
if (url.pathname === "/" && url.hash) return;
|
||||
links.push(url.pathname);
|
||||
}
|
||||
} catch {
|
||||
// Invalid URL, skip
|
||||
}
|
||||
});
|
||||
return [...new Set(links)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all images from a page.
|
||||
*/
|
||||
function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
|
||||
const images: string[] = [];
|
||||
|
||||
// Regular img tags
|
||||
$("img[src]").each((_, el) => {
|
||||
const src = $(el).attr("src");
|
||||
if (src) images.push(src);
|
||||
});
|
||||
|
||||
// CSS background images (inline styles)
|
||||
$("[style*='background-image']").each((_, el) => {
|
||||
const style = $(el).attr("style");
|
||||
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
|
||||
if (match && match[1]) {
|
||||
images.push(match[1]);
|
||||
}
|
||||
});
|
||||
|
||||
// Resolve URLs to absolute
|
||||
const absoluteImages: string[] = [];
|
||||
for (const img of images) {
|
||||
if (img.startsWith("data:image")) continue; // Skip inline base64
|
||||
try {
|
||||
const url = new URL(img, origin);
|
||||
// Ignore small tracking pixels or generic vectors
|
||||
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo")) continue;
|
||||
absoluteImages.push(url.href);
|
||||
} catch {
|
||||
// Invalid URL
|
||||
}
|
||||
}
|
||||
|
||||
return [...new Set(absoluteImages)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract services/competencies from text content.
|
||||
*/
|
||||
function extractServices(text: string): string[] {
|
||||
const services: string[] = [];
|
||||
// Common pattern: bulleted or newline-separated service lists
|
||||
const lines = text.split(/\n/).map((l) => l.trim()).filter((l) => l.length > 3 && l.length < 100);
|
||||
for (const line of lines) {
|
||||
// Skip generic boilerplate
|
||||
if (/cookie|datenschutz|impressum|copyright|©/i.test(line)) continue;
|
||||
if (/^(tel|fax|e-mail|mobil|web|http)/i.test(line)) continue;
|
||||
services.push(line);
|
||||
}
|
||||
return services;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a page via Zyte API with browser rendering.
|
||||
*/
|
||||
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
|
||||
try {
|
||||
const resp = await axios.post(
|
||||
"https://api.zyte.com/v1/extract",
|
||||
{
|
||||
url,
|
||||
browserHtml: true,
|
||||
},
|
||||
{
|
||||
auth: { username: apiKey, password: "" },
|
||||
timeout: 60000,
|
||||
},
|
||||
);
|
||||
const html = resp.data.browserHtml || "";
|
||||
if (!html) {
|
||||
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
|
||||
}
|
||||
return html;
|
||||
} catch (err: any) {
|
||||
if (err.response) {
|
||||
console.error(` ❌ Zyte API error ${err.response.status} for ${url}: ${err.response.data?.detail || err.response.statusText}`);
|
||||
// Rate limited — wait and retry once
|
||||
if (err.response.status === 429) {
|
||||
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
return fetchWithZyte(url, apiKey);
|
||||
}
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a page via simple HTTP GET (fallback).
|
||||
*/
|
||||
async function fetchDirect(url: string): Promise<string> {
|
||||
const resp = await axios.get(url, {
|
||||
timeout: 30000,
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
},
|
||||
});
|
||||
return typeof resp.data === "string" ? resp.data : "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse an HTML string into a CrawledPage.
|
||||
*/
|
||||
function parsePage(html: string, url: string): CrawledPage {
|
||||
const $ = cheerio.load(html);
|
||||
const urlObj = new URL(url);
|
||||
|
||||
const title = $("title").text().trim();
|
||||
const headings = $("h1, h2, h3")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((h) => h.length > 0);
|
||||
|
||||
const navItems = $("nav a")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((t) => t.length > 0 && t.length < 100);
|
||||
|
||||
const bodyText = $("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim();
|
||||
|
||||
const features = detectFeatures($);
|
||||
const links = extractInternalLinks($, urlObj.origin);
|
||||
const images = extractImages($, urlObj.origin);
|
||||
|
||||
const description = $('meta[name="description"]').attr("content") || undefined;
|
||||
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
|
||||
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
|
||||
|
||||
return {
|
||||
url,
|
||||
pathname: urlObj.pathname,
|
||||
title,
|
||||
html,
|
||||
text: bodyText,
|
||||
headings,
|
||||
navItems,
|
||||
features,
|
||||
type: classifyPage(urlObj.pathname),
|
||||
links,
|
||||
images,
|
||||
meta: { description, ogTitle, ogImage },
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Crawl a website and persist all pages locally.
|
||||
*
|
||||
* Returns an array of CrawledPage objects.
|
||||
*/
|
||||
export async function crawlSite(
|
||||
targetUrl: string,
|
||||
config: ScraperConfig,
|
||||
): Promise<CrawledPage[]> {
|
||||
const urlObj = new URL(targetUrl);
|
||||
const origin = urlObj.origin;
|
||||
const domain = urlObj.hostname;
|
||||
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
|
||||
|
||||
// Check for existing crawl
|
||||
const metaFile = path.join(domainDir, "_crawl_meta.json");
|
||||
if (existsSync(metaFile)) {
|
||||
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
|
||||
return loadCrawlFromDisk(domainDir);
|
||||
}
|
||||
|
||||
console.log(`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`);
|
||||
|
||||
// Ensure output dir
|
||||
await fs.mkdir(domainDir, { recursive: true });
|
||||
|
||||
const maxPages = config.maxPages || 30;
|
||||
const visited = new Set<string>();
|
||||
const queue: string[] = [targetUrl];
|
||||
const pages: CrawledPage[] = [];
|
||||
|
||||
while (queue.length > 0 && visited.size < maxPages) {
|
||||
const url = queue.shift()!;
|
||||
const urlPath = new URL(url).pathname;
|
||||
|
||||
if (visited.has(urlPath)) continue;
|
||||
visited.add(urlPath);
|
||||
|
||||
try {
|
||||
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
|
||||
|
||||
let html: string;
|
||||
if (config.zyteApiKey) {
|
||||
html = await fetchWithZyte(url, config.zyteApiKey);
|
||||
} else {
|
||||
html = await fetchDirect(url);
|
||||
}
|
||||
|
||||
if (!html || html.length < 100) {
|
||||
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const page = parsePage(html, url);
|
||||
pages.push(page);
|
||||
|
||||
// Save HTML + metadata to disk
|
||||
const safeName = urlPath === "/" ? "index" : urlPath.replace(/\//g, "_").replace(/^_/, "");
|
||||
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
|
||||
await fs.writeFile(
|
||||
path.join(domainDir, `${safeName}.meta.json`),
|
||||
JSON.stringify(
|
||||
{
|
||||
url: page.url,
|
||||
pathname: page.pathname,
|
||||
title: page.title,
|
||||
type: page.type,
|
||||
headings: page.headings,
|
||||
navItems: page.navItems,
|
||||
features: page.features,
|
||||
links: page.links,
|
||||
images: page.images,
|
||||
meta: page.meta,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
|
||||
// Discover new links
|
||||
for (const link of page.links) {
|
||||
if (!visited.has(link)) {
|
||||
const fullUrl = `${origin}${link}`;
|
||||
queue.push(fullUrl);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Save crawl metadata
|
||||
await fs.writeFile(
|
||||
metaFile,
|
||||
JSON.stringify(
|
||||
{
|
||||
domain,
|
||||
crawledAt: new Date().toISOString(),
|
||||
totalPages: pages.length,
|
||||
urls: pages.map((p) => p.url),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
|
||||
console.log(`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`);
|
||||
return pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a previously crawled site from disk.
|
||||
*/
|
||||
async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
|
||||
const files = await fs.readdir(domainDir);
|
||||
const metaFiles = files.filter((f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json");
|
||||
|
||||
const pages: CrawledPage[] = [];
|
||||
for (const metaFile of metaFiles) {
|
||||
const baseName = metaFile.replace(".meta.json", "");
|
||||
const htmlFile = `${baseName}.html`;
|
||||
|
||||
const meta = JSON.parse(await fs.readFile(path.join(domainDir, metaFile), "utf8"));
|
||||
let html = "";
|
||||
if (files.includes(htmlFile)) {
|
||||
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
|
||||
}
|
||||
|
||||
const text = html
|
||||
? cheerio
|
||||
.load(html)("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim()
|
||||
: "";
|
||||
|
||||
pages.push({
|
||||
url: meta.url,
|
||||
pathname: meta.pathname,
|
||||
title: meta.title,
|
||||
html,
|
||||
text,
|
||||
headings: meta.headings || [],
|
||||
navItems: meta.navItems || [],
|
||||
features: meta.features || [],
|
||||
type: meta.type || "other",
|
||||
links: meta.links || [],
|
||||
images: meta.images || [],
|
||||
meta: meta.meta || {},
|
||||
});
|
||||
}
|
||||
|
||||
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
|
||||
return pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a cached crawl to force re-crawl.
|
||||
*/
|
||||
export async function clearCrawlCache(crawlDir: string, domain: string): Promise<void> {
|
||||
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
|
||||
if (existsSync(domainDir)) {
|
||||
await fs.rm(domainDir, { recursive: true, force: true });
|
||||
console.log(`🧹 Cleared crawl cache for ${domain}`);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user