at-mintel/packages/concept-engine/src/scraper.ts

// ============================================================================
// Scraper — Zyte API + Local Persistence
// Crawls all pages of a website, stores them locally for reuse.
// Crawls all pages of a website, stores them locally for reuse.
// ============================================================================
import * as cheerio from "cheerio";
import * as fs from "node:fs/promises";
import * as path from "node:path";
import { existsSync } from "node:fs";
import type { CrawledPage, PageType } from "./types.js";

interface ScraperConfig {
  zyteApiKey?: string;
  crawlDir: string;
  maxPages?: number;
}

/**
 * Classify a URL pathname into a page type.
 */
function classifyPage(pathname: string): PageType {
  const p = pathname.toLowerCase();
  if (p === "/" || p === "" || p === "/index.html") return "home";
  if (
    p.includes("service") ||
    p.includes("leistung") ||
    p.includes("kompetenz")
  )
    return "service";
  if (
    p.includes("about") ||
    p.includes("ueber") ||
    p.includes("über") ||
    p.includes("unternehmen")
  )
    return "about";
  if (p.includes("contact") || p.includes("kontakt")) return "contact";
  if (
    p.includes("job") ||
    p.includes("karriere") ||
    p.includes("career") ||
    p.includes("human-resources")
  )
    return "career";
  if (
    p.includes("portfolio") ||
    p.includes("referenz") ||
    p.includes("projekt") ||
    p.includes("case-study")
  )
    return "portfolio";
  if (
    p.includes("blog") ||
    p.includes("news") ||
    p.includes("aktuelles") ||
    p.includes("magazin")
  )
    return "blog";
  if (
    p.includes("legal") ||
    p.includes("impressum") ||
    p.includes("datenschutz") ||
    p.includes("privacy") ||
    p.includes("agb")
  )
    return "legal";
  return "other";
}

/**
 * Detect interactive features present on a page.
 */
function detectFeatures($: cheerio.CheerioAPI): string[] {
  const features: string[] = [];

  // Search
  if (
    $('input[type="search"]').length > 0 ||
    $('form[role="search"]').length > 0 ||
    $(".search-form, .search-box, #search, .searchbar").length > 0 ||
    $('input[name="q"], input[name="s"], input[name="search"]').length > 0
  ) {
    features.push("search");
  }

  // Forms (beyond search)
  const formCount = $("form").length;
  const searchForms = $('form[role="search"], .search-form').length;
  if (formCount > searchForms) {
    features.push("forms");
  }

  // Maps
  if (
    $(
      'iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]',
    ).length > 0
  ) {
    features.push("maps");
  }

  // Video
  if (
    $("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container")
      .length > 0
  ) {
    features.push("video");
  }

  // Calendar / Events
  if ($(".calendar, .event, [data-calendar]").length > 0) {
    features.push("calendar");
  }

  // Cookie consent
  if (
    $(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length >
    0
  ) {
    features.push("cookie-consent");
  }

  return features;
}

/**
 * Extract all internal links from a page.
 */
function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
  const links: string[] = [];
  $("a[href]").each((_, el) => {
    const href = $(el).attr("href");
    if (!href) return;
    try {
      const url = new URL(href, origin);
      if (url.origin === origin) {
        // Skip assets
        if (
          /\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(
            url.pathname,
          )
        )
          return;
        // Skip anchors-only
        if (url.pathname === "/" && url.hash) return;
        links.push(url.pathname);
      }
    } catch {
      // Invalid URL, skip
    }
  });
  return [...new Set(links)];
}

/**
 * Extract all images from a page.
 */
function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
  const images: string[] = [];

  // Regular img tags
  $("img[src]").each((_, el) => {
    const src = $(el).attr("src");
    if (src) images.push(src);
  });

  // CSS background images (inline styles)
  $("[style*='background-image']").each((_, el) => {
    const style = $(el).attr("style");
    const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
    if (match && match[1]) {
      images.push(match[1]);
    }
  });

  // Resolve URLs to absolute
  const absoluteImages: string[] = [];
  for (const img of images) {
    if (img.startsWith("data:image")) continue; // Skip inline base64
    try {
      const url = new URL(img, origin);
      // Ignore small tracking pixels or generic vectors
      if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo"))
        continue;
      absoluteImages.push(url.href);
    } catch {
      // Invalid URL
    }
  }

  return [...new Set(absoluteImages)];
}

/**
 * Fetch a page via Zyte API with browser rendering.
 */
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
  const auth = Buffer.from(`${apiKey}:`).toString("base64");
  const resp = await fetch("https://api.zyte.com/v1/extract", {
    method: "POST",
    headers: {
      Authorization: `Basic ${auth}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      url,
      browserHtml: true,
    }),
    signal: AbortSignal.timeout(60000),
  });

  if (!resp.ok) {
    const errorText = await resp.text();
    console.error(
      `  ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`,
    );
    // Rate limited — wait and retry once
    if (resp.status === 429) {
      console.log("  ⏳ Rate limited, waiting 5s and retrying...");
      await new Promise((r) => setTimeout(r, 5000));
      return fetchWithZyte(url, apiKey);
    }
    throw new Error(`HTTP ${resp.status}: ${errorText}`);
  }

  const data = await resp.json();
  const html = data.browserHtml || "";
  if (!html) {
    console.warn(`  ⚠️ Zyte returned empty browserHtml for ${url}`);
  }
  return html;
}
/**
 * Fetch a page via simple HTTP GET (fallback).
 */
async function fetchDirect(url: string): Promise<string> {
  const resp = await fetch(url, {
    headers: {
      "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    },
    signal: AbortSignal.timeout(30000),
  }).catch(() => null);

  if (!resp || !resp.ok) return "";
  return await resp.text();
}

/**
 * Parse an HTML string into a CrawledPage.
 */
function parsePage(html: string, url: string): CrawledPage {
  const $ = cheerio.load(html);
  const urlObj = new URL(url);

  const title = $("title").text().trim();
  const headings = $("h1, h2, h3")
    .map((_, el) => $(el).text().trim())
    .get()
    .filter((h) => h.length > 0);

  const navItems = $("nav a")
    .map((_, el) => $(el).text().trim())
    .get()
    .filter((t) => t.length > 0 && t.length < 100);

  const bodyText = $("body")
    .text()
    .replace(/\s+/g, " ")
    .substring(0, 50000)
    .trim();

  const features = detectFeatures($);
  const links = extractInternalLinks($, urlObj.origin);
  const images = extractImages($, urlObj.origin);

  const description =
    $('meta[name="description"]').attr("content") || undefined;
  const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
  const ogImage = $('meta[property="og:image"]').attr("content") || undefined;

  return {
    url,
    pathname: urlObj.pathname,
    title,
    html,
    text: bodyText,
    headings,
    navItems,
    features,
    type: classifyPage(urlObj.pathname),
    links,
    images,
    meta: { description, ogTitle, ogImage },
  };
}

/**
 * Crawl a website and persist all pages locally.
 *
 * Returns an array of CrawledPage objects.
 */
export async function crawlSite(
  targetUrl: string,
  config: ScraperConfig,
): Promise<CrawledPage[]> {
  const urlObj = new URL(targetUrl);
  const origin = urlObj.origin;
  const domain = urlObj.hostname;
  const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));

  // Check for existing crawl
  const metaFile = path.join(domainDir, "_crawl_meta.json");
  if (existsSync(metaFile)) {
    console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
    return loadCrawlFromDisk(domainDir);
  }

  console.log(
    `🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`,
  );

  // Ensure output dir
  await fs.mkdir(domainDir, { recursive: true });

  const maxPages = config.maxPages || 30;
  const visited = new Set<string>();
  const queue: string[] = [targetUrl];
  const pages: CrawledPage[] = [];

  while (queue.length > 0 && visited.size < maxPages) {
    const url = queue.shift()!;
    const urlPath = new URL(url).pathname;

    if (visited.has(urlPath)) continue;
    visited.add(urlPath);

    try {
      console.log(`  ↳ Fetching ${url} (${visited.size}/${maxPages})...`);

      let html: string;
      if (config.zyteApiKey) {
        html = await fetchWithZyte(url, config.zyteApiKey);
      } else {
        html = await fetchDirect(url);
      }

      if (!html || html.length < 100) {
        console.warn(`  ⚠️ Empty/tiny response for ${url}, skipping.`);
        continue;
      }

      const page = parsePage(html, url);
      pages.push(page);

      // Save HTML + metadata to disk
      const safeName =
        urlPath === "/"
          ? "index"
          : urlPath.replace(/\//g, "_").replace(/^_/, "");
      await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
      await fs.writeFile(
        path.join(domainDir, `${safeName}.meta.json`),
        JSON.stringify(
          {
            url: page.url,
            pathname: page.pathname,
            title: page.title,
            type: page.type,
            headings: page.headings,
            navItems: page.navItems,
            features: page.features,
            links: page.links,
            images: page.images,
            meta: page.meta,
          },
          null,
          2,
        ),
      );

      // Discover new links
      for (const link of page.links) {
        if (!visited.has(link)) {
          const fullUrl = `${origin}${link}`;
          queue.push(fullUrl);
        }
      }
    } catch (err) {
      console.warn(`  ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
    }
  }

  // Save crawl metadata
  await fs.writeFile(
    metaFile,
    JSON.stringify(
      {
        domain,
        crawledAt: new Date().toISOString(),
        totalPages: pages.length,
        urls: pages.map((p) => p.url),
      },
      null,
      2,
    ),
  );

  console.log(
    `✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`,
  );
  return pages;
}

/**
 * Load a previously crawled site from disk.
 */
async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
  const files = await fs.readdir(domainDir);
  const metaFiles = files.filter(
    (f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json",
  );

  const pages: CrawledPage[] = [];
  for (const metaFile of metaFiles) {
    const baseName = metaFile.replace(".meta.json", "");
    const htmlFile = `${baseName}.html`;

    const meta = JSON.parse(
      await fs.readFile(path.join(domainDir, metaFile), "utf8"),
    );
    let html = "";
    if (files.includes(htmlFile)) {
      html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
    }

    const text = html
      ? cheerio
          .load(html)("body")
          .text()
          .replace(/\s+/g, " ")
          .substring(0, 50000)
          .trim()
      : "";

    pages.push({
      url: meta.url,
      pathname: meta.pathname,
      title: meta.title,
      html,
      text,
      headings: meta.headings || [],
      navItems: meta.navItems || [],
      features: meta.features || [],
      type: meta.type || "other",
      links: meta.links || [],
      images: meta.images || [],
      meta: meta.meta || {},
    });
  }

  console.log(`  📂 Loaded ${pages.length} cached pages from disk.`);
  return pages;
}

/**
 * Delete a cached crawl to force re-crawl.
 */
export async function clearCrawlCache(
  crawlDir: string,
  domain: string,
): Promise<void> {
  const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
  if (existsSync(domainDir)) {
    await fs.rm(domainDir, { recursive: true, force: true });
    console.log(`🧹 Cleared crawl cache for ${domain}`);
  }
}