at-mintel/packages/cloner-library/src/WebsiteCloner.ts

import { PlaywrightCrawler, RequestQueue } from "crawlee";
import * as path from "node:path";
import * as fs from "node:fs";
import { execSync } from "node:child_process";

export interface WebsiteClonerOptions {
  baseOutputDir: string;
  maxRequestsPerCrawl?: number;
  maxConcurrency?: number;
}

export class WebsiteCloner {
  private options: WebsiteClonerOptions;

  constructor(options: WebsiteClonerOptions) {
    this.options = {
      maxRequestsPerCrawl: 100,
      maxConcurrency: 3,
      ...options,
    };
  }

  public async clone(
    targetUrl: string,
    outputDirName?: string,
  ): Promise<string> {
    const urlObj = new URL(targetUrl);
    const domain = urlObj.hostname;
    const finalOutputDirName = outputDirName || domain.replace(/\./g, "-");
    const baseOutputDir = path.resolve(
      this.options.baseOutputDir,
      finalOutputDirName,
    );

    if (fs.existsSync(baseOutputDir)) {
      fs.rmSync(baseOutputDir, { recursive: true, force: true });
    }
    fs.mkdirSync(baseOutputDir, { recursive: true });

    console.log(`🚀 Starting perfect recursive clone of ${targetUrl}...`);
    console.log(`📂 Output: ${baseOutputDir}`);

    const requestQueue = await RequestQueue.open();
    await requestQueue.addRequest({ url: targetUrl });

    const crawler = new PlaywrightCrawler({
      requestQueue,
      maxRequestsPerCrawl: this.options.maxRequestsPerCrawl,
      maxConcurrency: this.options.maxConcurrency,

      async requestHandler({ request, enqueueLinks, log }) {
        const url = request.url;
        log.info(`Capturing ${url}...`);

        const u = new URL(url);
        let relPath = u.pathname;
        if (relPath === "/" || relPath === "") relPath = "/index.html";
        if (!relPath.endsWith(".html") && !path.extname(relPath))
          relPath += "/index.html";
        if (relPath.startsWith("/")) relPath = relPath.substring(1);

        const fullPath = path.join(baseOutputDir, relPath);
        fs.mkdirSync(path.dirname(fullPath), { recursive: true });

        try {
          // Note: This assumes single-file-cli is available in the environment
          execSync(
            `npx single-file-cli "${url}" "${fullPath}" --browser-headless=true --browser-wait-until=networkidle0`,
            {
              stdio: "inherit",
            },
          );
        } catch (_e) {
          log.error(`Failed to capture ${url} with SingleFile`);
        }

        await enqueueLinks({
          strategy: "same-domain",
          transformRequestFunction: (req) => {
            if (
              /\.(download|pdf|zip|gz|exe|png|jpg|jpeg|gif|svg|css|js)$/i.test(
                req.url,
              )
            )
              return false;
            return req;
          },
        });
      },
    });

    await crawler.run();

    console.log("🔗 Rewriting internal links for offline navigation...");
    const allFiles = this.getFiles(baseOutputDir).filter((f) =>
      f.endsWith(".html"),
    );

    for (const file of allFiles) {
      let content = fs.readFileSync(file, "utf8");
      const fileRelToRoot = path.relative(baseOutputDir, file);

      content = content.replace(/href="([^"]+)"/g, (match, href) => {
        if (
          href.startsWith(targetUrl) ||
          href.startsWith("/") ||
          (!href.includes("://") && !href.startsWith("data:"))
        ) {
          try {
            const linkUrl = new URL(href, targetUrl);
            if (linkUrl.hostname === domain) {
              let linkPath = linkUrl.pathname;
              if (linkPath === "/" || linkPath === "") linkPath = "/index.html";
              if (!linkPath.endsWith(".html") && !path.extname(linkPath))
                linkPath += "/index.html";
              if (linkPath.startsWith("/")) linkPath = linkPath.substring(1);

              const relativeLink = path.relative(
                path.dirname(fileRelToRoot),
                linkPath,
              );
              return `href="${relativeLink}"`;
            }
          } catch (_e) {
            // Ignore link rewriting failures
          }
        }
        return match;
      });

      fs.writeFileSync(file, content);
    }

    console.log(`\n✅ Done! Perfect clone complete in: ${baseOutputDir}`);
    return baseOutputDir;
  }

  private getFiles(dir: string, fileList: string[] = []) {
    const files = fs.readdirSync(dir);
    for (const file of files) {
      const name = path.join(dir, file);
      if (fs.statSync(name).isDirectory()) {
        this.getFiles(name, fileList);
      } else {
        fileList.push(name);
      }
    }
    return fileList;
  }
}