chore: remove Directus CMS and related dependencies
All checks were successful
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 3s
Monorepo Pipeline / 🧹 Lint (push) Successful in 1m19s
Monorepo Pipeline / 🧪 Test (push) Successful in 1m5s
Monorepo Pipeline / 🏗️ Build (push) Successful in 1m26s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped

This commit is contained in:
2026-02-27 19:06:06 +01:00
parent fbf2153430
commit 7702310a9c
79 changed files with 1733 additions and 14597 deletions

View File

@@ -1,40 +1,39 @@
import { config as dotenvConfig } from 'dotenv';
import * as path from 'node:path';
import * as fs from 'node:fs/promises';
import { EstimationPipeline } from './pipeline.js';
import { config as dotenvConfig } from "dotenv";
import * as path from "node:path";
import * as fs from "node:fs/promises";
import { ConceptPipeline } from "./pipeline.js";
dotenvConfig({ path: path.resolve(process.cwd(), '../../.env') });
dotenvConfig({ path: path.resolve(process.cwd(), "../../.env") });
const briefing = await fs.readFile(
path.resolve(process.cwd(), '../../data/briefings/etib.txt'),
'utf8',
path.resolve(process.cwd(), "../../data/briefings/etib.txt"),
"utf8",
);
console.log(`Briefing loaded: ${briefing.length} chars`);
const pipeline = new EstimationPipeline(
{
openrouterKey: process.env.OPENROUTER_API_KEY || '',
zyteApiKey: process.env.ZYTE_API_KEY,
outputDir: path.resolve(process.cwd(), '../../out/estimations'),
crawlDir: path.resolve(process.cwd(), '../../data/crawls'),
},
{
onStepStart: (id, name) => console.log(`[CB] Starting: ${id}`),
onStepComplete: (id) => console.log(`[CB] Done: ${id}`),
onStepError: (id, err) => console.error(`[CB] Error in ${id}: ${err}`),
},
const pipeline = new ConceptPipeline(
{
openrouterKey: process.env.OPENROUTER_API_KEY || "",
zyteApiKey: process.env.ZYTE_API_KEY,
outputDir: path.resolve(process.cwd(), "../../out/estimations"),
crawlDir: path.resolve(process.cwd(), "../../data/crawls"),
},
{
onStepStart: (id, _name) => console.log(`[CB] Starting: ${id}`),
onStepComplete: (id) => console.log(`[CB] Done: ${id}`),
onStepError: (id, err) => console.error(`[CB] Error in ${id}: ${err}`),
},
);
try {
const result = await pipeline.run({
briefing,
url: 'https://www.e-tib.com',
});
await pipeline.run({
briefing,
url: "https://www.e-tib.com",
});
console.log('\n✨ Pipeline complete!');
console.log('Validation:', result.validationResult?.passed ? 'PASSED' : 'FAILED');
console.log("\n✨ Pipeline complete!");
} catch (err: any) {
console.error('\n❌ Pipeline failed:', err.message);
console.error(err.stack);
console.error("\n❌ Pipeline failed:", err.message);
console.error(err.stack);
}

View File

@@ -18,132 +18,146 @@ dotenvConfig({ path: path.resolve(process.cwd(), ".env") });
const program = new Command();
program
.name("concept")
.description("AI-powered project concept generator")
.version("1.0.0");
.name("concept")
.description("AI-powered project concept generator")
.version("1.0.0");
program
.command("run")
.description("Run the full concept pipeline")
.argument("[briefing]", "Briefing text or @path/to/file.txt")
.option("--url <url>", "Target website URL")
.option("--comments <comments>", "Additional notes")
.option("--clear-cache", "Clear crawl cache and re-crawl")
.option("--output <dir>", "Output directory", "../../out/concepts")
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
.action(async (briefingArg: string | undefined, options: any) => {
const openrouterKey = process.env.OPENROUTER_API_KEY || process.env.OPENROUTER_KEY;
if (!openrouterKey) {
console.error("❌ OPENROUTER_API_KEY not found in environment.");
process.exit(1);
}
.command("run")
.description("Run the full concept pipeline")
.argument("[briefing]", "Briefing text or @path/to/file.txt")
.option("--url <url>", "Target website URL")
.option("--comments <comments>", "Additional notes")
.option("--clear-cache", "Clear crawl cache and re-crawl")
.option("--output <dir>", "Output directory", "../../out/concepts")
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
.action(async (briefingArg: string | undefined, options: any) => {
const openrouterKey =
process.env.OPENROUTER_API_KEY || process.env.OPENROUTER_KEY;
if (!openrouterKey) {
console.error("❌ OPENROUTER_API_KEY not found in environment.");
process.exit(1);
}
let briefing = briefingArg || "";
let briefing = briefingArg || "";
// Handle @file references
if (briefing.startsWith("@")) {
const rawPath = briefing.substring(1);
const filePath = rawPath.startsWith("/")
? rawPath
: path.resolve(process.cwd(), rawPath);
if (!existsSync(filePath)) {
console.error(`❌ Briefing file not found: ${filePath}`);
process.exit(1);
}
briefing = await fs.readFile(filePath, "utf8");
console.log(`📄 Loaded briefing from: ${filePath}`);
}
// Handle @file references
if (briefing.startsWith("@")) {
const rawPath = briefing.substring(1);
const filePath = rawPath.startsWith("/")
? rawPath
: path.resolve(process.cwd(), rawPath);
if (!existsSync(filePath)) {
console.error(`❌ Briefing file not found: ${filePath}`);
process.exit(1);
}
briefing = await fs.readFile(filePath, "utf8");
console.log(`📄 Loaded briefing from: ${filePath}`);
}
// Auto-discover URL from briefing
let url = options.url;
if (!url && briefing) {
const urlMatch = briefing.match(/https?:\/\/[^\s]+/);
if (urlMatch) {
url = urlMatch[0];
console.log(`🔗 Discovered URL in briefing: ${url}`);
}
}
// Auto-discover URL from briefing
let url = options.url;
if (!url && briefing) {
const urlMatch = briefing.match(/https?:\/\/[^\s]+/);
if (urlMatch) {
url = urlMatch[0];
console.log(`🔗 Discovered URL in briefing: ${url}`);
}
}
if (!briefing && !url) {
console.error("❌ Provide a briefing text or --url");
process.exit(1);
}
if (!briefing && !url) {
console.error("❌ Provide a briefing text or --url");
process.exit(1);
}
const pipeline = new ConceptPipeline(
{
openrouterKey,
zyteApiKey: process.env.ZYTE_API_KEY,
outputDir: path.resolve(process.cwd(), options.output),
crawlDir: path.resolve(process.cwd(), options.crawlDir),
},
{
onStepStart: (id, name) => {
// Will be enhanced with Ink spinner later
},
onStepComplete: (id, result) => {
// Will be enhanced with Ink UI later
},
},
);
const pipeline = new ConceptPipeline(
{
openrouterKey,
zyteApiKey: process.env.ZYTE_API_KEY,
outputDir: path.resolve(process.cwd(), options.output),
crawlDir: path.resolve(process.cwd(), options.crawlDir),
},
{
onStepStart: (_id, _name) => {
// Will be enhanced with Ink spinner later
},
onStepComplete: (_id, _result) => {
// Will be enhanced with Ink UI later
},
},
);
try {
await pipeline.run({
briefing,
url,
comments: options.comments,
clearCache: options.clearCache,
});
try {
await pipeline.run({
briefing,
url,
comments: options.comments,
clearCache: options.clearCache,
});
console.log("\n✨ Concept generation complete!");
} catch (err) {
console.error(`\n❌ Pipeline failed: ${(err as Error).message}`);
process.exit(1);
}
});
console.log("\n✨ Concept generation complete!");
} catch (err) {
console.error(`\n❌ Pipeline failed: ${(err as Error).message}`);
process.exit(1);
}
});
program
.command("analyze")
.description("Only crawl and analyze a website (no LLM)")
.argument("<url>", "Website URL to analyze")
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
.option("--clear-cache", "Clear existing crawl cache")
.action(async (url: string, options: any) => {
const { crawlSite } = await import("./scraper.js");
const { analyzeSite } = await import("./analyzer.js");
.command("analyze")
.description("Only crawl and analyze a website (no LLM)")
.argument("<url>", "Website URL to analyze")
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
.option("--clear-cache", "Clear existing crawl cache")
.action(async (url: string, options: any) => {
const { crawlSite } = await import("./scraper.js");
const { analyzeSite } = await import("./analyzer.js");
if (options.clearCache) {
const { clearCrawlCache } = await import("./scraper.js");
const domain = new URL(url).hostname;
await clearCrawlCache(path.resolve(process.cwd(), options.crawlDir), domain);
}
if (options.clearCache) {
const { clearCrawlCache } = await import("./scraper.js");
const domain = new URL(url).hostname;
await clearCrawlCache(
path.resolve(process.cwd(), options.crawlDir),
domain,
);
}
const pages = await crawlSite(url, {
zyteApiKey: process.env.ZYTE_API_KEY,
crawlDir: path.resolve(process.cwd(), options.crawlDir),
});
const domain = new URL(url).hostname;
const profile = analyzeSite(pages, domain);
console.log("\n📊 Site Profile:");
console.log(` Domain: ${profile.domain}`);
console.log(` Total Pages: ${profile.totalPages}`);
console.log(` Navigation: ${profile.navigation.map((n) => n.label).join(", ")}`);
console.log(` Features: ${profile.existingFeatures.join(", ") || "none"}`);
console.log(` Services: ${profile.services.join(", ") || "none"}`);
console.log(` External Domains: ${profile.externalDomains.join(", ") || "none"}`);
console.log(` Company: ${profile.companyInfo.name || "unbekannt"}`);
console.log(` Tax ID: ${profile.companyInfo.taxId || "unbekannt"}`);
console.log(` Colors: ${profile.colors.join(", ")}`);
console.log(` Images Found: ${profile.images.length}`);
console.log(` Social: ${Object.entries(profile.socialLinks).map(([k, v]) => `${k}`).join(", ") || "none"}`);
const outputPath = path.join(
path.resolve(process.cwd(), options.crawlDir),
domain.replace(/\./g, "-"),
"_site_profile.json",
);
console.log(`\n📦 Full profile saved to: ${outputPath}`);
const pages = await crawlSite(url, {
zyteApiKey: process.env.ZYTE_API_KEY,
crawlDir: path.resolve(process.cwd(), options.crawlDir),
});
const domain = new URL(url).hostname;
const profile = analyzeSite(pages, domain);
console.log("\n📊 Site Profile:");
console.log(` Domain: ${profile.domain}`);
console.log(` Total Pages: ${profile.totalPages}`);
console.log(
` Navigation: ${profile.navigation.map((n) => n.label).join(", ")}`,
);
console.log(` Features: ${profile.existingFeatures.join(", ") || "none"}`);
console.log(` Services: ${profile.services.join(", ") || "none"}`);
console.log(
` External Domains: ${profile.externalDomains.join(", ") || "none"}`,
);
console.log(` Company: ${profile.companyInfo.name || "unbekannt"}`);
console.log(` Tax ID: ${profile.companyInfo.taxId || "unbekannt"}`);
console.log(` Colors: ${profile.colors.join(", ")}`);
console.log(` Images Found: ${profile.images.length}`);
console.log(
` Social: ${
Object.entries(profile.socialLinks)
.map(([_k, _v]) => `${_k}`)
.join(", ") || "none"
}`,
);
const outputPath = path.join(
path.resolve(process.cwd(), options.crawlDir),
domain.replace(/\./g, "-"),
"_site_profile.json",
);
console.log(`\n📦 Full profile saved to: ${outputPath}`);
});
program.parse();

View File

@@ -0,0 +1,7 @@
import { describe, it, expect } from "vitest";
describe("concept-engine", () => {
it("should pass", () => {
expect(true).toBe(true);
});
});

View File

@@ -5,20 +5,20 @@
import axios from "axios";
interface LLMRequestOptions {
model: string;
systemPrompt: string;
userPrompt: string;
jsonMode?: boolean;
apiKey: string;
model: string;
systemPrompt: string;
userPrompt: string;
jsonMode?: boolean;
apiKey: string;
}
interface LLMResponse {
content: string;
usage: {
promptTokens: number;
completionTokens: number;
cost: number;
};
content: string;
usage: {
promptTokens: number;
completionTokens: number;
cost: number;
};
}
/**
@@ -26,108 +26,117 @@ interface LLMResponse {
* Handles markdown fences, control chars, trailing commas.
*/
export function cleanJson(str: string): string {
let cleaned = str.replace(/```json\n?|```/g, "").trim();
cleaned = cleaned.replace(
/[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
" ",
);
cleaned = cleaned.replace(/,\s*([\]}])/g, "$1");
return cleaned;
let cleaned = str.replace(/```json\n?|```/g, "").trim();
// eslint-disable-next-line no-control-regex
cleaned = cleaned.replace(/[\x00-\x1f\x7f-\x9f]/gi, " ");
cleaned = cleaned.replace(/,\s*([\]}])/g, "$1");
return cleaned;
}
/**
* Send a request to an LLM via OpenRouter.
*/
export async function llmRequest(options: LLMRequestOptions): Promise<LLMResponse> {
const { model, systemPrompt, userPrompt, jsonMode = true, apiKey } = options;
export async function llmRequest(
options: LLMRequestOptions,
): Promise<LLMResponse> {
const { model, systemPrompt, userPrompt, jsonMode = true, apiKey } = options;
const startTime = Date.now();
const resp = await axios.post(
"https://openrouter.ai/api/v1/chat/completions",
{
model,
messages: [
{ role: "system", content: systemPrompt },
{ role: "user", content: userPrompt },
],
...(jsonMode ? { response_format: { type: "json_object" } } : {}),
const resp = await axios
.post(
"https://openrouter.ai/api/v1/chat/completions",
{
model,
messages: [
{ role: "system", content: systemPrompt },
{ role: "user", content: userPrompt },
],
...(jsonMode ? { response_format: { type: "json_object" } } : {}),
},
{
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
{
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
timeout: 120000,
},
).catch(err => {
if (err.response) {
console.error("OpenRouter API Error:", JSON.stringify(err.response.data, null, 2));
}
throw err;
timeout: 120000,
},
)
.catch((err) => {
if (err.response) {
console.error(
"OpenRouter API Error:",
JSON.stringify(err.response.data, null, 2),
);
}
throw err;
});
const content = resp.data.choices?.[0]?.message?.content;
if (!content) {
throw new Error(`LLM returned no content. Model: ${model}`);
}
const content = resp.data.choices?.[0]?.message?.content;
if (!content) {
throw new Error(`LLM returned no content. Model: ${model}`);
}
let cost = 0;
const usage = resp.data.usage || {};
if (usage.cost !== undefined) {
cost = usage.cost;
} else {
// Fallback estimation
cost =
(usage.prompt_tokens || 0) * (0.1 / 1_000_000) +
(usage.completion_tokens || 0) * (0.4 / 1_000_000);
}
let cost = 0;
const usage = resp.data.usage || {};
if (usage.cost !== undefined) {
cost = usage.cost;
} else {
// Fallback estimation
cost =
(usage.prompt_tokens || 0) * (0.1 / 1_000_000) +
(usage.completion_tokens || 0) * (0.4 / 1_000_000);
}
return {
content,
usage: {
promptTokens: usage.prompt_tokens || 0,
completionTokens: usage.completion_tokens || 0,
cost,
},
};
return {
content,
usage: {
promptTokens: usage.prompt_tokens || 0,
completionTokens: usage.completion_tokens || 0,
cost,
},
};
}
/**
* Send a request and parse the response as JSON.
*/
export async function llmJsonRequest<T = any>(
options: LLMRequestOptions,
options: LLMRequestOptions,
): Promise<{ data: T; usage: LLMResponse["usage"] }> {
const response = await llmRequest({ ...options, jsonMode: true });
const cleaned = cleanJson(response.content);
const response = await llmRequest({ ...options, jsonMode: true });
const cleaned = cleanJson(response.content);
let parsed: T;
try {
parsed = JSON.parse(cleaned);
} catch (e) {
throw new Error(
`Failed to parse LLM JSON response: ${(e as Error).message}\nRaw: ${cleaned.substring(0, 500)}`,
);
}
let parsed: T;
try {
parsed = JSON.parse(cleaned);
} catch (e) {
throw new Error(
`Failed to parse LLM JSON response: ${(e as Error).message}\nRaw: ${cleaned.substring(0, 500)}`,
);
}
// Unwrap common LLM artifacts: {"0": {...}}, {"state": {...}}, etc.
const unwrapped = unwrapResponse(parsed);
// Unwrap common LLM artifacts: {"0": {...}}, {"state": {...}}, etc.
const unwrapped = unwrapResponse(parsed);
return { data: unwrapped as T, usage: response.usage };
return { data: unwrapped as T, usage: response.usage };
}
/**
* Recursively unwrap common LLM wrapping patterns.
*/
function unwrapResponse(obj: any): any {
if (!obj || typeof obj !== "object" || Array.isArray(obj)) return obj;
const keys = Object.keys(obj);
if (keys.length === 1) {
const key = keys[0];
if (key === "0" || key === "state" || key === "facts" || key === "result" || key === "data") {
return unwrapResponse(obj[key]);
}
if (!obj || typeof obj !== "object" || Array.isArray(obj)) return obj;
const keys = Object.keys(obj);
if (keys.length === 1) {
const key = keys[0];
if (
key === "0" ||
key === "state" ||
key === "facts" ||
key === "result" ||
key === "data"
) {
return unwrapResponse(obj[key]);
}
return obj;
}
return obj;
}

View File

@@ -5,7 +5,6 @@
import * as fs from "node:fs/promises";
import * as path from "node:path";
import { existsSync } from "node:fs";
import { crawlSite, clearCrawlCache } from "./scraper.js";
import { analyzeSite } from "./analyzer.js";
import { executeResearch } from "./steps/00b-research.js";
@@ -15,18 +14,17 @@ import { executeAudit } from "./steps/02-audit.js";
import { executeStrategize } from "./steps/03-strategize.js";
import { executeArchitect } from "./steps/04-architect.js";
import type {
PipelineConfig,
PipelineInput,
ConceptState,
ProjectConcept,
StepResult,
StepUsage,
PipelineConfig,
PipelineInput,
ConceptState,
ProjectConcept,
StepResult,
} from "./types.js";
export interface PipelineCallbacks {
onStepStart?: (stepId: string, stepName: string) => void;
onStepComplete?: (stepId: string, result: StepResult) => void;
onStepError?: (stepId: string, error: string) => void;
onStepStart?: (stepId: string, stepName: string) => void;
onStepComplete?: (stepId: string, result: StepResult) => void;
onStepError?: (stepId: string, error: string) => void;
}
/**
@@ -34,224 +32,265 @@ export interface PipelineCallbacks {
* Runs conceptual steps sequentially and builds the ProjectConcept.
*/
export class ConceptPipeline {
private config: PipelineConfig;
private state: ConceptState;
private callbacks: PipelineCallbacks;
private config: PipelineConfig;
private state: ConceptState;
private callbacks: PipelineCallbacks;
constructor(config: PipelineConfig, callbacks: PipelineCallbacks = {}) {
this.config = config;
this.callbacks = callbacks;
this.state = this.createInitialState();
}
constructor(config: PipelineConfig, callbacks: PipelineCallbacks = {}) {
this.config = config;
this.callbacks = callbacks;
this.state = this.createInitialState();
}
private createInitialState(): ConceptState {
return {
briefing: "",
private createInitialState(): ConceptState {
return {
briefing: "",
usage: {
totalPromptTokens: 0,
totalCompletionTokens: 0,
totalCost: 0,
perStep: [],
},
};
}
/**
* Run the full concept pipeline from scratch.
*/
async run(input: PipelineInput): Promise<ProjectConcept> {
this.state.briefing = input.briefing;
this.state.url = input.url;
this.state.comments = input.comments;
// Ensure output directories
await fs.mkdir(this.config.outputDir, { recursive: true });
await fs.mkdir(this.config.crawlDir, { recursive: true });
// Step 0: Scrape & Analyze (deterministic)
if (input.url) {
if (input.clearCache) {
const domain = new URL(input.url).hostname;
await clearCrawlCache(this.config.crawlDir, domain);
}
await this.runStep(
"00-scrape",
"Scraping & Analyzing Website",
async () => {
const pages = await crawlSite(input.url!, {
zyteApiKey: this.config.zyteApiKey,
crawlDir: this.config.crawlDir,
});
const domain = new URL(input.url!).hostname;
const siteProfile = analyzeSite(pages, domain);
this.state.siteProfile = siteProfile;
this.state.crawlDir = path.join(
this.config.crawlDir,
domain.replace(/\./g, "-"),
);
// Save site profile
await fs.writeFile(
path.join(this.state.crawlDir!, "_site_profile.json"),
JSON.stringify(siteProfile, null, 2),
);
return {
success: true,
data: siteProfile,
usage: {
totalPromptTokens: 0,
totalCompletionTokens: 0,
totalCost: 0,
perStep: [],
step: "00-scrape",
model: "none",
promptTokens: 0,
completionTokens: 0,
cost: 0,
durationMs: 0,
},
};
};
},
);
}
/**
* Run the full concept pipeline from scratch.
*/
async run(input: PipelineInput): Promise<ProjectConcept> {
this.state.briefing = input.briefing;
this.state.url = input.url;
this.state.comments = input.comments;
// Ensure output directories
await fs.mkdir(this.config.outputDir, { recursive: true });
await fs.mkdir(this.config.crawlDir, { recursive: true });
// Step 0: Scrape & Analyze (deterministic)
if (input.url) {
if (input.clearCache) {
const domain = new URL(input.url).hostname;
await clearCrawlCache(this.config.crawlDir, domain);
}
await this.runStep("00-scrape", "Scraping & Analyzing Website", async () => {
const pages = await crawlSite(input.url!, {
zyteApiKey: this.config.zyteApiKey,
crawlDir: this.config.crawlDir,
});
const domain = new URL(input.url!).hostname;
const siteProfile = analyzeSite(pages, domain);
this.state.siteProfile = siteProfile;
this.state.crawlDir = path.join(this.config.crawlDir, domain.replace(/\./g, "-"));
// Save site profile
await fs.writeFile(
path.join(this.state.crawlDir!, "_site_profile.json"),
JSON.stringify(siteProfile, null, 2),
);
return {
success: true,
data: siteProfile,
usage: { step: "00-scrape", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: 0 },
};
});
// Step 00a: Site Audit (DataForSEO)
await this.runStep(
"00a-site-audit",
"IST-Analysis (DataForSEO)",
async () => {
const result = await executeSiteAudit(this.state, this.config);
if (result.success && result.data) {
this.state.siteAudit = result.data;
}
return result;
},
);
// Step 00a: Site Audit (DataForSEO)
await this.runStep("00a-site-audit", "IST-Analysis (DataForSEO)", async () => {
const result = await executeSiteAudit(this.state, this.config);
if (result.success && result.data) {
this.state.siteAudit = result.data;
}
return result;
});
// Step 00b: Research (real web data via journaling)
await this.runStep("00b-research", "Industry & Company Research", async () => {
const result = await executeResearch(this.state);
if (result.success && result.data) {
this.state.researchData = result.data;
}
return result;
});
// Step 1: Extract facts
await this.runStep("01-extract", "Extracting Facts from Briefing", async () => {
const result = await executeExtract(this.state, this.config);
if (result.success) this.state.facts = result.data;
return result;
});
// Step 2: Audit features
await this.runStep("02-audit", "Auditing Features (Skeptical Review)", async () => {
const result = await executeAudit(this.state, this.config);
if (result.success) this.state.auditedFacts = result.data;
return result;
});
// Step 3: Strategic analysis
await this.runStep("03-strategize", "Strategic Analysis", async () => {
const result = await executeStrategize(this.state, this.config);
if (result.success) {
this.state.briefingSummary = result.data.briefingSummary;
this.state.designVision = result.data.designVision;
}
return result;
});
// Step 4: Sitemap architecture
await this.runStep("04-architect", "Information Architecture", async () => {
const result = await executeArchitect(this.state, this.config);
if (result.success) {
this.state.sitemap = result.data.sitemap;
this.state.websiteTopic = result.data.websiteTopic;
}
return result;
});
const projectConcept = this.buildProjectConcept();
await this.saveState(projectConcept);
return projectConcept;
}
/**
* Run a single step with callbacks and error handling.
*/
private async runStep(
stepId: string,
stepName: string,
executor: () => Promise<StepResult>,
): Promise<void> {
this.callbacks.onStepStart?.(stepId, stepName);
console.log(`\n📍 ${stepName}...`);
try {
const result = await executor();
if (result.usage) {
this.state.usage.perStep.push(result.usage);
this.state.usage.totalPromptTokens += result.usage.promptTokens;
this.state.usage.totalCompletionTokens += result.usage.completionTokens;
this.state.usage.totalCost += result.usage.cost;
}
if (result.success) {
const cost = result.usage?.cost ? ` ($${result.usage.cost.toFixed(4)})` : "";
const duration = result.usage?.durationMs ? ` [${(result.usage.durationMs / 1000).toFixed(1)}s]` : "";
console.log(`${stepName} complete${cost}${duration}`);
this.callbacks.onStepComplete?.(stepId, result);
} else {
console.error(`${stepName} failed: ${result.error}`);
this.callbacks.onStepError?.(stepId, result.error || "Unknown error");
throw new Error(result.error);
}
} catch (err) {
const errorMsg = (err as Error).message;
this.callbacks.onStepError?.(stepId, errorMsg);
throw err;
// Step 00b: Research (real web data via journaling)
await this.runStep(
"00b-research",
"Industry & Company Research",
async () => {
const result = await executeResearch(this.state);
if (result.success && result.data) {
this.state.researchData = result.data;
}
return result;
},
);
// Step 1: Extract facts
await this.runStep(
"01-extract",
"Extracting Facts from Briefing",
async () => {
const result = await executeExtract(this.state, this.config);
if (result.success) this.state.facts = result.data;
return result;
},
);
// Step 2: Audit features
await this.runStep(
"02-audit",
"Auditing Features (Skeptical Review)",
async () => {
const result = await executeAudit(this.state, this.config);
if (result.success) this.state.auditedFacts = result.data;
return result;
},
);
// Step 3: Strategic analysis
await this.runStep("03-strategize", "Strategic Analysis", async () => {
const result = await executeStrategize(this.state, this.config);
if (result.success) {
this.state.briefingSummary = result.data.briefingSummary;
this.state.designVision = result.data.designVision;
}
return result;
});
// Step 4: Sitemap architecture
await this.runStep("04-architect", "Information Architecture", async () => {
const result = await executeArchitect(this.state, this.config);
if (result.success) {
this.state.sitemap = result.data.sitemap;
this.state.websiteTopic = result.data.websiteTopic;
}
return result;
});
const projectConcept = this.buildProjectConcept();
await this.saveState(projectConcept);
return projectConcept;
}
/**
* Run a single step with callbacks and error handling.
*/
private async runStep(
stepId: string,
stepName: string,
executor: () => Promise<StepResult>,
): Promise<void> {
this.callbacks.onStepStart?.(stepId, stepName);
console.log(`\n📍 ${stepName}...`);
try {
const result = await executor();
if (result.usage) {
this.state.usage.perStep.push(result.usage);
this.state.usage.totalPromptTokens += result.usage.promptTokens;
this.state.usage.totalCompletionTokens += result.usage.completionTokens;
this.state.usage.totalCost += result.usage.cost;
}
if (result.success) {
const cost = result.usage?.cost
? ` ($${result.usage.cost.toFixed(4)})`
: "";
const duration = result.usage?.durationMs
? ` [${(result.usage.durationMs / 1000).toFixed(1)}s]`
: "";
console.log(`${stepName} complete${cost}${duration}`);
this.callbacks.onStepComplete?.(stepId, result);
} else {
console.error(`${stepName} failed: ${result.error}`);
this.callbacks.onStepError?.(stepId, result.error || "Unknown error");
throw new Error(result.error);
}
} catch (err) {
const errorMsg = (err as Error).message;
this.callbacks.onStepError?.(stepId, errorMsg);
throw err;
}
}
/**
* Build the final Concept object.
*/
private buildProjectConcept(): ProjectConcept {
return {
domain: this.state.siteProfile?.domain || "unknown",
timestamp: new Date().toISOString(),
briefing: this.state.briefing,
auditedFacts: this.state.auditedFacts || {},
siteProfile: this.state.siteProfile,
siteAudit: this.state.siteAudit,
researchData: this.state.researchData,
strategy: {
briefingSummary: this.state.briefingSummary || "",
designVision: this.state.designVision || "",
},
architecture: {
websiteTopic: this.state.websiteTopic || "",
sitemap: this.state.sitemap || [],
},
usage: this.state.usage,
};
/**
* Build the final Concept object.
*/
private buildProjectConcept(): ProjectConcept {
return {
domain: this.state.siteProfile?.domain || "unknown",
timestamp: new Date().toISOString(),
briefing: this.state.briefing,
auditedFacts: this.state.auditedFacts || {},
siteProfile: this.state.siteProfile,
siteAudit: this.state.siteAudit,
researchData: this.state.researchData,
strategy: {
briefingSummary: this.state.briefingSummary || "",
designVision: this.state.designVision || "",
},
architecture: {
websiteTopic: this.state.websiteTopic || "",
sitemap: this.state.sitemap || [],
},
usage: this.state.usage,
};
}
/**
* Save the full concept generated state to disk.
*/
private async saveState(concept: ProjectConcept): Promise<void> {
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const companyName = this.state.auditedFacts?.companyName || "unknown";
const stateDir = path.join(this.config.outputDir, "concepts");
await fs.mkdir(stateDir, { recursive: true });
const statePath = path.join(stateDir, `${companyName}_${timestamp}.json`);
await fs.writeFile(statePath, JSON.stringify(concept, null, 2));
console.log(`\n📦 Saved Project Concept to: ${statePath}`);
// Save debug trace
const debugPath = path.join(
stateDir,
`${companyName}_${timestamp}_debug.json`,
);
await fs.writeFile(debugPath, JSON.stringify(this.state, null, 2));
// Print usage summary
console.log("\n──────────────────────────────────────────────");
console.log("📊 PIPELINE USAGE SUMMARY");
console.log("──────────────────────────────────────────────");
for (const step of this.state.usage.perStep) {
if (step.cost > 0) {
console.log(
` ${step.step}: ${step.model}$${step.cost.toFixed(6)} (${(step.durationMs / 1000).toFixed(1)}s)`,
);
}
}
console.log("──────────────────────────────────────────────");
console.log(` TOTAL: $${this.state.usage.totalCost.toFixed(6)}`);
console.log(
` Tokens: ${(this.state.usage.totalPromptTokens + this.state.usage.totalCompletionTokens).toLocaleString()}`,
);
console.log("──────────────────────────────────────────────\n");
}
/**
* Save the full concept generated state to disk.
*/
private async saveState(concept: ProjectConcept): Promise<void> {
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const companyName = this.state.auditedFacts?.companyName || "unknown";
const stateDir = path.join(this.config.outputDir, "concepts");
await fs.mkdir(stateDir, { recursive: true });
const statePath = path.join(stateDir, `${companyName}_${timestamp}.json`);
await fs.writeFile(statePath, JSON.stringify(concept, null, 2));
console.log(`\n📦 Saved Project Concept to: ${statePath}`);
// Save debug trace
const debugPath = path.join(stateDir, `${companyName}_${timestamp}_debug.json`);
await fs.writeFile(debugPath, JSON.stringify(this.state, null, 2));
// Print usage summary
console.log("\n──────────────────────────────────────────────");
console.log("📊 PIPELINE USAGE SUMMARY");
console.log("──────────────────────────────────────────────");
for (const step of this.state.usage.perStep) {
if (step.cost > 0) {
console.log(` ${step.step}: ${step.model}$${step.cost.toFixed(6)} (${(step.durationMs / 1000).toFixed(1)}s)`);
}
}
console.log("──────────────────────────────────────────────");
console.log(` TOTAL: $${this.state.usage.totalCost.toFixed(6)}`);
console.log(` Tokens: ${(this.state.usage.totalPromptTokens + this.state.usage.totalCompletionTokens).toLocaleString()}`);
console.log("──────────────────────────────────────────────\n");
}
/** Get the current internal state (for CLI inspection). */
getState(): ConceptState {
return this.state;
}
/** Get the current internal state (for CLI inspection). */
getState(): ConceptState {
return this.state;
}
}

View File

@@ -10,268 +10,289 @@ import { existsSync } from "node:fs";
import type { CrawledPage, PageType } from "./types.js";
interface ScraperConfig {
zyteApiKey?: string;
crawlDir: string;
maxPages?: number;
zyteApiKey?: string;
crawlDir: string;
maxPages?: number;
}
/**
* Classify a URL pathname into a page type.
*/
function classifyPage(pathname: string): PageType {
const p = pathname.toLowerCase();
if (p === "/" || p === "" || p === "/index.html") return "home";
if (p.includes("service") || p.includes("leistung") || p.includes("kompetenz"))
return "service";
if (p.includes("about") || p.includes("ueber") || p.includes("über") || p.includes("unternehmen"))
return "about";
if (p.includes("contact") || p.includes("kontakt")) return "contact";
if (p.includes("job") || p.includes("karriere") || p.includes("career") || p.includes("human-resources"))
return "career";
if (p.includes("portfolio") || p.includes("referenz") || p.includes("projekt") || p.includes("case-study"))
return "portfolio";
if (p.includes("blog") || p.includes("news") || p.includes("aktuelles") || p.includes("magazin"))
return "blog";
if (p.includes("legal") || p.includes("impressum") || p.includes("datenschutz") || p.includes("privacy") || p.includes("agb"))
return "legal";
return "other";
const p = pathname.toLowerCase();
if (p === "/" || p === "" || p === "/index.html") return "home";
if (
p.includes("service") ||
p.includes("leistung") ||
p.includes("kompetenz")
)
return "service";
if (
p.includes("about") ||
p.includes("ueber") ||
p.includes("über") ||
p.includes("unternehmen")
)
return "about";
if (p.includes("contact") || p.includes("kontakt")) return "contact";
if (
p.includes("job") ||
p.includes("karriere") ||
p.includes("career") ||
p.includes("human-resources")
)
return "career";
if (
p.includes("portfolio") ||
p.includes("referenz") ||
p.includes("projekt") ||
p.includes("case-study")
)
return "portfolio";
if (
p.includes("blog") ||
p.includes("news") ||
p.includes("aktuelles") ||
p.includes("magazin")
)
return "blog";
if (
p.includes("legal") ||
p.includes("impressum") ||
p.includes("datenschutz") ||
p.includes("privacy") ||
p.includes("agb")
)
return "legal";
return "other";
}
/**
* Detect interactive features present on a page.
*/
function detectFeatures($: cheerio.CheerioAPI): string[] {
const features: string[] = [];
const features: string[] = [];
// Search
if (
$('input[type="search"]').length > 0 ||
$('form[role="search"]').length > 0 ||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
) {
features.push("search");
}
// Search
if (
$('input[type="search"]').length > 0 ||
$('form[role="search"]').length > 0 ||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
) {
features.push("search");
}
// Forms (beyond search)
const formCount = $("form").length;
const searchForms = $('form[role="search"], .search-form').length;
if (formCount > searchForms) {
features.push("forms");
}
// Forms (beyond search)
const formCount = $("form").length;
const searchForms = $('form[role="search"], .search-form').length;
if (formCount > searchForms) {
features.push("forms");
}
// Maps
if (
$('iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]').length > 0
) {
features.push("maps");
}
// Maps
if (
$(
'iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]',
).length > 0
) {
features.push("maps");
}
// Video
if (
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container").length > 0
) {
features.push("video");
}
// Video
if (
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container")
.length > 0
) {
features.push("video");
}
// Calendar / Events
if ($(".calendar, .event, [data-calendar]").length > 0) {
features.push("calendar");
}
// Calendar / Events
if ($(".calendar, .event, [data-calendar]").length > 0) {
features.push("calendar");
}
// Cookie consent
if ($(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length > 0) {
features.push("cookie-consent");
}
// Cookie consent
if (
$(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length >
0
) {
features.push("cookie-consent");
}
return features;
return features;
}
/**
* Extract all internal links from a page.
*/
function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
const links: string[] = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const url = new URL(href, origin);
if (url.origin === origin) {
// Skip assets
if (/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(url.pathname)) return;
// Skip anchors-only
if (url.pathname === "/" && url.hash) return;
links.push(url.pathname);
}
} catch {
// Invalid URL, skip
}
});
return [...new Set(links)];
const links: string[] = [];
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
const url = new URL(href, origin);
if (url.origin === origin) {
// Skip assets
if (
/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(
url.pathname,
)
)
return;
// Skip anchors-only
if (url.pathname === "/" && url.hash) return;
links.push(url.pathname);
}
} catch {
// Invalid URL, skip
}
});
return [...new Set(links)];
}
/**
* Extract all images from a page.
*/
function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
const images: string[] = [];
const images: string[] = [];
// Regular img tags
$("img[src]").each((_, el) => {
const src = $(el).attr("src");
if (src) images.push(src);
});
// Regular img tags
$("img[src]").each((_, el) => {
const src = $(el).attr("src");
if (src) images.push(src);
});
// CSS background images (inline styles)
$("[style*='background-image']").each((_, el) => {
const style = $(el).attr("style");
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
if (match && match[1]) {
images.push(match[1]);
}
});
// Resolve URLs to absolute
const absoluteImages: string[] = [];
for (const img of images) {
if (img.startsWith("data:image")) continue; // Skip inline base64
try {
const url = new URL(img, origin);
// Ignore small tracking pixels or generic vectors
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo")) continue;
absoluteImages.push(url.href);
} catch {
// Invalid URL
}
// CSS background images (inline styles)
$("[style*='background-image']").each((_, el) => {
const style = $(el).attr("style");
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
if (match && match[1]) {
images.push(match[1]);
}
});
return [...new Set(absoluteImages)];
}
/**
* Extract services/competencies from text content.
*/
function extractServices(text: string): string[] {
const services: string[] = [];
// Common pattern: bulleted or newline-separated service lists
const lines = text.split(/\n/).map((l) => l.trim()).filter((l) => l.length > 3 && l.length < 100);
for (const line of lines) {
// Skip generic boilerplate
if (/cookie|datenschutz|impressum|copyright|©/i.test(line)) continue;
if (/^(tel|fax|e-mail|mobil|web|http)/i.test(line)) continue;
services.push(line);
// Resolve URLs to absolute
const absoluteImages: string[] = [];
for (const img of images) {
if (img.startsWith("data:image")) continue; // Skip inline base64
try {
const url = new URL(img, origin);
// Ignore small tracking pixels or generic vectors
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo"))
continue;
absoluteImages.push(url.href);
} catch {
// Invalid URL
}
return services;
}
return [...new Set(absoluteImages)];
}
/**
* Fetch a page via Zyte API with browser rendering.
*/
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
try {
const auth = Buffer.from(`${apiKey}:`).toString("base64");
const resp = await fetch("https://api.zyte.com/v1/extract", {
method: "POST",
headers: {
"Authorization": `Basic ${auth}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url,
browserHtml: true,
}),
signal: AbortSignal.timeout(60000),
});
const auth = Buffer.from(`${apiKey}:`).toString("base64");
const resp = await fetch("https://api.zyte.com/v1/extract", {
method: "POST",
headers: {
Authorization: `Basic ${auth}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url,
browserHtml: true,
}),
signal: AbortSignal.timeout(60000),
});
if (!resp.ok) {
const errorText = await resp.text();
console.error(` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`);
// Rate limited — wait and retry once
if (resp.status === 429) {
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
await new Promise((r) => setTimeout(r, 5000));
return fetchWithZyte(url, apiKey);
}
throw new Error(`HTTP ${resp.status}: ${errorText}`);
}
const data = await resp.json();
const html = data.browserHtml || "";
if (!html) {
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
}
return html;
} catch (err: any) {
throw err;
if (!resp.ok) {
const errorText = await resp.text();
console.error(
` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`,
);
// Rate limited wait and retry once
if (resp.status === 429) {
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
await new Promise((r) => setTimeout(r, 5000));
return fetchWithZyte(url, apiKey);
}
}
throw new Error(`HTTP ${resp.status}: ${errorText}`);
}
const data = await resp.json();
const html = data.browserHtml || "";
if (!html) {
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
}
return html;
}
/**
* Fetch a page via simple HTTP GET (fallback).
*/
async function fetchDirect(url: string): Promise<string> {
try {
const resp = await fetch(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
},
signal: AbortSignal.timeout(30000),
});
if (!resp.ok) return "";
return await resp.text();
} catch {
return "";
}
const resp = await fetch(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
},
signal: AbortSignal.timeout(30000),
}).catch(() => null);
if (!resp || !resp.ok) return "";
return await resp.text();
}
/**
* Parse an HTML string into a CrawledPage.
*/
function parsePage(html: string, url: string): CrawledPage {
const $ = cheerio.load(html);
const urlObj = new URL(url);
const $ = cheerio.load(html);
const urlObj = new URL(url);
const title = $("title").text().trim();
const headings = $("h1, h2, h3")
.map((_, el) => $(el).text().trim())
.get()
.filter((h) => h.length > 0);
const title = $("title").text().trim();
const headings = $("h1, h2, h3")
.map((_, el) => $(el).text().trim())
.get()
.filter((h) => h.length > 0);
const navItems = $("nav a")
.map((_, el) => $(el).text().trim())
.get()
.filter((t) => t.length > 0 && t.length < 100);
const navItems = $("nav a")
.map((_, el) => $(el).text().trim())
.get()
.filter((t) => t.length > 0 && t.length < 100);
const bodyText = $("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim();
const bodyText = $("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim();
const features = detectFeatures($);
const links = extractInternalLinks($, urlObj.origin);
const images = extractImages($, urlObj.origin);
const features = detectFeatures($);
const links = extractInternalLinks($, urlObj.origin);
const images = extractImages($, urlObj.origin);
const description = $('meta[name="description"]').attr("content") || undefined;
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
const description =
$('meta[name="description"]').attr("content") || undefined;
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
return {
url,
pathname: urlObj.pathname,
title,
html,
text: bodyText,
headings,
navItems,
features,
type: classifyPage(urlObj.pathname),
links,
images,
meta: { description, ogTitle, ogImage },
};
return {
url,
pathname: urlObj.pathname,
title,
html,
text: bodyText,
headings,
navItems,
features,
type: classifyPage(urlObj.pathname),
links,
images,
meta: { description, ogTitle, ogImage },
};
}
/**
@@ -280,164 +301,178 @@ function parsePage(html: string, url: string): CrawledPage {
* Returns an array of CrawledPage objects.
*/
export async function crawlSite(
targetUrl: string,
config: ScraperConfig,
targetUrl: string,
config: ScraperConfig,
): Promise<CrawledPage[]> {
const urlObj = new URL(targetUrl);
const origin = urlObj.origin;
const domain = urlObj.hostname;
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
const urlObj = new URL(targetUrl);
const origin = urlObj.origin;
const domain = urlObj.hostname;
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
// Check for existing crawl
const metaFile = path.join(domainDir, "_crawl_meta.json");
if (existsSync(metaFile)) {
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
return loadCrawlFromDisk(domainDir);
}
// Check for existing crawl
const metaFile = path.join(domainDir, "_crawl_meta.json");
if (existsSync(metaFile)) {
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
return loadCrawlFromDisk(domainDir);
}
console.log(`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`);
console.log(
`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`,
);
// Ensure output dir
await fs.mkdir(domainDir, { recursive: true });
// Ensure output dir
await fs.mkdir(domainDir, { recursive: true });
const maxPages = config.maxPages || 30;
const visited = new Set<string>();
const queue: string[] = [targetUrl];
const pages: CrawledPage[] = [];
const maxPages = config.maxPages || 30;
const visited = new Set<string>();
const queue: string[] = [targetUrl];
const pages: CrawledPage[] = [];
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift()!;
const urlPath = new URL(url).pathname;
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift()!;
const urlPath = new URL(url).pathname;
if (visited.has(urlPath)) continue;
visited.add(urlPath);
if (visited.has(urlPath)) continue;
visited.add(urlPath);
try {
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
try {
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
let html: string;
if (config.zyteApiKey) {
html = await fetchWithZyte(url, config.zyteApiKey);
} else {
html = await fetchDirect(url);
}
let html: string;
if (config.zyteApiKey) {
html = await fetchWithZyte(url, config.zyteApiKey);
} else {
html = await fetchDirect(url);
}
if (!html || html.length < 100) {
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
continue;
}
if (!html || html.length < 100) {
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
continue;
}
const page = parsePage(html, url);
pages.push(page);
const page = parsePage(html, url);
pages.push(page);
// Save HTML + metadata to disk
const safeName = urlPath === "/" ? "index" : urlPath.replace(/\//g, "_").replace(/^_/, "");
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
await fs.writeFile(
path.join(domainDir, `${safeName}.meta.json`),
JSON.stringify(
{
url: page.url,
pathname: page.pathname,
title: page.title,
type: page.type,
headings: page.headings,
navItems: page.navItems,
features: page.features,
links: page.links,
images: page.images,
meta: page.meta,
},
null,
2,
),
);
// Discover new links
for (const link of page.links) {
if (!visited.has(link)) {
const fullUrl = `${origin}${link}`;
queue.push(fullUrl);
}
}
} catch (err) {
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
}
}
// Save crawl metadata
await fs.writeFile(
metaFile,
// Save HTML + metadata to disk
const safeName =
urlPath === "/"
? "index"
: urlPath.replace(/\//g, "_").replace(/^_/, "");
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
await fs.writeFile(
path.join(domainDir, `${safeName}.meta.json`),
JSON.stringify(
{
domain,
crawledAt: new Date().toISOString(),
totalPages: pages.length,
urls: pages.map((p) => p.url),
},
null,
2,
{
url: page.url,
pathname: page.pathname,
title: page.title,
type: page.type,
headings: page.headings,
navItems: page.navItems,
features: page.features,
links: page.links,
images: page.images,
meta: page.meta,
},
null,
2,
),
);
);
console.log(`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`);
return pages;
// Discover new links
for (const link of page.links) {
if (!visited.has(link)) {
const fullUrl = `${origin}${link}`;
queue.push(fullUrl);
}
}
} catch (err) {
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
}
}
// Save crawl metadata
await fs.writeFile(
metaFile,
JSON.stringify(
{
domain,
crawledAt: new Date().toISOString(),
totalPages: pages.length,
urls: pages.map((p) => p.url),
},
null,
2,
),
);
console.log(
`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`,
);
return pages;
}
/**
* Load a previously crawled site from disk.
*/
async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
const files = await fs.readdir(domainDir);
const metaFiles = files.filter((f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json");
const files = await fs.readdir(domainDir);
const metaFiles = files.filter(
(f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json",
);
const pages: CrawledPage[] = [];
for (const metaFile of metaFiles) {
const baseName = metaFile.replace(".meta.json", "");
const htmlFile = `${baseName}.html`;
const pages: CrawledPage[] = [];
for (const metaFile of metaFiles) {
const baseName = metaFile.replace(".meta.json", "");
const htmlFile = `${baseName}.html`;
const meta = JSON.parse(await fs.readFile(path.join(domainDir, metaFile), "utf8"));
let html = "";
if (files.includes(htmlFile)) {
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
}
const text = html
? cheerio
.load(html)("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim()
: "";
pages.push({
url: meta.url,
pathname: meta.pathname,
title: meta.title,
html,
text,
headings: meta.headings || [],
navItems: meta.navItems || [],
features: meta.features || [],
type: meta.type || "other",
links: meta.links || [],
images: meta.images || [],
meta: meta.meta || {},
});
const meta = JSON.parse(
await fs.readFile(path.join(domainDir, metaFile), "utf8"),
);
let html = "";
if (files.includes(htmlFile)) {
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
}
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
return pages;
const text = html
? cheerio
.load(html)("body")
.text()
.replace(/\s+/g, " ")
.substring(0, 50000)
.trim()
: "";
pages.push({
url: meta.url,
pathname: meta.pathname,
title: meta.title,
html,
text,
headings: meta.headings || [],
navItems: meta.navItems || [],
features: meta.features || [],
type: meta.type || "other",
links: meta.links || [],
images: meta.images || [],
meta: meta.meta || {},
});
}
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
return pages;
}
/**
* Delete a cached crawl to force re-crawl.
*/
export async function clearCrawlCache(crawlDir: string, domain: string): Promise<void> {
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
if (existsSync(domainDir)) {
await fs.rm(domainDir, { recursive: true, force: true });
console.log(`🧹 Cleared crawl cache for ${domain}`);
}
export async function clearCrawlCache(
crawlDir: string,
domain: string,
): Promise<void> {
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
if (existsSync(domainDir)) {
await fs.rm(domainDir, { recursive: true, force: true });
console.log(`🧹 Cleared crawl cache for ${domain}`);
}
}