chore: remove Directus CMS and related dependencies
All checks were successful
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 3s
Monorepo Pipeline / 🧹 Lint (push) Successful in 1m19s
Monorepo Pipeline / 🧪 Test (push) Successful in 1m5s
Monorepo Pipeline / 🏗️ Build (push) Successful in 1m26s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
All checks were successful
Monorepo Pipeline / ⚡ Prioritize Release (push) Successful in 3s
Monorepo Pipeline / 🧹 Lint (push) Successful in 1m19s
Monorepo Pipeline / 🧪 Test (push) Successful in 1m5s
Monorepo Pipeline / 🏗️ Build (push) Successful in 1m26s
Monorepo Pipeline / 🚀 Release (push) Has been skipped
Monorepo Pipeline / 🐳 Build Image Processor (push) Has been skipped
Monorepo Pipeline / 🐳 Build Directus (Base) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Gatekeeper (Product) (push) Has been skipped
Monorepo Pipeline / 🐳 Build Build-Base (push) Has been skipped
Monorepo Pipeline / 🐳 Build Production Runtime (push) Has been skipped
This commit is contained in:
@@ -1,40 +1,39 @@
|
||||
import { config as dotenvConfig } from 'dotenv';
|
||||
import * as path from 'node:path';
|
||||
import * as fs from 'node:fs/promises';
|
||||
import { EstimationPipeline } from './pipeline.js';
|
||||
import { config as dotenvConfig } from "dotenv";
|
||||
import * as path from "node:path";
|
||||
import * as fs from "node:fs/promises";
|
||||
import { ConceptPipeline } from "./pipeline.js";
|
||||
|
||||
dotenvConfig({ path: path.resolve(process.cwd(), '../../.env') });
|
||||
dotenvConfig({ path: path.resolve(process.cwd(), "../../.env") });
|
||||
|
||||
const briefing = await fs.readFile(
|
||||
path.resolve(process.cwd(), '../../data/briefings/etib.txt'),
|
||||
'utf8',
|
||||
path.resolve(process.cwd(), "../../data/briefings/etib.txt"),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
console.log(`Briefing loaded: ${briefing.length} chars`);
|
||||
|
||||
const pipeline = new EstimationPipeline(
|
||||
{
|
||||
openrouterKey: process.env.OPENROUTER_API_KEY || '',
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
outputDir: path.resolve(process.cwd(), '../../out/estimations'),
|
||||
crawlDir: path.resolve(process.cwd(), '../../data/crawls'),
|
||||
},
|
||||
{
|
||||
onStepStart: (id, name) => console.log(`[CB] Starting: ${id}`),
|
||||
onStepComplete: (id) => console.log(`[CB] Done: ${id}`),
|
||||
onStepError: (id, err) => console.error(`[CB] Error in ${id}: ${err}`),
|
||||
},
|
||||
const pipeline = new ConceptPipeline(
|
||||
{
|
||||
openrouterKey: process.env.OPENROUTER_API_KEY || "",
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
outputDir: path.resolve(process.cwd(), "../../out/estimations"),
|
||||
crawlDir: path.resolve(process.cwd(), "../../data/crawls"),
|
||||
},
|
||||
{
|
||||
onStepStart: (id, _name) => console.log(`[CB] Starting: ${id}`),
|
||||
onStepComplete: (id) => console.log(`[CB] Done: ${id}`),
|
||||
onStepError: (id, err) => console.error(`[CB] Error in ${id}: ${err}`),
|
||||
},
|
||||
);
|
||||
|
||||
try {
|
||||
const result = await pipeline.run({
|
||||
briefing,
|
||||
url: 'https://www.e-tib.com',
|
||||
});
|
||||
await pipeline.run({
|
||||
briefing,
|
||||
url: "https://www.e-tib.com",
|
||||
});
|
||||
|
||||
console.log('\n✨ Pipeline complete!');
|
||||
console.log('Validation:', result.validationResult?.passed ? 'PASSED' : 'FAILED');
|
||||
console.log("\n✨ Pipeline complete!");
|
||||
} catch (err: any) {
|
||||
console.error('\n❌ Pipeline failed:', err.message);
|
||||
console.error(err.stack);
|
||||
console.error("\n❌ Pipeline failed:", err.message);
|
||||
console.error(err.stack);
|
||||
}
|
||||
|
||||
@@ -18,132 +18,146 @@ dotenvConfig({ path: path.resolve(process.cwd(), ".env") });
|
||||
const program = new Command();
|
||||
|
||||
program
|
||||
.name("concept")
|
||||
.description("AI-powered project concept generator")
|
||||
.version("1.0.0");
|
||||
.name("concept")
|
||||
.description("AI-powered project concept generator")
|
||||
.version("1.0.0");
|
||||
|
||||
program
|
||||
.command("run")
|
||||
.description("Run the full concept pipeline")
|
||||
.argument("[briefing]", "Briefing text or @path/to/file.txt")
|
||||
.option("--url <url>", "Target website URL")
|
||||
.option("--comments <comments>", "Additional notes")
|
||||
.option("--clear-cache", "Clear crawl cache and re-crawl")
|
||||
.option("--output <dir>", "Output directory", "../../out/concepts")
|
||||
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
|
||||
.action(async (briefingArg: string | undefined, options: any) => {
|
||||
const openrouterKey = process.env.OPENROUTER_API_KEY || process.env.OPENROUTER_KEY;
|
||||
if (!openrouterKey) {
|
||||
console.error("❌ OPENROUTER_API_KEY not found in environment.");
|
||||
process.exit(1);
|
||||
}
|
||||
.command("run")
|
||||
.description("Run the full concept pipeline")
|
||||
.argument("[briefing]", "Briefing text or @path/to/file.txt")
|
||||
.option("--url <url>", "Target website URL")
|
||||
.option("--comments <comments>", "Additional notes")
|
||||
.option("--clear-cache", "Clear crawl cache and re-crawl")
|
||||
.option("--output <dir>", "Output directory", "../../out/concepts")
|
||||
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
|
||||
.action(async (briefingArg: string | undefined, options: any) => {
|
||||
const openrouterKey =
|
||||
process.env.OPENROUTER_API_KEY || process.env.OPENROUTER_KEY;
|
||||
if (!openrouterKey) {
|
||||
console.error("❌ OPENROUTER_API_KEY not found in environment.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let briefing = briefingArg || "";
|
||||
let briefing = briefingArg || "";
|
||||
|
||||
// Handle @file references
|
||||
if (briefing.startsWith("@")) {
|
||||
const rawPath = briefing.substring(1);
|
||||
const filePath = rawPath.startsWith("/")
|
||||
? rawPath
|
||||
: path.resolve(process.cwd(), rawPath);
|
||||
if (!existsSync(filePath)) {
|
||||
console.error(`❌ Briefing file not found: ${filePath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
briefing = await fs.readFile(filePath, "utf8");
|
||||
console.log(`📄 Loaded briefing from: ${filePath}`);
|
||||
}
|
||||
// Handle @file references
|
||||
if (briefing.startsWith("@")) {
|
||||
const rawPath = briefing.substring(1);
|
||||
const filePath = rawPath.startsWith("/")
|
||||
? rawPath
|
||||
: path.resolve(process.cwd(), rawPath);
|
||||
if (!existsSync(filePath)) {
|
||||
console.error(`❌ Briefing file not found: ${filePath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
briefing = await fs.readFile(filePath, "utf8");
|
||||
console.log(`📄 Loaded briefing from: ${filePath}`);
|
||||
}
|
||||
|
||||
// Auto-discover URL from briefing
|
||||
let url = options.url;
|
||||
if (!url && briefing) {
|
||||
const urlMatch = briefing.match(/https?:\/\/[^\s]+/);
|
||||
if (urlMatch) {
|
||||
url = urlMatch[0];
|
||||
console.log(`🔗 Discovered URL in briefing: ${url}`);
|
||||
}
|
||||
}
|
||||
// Auto-discover URL from briefing
|
||||
let url = options.url;
|
||||
if (!url && briefing) {
|
||||
const urlMatch = briefing.match(/https?:\/\/[^\s]+/);
|
||||
if (urlMatch) {
|
||||
url = urlMatch[0];
|
||||
console.log(`🔗 Discovered URL in briefing: ${url}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (!briefing && !url) {
|
||||
console.error("❌ Provide a briefing text or --url");
|
||||
process.exit(1);
|
||||
}
|
||||
if (!briefing && !url) {
|
||||
console.error("❌ Provide a briefing text or --url");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const pipeline = new ConceptPipeline(
|
||||
{
|
||||
openrouterKey,
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
outputDir: path.resolve(process.cwd(), options.output),
|
||||
crawlDir: path.resolve(process.cwd(), options.crawlDir),
|
||||
},
|
||||
{
|
||||
onStepStart: (id, name) => {
|
||||
// Will be enhanced with Ink spinner later
|
||||
},
|
||||
onStepComplete: (id, result) => {
|
||||
// Will be enhanced with Ink UI later
|
||||
},
|
||||
},
|
||||
);
|
||||
const pipeline = new ConceptPipeline(
|
||||
{
|
||||
openrouterKey,
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
outputDir: path.resolve(process.cwd(), options.output),
|
||||
crawlDir: path.resolve(process.cwd(), options.crawlDir),
|
||||
},
|
||||
{
|
||||
onStepStart: (_id, _name) => {
|
||||
// Will be enhanced with Ink spinner later
|
||||
},
|
||||
onStepComplete: (_id, _result) => {
|
||||
// Will be enhanced with Ink UI later
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
try {
|
||||
await pipeline.run({
|
||||
briefing,
|
||||
url,
|
||||
comments: options.comments,
|
||||
clearCache: options.clearCache,
|
||||
});
|
||||
try {
|
||||
await pipeline.run({
|
||||
briefing,
|
||||
url,
|
||||
comments: options.comments,
|
||||
clearCache: options.clearCache,
|
||||
});
|
||||
|
||||
console.log("\n✨ Concept generation complete!");
|
||||
} catch (err) {
|
||||
console.error(`\n❌ Pipeline failed: ${(err as Error).message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
console.log("\n✨ Concept generation complete!");
|
||||
} catch (err) {
|
||||
console.error(`\n❌ Pipeline failed: ${(err as Error).message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
program
|
||||
.command("analyze")
|
||||
.description("Only crawl and analyze a website (no LLM)")
|
||||
.argument("<url>", "Website URL to analyze")
|
||||
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
|
||||
.option("--clear-cache", "Clear existing crawl cache")
|
||||
.action(async (url: string, options: any) => {
|
||||
const { crawlSite } = await import("./scraper.js");
|
||||
const { analyzeSite } = await import("./analyzer.js");
|
||||
.command("analyze")
|
||||
.description("Only crawl and analyze a website (no LLM)")
|
||||
.argument("<url>", "Website URL to analyze")
|
||||
.option("--crawl-dir <dir>", "Crawl data directory", "../../data/crawls")
|
||||
.option("--clear-cache", "Clear existing crawl cache")
|
||||
.action(async (url: string, options: any) => {
|
||||
const { crawlSite } = await import("./scraper.js");
|
||||
const { analyzeSite } = await import("./analyzer.js");
|
||||
|
||||
if (options.clearCache) {
|
||||
const { clearCrawlCache } = await import("./scraper.js");
|
||||
const domain = new URL(url).hostname;
|
||||
await clearCrawlCache(path.resolve(process.cwd(), options.crawlDir), domain);
|
||||
}
|
||||
if (options.clearCache) {
|
||||
const { clearCrawlCache } = await import("./scraper.js");
|
||||
const domain = new URL(url).hostname;
|
||||
await clearCrawlCache(
|
||||
path.resolve(process.cwd(), options.crawlDir),
|
||||
domain,
|
||||
);
|
||||
}
|
||||
|
||||
const pages = await crawlSite(url, {
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
crawlDir: path.resolve(process.cwd(), options.crawlDir),
|
||||
});
|
||||
|
||||
const domain = new URL(url).hostname;
|
||||
const profile = analyzeSite(pages, domain);
|
||||
|
||||
console.log("\n📊 Site Profile:");
|
||||
console.log(` Domain: ${profile.domain}`);
|
||||
console.log(` Total Pages: ${profile.totalPages}`);
|
||||
console.log(` Navigation: ${profile.navigation.map((n) => n.label).join(", ")}`);
|
||||
console.log(` Features: ${profile.existingFeatures.join(", ") || "none"}`);
|
||||
console.log(` Services: ${profile.services.join(", ") || "none"}`);
|
||||
console.log(` External Domains: ${profile.externalDomains.join(", ") || "none"}`);
|
||||
console.log(` Company: ${profile.companyInfo.name || "unbekannt"}`);
|
||||
console.log(` Tax ID: ${profile.companyInfo.taxId || "unbekannt"}`);
|
||||
console.log(` Colors: ${profile.colors.join(", ")}`);
|
||||
console.log(` Images Found: ${profile.images.length}`);
|
||||
console.log(` Social: ${Object.entries(profile.socialLinks).map(([k, v]) => `${k}`).join(", ") || "none"}`);
|
||||
|
||||
const outputPath = path.join(
|
||||
path.resolve(process.cwd(), options.crawlDir),
|
||||
domain.replace(/\./g, "-"),
|
||||
"_site_profile.json",
|
||||
);
|
||||
console.log(`\n📦 Full profile saved to: ${outputPath}`);
|
||||
const pages = await crawlSite(url, {
|
||||
zyteApiKey: process.env.ZYTE_API_KEY,
|
||||
crawlDir: path.resolve(process.cwd(), options.crawlDir),
|
||||
});
|
||||
|
||||
const domain = new URL(url).hostname;
|
||||
const profile = analyzeSite(pages, domain);
|
||||
|
||||
console.log("\n📊 Site Profile:");
|
||||
console.log(` Domain: ${profile.domain}`);
|
||||
console.log(` Total Pages: ${profile.totalPages}`);
|
||||
console.log(
|
||||
` Navigation: ${profile.navigation.map((n) => n.label).join(", ")}`,
|
||||
);
|
||||
console.log(` Features: ${profile.existingFeatures.join(", ") || "none"}`);
|
||||
console.log(` Services: ${profile.services.join(", ") || "none"}`);
|
||||
console.log(
|
||||
` External Domains: ${profile.externalDomains.join(", ") || "none"}`,
|
||||
);
|
||||
console.log(` Company: ${profile.companyInfo.name || "unbekannt"}`);
|
||||
console.log(` Tax ID: ${profile.companyInfo.taxId || "unbekannt"}`);
|
||||
console.log(` Colors: ${profile.colors.join(", ")}`);
|
||||
console.log(` Images Found: ${profile.images.length}`);
|
||||
console.log(
|
||||
` Social: ${
|
||||
Object.entries(profile.socialLinks)
|
||||
.map(([_k, _v]) => `${_k}`)
|
||||
.join(", ") || "none"
|
||||
}`,
|
||||
);
|
||||
|
||||
const outputPath = path.join(
|
||||
path.resolve(process.cwd(), options.crawlDir),
|
||||
domain.replace(/\./g, "-"),
|
||||
"_site_profile.json",
|
||||
);
|
||||
console.log(`\n📦 Full profile saved to: ${outputPath}`);
|
||||
});
|
||||
|
||||
program.parse();
|
||||
|
||||
7
packages/concept-engine/src/dummy.test.ts
Normal file
7
packages/concept-engine/src/dummy.test.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
|
||||
describe("concept-engine", () => {
|
||||
it("should pass", () => {
|
||||
expect(true).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -5,20 +5,20 @@
|
||||
import axios from "axios";
|
||||
|
||||
interface LLMRequestOptions {
|
||||
model: string;
|
||||
systemPrompt: string;
|
||||
userPrompt: string;
|
||||
jsonMode?: boolean;
|
||||
apiKey: string;
|
||||
model: string;
|
||||
systemPrompt: string;
|
||||
userPrompt: string;
|
||||
jsonMode?: boolean;
|
||||
apiKey: string;
|
||||
}
|
||||
|
||||
interface LLMResponse {
|
||||
content: string;
|
||||
usage: {
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
cost: number;
|
||||
};
|
||||
content: string;
|
||||
usage: {
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
cost: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -26,108 +26,117 @@ interface LLMResponse {
|
||||
* Handles markdown fences, control chars, trailing commas.
|
||||
*/
|
||||
export function cleanJson(str: string): string {
|
||||
let cleaned = str.replace(/```json\n?|```/g, "").trim();
|
||||
cleaned = cleaned.replace(
|
||||
/[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
|
||||
" ",
|
||||
);
|
||||
cleaned = cleaned.replace(/,\s*([\]}])/g, "$1");
|
||||
return cleaned;
|
||||
let cleaned = str.replace(/```json\n?|```/g, "").trim();
|
||||
// eslint-disable-next-line no-control-regex
|
||||
cleaned = cleaned.replace(/[\x00-\x1f\x7f-\x9f]/gi, " ");
|
||||
cleaned = cleaned.replace(/,\s*([\]}])/g, "$1");
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a request to an LLM via OpenRouter.
|
||||
*/
|
||||
export async function llmRequest(options: LLMRequestOptions): Promise<LLMResponse> {
|
||||
const { model, systemPrompt, userPrompt, jsonMode = true, apiKey } = options;
|
||||
export async function llmRequest(
|
||||
options: LLMRequestOptions,
|
||||
): Promise<LLMResponse> {
|
||||
const { model, systemPrompt, userPrompt, jsonMode = true, apiKey } = options;
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
const resp = await axios.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
{
|
||||
model,
|
||||
messages: [
|
||||
{ role: "system", content: systemPrompt },
|
||||
{ role: "user", content: userPrompt },
|
||||
],
|
||||
...(jsonMode ? { response_format: { type: "json_object" } } : {}),
|
||||
const resp = await axios
|
||||
.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
{
|
||||
model,
|
||||
messages: [
|
||||
{ role: "system", content: systemPrompt },
|
||||
{ role: "user", content: userPrompt },
|
||||
],
|
||||
...(jsonMode ? { response_format: { type: "json_object" } } : {}),
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout: 120000,
|
||||
},
|
||||
).catch(err => {
|
||||
if (err.response) {
|
||||
console.error("OpenRouter API Error:", JSON.stringify(err.response.data, null, 2));
|
||||
}
|
||||
throw err;
|
||||
timeout: 120000,
|
||||
},
|
||||
)
|
||||
.catch((err) => {
|
||||
if (err.response) {
|
||||
console.error(
|
||||
"OpenRouter API Error:",
|
||||
JSON.stringify(err.response.data, null, 2),
|
||||
);
|
||||
}
|
||||
throw err;
|
||||
});
|
||||
|
||||
const content = resp.data.choices?.[0]?.message?.content;
|
||||
if (!content) {
|
||||
throw new Error(`LLM returned no content. Model: ${model}`);
|
||||
}
|
||||
const content = resp.data.choices?.[0]?.message?.content;
|
||||
if (!content) {
|
||||
throw new Error(`LLM returned no content. Model: ${model}`);
|
||||
}
|
||||
|
||||
let cost = 0;
|
||||
const usage = resp.data.usage || {};
|
||||
if (usage.cost !== undefined) {
|
||||
cost = usage.cost;
|
||||
} else {
|
||||
// Fallback estimation
|
||||
cost =
|
||||
(usage.prompt_tokens || 0) * (0.1 / 1_000_000) +
|
||||
(usage.completion_tokens || 0) * (0.4 / 1_000_000);
|
||||
}
|
||||
let cost = 0;
|
||||
const usage = resp.data.usage || {};
|
||||
if (usage.cost !== undefined) {
|
||||
cost = usage.cost;
|
||||
} else {
|
||||
// Fallback estimation
|
||||
cost =
|
||||
(usage.prompt_tokens || 0) * (0.1 / 1_000_000) +
|
||||
(usage.completion_tokens || 0) * (0.4 / 1_000_000);
|
||||
}
|
||||
|
||||
return {
|
||||
content,
|
||||
usage: {
|
||||
promptTokens: usage.prompt_tokens || 0,
|
||||
completionTokens: usage.completion_tokens || 0,
|
||||
cost,
|
||||
},
|
||||
};
|
||||
return {
|
||||
content,
|
||||
usage: {
|
||||
promptTokens: usage.prompt_tokens || 0,
|
||||
completionTokens: usage.completion_tokens || 0,
|
||||
cost,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a request and parse the response as JSON.
|
||||
*/
|
||||
export async function llmJsonRequest<T = any>(
|
||||
options: LLMRequestOptions,
|
||||
options: LLMRequestOptions,
|
||||
): Promise<{ data: T; usage: LLMResponse["usage"] }> {
|
||||
const response = await llmRequest({ ...options, jsonMode: true });
|
||||
const cleaned = cleanJson(response.content);
|
||||
const response = await llmRequest({ ...options, jsonMode: true });
|
||||
const cleaned = cleanJson(response.content);
|
||||
|
||||
let parsed: T;
|
||||
try {
|
||||
parsed = JSON.parse(cleaned);
|
||||
} catch (e) {
|
||||
throw new Error(
|
||||
`Failed to parse LLM JSON response: ${(e as Error).message}\nRaw: ${cleaned.substring(0, 500)}`,
|
||||
);
|
||||
}
|
||||
let parsed: T;
|
||||
try {
|
||||
parsed = JSON.parse(cleaned);
|
||||
} catch (e) {
|
||||
throw new Error(
|
||||
`Failed to parse LLM JSON response: ${(e as Error).message}\nRaw: ${cleaned.substring(0, 500)}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Unwrap common LLM artifacts: {"0": {...}}, {"state": {...}}, etc.
|
||||
const unwrapped = unwrapResponse(parsed);
|
||||
// Unwrap common LLM artifacts: {"0": {...}}, {"state": {...}}, etc.
|
||||
const unwrapped = unwrapResponse(parsed);
|
||||
|
||||
return { data: unwrapped as T, usage: response.usage };
|
||||
return { data: unwrapped as T, usage: response.usage };
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively unwrap common LLM wrapping patterns.
|
||||
*/
|
||||
function unwrapResponse(obj: any): any {
|
||||
if (!obj || typeof obj !== "object" || Array.isArray(obj)) return obj;
|
||||
const keys = Object.keys(obj);
|
||||
if (keys.length === 1) {
|
||||
const key = keys[0];
|
||||
if (key === "0" || key === "state" || key === "facts" || key === "result" || key === "data") {
|
||||
return unwrapResponse(obj[key]);
|
||||
}
|
||||
if (!obj || typeof obj !== "object" || Array.isArray(obj)) return obj;
|
||||
const keys = Object.keys(obj);
|
||||
if (keys.length === 1) {
|
||||
const key = keys[0];
|
||||
if (
|
||||
key === "0" ||
|
||||
key === "state" ||
|
||||
key === "facts" ||
|
||||
key === "result" ||
|
||||
key === "data"
|
||||
) {
|
||||
return unwrapResponse(obj[key]);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
import * as fs from "node:fs/promises";
|
||||
import * as path from "node:path";
|
||||
import { existsSync } from "node:fs";
|
||||
import { crawlSite, clearCrawlCache } from "./scraper.js";
|
||||
import { analyzeSite } from "./analyzer.js";
|
||||
import { executeResearch } from "./steps/00b-research.js";
|
||||
@@ -15,18 +14,17 @@ import { executeAudit } from "./steps/02-audit.js";
|
||||
import { executeStrategize } from "./steps/03-strategize.js";
|
||||
import { executeArchitect } from "./steps/04-architect.js";
|
||||
import type {
|
||||
PipelineConfig,
|
||||
PipelineInput,
|
||||
ConceptState,
|
||||
ProjectConcept,
|
||||
StepResult,
|
||||
StepUsage,
|
||||
PipelineConfig,
|
||||
PipelineInput,
|
||||
ConceptState,
|
||||
ProjectConcept,
|
||||
StepResult,
|
||||
} from "./types.js";
|
||||
|
||||
export interface PipelineCallbacks {
|
||||
onStepStart?: (stepId: string, stepName: string) => void;
|
||||
onStepComplete?: (stepId: string, result: StepResult) => void;
|
||||
onStepError?: (stepId: string, error: string) => void;
|
||||
onStepStart?: (stepId: string, stepName: string) => void;
|
||||
onStepComplete?: (stepId: string, result: StepResult) => void;
|
||||
onStepError?: (stepId: string, error: string) => void;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -34,224 +32,265 @@ export interface PipelineCallbacks {
|
||||
* Runs conceptual steps sequentially and builds the ProjectConcept.
|
||||
*/
|
||||
export class ConceptPipeline {
|
||||
private config: PipelineConfig;
|
||||
private state: ConceptState;
|
||||
private callbacks: PipelineCallbacks;
|
||||
private config: PipelineConfig;
|
||||
private state: ConceptState;
|
||||
private callbacks: PipelineCallbacks;
|
||||
|
||||
constructor(config: PipelineConfig, callbacks: PipelineCallbacks = {}) {
|
||||
this.config = config;
|
||||
this.callbacks = callbacks;
|
||||
this.state = this.createInitialState();
|
||||
}
|
||||
constructor(config: PipelineConfig, callbacks: PipelineCallbacks = {}) {
|
||||
this.config = config;
|
||||
this.callbacks = callbacks;
|
||||
this.state = this.createInitialState();
|
||||
}
|
||||
|
||||
private createInitialState(): ConceptState {
|
||||
return {
|
||||
briefing: "",
|
||||
private createInitialState(): ConceptState {
|
||||
return {
|
||||
briefing: "",
|
||||
usage: {
|
||||
totalPromptTokens: 0,
|
||||
totalCompletionTokens: 0,
|
||||
totalCost: 0,
|
||||
perStep: [],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the full concept pipeline from scratch.
|
||||
*/
|
||||
async run(input: PipelineInput): Promise<ProjectConcept> {
|
||||
this.state.briefing = input.briefing;
|
||||
this.state.url = input.url;
|
||||
this.state.comments = input.comments;
|
||||
|
||||
// Ensure output directories
|
||||
await fs.mkdir(this.config.outputDir, { recursive: true });
|
||||
await fs.mkdir(this.config.crawlDir, { recursive: true });
|
||||
|
||||
// Step 0: Scrape & Analyze (deterministic)
|
||||
if (input.url) {
|
||||
if (input.clearCache) {
|
||||
const domain = new URL(input.url).hostname;
|
||||
await clearCrawlCache(this.config.crawlDir, domain);
|
||||
}
|
||||
await this.runStep(
|
||||
"00-scrape",
|
||||
"Scraping & Analyzing Website",
|
||||
async () => {
|
||||
const pages = await crawlSite(input.url!, {
|
||||
zyteApiKey: this.config.zyteApiKey,
|
||||
crawlDir: this.config.crawlDir,
|
||||
});
|
||||
const domain = new URL(input.url!).hostname;
|
||||
const siteProfile = analyzeSite(pages, domain);
|
||||
this.state.siteProfile = siteProfile;
|
||||
this.state.crawlDir = path.join(
|
||||
this.config.crawlDir,
|
||||
domain.replace(/\./g, "-"),
|
||||
);
|
||||
|
||||
// Save site profile
|
||||
await fs.writeFile(
|
||||
path.join(this.state.crawlDir!, "_site_profile.json"),
|
||||
JSON.stringify(siteProfile, null, 2),
|
||||
);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: siteProfile,
|
||||
usage: {
|
||||
totalPromptTokens: 0,
|
||||
totalCompletionTokens: 0,
|
||||
totalCost: 0,
|
||||
perStep: [],
|
||||
step: "00-scrape",
|
||||
model: "none",
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
cost: 0,
|
||||
durationMs: 0,
|
||||
},
|
||||
};
|
||||
};
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the full concept pipeline from scratch.
|
||||
*/
|
||||
async run(input: PipelineInput): Promise<ProjectConcept> {
|
||||
this.state.briefing = input.briefing;
|
||||
this.state.url = input.url;
|
||||
this.state.comments = input.comments;
|
||||
|
||||
// Ensure output directories
|
||||
await fs.mkdir(this.config.outputDir, { recursive: true });
|
||||
await fs.mkdir(this.config.crawlDir, { recursive: true });
|
||||
|
||||
// Step 0: Scrape & Analyze (deterministic)
|
||||
if (input.url) {
|
||||
if (input.clearCache) {
|
||||
const domain = new URL(input.url).hostname;
|
||||
await clearCrawlCache(this.config.crawlDir, domain);
|
||||
}
|
||||
await this.runStep("00-scrape", "Scraping & Analyzing Website", async () => {
|
||||
const pages = await crawlSite(input.url!, {
|
||||
zyteApiKey: this.config.zyteApiKey,
|
||||
crawlDir: this.config.crawlDir,
|
||||
});
|
||||
const domain = new URL(input.url!).hostname;
|
||||
const siteProfile = analyzeSite(pages, domain);
|
||||
this.state.siteProfile = siteProfile;
|
||||
this.state.crawlDir = path.join(this.config.crawlDir, domain.replace(/\./g, "-"));
|
||||
|
||||
// Save site profile
|
||||
await fs.writeFile(
|
||||
path.join(this.state.crawlDir!, "_site_profile.json"),
|
||||
JSON.stringify(siteProfile, null, 2),
|
||||
);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: siteProfile,
|
||||
usage: { step: "00-scrape", model: "none", promptTokens: 0, completionTokens: 0, cost: 0, durationMs: 0 },
|
||||
};
|
||||
});
|
||||
// Step 00a: Site Audit (DataForSEO)
|
||||
await this.runStep(
|
||||
"00a-site-audit",
|
||||
"IST-Analysis (DataForSEO)",
|
||||
async () => {
|
||||
const result = await executeSiteAudit(this.state, this.config);
|
||||
if (result.success && result.data) {
|
||||
this.state.siteAudit = result.data;
|
||||
}
|
||||
return result;
|
||||
},
|
||||
);
|
||||
|
||||
// Step 00a: Site Audit (DataForSEO)
|
||||
await this.runStep("00a-site-audit", "IST-Analysis (DataForSEO)", async () => {
|
||||
const result = await executeSiteAudit(this.state, this.config);
|
||||
if (result.success && result.data) {
|
||||
this.state.siteAudit = result.data;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 00b: Research (real web data via journaling)
|
||||
await this.runStep("00b-research", "Industry & Company Research", async () => {
|
||||
const result = await executeResearch(this.state);
|
||||
if (result.success && result.data) {
|
||||
this.state.researchData = result.data;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 1: Extract facts
|
||||
await this.runStep("01-extract", "Extracting Facts from Briefing", async () => {
|
||||
const result = await executeExtract(this.state, this.config);
|
||||
if (result.success) this.state.facts = result.data;
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 2: Audit features
|
||||
await this.runStep("02-audit", "Auditing Features (Skeptical Review)", async () => {
|
||||
const result = await executeAudit(this.state, this.config);
|
||||
if (result.success) this.state.auditedFacts = result.data;
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 3: Strategic analysis
|
||||
await this.runStep("03-strategize", "Strategic Analysis", async () => {
|
||||
const result = await executeStrategize(this.state, this.config);
|
||||
if (result.success) {
|
||||
this.state.briefingSummary = result.data.briefingSummary;
|
||||
this.state.designVision = result.data.designVision;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 4: Sitemap architecture
|
||||
await this.runStep("04-architect", "Information Architecture", async () => {
|
||||
const result = await executeArchitect(this.state, this.config);
|
||||
if (result.success) {
|
||||
this.state.sitemap = result.data.sitemap;
|
||||
this.state.websiteTopic = result.data.websiteTopic;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
const projectConcept = this.buildProjectConcept();
|
||||
await this.saveState(projectConcept);
|
||||
|
||||
return projectConcept;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single step with callbacks and error handling.
|
||||
*/
|
||||
private async runStep(
|
||||
stepId: string,
|
||||
stepName: string,
|
||||
executor: () => Promise<StepResult>,
|
||||
): Promise<void> {
|
||||
this.callbacks.onStepStart?.(stepId, stepName);
|
||||
console.log(`\n📍 ${stepName}...`);
|
||||
|
||||
try {
|
||||
const result = await executor();
|
||||
if (result.usage) {
|
||||
this.state.usage.perStep.push(result.usage);
|
||||
this.state.usage.totalPromptTokens += result.usage.promptTokens;
|
||||
this.state.usage.totalCompletionTokens += result.usage.completionTokens;
|
||||
this.state.usage.totalCost += result.usage.cost;
|
||||
}
|
||||
|
||||
if (result.success) {
|
||||
const cost = result.usage?.cost ? ` ($${result.usage.cost.toFixed(4)})` : "";
|
||||
const duration = result.usage?.durationMs ? ` [${(result.usage.durationMs / 1000).toFixed(1)}s]` : "";
|
||||
console.log(` ✅ ${stepName} complete${cost}${duration}`);
|
||||
this.callbacks.onStepComplete?.(stepId, result);
|
||||
} else {
|
||||
console.error(` ❌ ${stepName} failed: ${result.error}`);
|
||||
this.callbacks.onStepError?.(stepId, result.error || "Unknown error");
|
||||
throw new Error(result.error);
|
||||
}
|
||||
} catch (err) {
|
||||
const errorMsg = (err as Error).message;
|
||||
this.callbacks.onStepError?.(stepId, errorMsg);
|
||||
throw err;
|
||||
// Step 00b: Research (real web data via journaling)
|
||||
await this.runStep(
|
||||
"00b-research",
|
||||
"Industry & Company Research",
|
||||
async () => {
|
||||
const result = await executeResearch(this.state);
|
||||
if (result.success && result.data) {
|
||||
this.state.researchData = result.data;
|
||||
}
|
||||
return result;
|
||||
},
|
||||
);
|
||||
|
||||
// Step 1: Extract facts
|
||||
await this.runStep(
|
||||
"01-extract",
|
||||
"Extracting Facts from Briefing",
|
||||
async () => {
|
||||
const result = await executeExtract(this.state, this.config);
|
||||
if (result.success) this.state.facts = result.data;
|
||||
return result;
|
||||
},
|
||||
);
|
||||
|
||||
// Step 2: Audit features
|
||||
await this.runStep(
|
||||
"02-audit",
|
||||
"Auditing Features (Skeptical Review)",
|
||||
async () => {
|
||||
const result = await executeAudit(this.state, this.config);
|
||||
if (result.success) this.state.auditedFacts = result.data;
|
||||
return result;
|
||||
},
|
||||
);
|
||||
|
||||
// Step 3: Strategic analysis
|
||||
await this.runStep("03-strategize", "Strategic Analysis", async () => {
|
||||
const result = await executeStrategize(this.state, this.config);
|
||||
if (result.success) {
|
||||
this.state.briefingSummary = result.data.briefingSummary;
|
||||
this.state.designVision = result.data.designVision;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Step 4: Sitemap architecture
|
||||
await this.runStep("04-architect", "Information Architecture", async () => {
|
||||
const result = await executeArchitect(this.state, this.config);
|
||||
if (result.success) {
|
||||
this.state.sitemap = result.data.sitemap;
|
||||
this.state.websiteTopic = result.data.websiteTopic;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
const projectConcept = this.buildProjectConcept();
|
||||
await this.saveState(projectConcept);
|
||||
|
||||
return projectConcept;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single step with callbacks and error handling.
|
||||
*/
|
||||
private async runStep(
|
||||
stepId: string,
|
||||
stepName: string,
|
||||
executor: () => Promise<StepResult>,
|
||||
): Promise<void> {
|
||||
this.callbacks.onStepStart?.(stepId, stepName);
|
||||
console.log(`\n📍 ${stepName}...`);
|
||||
|
||||
try {
|
||||
const result = await executor();
|
||||
if (result.usage) {
|
||||
this.state.usage.perStep.push(result.usage);
|
||||
this.state.usage.totalPromptTokens += result.usage.promptTokens;
|
||||
this.state.usage.totalCompletionTokens += result.usage.completionTokens;
|
||||
this.state.usage.totalCost += result.usage.cost;
|
||||
}
|
||||
|
||||
if (result.success) {
|
||||
const cost = result.usage?.cost
|
||||
? ` ($${result.usage.cost.toFixed(4)})`
|
||||
: "";
|
||||
const duration = result.usage?.durationMs
|
||||
? ` [${(result.usage.durationMs / 1000).toFixed(1)}s]`
|
||||
: "";
|
||||
console.log(` ✅ ${stepName} complete${cost}${duration}`);
|
||||
this.callbacks.onStepComplete?.(stepId, result);
|
||||
} else {
|
||||
console.error(` ❌ ${stepName} failed: ${result.error}`);
|
||||
this.callbacks.onStepError?.(stepId, result.error || "Unknown error");
|
||||
throw new Error(result.error);
|
||||
}
|
||||
} catch (err) {
|
||||
const errorMsg = (err as Error).message;
|
||||
this.callbacks.onStepError?.(stepId, errorMsg);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the final Concept object.
|
||||
*/
|
||||
private buildProjectConcept(): ProjectConcept {
|
||||
return {
|
||||
domain: this.state.siteProfile?.domain || "unknown",
|
||||
timestamp: new Date().toISOString(),
|
||||
briefing: this.state.briefing,
|
||||
auditedFacts: this.state.auditedFacts || {},
|
||||
siteProfile: this.state.siteProfile,
|
||||
siteAudit: this.state.siteAudit,
|
||||
researchData: this.state.researchData,
|
||||
strategy: {
|
||||
briefingSummary: this.state.briefingSummary || "",
|
||||
designVision: this.state.designVision || "",
|
||||
},
|
||||
architecture: {
|
||||
websiteTopic: this.state.websiteTopic || "",
|
||||
sitemap: this.state.sitemap || [],
|
||||
},
|
||||
usage: this.state.usage,
|
||||
};
|
||||
/**
|
||||
* Build the final Concept object.
|
||||
*/
|
||||
private buildProjectConcept(): ProjectConcept {
|
||||
return {
|
||||
domain: this.state.siteProfile?.domain || "unknown",
|
||||
timestamp: new Date().toISOString(),
|
||||
briefing: this.state.briefing,
|
||||
auditedFacts: this.state.auditedFacts || {},
|
||||
siteProfile: this.state.siteProfile,
|
||||
siteAudit: this.state.siteAudit,
|
||||
researchData: this.state.researchData,
|
||||
strategy: {
|
||||
briefingSummary: this.state.briefingSummary || "",
|
||||
designVision: this.state.designVision || "",
|
||||
},
|
||||
architecture: {
|
||||
websiteTopic: this.state.websiteTopic || "",
|
||||
sitemap: this.state.sitemap || [],
|
||||
},
|
||||
usage: this.state.usage,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the full concept generated state to disk.
|
||||
*/
|
||||
private async saveState(concept: ProjectConcept): Promise<void> {
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||
const companyName = this.state.auditedFacts?.companyName || "unknown";
|
||||
|
||||
const stateDir = path.join(this.config.outputDir, "concepts");
|
||||
await fs.mkdir(stateDir, { recursive: true });
|
||||
|
||||
const statePath = path.join(stateDir, `${companyName}_${timestamp}.json`);
|
||||
await fs.writeFile(statePath, JSON.stringify(concept, null, 2));
|
||||
console.log(`\n📦 Saved Project Concept to: ${statePath}`);
|
||||
|
||||
// Save debug trace
|
||||
const debugPath = path.join(
|
||||
stateDir,
|
||||
`${companyName}_${timestamp}_debug.json`,
|
||||
);
|
||||
await fs.writeFile(debugPath, JSON.stringify(this.state, null, 2));
|
||||
|
||||
// Print usage summary
|
||||
console.log("\n──────────────────────────────────────────────");
|
||||
console.log("📊 PIPELINE USAGE SUMMARY");
|
||||
console.log("──────────────────────────────────────────────");
|
||||
for (const step of this.state.usage.perStep) {
|
||||
if (step.cost > 0) {
|
||||
console.log(
|
||||
` ${step.step}: ${step.model} — $${step.cost.toFixed(6)} (${(step.durationMs / 1000).toFixed(1)}s)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
console.log("──────────────────────────────────────────────");
|
||||
console.log(` TOTAL: $${this.state.usage.totalCost.toFixed(6)}`);
|
||||
console.log(
|
||||
` Tokens: ${(this.state.usage.totalPromptTokens + this.state.usage.totalCompletionTokens).toLocaleString()}`,
|
||||
);
|
||||
console.log("──────────────────────────────────────────────\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the full concept generated state to disk.
|
||||
*/
|
||||
private async saveState(concept: ProjectConcept): Promise<void> {
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||
const companyName = this.state.auditedFacts?.companyName || "unknown";
|
||||
|
||||
const stateDir = path.join(this.config.outputDir, "concepts");
|
||||
await fs.mkdir(stateDir, { recursive: true });
|
||||
|
||||
const statePath = path.join(stateDir, `${companyName}_${timestamp}.json`);
|
||||
await fs.writeFile(statePath, JSON.stringify(concept, null, 2));
|
||||
console.log(`\n📦 Saved Project Concept to: ${statePath}`);
|
||||
|
||||
// Save debug trace
|
||||
const debugPath = path.join(stateDir, `${companyName}_${timestamp}_debug.json`);
|
||||
await fs.writeFile(debugPath, JSON.stringify(this.state, null, 2));
|
||||
|
||||
// Print usage summary
|
||||
console.log("\n──────────────────────────────────────────────");
|
||||
console.log("📊 PIPELINE USAGE SUMMARY");
|
||||
console.log("──────────────────────────────────────────────");
|
||||
for (const step of this.state.usage.perStep) {
|
||||
if (step.cost > 0) {
|
||||
console.log(` ${step.step}: ${step.model} — $${step.cost.toFixed(6)} (${(step.durationMs / 1000).toFixed(1)}s)`);
|
||||
}
|
||||
}
|
||||
console.log("──────────────────────────────────────────────");
|
||||
console.log(` TOTAL: $${this.state.usage.totalCost.toFixed(6)}`);
|
||||
console.log(` Tokens: ${(this.state.usage.totalPromptTokens + this.state.usage.totalCompletionTokens).toLocaleString()}`);
|
||||
console.log("──────────────────────────────────────────────\n");
|
||||
}
|
||||
|
||||
/** Get the current internal state (for CLI inspection). */
|
||||
getState(): ConceptState {
|
||||
return this.state;
|
||||
}
|
||||
/** Get the current internal state (for CLI inspection). */
|
||||
getState(): ConceptState {
|
||||
return this.state;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,268 +10,289 @@ import { existsSync } from "node:fs";
|
||||
import type { CrawledPage, PageType } from "./types.js";
|
||||
|
||||
interface ScraperConfig {
|
||||
zyteApiKey?: string;
|
||||
crawlDir: string;
|
||||
maxPages?: number;
|
||||
zyteApiKey?: string;
|
||||
crawlDir: string;
|
||||
maxPages?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify a URL pathname into a page type.
|
||||
*/
|
||||
function classifyPage(pathname: string): PageType {
|
||||
const p = pathname.toLowerCase();
|
||||
if (p === "/" || p === "" || p === "/index.html") return "home";
|
||||
if (p.includes("service") || p.includes("leistung") || p.includes("kompetenz"))
|
||||
return "service";
|
||||
if (p.includes("about") || p.includes("ueber") || p.includes("über") || p.includes("unternehmen"))
|
||||
return "about";
|
||||
if (p.includes("contact") || p.includes("kontakt")) return "contact";
|
||||
if (p.includes("job") || p.includes("karriere") || p.includes("career") || p.includes("human-resources"))
|
||||
return "career";
|
||||
if (p.includes("portfolio") || p.includes("referenz") || p.includes("projekt") || p.includes("case-study"))
|
||||
return "portfolio";
|
||||
if (p.includes("blog") || p.includes("news") || p.includes("aktuelles") || p.includes("magazin"))
|
||||
return "blog";
|
||||
if (p.includes("legal") || p.includes("impressum") || p.includes("datenschutz") || p.includes("privacy") || p.includes("agb"))
|
||||
return "legal";
|
||||
return "other";
|
||||
const p = pathname.toLowerCase();
|
||||
if (p === "/" || p === "" || p === "/index.html") return "home";
|
||||
if (
|
||||
p.includes("service") ||
|
||||
p.includes("leistung") ||
|
||||
p.includes("kompetenz")
|
||||
)
|
||||
return "service";
|
||||
if (
|
||||
p.includes("about") ||
|
||||
p.includes("ueber") ||
|
||||
p.includes("über") ||
|
||||
p.includes("unternehmen")
|
||||
)
|
||||
return "about";
|
||||
if (p.includes("contact") || p.includes("kontakt")) return "contact";
|
||||
if (
|
||||
p.includes("job") ||
|
||||
p.includes("karriere") ||
|
||||
p.includes("career") ||
|
||||
p.includes("human-resources")
|
||||
)
|
||||
return "career";
|
||||
if (
|
||||
p.includes("portfolio") ||
|
||||
p.includes("referenz") ||
|
||||
p.includes("projekt") ||
|
||||
p.includes("case-study")
|
||||
)
|
||||
return "portfolio";
|
||||
if (
|
||||
p.includes("blog") ||
|
||||
p.includes("news") ||
|
||||
p.includes("aktuelles") ||
|
||||
p.includes("magazin")
|
||||
)
|
||||
return "blog";
|
||||
if (
|
||||
p.includes("legal") ||
|
||||
p.includes("impressum") ||
|
||||
p.includes("datenschutz") ||
|
||||
p.includes("privacy") ||
|
||||
p.includes("agb")
|
||||
)
|
||||
return "legal";
|
||||
return "other";
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect interactive features present on a page.
|
||||
*/
|
||||
function detectFeatures($: cheerio.CheerioAPI): string[] {
|
||||
const features: string[] = [];
|
||||
const features: string[] = [];
|
||||
|
||||
// Search
|
||||
if (
|
||||
$('input[type="search"]').length > 0 ||
|
||||
$('form[role="search"]').length > 0 ||
|
||||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
|
||||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
|
||||
) {
|
||||
features.push("search");
|
||||
}
|
||||
// Search
|
||||
if (
|
||||
$('input[type="search"]').length > 0 ||
|
||||
$('form[role="search"]').length > 0 ||
|
||||
$(".search-form, .search-box, #search, .searchbar").length > 0 ||
|
||||
$('input[name="q"], input[name="s"], input[name="search"]').length > 0
|
||||
) {
|
||||
features.push("search");
|
||||
}
|
||||
|
||||
// Forms (beyond search)
|
||||
const formCount = $("form").length;
|
||||
const searchForms = $('form[role="search"], .search-form').length;
|
||||
if (formCount > searchForms) {
|
||||
features.push("forms");
|
||||
}
|
||||
// Forms (beyond search)
|
||||
const formCount = $("form").length;
|
||||
const searchForms = $('form[role="search"], .search-form').length;
|
||||
if (formCount > searchForms) {
|
||||
features.push("forms");
|
||||
}
|
||||
|
||||
// Maps
|
||||
if (
|
||||
$('iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]').length > 0
|
||||
) {
|
||||
features.push("maps");
|
||||
}
|
||||
// Maps
|
||||
if (
|
||||
$(
|
||||
'iframe[src*="google.com/maps"], iframe[src*="openstreetmap"], .map-container, #map, [data-map]',
|
||||
).length > 0
|
||||
) {
|
||||
features.push("maps");
|
||||
}
|
||||
|
||||
// Video
|
||||
if (
|
||||
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container").length > 0
|
||||
) {
|
||||
features.push("video");
|
||||
}
|
||||
// Video
|
||||
if (
|
||||
$("video, iframe[src*='youtube'], iframe[src*='vimeo'], .video-container")
|
||||
.length > 0
|
||||
) {
|
||||
features.push("video");
|
||||
}
|
||||
|
||||
// Calendar / Events
|
||||
if ($(".calendar, .event, [data-calendar]").length > 0) {
|
||||
features.push("calendar");
|
||||
}
|
||||
// Calendar / Events
|
||||
if ($(".calendar, .event, [data-calendar]").length > 0) {
|
||||
features.push("calendar");
|
||||
}
|
||||
|
||||
// Cookie consent
|
||||
if ($(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length > 0) {
|
||||
features.push("cookie-consent");
|
||||
}
|
||||
// Cookie consent
|
||||
if (
|
||||
$(".cookie-banner, .cookie-consent, #cookie-notice, [data-cookie]").length >
|
||||
0
|
||||
) {
|
||||
features.push("cookie-consent");
|
||||
}
|
||||
|
||||
return features;
|
||||
return features;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all internal links from a page.
|
||||
*/
|
||||
function extractInternalLinks($: cheerio.CheerioAPI, origin: string): string[] {
|
||||
const links: string[] = [];
|
||||
$("a[href]").each((_, el) => {
|
||||
const href = $(el).attr("href");
|
||||
if (!href) return;
|
||||
try {
|
||||
const url = new URL(href, origin);
|
||||
if (url.origin === origin) {
|
||||
// Skip assets
|
||||
if (/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(url.pathname)) return;
|
||||
// Skip anchors-only
|
||||
if (url.pathname === "/" && url.hash) return;
|
||||
links.push(url.pathname);
|
||||
}
|
||||
} catch {
|
||||
// Invalid URL, skip
|
||||
}
|
||||
});
|
||||
return [...new Set(links)];
|
||||
const links: string[] = [];
|
||||
$("a[href]").each((_, el) => {
|
||||
const href = $(el).attr("href");
|
||||
if (!href) return;
|
||||
try {
|
||||
const url = new URL(href, origin);
|
||||
if (url.origin === origin) {
|
||||
// Skip assets
|
||||
if (
|
||||
/\.(pdf|zip|jpg|jpeg|png|svg|webp|gif|css|js|ico|woff|woff2|ttf|eot)$/i.test(
|
||||
url.pathname,
|
||||
)
|
||||
)
|
||||
return;
|
||||
// Skip anchors-only
|
||||
if (url.pathname === "/" && url.hash) return;
|
||||
links.push(url.pathname);
|
||||
}
|
||||
} catch {
|
||||
// Invalid URL, skip
|
||||
}
|
||||
});
|
||||
return [...new Set(links)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all images from a page.
|
||||
*/
|
||||
function extractImages($: cheerio.CheerioAPI, origin: string): string[] {
|
||||
const images: string[] = [];
|
||||
const images: string[] = [];
|
||||
|
||||
// Regular img tags
|
||||
$("img[src]").each((_, el) => {
|
||||
const src = $(el).attr("src");
|
||||
if (src) images.push(src);
|
||||
});
|
||||
// Regular img tags
|
||||
$("img[src]").each((_, el) => {
|
||||
const src = $(el).attr("src");
|
||||
if (src) images.push(src);
|
||||
});
|
||||
|
||||
// CSS background images (inline styles)
|
||||
$("[style*='background-image']").each((_, el) => {
|
||||
const style = $(el).attr("style");
|
||||
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
|
||||
if (match && match[1]) {
|
||||
images.push(match[1]);
|
||||
}
|
||||
});
|
||||
|
||||
// Resolve URLs to absolute
|
||||
const absoluteImages: string[] = [];
|
||||
for (const img of images) {
|
||||
if (img.startsWith("data:image")) continue; // Skip inline base64
|
||||
try {
|
||||
const url = new URL(img, origin);
|
||||
// Ignore small tracking pixels or generic vectors
|
||||
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo")) continue;
|
||||
absoluteImages.push(url.href);
|
||||
} catch {
|
||||
// Invalid URL
|
||||
}
|
||||
// CSS background images (inline styles)
|
||||
$("[style*='background-image']").each((_, el) => {
|
||||
const style = $(el).attr("style");
|
||||
const match = style?.match(/url\(['"]?(.*?)['"]?\)/);
|
||||
if (match && match[1]) {
|
||||
images.push(match[1]);
|
||||
}
|
||||
});
|
||||
|
||||
return [...new Set(absoluteImages)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract services/competencies from text content.
|
||||
*/
|
||||
function extractServices(text: string): string[] {
|
||||
const services: string[] = [];
|
||||
// Common pattern: bulleted or newline-separated service lists
|
||||
const lines = text.split(/\n/).map((l) => l.trim()).filter((l) => l.length > 3 && l.length < 100);
|
||||
for (const line of lines) {
|
||||
// Skip generic boilerplate
|
||||
if (/cookie|datenschutz|impressum|copyright|©/i.test(line)) continue;
|
||||
if (/^(tel|fax|e-mail|mobil|web|http)/i.test(line)) continue;
|
||||
services.push(line);
|
||||
// Resolve URLs to absolute
|
||||
const absoluteImages: string[] = [];
|
||||
for (const img of images) {
|
||||
if (img.startsWith("data:image")) continue; // Skip inline base64
|
||||
try {
|
||||
const url = new URL(img, origin);
|
||||
// Ignore small tracking pixels or generic vectors
|
||||
if (url.pathname.endsWith(".svg") && !url.pathname.includes("logo"))
|
||||
continue;
|
||||
absoluteImages.push(url.href);
|
||||
} catch {
|
||||
// Invalid URL
|
||||
}
|
||||
return services;
|
||||
}
|
||||
|
||||
return [...new Set(absoluteImages)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a page via Zyte API with browser rendering.
|
||||
*/
|
||||
async function fetchWithZyte(url: string, apiKey: string): Promise<string> {
|
||||
try {
|
||||
const auth = Buffer.from(`${apiKey}:`).toString("base64");
|
||||
const resp = await fetch("https://api.zyte.com/v1/extract", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Authorization": `Basic ${auth}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url,
|
||||
browserHtml: true,
|
||||
}),
|
||||
signal: AbortSignal.timeout(60000),
|
||||
});
|
||||
const auth = Buffer.from(`${apiKey}:`).toString("base64");
|
||||
const resp = await fetch("https://api.zyte.com/v1/extract", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Basic ${auth}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url,
|
||||
browserHtml: true,
|
||||
}),
|
||||
signal: AbortSignal.timeout(60000),
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const errorText = await resp.text();
|
||||
console.error(` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`);
|
||||
// Rate limited — wait and retry once
|
||||
if (resp.status === 429) {
|
||||
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
return fetchWithZyte(url, apiKey);
|
||||
}
|
||||
throw new Error(`HTTP ${resp.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json();
|
||||
const html = data.browserHtml || "";
|
||||
if (!html) {
|
||||
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
|
||||
}
|
||||
return html;
|
||||
} catch (err: any) {
|
||||
throw err;
|
||||
if (!resp.ok) {
|
||||
const errorText = await resp.text();
|
||||
console.error(
|
||||
` ❌ Zyte API error ${resp.status} for ${url}: ${errorText}`,
|
||||
);
|
||||
// Rate limited — wait and retry once
|
||||
if (resp.status === 429) {
|
||||
console.log(" ⏳ Rate limited, waiting 5s and retrying...");
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
return fetchWithZyte(url, apiKey);
|
||||
}
|
||||
}
|
||||
throw new Error(`HTTP ${resp.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json();
|
||||
const html = data.browserHtml || "";
|
||||
if (!html) {
|
||||
console.warn(` ⚠️ Zyte returned empty browserHtml for ${url}`);
|
||||
}
|
||||
return html;
|
||||
}
|
||||
/**
|
||||
* Fetch a page via simple HTTP GET (fallback).
|
||||
*/
|
||||
async function fetchDirect(url: string): Promise<string> {
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
},
|
||||
signal: AbortSignal.timeout(30000),
|
||||
});
|
||||
if (!resp.ok) return "";
|
||||
return await resp.text();
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
const resp = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
},
|
||||
signal: AbortSignal.timeout(30000),
|
||||
}).catch(() => null);
|
||||
|
||||
if (!resp || !resp.ok) return "";
|
||||
return await resp.text();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse an HTML string into a CrawledPage.
|
||||
*/
|
||||
function parsePage(html: string, url: string): CrawledPage {
|
||||
const $ = cheerio.load(html);
|
||||
const urlObj = new URL(url);
|
||||
const $ = cheerio.load(html);
|
||||
const urlObj = new URL(url);
|
||||
|
||||
const title = $("title").text().trim();
|
||||
const headings = $("h1, h2, h3")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((h) => h.length > 0);
|
||||
const title = $("title").text().trim();
|
||||
const headings = $("h1, h2, h3")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((h) => h.length > 0);
|
||||
|
||||
const navItems = $("nav a")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((t) => t.length > 0 && t.length < 100);
|
||||
const navItems = $("nav a")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get()
|
||||
.filter((t) => t.length > 0 && t.length < 100);
|
||||
|
||||
const bodyText = $("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim();
|
||||
const bodyText = $("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim();
|
||||
|
||||
const features = detectFeatures($);
|
||||
const links = extractInternalLinks($, urlObj.origin);
|
||||
const images = extractImages($, urlObj.origin);
|
||||
const features = detectFeatures($);
|
||||
const links = extractInternalLinks($, urlObj.origin);
|
||||
const images = extractImages($, urlObj.origin);
|
||||
|
||||
const description = $('meta[name="description"]').attr("content") || undefined;
|
||||
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
|
||||
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
|
||||
const description =
|
||||
$('meta[name="description"]').attr("content") || undefined;
|
||||
const ogTitle = $('meta[property="og:title"]').attr("content") || undefined;
|
||||
const ogImage = $('meta[property="og:image"]').attr("content") || undefined;
|
||||
|
||||
return {
|
||||
url,
|
||||
pathname: urlObj.pathname,
|
||||
title,
|
||||
html,
|
||||
text: bodyText,
|
||||
headings,
|
||||
navItems,
|
||||
features,
|
||||
type: classifyPage(urlObj.pathname),
|
||||
links,
|
||||
images,
|
||||
meta: { description, ogTitle, ogImage },
|
||||
};
|
||||
return {
|
||||
url,
|
||||
pathname: urlObj.pathname,
|
||||
title,
|
||||
html,
|
||||
text: bodyText,
|
||||
headings,
|
||||
navItems,
|
||||
features,
|
||||
type: classifyPage(urlObj.pathname),
|
||||
links,
|
||||
images,
|
||||
meta: { description, ogTitle, ogImage },
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -280,164 +301,178 @@ function parsePage(html: string, url: string): CrawledPage {
|
||||
* Returns an array of CrawledPage objects.
|
||||
*/
|
||||
export async function crawlSite(
|
||||
targetUrl: string,
|
||||
config: ScraperConfig,
|
||||
targetUrl: string,
|
||||
config: ScraperConfig,
|
||||
): Promise<CrawledPage[]> {
|
||||
const urlObj = new URL(targetUrl);
|
||||
const origin = urlObj.origin;
|
||||
const domain = urlObj.hostname;
|
||||
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
|
||||
const urlObj = new URL(targetUrl);
|
||||
const origin = urlObj.origin;
|
||||
const domain = urlObj.hostname;
|
||||
const domainDir = path.join(config.crawlDir, domain.replace(/\./g, "-"));
|
||||
|
||||
// Check for existing crawl
|
||||
const metaFile = path.join(domainDir, "_crawl_meta.json");
|
||||
if (existsSync(metaFile)) {
|
||||
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
|
||||
return loadCrawlFromDisk(domainDir);
|
||||
}
|
||||
// Check for existing crawl
|
||||
const metaFile = path.join(domainDir, "_crawl_meta.json");
|
||||
if (existsSync(metaFile)) {
|
||||
console.log(`📦 Found existing crawl for ${domain}. Loading from disk...`);
|
||||
return loadCrawlFromDisk(domainDir);
|
||||
}
|
||||
|
||||
console.log(`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`);
|
||||
console.log(
|
||||
`🔍 Crawling ${targetUrl} via ${config.zyteApiKey ? "Zyte API" : "direct HTTP"}...`,
|
||||
);
|
||||
|
||||
// Ensure output dir
|
||||
await fs.mkdir(domainDir, { recursive: true });
|
||||
// Ensure output dir
|
||||
await fs.mkdir(domainDir, { recursive: true });
|
||||
|
||||
const maxPages = config.maxPages || 30;
|
||||
const visited = new Set<string>();
|
||||
const queue: string[] = [targetUrl];
|
||||
const pages: CrawledPage[] = [];
|
||||
const maxPages = config.maxPages || 30;
|
||||
const visited = new Set<string>();
|
||||
const queue: string[] = [targetUrl];
|
||||
const pages: CrawledPage[] = [];
|
||||
|
||||
while (queue.length > 0 && visited.size < maxPages) {
|
||||
const url = queue.shift()!;
|
||||
const urlPath = new URL(url).pathname;
|
||||
while (queue.length > 0 && visited.size < maxPages) {
|
||||
const url = queue.shift()!;
|
||||
const urlPath = new URL(url).pathname;
|
||||
|
||||
if (visited.has(urlPath)) continue;
|
||||
visited.add(urlPath);
|
||||
if (visited.has(urlPath)) continue;
|
||||
visited.add(urlPath);
|
||||
|
||||
try {
|
||||
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
|
||||
try {
|
||||
console.log(` ↳ Fetching ${url} (${visited.size}/${maxPages})...`);
|
||||
|
||||
let html: string;
|
||||
if (config.zyteApiKey) {
|
||||
html = await fetchWithZyte(url, config.zyteApiKey);
|
||||
} else {
|
||||
html = await fetchDirect(url);
|
||||
}
|
||||
let html: string;
|
||||
if (config.zyteApiKey) {
|
||||
html = await fetchWithZyte(url, config.zyteApiKey);
|
||||
} else {
|
||||
html = await fetchDirect(url);
|
||||
}
|
||||
|
||||
if (!html || html.length < 100) {
|
||||
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
|
||||
continue;
|
||||
}
|
||||
if (!html || html.length < 100) {
|
||||
console.warn(` ⚠️ Empty/tiny response for ${url}, skipping.`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const page = parsePage(html, url);
|
||||
pages.push(page);
|
||||
const page = parsePage(html, url);
|
||||
pages.push(page);
|
||||
|
||||
// Save HTML + metadata to disk
|
||||
const safeName = urlPath === "/" ? "index" : urlPath.replace(/\//g, "_").replace(/^_/, "");
|
||||
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
|
||||
await fs.writeFile(
|
||||
path.join(domainDir, `${safeName}.meta.json`),
|
||||
JSON.stringify(
|
||||
{
|
||||
url: page.url,
|
||||
pathname: page.pathname,
|
||||
title: page.title,
|
||||
type: page.type,
|
||||
headings: page.headings,
|
||||
navItems: page.navItems,
|
||||
features: page.features,
|
||||
links: page.links,
|
||||
images: page.images,
|
||||
meta: page.meta,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
|
||||
// Discover new links
|
||||
for (const link of page.links) {
|
||||
if (!visited.has(link)) {
|
||||
const fullUrl = `${origin}${link}`;
|
||||
queue.push(fullUrl);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Save crawl metadata
|
||||
await fs.writeFile(
|
||||
metaFile,
|
||||
// Save HTML + metadata to disk
|
||||
const safeName =
|
||||
urlPath === "/"
|
||||
? "index"
|
||||
: urlPath.replace(/\//g, "_").replace(/^_/, "");
|
||||
await fs.writeFile(path.join(domainDir, `${safeName}.html`), html);
|
||||
await fs.writeFile(
|
||||
path.join(domainDir, `${safeName}.meta.json`),
|
||||
JSON.stringify(
|
||||
{
|
||||
domain,
|
||||
crawledAt: new Date().toISOString(),
|
||||
totalPages: pages.length,
|
||||
urls: pages.map((p) => p.url),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
{
|
||||
url: page.url,
|
||||
pathname: page.pathname,
|
||||
title: page.title,
|
||||
type: page.type,
|
||||
headings: page.headings,
|
||||
navItems: page.navItems,
|
||||
features: page.features,
|
||||
links: page.links,
|
||||
images: page.images,
|
||||
meta: page.meta,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
);
|
||||
|
||||
console.log(`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`);
|
||||
return pages;
|
||||
// Discover new links
|
||||
for (const link of page.links) {
|
||||
if (!visited.has(link)) {
|
||||
const fullUrl = `${origin}${link}`;
|
||||
queue.push(fullUrl);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(` ⚠️ Failed to fetch ${url}: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Save crawl metadata
|
||||
await fs.writeFile(
|
||||
metaFile,
|
||||
JSON.stringify(
|
||||
{
|
||||
domain,
|
||||
crawledAt: new Date().toISOString(),
|
||||
totalPages: pages.length,
|
||||
urls: pages.map((p) => p.url),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
|
||||
console.log(
|
||||
`✅ Crawled ${pages.length} pages for ${domain}. Saved to ${domainDir}`,
|
||||
);
|
||||
return pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a previously crawled site from disk.
|
||||
*/
|
||||
async function loadCrawlFromDisk(domainDir: string): Promise<CrawledPage[]> {
|
||||
const files = await fs.readdir(domainDir);
|
||||
const metaFiles = files.filter((f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json");
|
||||
const files = await fs.readdir(domainDir);
|
||||
const metaFiles = files.filter(
|
||||
(f) => f.endsWith(".meta.json") && f !== "_crawl_meta.json",
|
||||
);
|
||||
|
||||
const pages: CrawledPage[] = [];
|
||||
for (const metaFile of metaFiles) {
|
||||
const baseName = metaFile.replace(".meta.json", "");
|
||||
const htmlFile = `${baseName}.html`;
|
||||
const pages: CrawledPage[] = [];
|
||||
for (const metaFile of metaFiles) {
|
||||
const baseName = metaFile.replace(".meta.json", "");
|
||||
const htmlFile = `${baseName}.html`;
|
||||
|
||||
const meta = JSON.parse(await fs.readFile(path.join(domainDir, metaFile), "utf8"));
|
||||
let html = "";
|
||||
if (files.includes(htmlFile)) {
|
||||
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
|
||||
}
|
||||
|
||||
const text = html
|
||||
? cheerio
|
||||
.load(html)("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim()
|
||||
: "";
|
||||
|
||||
pages.push({
|
||||
url: meta.url,
|
||||
pathname: meta.pathname,
|
||||
title: meta.title,
|
||||
html,
|
||||
text,
|
||||
headings: meta.headings || [],
|
||||
navItems: meta.navItems || [],
|
||||
features: meta.features || [],
|
||||
type: meta.type || "other",
|
||||
links: meta.links || [],
|
||||
images: meta.images || [],
|
||||
meta: meta.meta || {},
|
||||
});
|
||||
const meta = JSON.parse(
|
||||
await fs.readFile(path.join(domainDir, metaFile), "utf8"),
|
||||
);
|
||||
let html = "";
|
||||
if (files.includes(htmlFile)) {
|
||||
html = await fs.readFile(path.join(domainDir, htmlFile), "utf8");
|
||||
}
|
||||
|
||||
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
|
||||
return pages;
|
||||
const text = html
|
||||
? cheerio
|
||||
.load(html)("body")
|
||||
.text()
|
||||
.replace(/\s+/g, " ")
|
||||
.substring(0, 50000)
|
||||
.trim()
|
||||
: "";
|
||||
|
||||
pages.push({
|
||||
url: meta.url,
|
||||
pathname: meta.pathname,
|
||||
title: meta.title,
|
||||
html,
|
||||
text,
|
||||
headings: meta.headings || [],
|
||||
navItems: meta.navItems || [],
|
||||
features: meta.features || [],
|
||||
type: meta.type || "other",
|
||||
links: meta.links || [],
|
||||
images: meta.images || [],
|
||||
meta: meta.meta || {},
|
||||
});
|
||||
}
|
||||
|
||||
console.log(` 📂 Loaded ${pages.length} cached pages from disk.`);
|
||||
return pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a cached crawl to force re-crawl.
|
||||
*/
|
||||
export async function clearCrawlCache(crawlDir: string, domain: string): Promise<void> {
|
||||
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
|
||||
if (existsSync(domainDir)) {
|
||||
await fs.rm(domainDir, { recursive: true, force: true });
|
||||
console.log(`🧹 Cleared crawl cache for ${domain}`);
|
||||
}
|
||||
export async function clearCrawlCache(
|
||||
crawlDir: string,
|
||||
domain: string,
|
||||
): Promise<void> {
|
||||
const domainDir = path.join(crawlDir, domain.replace(/\./g, "-"));
|
||||
if (existsSync(domainDir)) {
|
||||
await fs.rm(domainDir, { recursive: true, force: true });
|
||||
console.log(`🧹 Cleared crawl cache for ${domain}`);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user