pdf sheets from new excel

2026-01-23 13:10:08 +01:00
parent 899b3c7ed4
commit e5e2b646a0
53 changed files with 447 additions and 53 deletions
--- a/scripts/generate-pdf-datasheets.ts
+++ b/scripts/generate-pdf-datasheets.ts
@@ -2,48 +2,212 @@
 /**
 * PDF Datasheet Generator (React-PDF)
 *
- * Uses the same Excel-driven data model as the legacy generator, but renders
- * PDFs via `@react-pdf/renderer` for maintainable layout and pagination.
+ * Renders PDFs via `@react-pdf/renderer`.
+ *
+ * Source of truth:
+ * - All technical data + cross-section tables: Excel files in `data/excel/`
+ * - Product description text: MDX files in `data/products/{en,de}/*.mdx`
 */

 import * as fs from 'fs';
 import * as path from 'path';

+import * as XLSX from 'xlsx';
+const matter = require('gray-matter') as (src: string) => { data: unknown; content: string };
+
 import type { ProductData } from './pdf/model/types';
 import { generateDatasheetPdfBuffer } from './pdf/react-pdf/generate-datasheet-pdf';
-import { generateFileName, normalizeValue } from './pdf/model/utils';
+import { generateFileName, normalizeValue, stripHtml } from './pdf/model/utils';

 const CONFIG = {
-  productsFile: path.join(process.cwd(), 'data/processed/products.json'),
  outputDir: path.join(process.cwd(), 'public/datasheets'),
  chunkSize: 10,
 } as const;

+const EXCEL_FILES = [
+  path.join(process.cwd(), 'data/excel/high-voltage.xlsx'),
+  path.join(process.cwd(), 'data/excel/medium-voltage-KM.xlsx'),
+  path.join(process.cwd(), 'data/excel/medium-voltage-KM 170126.xlsx'),
+  path.join(process.cwd(), 'data/excel/low-voltage-KM.xlsx'),
+  path.join(process.cwd(), 'data/excel/solar-cables.xlsx'),
+] as const;
+
+type MdxProduct = {
+  slug: string;
+  title: string;
+  sku: string;
+  categories: string[];
+  images: string[];
+  descriptionHtml: string;
+};
+
+type MdxIndex = Map<string, MdxProduct>; // key: normalized designation/title
+
 function ensureOutputDir(): void {
  if (!fs.existsSync(CONFIG.outputDir)) {
    fs.mkdirSync(CONFIG.outputDir, { recursive: true });
  }
 }

-async function readProductsStream(): Promise<ProductData[]> {
-  console.log('Reading products.json...');
-  return new Promise((resolve, reject) => {
-    const stream = fs.createReadStream(CONFIG.productsFile, { encoding: 'utf8' });
-    let data = '';
-    stream.on('data', chunk => {
-      data += chunk;
+function normalizeExcelKey(value: string): string {
+  return String(value || '')
+    .toUpperCase()
+    .replace(/-\d+$/g, '')
+    .replace(/[^A-Z0-9]+/g, '');
+}
+
+function extractDescriptionHtmlFromMdxBody(body: string): string {
+  const content = String(body || '').trim();
+  if (!content) return '';
+
+  // MDX product files are wrapped like:
+  // <ProductTabs technicalData={...}>
+  //   <section>...</section>
+  // </ProductTabs>
+  // For PDF, we only want the inner description content.
+  const withoutOpen = content.replace(/^\s*<ProductTabs[\s\S]*?>\s*/i, '');
+  const withoutClose = withoutOpen.replace(/\s*<\/ProductTabs>\s*$/i, '');
+  return withoutClose.trim();
+}
+
+function buildMdxIndex(locale: 'en' | 'de'): MdxIndex {
+  const dir = path.join(process.cwd(), 'data/products', locale);
+  const idx: MdxIndex = new Map();
+  if (!fs.existsSync(dir)) return idx;
+
+  const files = fs
+    .readdirSync(dir)
+    .filter(f => f.endsWith('.mdx'))
+    .sort();
+
+  for (const file of files) {
+    const filePath = path.join(dir, file);
+    const raw = fs.readFileSync(filePath, 'utf8');
+    const parsed = matter(raw);
+    const data = (parsed.data || {}) as any;
+
+    const title = normalizeValue(String(data.title || ''));
+    if (!title) continue;
+
+    const sku = normalizeValue(String(data.sku || ''));
+    const categories = Array.isArray(data.categories) ? data.categories.map((c: any) => normalizeValue(String(c))).filter(Boolean) : [];
+    const images = Array.isArray(data.images) ? data.images.map((i: any) => normalizeValue(String(i))).filter(Boolean) : [];
+
+    const descriptionHtml = extractDescriptionHtmlFromMdxBody(parsed.content);
+
+    const slug = path.basename(file, '.mdx');
+    idx.set(normalizeExcelKey(title), { slug, title, sku, categories, images, descriptionHtml });
+  }
+
+  return idx;
+}
+
+function findKeyByHeaderValue(headerRow: Record<string, unknown>, pattern: RegExp): string | null {
+  for (const [k, v] of Object.entries(headerRow || {})) {
+    const text = normalizeValue(String(v ?? ''));
+    if (!text) continue;
+    if (pattern.test(text)) return k;
+  }
+  return null;
+}
+
+function readExcelRows(filePath: string): Array<Record<string, unknown>> {
+  if (!fs.existsSync(filePath)) return [];
+  const workbook = XLSX.readFile(filePath, { cellDates: false, cellNF: false, cellText: false });
+  const sheetName = workbook.SheetNames[0];
+  if (!sheetName) return [];
+  const sheet = workbook.Sheets[sheetName];
+  if (!sheet) return [];
+
+  return XLSX.utils.sheet_to_json(sheet, {
+    defval: '',
+    raw: false,
+    blankrows: false,
+  }) as Array<Record<string, unknown>>;
+}
+
+function readDesignationsFromExcelFile(filePath: string): Map<string, string> {
+  const rows = readExcelRows(filePath);
+  if (!rows.length) return new Map();
+
+  // Legacy sheets use "Part Number" as a column key.
+  // The new MV sheet uses __EMPTY* keys and stores the human headers in row 0 values.
+  const headerRow = rows[0] || {};
+  const partNumberKey =
+    (Object.prototype.hasOwnProperty.call(headerRow, 'Part Number') ? 'Part Number' : null) ||
+    findKeyByHeaderValue(headerRow, /^part\s*number$/i) ||
+    '__EMPTY';
+
+  const out = new Map<string, string>();
+  for (const r of rows) {
+    const pn = normalizeValue(String(r?.[partNumberKey] ?? ''));
+    if (!pn || pn === 'Units' || pn === 'Part Number') continue;
+
+    const key = normalizeExcelKey(pn);
+    if (!key) continue;
+
+    // Keep first-seen designation string (stable filenames from MDX slug).
+    if (!out.has(key)) out.set(key, pn);
+  }
+
+  return out;
+}
+
+function loadAllExcelDesignations(): Map<string, string> {
+  const out = new Map<string, string>();
+  for (const filePath of EXCEL_FILES) {
+    const m = readDesignationsFromExcelFile(filePath);
+    Array.from(m.entries()).forEach(([k, v]) => {
+      if (!out.has(k)) out.set(k, v);
    });
-    stream.on('end', () => {
-      try {
-        const products = JSON.parse(data) as ProductData[];
-        console.log(`Loaded ${products.length} products`);
-        resolve(products);
-      } catch (error) {
-        reject(new Error(`Failed to parse JSON: ${error}`));
-      }
+  }
+  return out;
+}
+
+async function loadProductsFromExcelAndMdx(locale: 'en' | 'de'): Promise<ProductData[]> {
+  const mdxIndex = buildMdxIndex(locale);
+  const excelDesignations = loadAllExcelDesignations();
+
+  const products: ProductData[] = [];
+  let id = 1;
+
+  Array.from(excelDesignations.entries()).forEach(([key, designation]) => {
+    const mdx = mdxIndex.get(key) || null;
+
+    const title = mdx?.title || designation;
+    const slug =
+      mdx?.slug ||
+      title
+        .toLowerCase()
+        .replace(/[^a-z0-9]+/g, '-')
+        .replace(/-+/g, '-')
+        .replace(/^-|-$/g, '');
+
+    // Only the product description comes from MDX. Everything else is Excel-driven
+    // during model building (technicalItems + voltage tables).
+    const descriptionHtml = mdx?.descriptionHtml || '';
+
+    products.push({
+      id: id++,
+      name: title,
+      shortDescriptionHtml: '',
+      descriptionHtml,
+      images: mdx?.images || [],
+      featuredImage: (mdx?.images && mdx.images[0]) || null,
+      sku: mdx?.sku || title,
+      slug,
+      translationKey: slug,
+      locale,
+      categories: (mdx?.categories || []).map(name => ({ name })),
+      attributes: [],
    });
-    stream.on('error', error => reject(new Error(`Failed to read file: ${error}`)));
  });
+
+  // Deterministic order: by slug, then name.
+  products.sort((a, b) => (a.slug || '').localeCompare(b.slug || '') || a.name.localeCompare(b.name));
+
+  // Drop products that have no readable name.
+  return products.filter(p => stripHtml(p.name));
 }

 async function processChunk(products: ProductData[], chunkIndex: number, totalChunks: number): Promise<void> {
@@ -67,21 +231,24 @@ async function processProductsInChunks(): Promise<void> {
  console.log('Starting PDF generation (React-PDF)');
  ensureOutputDir();

-  const allProducts = await readProductsStream();
+  const onlyLocale = normalizeValue(String(process.env.PDF_LOCALE || '')).toLowerCase();
+  const locales: Array<'en' | 'de'> = onlyLocale === 'de' || onlyLocale === 'en' ? [onlyLocale] : ['en', 'de'];
+
+  const allProducts: ProductData[] = [];
+  for (const locale of locales) {
+    const products = await loadProductsFromExcelAndMdx(locale);
+    allProducts.push(...products);
+  }
+
  if (allProducts.length === 0) {
    console.log('No products found');
    return;
  }

-  // Dev convenience: generate only one locale / one product subset.
+  // Dev convenience: generate only one product subset.
  // IMPORTANT: apply filters BEFORE PDF_LIMIT so the limit works within the filtered set.
  let products = allProducts;

-  const onlyLocale = normalizeValue(String(process.env.PDF_LOCALE || '')).toLowerCase();
-  if (onlyLocale === 'de' || onlyLocale === 'en') {
-    products = products.filter(p => (p.locale || 'en') === onlyLocale);
-  }
-
  const match = normalizeValue(String(process.env.PDF_MATCH || '')).toLowerCase();
  if (match) {
    products = products.filter(p => {