migration wip

2025-12-30 16:19:42 +01:00
parent 65a7e9f24a
commit 4ae6b36da9
149 changed files with 32034 additions and 34406 deletions
--- a/scripts/fix-images.js
+++ b/scripts/fix-images.js
@@ -0,0 +1,230 @@
+#!/usr/bin/env node
+
+const fs = require('fs');
+const path = require('path');
+
+const PROCESSED_DIR = path.join(__dirname, '..', 'data', 'processed');
+const ASSET_MAP_PATH = path.join(PROCESSED_DIR, 'asset-map.json');
+
+// Load asset map
+const assetMap = JSON.parse(fs.readFileSync(ASSET_MAP_PATH, 'utf8'));
+
+// Create ID to path mapping
+const idToPath = {};
+for (const [wpUrl, localPath] of Object.entries(assetMap)) {
+  const patterns = [/\/(\d+)-/, /\/(\d+)\./, /id=(\d+)/];
+  for (const pattern of patterns) {
+    const match = wpUrl.match(pattern);
+    if (match) {
+      idToPath[match[1]] = localPath;
+      break;
+    }
+  }
+}
+
+// Add manual mappings
+idToPath['45569'] = '/media/45569-Still-2025-02-10-104337_1.1.1.webp';
+idToPath['10648'] = '/media/10648-low-voltage-scaled.webp';
+idToPath['6486'] = '/media/6486-Low-Voltage.svg';
+idToPath['10649'] = '/media/10649-medium-voltage-scaled.webp';
+idToPath['6487'] = '/media/6487-Medium-Voltage.svg';
+idToPath['46786'] = '/media/46786-na2xsfl2y-rendered.webp';
+idToPath['6485'] = '/media/6485-High-Voltage.svg';
+idToPath['46359'] = '/media/46359-3.webp';
+idToPath['6484'] = '/media/6484-Solar.svg';
+idToPath['6527'] = '/media/6527-high-voltage-category.webp';
+idToPath['6519'] = '/media/6519-solar-category.webp';
+idToPath['6521'] = '/media/6521-low-voltage-category.webp';
+idToPath['6517'] = '/media/6517-medium-voltage-category.webp';
+
+console.log('Found', Object.keys(idToPath).length, 'media ID mappings');
+
+// HTML entity decoding - handles decimal, hex, and named entities
+function decodeHTMLEntities(text) {
+  if (!text) return '';
+  
+  let result = text;
+  
+  // First, handle numeric entities (decimal and hex)
+  result = result
+    .replace(/&#(\d+);/g, (match, dec) => {
+      const char = String.fromCharCode(parseInt(dec, 10));
+      return char;
+    })
+    .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => {
+      const char = String.fromCharCode(parseInt(hex, 16));
+      return char;
+    });
+  
+  // Handle common named entities and Unicode characters
+  const entityMap = {
+    ' ': ' ',
+    '‘': "'",
+    '’': "'",
+    '“': '"',
+    '”': '"',
+    '″': '"',  // Double prime (8243)
+    '–': '-',
+    '—': '—',
+    '…': '…',
+    '•': '•',
+    '€': '€',
+    '©': '©',
+    '®': '®',
+    '™': '™',
+    '°': '°',
+    '±': '±',
+    '×': '×',
+    '÷': '÷',
+    '−': '−',
+    '¢': '¢',
+    '£': '£',
+    '¥': '¥',
+    '§': '§',
+    '¶': '¶',
+    'µ': 'µ',
+    '«': '«',
+    '»': '»',
+    '·': '·'
+  };
+  
+  // Replace all named entities
+  for (const [entity, char] of Object.entries(entityMap)) {
+    result = result.replace(new RegExp(entity, 'g'), char);
+  }
+  
+  // Clean up any remaining ampersand patterns
+  result = result.replace(/&([a-zA-Z]+);/g, (match, name) => {
+    return entityMap[`&${name};`] || match;
+  });
+  
+  return result;
+}
+
+// Process files
+const files = ['pages.json', 'posts.json', 'products.json'];
+
+files.forEach(file => {
+  const filePath = path.join(PROCESSED_DIR, file);
+  if (!fs.existsSync(filePath)) return;
+  
+  const items = JSON.parse(fs.readFileSync(filePath, 'utf8'));
+  let updated = false;
+  let updateCount = 0;
+  let decodeCount = 0;
+  
+  items.forEach(item => {
+    let contentChanged = false;
+    let wasDecoded = false;
+    
+    if (item.contentHtml) {
+      // Decode entities first
+      const original = item.contentHtml;
+      item.contentHtml = decodeHTMLEntities(item.contentHtml);
+      if (item.contentHtml !== original) {
+        wasDecoded = true;
+        decodeCount++;
+      }
+      
+      // Now replace IDs with local paths
+      for (const [id, localPath] of Object.entries(idToPath)) {
+        // Pattern 1: bg_image="45569" (standard quotes)
+        const patterns = [
+          { search: 'bg_image="' + id + '"', replace: 'bg_image="' + localPath + '"' },
+          { search: 'background_image="' + id + '"', replace: 'background_image="' + localPath + '"' },
+          { search: 'image_url="' + id + '"', replace: 'image_url="' + localPath + '"' },
+          { search: 'custom_icon_image="' + id + '"', replace: 'custom_icon_image="' + localPath + '"' },
+          { search: 'poster="' + id + '"', replace: 'poster="' + localPath + '"' },
+          { search: 'column_background_image="' + id + '"', replace: 'column_background_image="' + localPath + '"' },
+        ];
+        
+        patterns.forEach(({ search, replace }) => {
+          if (item.contentHtml.includes(search)) {
+            item.contentHtml = item.contentHtml.split(search).join(replace);
+            contentChanged = true;
+          }
+        });
+        
+        // Also check for HTML-encoded attribute values (after decodeHTMLEntities, these become regular quotes)
+        // But we need to handle the case where the HTML entities haven't been decoded yet
+        const encodedPatterns = [
+          { search: 'bg_image=”' + id + '″', replace: 'bg_image="' + localPath + '"' },
+          { search: 'bg_image=”' + id + '”', replace: 'bg_image="' + localPath + '"' },
+          { search: 'bg_image="' + id + '"', replace: 'bg_image="' + localPath + '"' },
+        ];
+        
+        encodedPatterns.forEach(({ search, replace }) => {
+          if (item.contentHtml.includes(search)) {
+            item.contentHtml = item.contentHtml.split(search).join(replace);
+            contentChanged = true;
+          }
+        });
+      }
+    }
+    
+    if (item.excerptHtml) {
+      const original = item.excerptHtml;
+      item.excerptHtml = decodeHTMLEntities(item.excerptHtml);
+      
+      for (const [id, localPath] of Object.entries(idToPath)) {
+        // Standard pattern
+        const search = 'bg_image="' + id + '"';
+        const replace = 'bg_image="' + localPath + '"';
+        if (item.excerptHtml.includes(search)) {
+          item.excerptHtml = item.excerptHtml.split(search).join(replace);
+          contentChanged = true;
+        }
+        
+        // Also check for HTML-encoded patterns that might remain (after decode)
+        // Handle various quote combinations
+        const encodedPatterns = [
+          'bg_image="' + id + '"',  // Already decoded
+          'bg_image="' + id + '″',  // Opening regular, closing double prime
+          'bg_image="' + id + '"',  // Both regular
+        ];
+        
+        encodedPatterns.forEach(search => {
+          if (item.excerptHtml.includes(search)) {
+            item.excerptHtml = item.excerptHtml.split(search).join(replace);
+            contentChanged = true;
+          }
+        });
+      }
+      
+      if (item.excerptHtml !== original && !contentChanged) contentChanged = true;
+    }
+    
+    if (contentChanged || wasDecoded) {
+      updated = true;
+      if (contentChanged) updateCount++;
+    }
+  });
+  
+  if (updated) {
+    fs.writeFileSync(filePath, JSON.stringify(items, null, 2));
+    console.log('✅ Updated ' + file + ' (' + updateCount + ' replacements, ' + decodeCount + ' decoded)');
+  } else {
+    console.log('ℹ️  No changes for ' + file);
+  }
+});
+
+// Verify
+const pages = JSON.parse(fs.readFileSync(path.join(PROCESSED_DIR, 'pages.json'), 'utf8'));
+const homeEn = pages.find(p => p.slug === 'corporate-3-landing-2' && p.locale === 'en');
+const homeDe = pages.find(p => p.slug === 'start' && p.locale === 'de');
+
+console.log('\n✅ Verification:');
+console.log('EN home images:', (homeEn?.contentHtml?.match(/\/media\//g) || []).length);
+console.log('DE home images:', (homeDe?.contentHtml?.match(/\/media\//g) || []).length);
+
+// Check for remaining IDs
+const remainingIds = homeEn?.contentHtml?.match(/bg_image="\d+"/g) || [];
+console.log('Remaining IDs in EN:', remainingIds.length > 0 ? remainingIds : 'None');
+
+// Show examples
+if (homeEn?.contentHtml) {
+  const matches = homeEn.contentHtml.match(/bg_image="[^"]+"/g);
+  if (matches) {
+    console.log('\nEN bg_image examples:', matches.slice(0, 3));
+  }
+}
--- a/scripts/process-data.js
+++ b/scripts/process-data.js
@@ -27,52 +27,83 @@ function getLatestExportDir() {
  return path.join(RAW_DIR, dirs[0]);
 }

-// Decode HTML entities in text
+// Decode HTML entities in text - comprehensive handling
 function decodeHTMLEntities(text) {
  if (!text) return '';
  
-  return text
-    // Decode numeric HTML entities first
-    .replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec))
-    .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => String.fromCharCode(parseInt(hex, 16)))
-    
-    // Handle common named entities
-    .replace(/ /g, ' ')
-    .replace(/&/g, '&')
-    .replace(/</g, '<')
-    .replace(/>/g, '>')
-    .replace(/"/g, '"')
-    .replace(/'/g, "'")
-    .replace(/‘/g, "'")
-    .replace(/’/g, "'")
-    .replace(/“/g, '"')
-    .replace(/”/g, '"')
-    .replace(/–/g, '-')
-    .replace(/—/g, '—')
-    .replace(/…/g, '…')
-    .replace(/•/g, '•')
-    .replace(/€/g, '€')
-    
-    // Handle Unicode characters that might appear
-    .replace(/"/g, '"')
-    .replace(/'/g, "'")
-    .replace(/‘/g, "'")
-    .replace(/’/g, "'")
-    .replace(/“/g, '"')
-    .replace(/”/g, '"')
-    .replace(/–/g, '-')  // En dash
-    .replace(/—/g, '—')  // Em dash
-    .replace(/…/g, '…')  // Ellipsis
-    .replace(/•/g, '•')  // Bullet
-    .replace(/€/g, '€'); // Euro
+  // First, handle numeric entities (decimal and hex)
+  let result = text
+    .replace(/&#(\d+);/g, (match, dec) => {
+      const char = String.fromCharCode(parseInt(dec, 10));
+      return char;
+    })
+    .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => {
+      const char = String.fromCharCode(parseInt(hex, 16));
+      return char;
+    });
+  
+  // Handle common named entities and Unicode characters
+  const entityMap = {
+    ' ': ' ',
+    '‘': "'",
+    '’': "'",
+    '“': '"',
+    '”': '"',
+    '″': '"',  // Double prime (8243)
+    '–': '-',
+    '—': '—',
+    '…': '…',
+    '•': '•',
+    '€': '€',
+    '©': '©',
+    '®': '®',
+    '™': '™',
+    '°': '°',
+    '±': '±',
+    '×': '×',
+    '÷': '÷',
+    '−': '−',
+    '¢': '¢',
+    '£': '£',
+    '¥': '¥',
+    '§': '§',
+    '¶': '¶',
+    'µ': 'µ',
+    '«': '«',
+    '»': '»',
+    '·': '·'
+  };
+  
+  // Replace all named entities
+  for (const [entity, char] of Object.entries(entityMap)) {
+    result = result.replace(new RegExp(entity, 'g'), char);
+  }
+  
+  // Clean up any remaining ampersand patterns
+  result = result.replace(/&([a-zA-Z]+);/g, (match, name) => {
+    // If it's not in our map, try to decode it or leave as is
+    return entityMap[`&${name};`] || match;
+  });
+  
+  return result;
 }

 // HTML sanitization - preserve content but clean dangerous elements
+// Also preserves bg_image attributes for later processing by fix-images.js
 function sanitizeHTML(html) {
  if (!html) return '';
  
  let sanitized = html;
  
+  // Temporarily preserve bg_image attributes by replacing them with placeholders
+  // Handle both regular quotes and Unicode quotes
+  const bgImagePlaceholders = [];
+  sanitized = sanitized.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => {
+    const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`;
+    bgImagePlaceholders.push(match);
+    return placeholder;
+  });
+  
  // Remove script tags and inline handlers (security)
  sanitized = sanitized.replace(/<script.*?>.*?<\/script>/gis, '');
  sanitized = sanitized.replace(/\son\w+=".*?"/gi, '');
@@ -103,6 +134,11 @@ function sanitizeHTML(html) {
  // Normalize whitespace but preserve HTML structure
  sanitized = sanitized.replace(/\s+/g, ' ').trim();
  
+  // Restore bg_image placeholders
+  bgImagePlaceholders.forEach((placeholder, index) => {
+    sanitized = sanitized.replace(`__BG_IMAGE_${index}__`, placeholder);
+  });
+  
  return sanitized;
 }

@@ -115,15 +151,29 @@ function processExcerptShortcodes(excerptHtml) {
  // First, decode HTML entities to regular characters
  processed = decodeHTMLEntities(processed);
  
+  // Temporarily preserve bg_image attributes (handle both regular and Unicode quotes)
+  const bgImagePlaceholders = [];
+  processed = processed.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => {
+    const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`;
+    bgImagePlaceholders.push(match);
+    return placeholder;
+  });
+  
  // Process WPBakery shortcodes with HTML entities
  processed = processed
    // vc_row - convert to div with classes (handle both complete and truncated)
+    // Preserve any placeholders in the attributes
    .replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => {
      const classes = ['vc-row'];
      if (attrs.includes('full_width_background')) classes.push('full-width-bg');
      if (attrs.includes('in_container')) classes.push('in-container');
      if (attrs.includes('full_width_content')) classes.push('full-width-content');
-      return `<div class="${classes.join(' ')}">`;
+      
+      // Extract and preserve placeholders from attrs
+      const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || [];
+      const preservedAttrs = placeholderMatches.join(' ');
+      
+      return `<div class="${classes.join(' ')}" ${preservedAttrs}>`;
    })
    // Handle truncated vc_row (no closing bracket)
    .replace(/\[vc_row([^\]]*)$/gi, (match, attrs) => {
@@ -131,7 +181,12 @@ function processExcerptShortcodes(excerptHtml) {
      if (attrs.includes('full_width_background')) classes.push('full-width-bg');
      if (attrs.includes('in_container')) classes.push('in-container');
      if (attrs.includes('full_width_content')) classes.push('full-width-content');
-      return `<div class="${classes.join(' ')}">`;
+      
+      // Extract and preserve placeholders from attrs
+      const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || [];
+      const preservedAttrs = placeholderMatches.join(' ');
+      
+      return `<div class="${classes.join(' ')}" ${preservedAttrs}>`;
    })
    .replace(/\[\/vc_row\]/gi, '</div>')
    
@@ -172,15 +227,15 @@ function processExcerptShortcodes(excerptHtml) {
    .replace(/\[\/vc_column_text\]/gi, '</div>')
    
    // nectar_cta - convert to button
-    .replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi, 
+    .replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
      '<a href="$4" class="nectar-cta">$2</a>')
    
    // nectar_highlighted_text - convert to span
-    .replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi, 
+    .replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi,
      '<span class="nectar-highlighted">$2</span>')
    
    // nectar_responsive_text - convert to span
-    .replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi, 
+    .replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi,
      '<span class="nectar-responsive">$2</span>')
    
    // nectar_icon_list - convert to ul
@@ -188,15 +243,15 @@ function processExcerptShortcodes(excerptHtml) {
    .replace(/\[\/nectar_icon_list\]/gi, '</ul>')
    
    // nectar_icon_list_item - convert to li
-    .replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi, 
+    .replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi,
      '<li><strong>$2</strong>: $4</li>')
    
    // nectar_btn - convert to button
-    .replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi, 
+    .replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
      '<a href="$4" class="nectar-btn">$2</a>')
    
    // split_line_heading - convert to heading
-    .replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi, 
+    .replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi,
      '<h2 class="split-line-heading">$2</h2>')
    
    // vc_row_inner - convert to div
@@ -229,6 +284,11 @@ function processExcerptShortcodes(excerptHtml) {
  // Normalize whitespace
  processed = processed.replace(/\s+/g, ' ').trim();
  
+  // Restore bg_image placeholders
+  bgImagePlaceholders.forEach((placeholder, index) => {
+    processed = processed.replace(`__BG_IMAGE_${index}__`, placeholder);
+  });
+  
  return processed;
 }

@@ -498,7 +558,7 @@ function main() {
    }
  };
  
-  const translationMapping = loadJSON('translation-mapping-improved.json');
+  const translationMapping = loadJSON('translation-mapping.json');
  const pagesEN = loadJSON('pages.en.json');
  const pagesDE = loadJSON('pages.de.json');
  const postsEN = loadJSON('posts.en.json');
--- a/scripts/wordpress-export-enhanced.js
+++ b/scripts/wordpress-export-enhanced.js