#!/usr/bin/env node /** * WordPress → Next.js Data Processing Pipeline * Transforms raw WordPress data into Next.js compatible format */ const fs = require('fs'); const path = require('path'); const DATA_DIR = path.join(__dirname, '..', 'data'); const RAW_DIR = path.join(DATA_DIR, 'raw'); const PROCESSED_DIR = path.join(DATA_DIR, 'processed'); // Create processed directory if (!fs.existsSync(PROCESSED_DIR)) { fs.mkdirSync(PROCESSED_DIR, { recursive: true }); } // Find latest export function getLatestExportDir() { const dirs = fs.readdirSync(RAW_DIR).filter(f => { const stat = fs.statSync(path.join(RAW_DIR, f)); return stat.isDirectory(); }); dirs.sort().reverse(); return path.join(RAW_DIR, dirs[0]); } // Decode HTML entities in text - comprehensive handling function decodeHTMLEntities(text) { if (!text) return ''; // First, handle numeric entities (decimal and hex) let result = text .replace(/&#(\d+);/g, (match, dec) => { const char = String.fromCharCode(parseInt(dec, 10)); return char; }) .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => { const char = String.fromCharCode(parseInt(hex, 16)); return char; }); // Handle common named entities and Unicode characters const entityMap = { ' ': ' ', '‘': "'", '’': "'", '“': '"', '”': '"', '″': '"', // Double prime (8243) '–': '-', '—': '—', '…': '…', '•': '•', '€': '€', '©': '©', '®': '®', '™': '™', '°': '°', '±': '±', '×': '×', '÷': '÷', '−': '−', '¢': '¢', '£': '£', '¥': '¥', '§': '§', '¶': '¶', 'µ': 'µ', '«': '«', '»': '»', '·': '·' }; // Replace all named entities for (const [entity, char] of Object.entries(entityMap)) { result = result.replace(new RegExp(entity, 'g'), char); } // Clean up any remaining ampersand patterns result = result.replace(/&([a-zA-Z]+);/g, (match, name) => { // If it's not in our map, try to decode it or leave as is return entityMap[`&${name};`] || match; }); return result; } // HTML sanitization - preserve content but clean dangerous elements // Also preserves bg_image attributes for later processing by fix-images.js function sanitizeHTML(html) { if (!html) return ''; let sanitized = html; // Temporarily preserve bg_image attributes by replacing them with placeholders // Handle both regular quotes and Unicode quotes const bgImagePlaceholders = []; sanitized = sanitized.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => { const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`; bgImagePlaceholders.push(match); return placeholder; }); // Remove script tags and inline handlers (security) sanitized = sanitized.replace(/.*?<\/script>/gis, ''); sanitized = sanitized.replace(/\son\w+=".*?"/gi, ''); // Remove WPBakery shortcode wrappers but keep their content // Replace vc_row/vc_column with divs to preserve structure sanitized = sanitized.replace(/\[vc_row.*?\]/gi, '
'); sanitized = sanitized.replace(/\[\/vc_row\]/gi, '
'); sanitized = sanitized.replace(/\[vc_column.*?\]/gi, '
'); sanitized = sanitized.replace(/\[\/vc_column\]/gi, '
'); // Remove other shortcodes but keep text content sanitized = sanitized.replace(/\[vc_column_text.*?\]/gi, '
'); sanitized = sanitized.replace(/\[\/vc_column_text\]/gi, '
'); // Handle Nectar shortcodes - remove them but keep any text content // [nectar_cta] blocks often contain text we want to preserve sanitized = sanitized.replace(/\[nectar_cta.*?\]([\s\S]*?)\[\/nectar_cta\]/gi, '$1'); sanitized = sanitized.replace(/\[nectar.*?\]/gi, ''); // Remove all remaining shortcodes sanitized = sanitized.replace(/\[.*?\]/g, ''); // Remove empty paragraphs and divs sanitized = sanitized.replace(/]*>\s*<\/p>/gi, ''); sanitized = sanitized.replace(/]*>\s*<\/div>/gi, ''); // Normalize whitespace but preserve HTML structure sanitized = sanitized.replace(/\s+/g, ' ').trim(); // Restore bg_image placeholders bgImagePlaceholders.forEach((placeholder, index) => { sanitized = sanitized.replace(`__BG_IMAGE_${index}__`, placeholder); }); return sanitized; } // Process excerpts specifically to handle shortcodes comprehensively function processExcerptShortcodes(excerptHtml) { if (!excerptHtml) return ''; let processed = excerptHtml; // First, decode HTML entities to regular characters processed = decodeHTMLEntities(processed); // Temporarily preserve bg_image attributes (handle both regular and Unicode quotes) const bgImagePlaceholders = []; processed = processed.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => { const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`; bgImagePlaceholders.push(match); return placeholder; }); // Process WPBakery shortcodes with HTML entities processed = processed // vc_row - convert to div with classes (handle both complete and truncated) // Preserve any placeholders in the attributes .replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => { const classes = ['vc-row']; if (attrs.includes('full_width_background')) classes.push('full-width-bg'); if (attrs.includes('in_container')) classes.push('in-container'); if (attrs.includes('full_width_content')) classes.push('full-width-content'); // Extract and preserve placeholders from attrs const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || []; const preservedAttrs = placeholderMatches.join(' '); return `
`; }) // Handle truncated vc_row (no closing bracket) .replace(/\[vc_row([^\]]*)$/gi, (match, attrs) => { const classes = ['vc-row']; if (attrs.includes('full_width_background')) classes.push('full-width-bg'); if (attrs.includes('in_container')) classes.push('in-container'); if (attrs.includes('full_width_content')) classes.push('full-width-content'); // Extract and preserve placeholders from attrs const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || []; const preservedAttrs = placeholderMatches.join(' '); return `
`; }) .replace(/\[\/vc_row\]/gi, '
') // vc_column - convert to div with classes // Handle both complete and incomplete (truncated) shortcodes .replace(/\[vc_column([^\]]*)\]/gi, (match, attrs) => { const classes = ['vc-column']; if (attrs.includes('1/1')) classes.push('col-1-1'); if (attrs.includes('1/2')) classes.push('col-1-2'); if (attrs.includes('1/3')) classes.push('col-1-3'); if (attrs.includes('2/3')) classes.push('col-2-3'); if (attrs.includes('1/4')) classes.push('col-1-4'); if (attrs.includes('3/4')) classes.push('col-3-4'); if (attrs.includes('5/12')) classes.push('col-5-12'); if (attrs.includes('7/12')) classes.push('col-7-12'); return `
`; }) // Also handle incomplete vc_column shortcodes (truncated at end of excerpt) .replace(/\[vc_column([^\]]*)$/gi, (match, attrs) => { const classes = ['vc-column']; if (attrs.includes('1/1')) classes.push('col-1-1'); if (attrs.includes('1/2')) classes.push('col-1-2'); if (attrs.includes('1/3')) classes.push('col-1-3'); if (attrs.includes('2/3')) classes.push('col-2-3'); if (attrs.includes('1/4')) classes.push('col-1-4'); if (attrs.includes('3/4')) classes.push('col-3-4'); if (attrs.includes('5/12')) classes.push('col-5-12'); if (attrs.includes('7/12')) classes.push('col-7-12'); return `
`; }) .replace(/\[\/vc_column\]/gi, '
') // Handle truncated vc_column_text .replace(/\[vc_column_text([^\]]*)$/gi, '
') // vc_column_text - convert to div .replace(/\[vc_column_text([^\]]*)\]/gi, '
') .replace(/\[\/vc_column_text\]/gi, '
') // nectar_cta - convert to button .replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi, '$2') // nectar_highlighted_text - convert to span .replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi, '$2') // nectar_responsive_text - convert to span .replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi, '$2') // nectar_icon_list - convert to ul .replace(/\[nectar_icon_list([^\]]*)\]/gi, '
    ') .replace(/\[\/nectar_icon_list\]/gi, '
') // nectar_icon_list_item - convert to li .replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi, '
  • $2: $4
  • ') // nectar_btn - convert to button .replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi, '$2') // split_line_heading - convert to heading .replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi, '

    $2

    ') // vc_row_inner - convert to div .replace(/\[vc_row_inner([^\]]*)\]/gi, '
    ') .replace(/\[\/vc_row_inner\]/gi, '
    ') // vc_column_inner - convert to div .replace(/\[vc_column_inner([^\]]*)\]/gi, '
    ') .replace(/\[\/vc_column_inner\]/gi, '
    ') // divider - convert to hr .replace(/\[divider([^\]]*)\]/gi, '
    ') // vc_gallery - convert to div (placeholder) .replace(/\[vc_gallery([^\]]*)\]/gi, '') // vc_raw_js - remove or convert to div .replace(/\[vc_raw_js\](.*?)\[\/vc_raw_js\]/gi, '
    [JavaScript]
    ') // nectar_gmap - convert to div .replace(/\[nectar_gmap([^\]]*)\]/gi, '
    [Google Map]
    '); // Remove any remaining shortcodes processed = processed.replace(/\[.*?\]/g, ''); // Clean up any HTML that might be broken processed = processed.replace(/]*>\s*<\/p>/gi, ''); processed = processed.replace(/]*>\s*<\/div>/gi, ''); // Normalize whitespace processed = processed.replace(/\s+/g, ' ').trim(); // Restore bg_image placeholders bgImagePlaceholders.forEach((placeholder, index) => { processed = processed.replace(`__BG_IMAGE_${index}__`, placeholder); }); return processed; } // Extract excerpt from content function generateExcerpt(content, maxLength = 200) { const text = content.replace(/<[^>]*>/g, ''); if (text.length <= maxLength) return text; return text.substring(0, maxLength) + '...'; } // Process pages function processPages(pagesEN, pagesDE, translationMapping) { const processed = []; // Process English pages pagesEN.forEach(page => { const translationKey = page.slug; const deMatch = translationMapping.pages[translationKey]; // Extract title and decode HTML entities const rawTitle = page.titleHtml.replace(/<[^>]*>/g, ''); const decodedTitle = decodeHTMLEntities(rawTitle); processed.push({ id: page.id, translationKey: translationKey, locale: 'en', slug: page.slug, path: `/${page.slug}`, title: decodedTitle, titleHtml: page.titleHtml, contentHtml: sanitizeHTML(page.contentHtml), excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml), featuredImage: page.featuredImage, updatedAt: page.updatedAt, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); }); // Process German pages pagesDE.forEach(page => { const translationKey = page.slug; const enMatch = translationMapping.pages[translationKey]; // Extract title and decode HTML entities const rawTitle = page.titleHtml.replace(/<[^>]*>/g, ''); const decodedTitle = decodeHTMLEntities(rawTitle); processed.push({ id: page.id, translationKey: translationKey, locale: 'de', slug: page.slug, path: `/de/${page.slug}`, title: decodedTitle, titleHtml: page.titleHtml, contentHtml: sanitizeHTML(page.contentHtml), excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml), featuredImage: page.featuredImage, updatedAt: page.updatedAt, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); }); return processed; } // Process posts function processPosts(postsEN, postsDE, translationMapping) { const processed = []; postsEN.forEach(post => { const translationKey = post.slug; const deMatch = translationMapping.posts[translationKey]; // Extract title and decode HTML entities const rawTitle = post.titleHtml.replace(/<[^>]*>/g, ''); const decodedTitle = decodeHTMLEntities(rawTitle); processed.push({ id: post.id, translationKey: translationKey, locale: 'en', slug: post.slug, path: `/blog/${post.slug}`, title: decodedTitle, titleHtml: post.titleHtml, contentHtml: sanitizeHTML(post.contentHtml), excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml), featuredImage: post.featuredImage, datePublished: post.datePublished, updatedAt: post.updatedAt, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); }); postsDE.forEach(post => { const translationKey = post.slug; const enMatch = translationMapping.posts[translationKey]; // Extract title and decode HTML entities const rawTitle = post.titleHtml.replace(/<[^>]*>/g, ''); const decodedTitle = decodeHTMLEntities(rawTitle); processed.push({ id: post.id, translationKey: translationKey, locale: 'de', slug: post.slug, path: `/de/blog/${post.slug}`, title: decodedTitle, titleHtml: post.titleHtml, contentHtml: sanitizeHTML(post.contentHtml), excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml), featuredImage: post.featuredImage, datePublished: post.datePublished, updatedAt: post.updatedAt, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); }); return processed; } // Process products function processProducts(productsEN, productsDE, translationMapping) { const processed = []; productsEN.forEach(product => { const translationKey = product.slug; const deMatch = translationMapping.products[translationKey]; processed.push({ id: product.id, translationKey: translationKey, locale: 'en', slug: product.slug, path: `/product/${product.slug}`, name: product.name, shortDescriptionHtml: product.shortDescriptionHtml, descriptionHtml: sanitizeHTML(product.descriptionHtml), images: product.images, featuredImage: product.featuredImage, sku: product.sku, regularPrice: product.regularPrice, salePrice: product.salePrice, currency: product.currency, stockStatus: product.stockStatus, categories: product.categories, attributes: product.attributes, variations: product.variations, updatedAt: product.updatedAt, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); }); productsDE.forEach(product => { const translationKey = product.slug; const enMatch = translationMapping.products[translationKey]; processed.push({ id: product.id, translationKey: translationKey, locale: 'de', slug: product.slug, path: `/de/product/${product.slug}`, name: product.name, shortDescriptionHtml: product.shortDescriptionHtml, descriptionHtml: sanitizeHTML(product.descriptionHtml), images: product.images, featuredImage: product.featuredImage, sku: product.sku, regularPrice: product.regularPrice, salePrice: product.salePrice, currency: product.currency, stockStatus: product.stockStatus, categories: product.categories, attributes: product.attributes, variations: product.variations, updatedAt: product.updatedAt, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); }); return processed; } // Process product categories function processProductCategories(categoriesEN, categoriesDE, translationMapping) { const processed = []; categoriesEN.forEach(category => { const translationKey = category.slug; const deMatch = translationMapping.productCategories[translationKey]; processed.push({ id: category.id, translationKey: translationKey, locale: 'en', slug: category.slug, name: category.name, path: `/product-category/${category.slug}`, description: category.description, count: category.count, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); }); categoriesDE.forEach(category => { const translationKey = category.slug; const enMatch = translationMapping.productCategories[translationKey]; processed.push({ id: category.id, translationKey: translationKey, locale: 'de', slug: category.slug, name: category.name, path: `/de/product-category/${category.slug}`, description: category.description, count: category.count, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); }); return processed; } // Process media manifest function processMedia(media) { return media.map(item => ({ id: item.id, filename: item.filename, url: item.url, localPath: `/media/${item.filename}`, alt: item.alt, width: item.width, height: item.height, mimeType: item.mime_type })); } // Generate asset map for URL replacement function generateAssetMap(media) { const map = {}; media.forEach(item => { if (item.url) { map[item.url] = `/media/${item.filename}`; } }); return map; } // Main processing function function main() { const exportDir = getLatestExportDir(); console.log('🔄 Processing WordPress Data for Next.js'); console.log('========================================\n'); // Load raw data const loadJSON = (file) => { try { return JSON.parse(fs.readFileSync(path.join(exportDir, file), 'utf8')); } catch (e) { console.error(`❌ Failed to load ${file}:`, e.message); return []; } }; const translationMapping = loadJSON('translation-mapping.json'); const pagesEN = loadJSON('pages.en.json'); const pagesDE = loadJSON('pages.de.json'); const postsEN = loadJSON('posts.en.json'); const postsDE = loadJSON('posts.de.json'); const productsEN = loadJSON('products.en.json'); const productsDE = loadJSON('products.de.json'); const categoriesEN = loadJSON('product-categories.en.json'); const categoriesDE = loadJSON('product-categories.de.json'); const media = loadJSON('media.json'); const redirects = loadJSON('redirects.json'); const siteInfo = loadJSON('site-info.json'); console.log('📊 Processing content types...\n'); // Process each content type const pages = processPages(pagesEN, pagesDE, translationMapping); const posts = processPosts(postsEN, postsDE, translationMapping); const products = processProducts(productsEN, productsDE, translationMapping); const categories = processProductCategories(categoriesEN, categoriesDE, translationMapping); const processedMedia = processMedia(media); const assetMap = generateAssetMap(media); // Create processed data structure const processedData = { site: { title: siteInfo.siteTitle, description: siteInfo.siteDescription, baseUrl: siteInfo.baseUrl, defaultLocale: siteInfo.defaultLocale || 'en', locales: ['en', 'de'] }, content: { pages, posts, products, categories }, assets: { media: processedMedia, map: assetMap }, redirects, exportDate: new Date().toISOString() }; // Save processed data const outputPath = path.join(PROCESSED_DIR, 'wordpress-data.json'); fs.writeFileSync(outputPath, JSON.stringify(processedData, null, 2)); // Save individual files for easier access fs.writeFileSync(path.join(PROCESSED_DIR, 'pages.json'), JSON.stringify(pages, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'posts.json'), JSON.stringify(posts, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'products.json'), JSON.stringify(products, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'categories.json'), JSON.stringify(categories, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'media.json'), JSON.stringify(processedMedia, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'asset-map.json'), JSON.stringify(assetMap, null, 2)); // Summary console.log('✅ Data Processing Complete\n'); console.log('📦 Processed Content:'); console.log(` Pages: ${pages.length} (with translations)`); console.log(` Posts: ${posts.length} (with translations)`); console.log(` Products: ${products.length} (with translations)`); console.log(` Categories: ${categories.length} (with translations)`); console.log(` Media: ${processedMedia.length} files`); console.log(` Redirects: ${redirects.length} rules\n`); console.log('📁 Output Files:'); console.log(` ${outputPath}`); console.log(` ${path.join(PROCESSED_DIR, 'pages.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'posts.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'products.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'categories.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'media.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'asset-map.json')}\n`); // Sample data if (pages.length > 0) { console.log('📄 Sample Page:'); console.log(` Title: ${pages[0].title}`); console.log(` Path: ${pages[0].path}`); console.log(` Locale: ${pages[0].locale}`); console.log(` Translation: ${pages[0].translation ? 'Yes' : 'No'}\n`); } if (posts.length > 0) { console.log('📝 Sample Post:'); console.log(` Title: ${posts[0].title}`); console.log(` Path: ${posts[0].path}`); console.log(` Locale: ${posts[0].locale}`); console.log(` Date: ${posts[0].datePublished}\n`); } console.log('💡 Next: Ready for Next.js project setup!'); } if (require.main === module) { main(); }