Files
klz-cables.com/scripts/process-data.js
2025-12-30 16:19:42 +01:00

660 lines
23 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* WordPress → Next.js Data Processing Pipeline
* Transforms raw WordPress data into Next.js compatible format
*/
const fs = require('fs');
const path = require('path');
const DATA_DIR = path.join(__dirname, '..', 'data');
const RAW_DIR = path.join(DATA_DIR, 'raw');
const PROCESSED_DIR = path.join(DATA_DIR, 'processed');
// Create processed directory
if (!fs.existsSync(PROCESSED_DIR)) {
fs.mkdirSync(PROCESSED_DIR, { recursive: true });
}
// Find latest export
function getLatestExportDir() {
const dirs = fs.readdirSync(RAW_DIR).filter(f => {
const stat = fs.statSync(path.join(RAW_DIR, f));
return stat.isDirectory();
});
dirs.sort().reverse();
return path.join(RAW_DIR, dirs[0]);
}
// Decode HTML entities in text - comprehensive handling
function decodeHTMLEntities(text) {
if (!text) return '';
// First, handle numeric entities (decimal and hex)
let result = text
.replace(/&#(\d+);/g, (match, dec) => {
const char = String.fromCharCode(parseInt(dec, 10));
return char;
})
.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => {
const char = String.fromCharCode(parseInt(hex, 16));
return char;
});
// Handle common named entities and Unicode characters
const entityMap = {
' ': ' ',
'': "'",
'': "'",
'“': '"',
'”': '"',
'″': '"', // Double prime (8243)
'': '-',
'—': '—',
'…': '…',
'•': '•',
'€': '€',
'©': '©',
'®': '®',
'™': '™',
'°': '°',
'±': '±',
'×': '×',
'÷': '÷',
'': '',
'¢': '¢',
'£': '£',
'¥': '¥',
'§': '§',
'¶': '¶',
'µ': 'µ',
'«': '«',
'»': '»',
'·': '·'
};
// Replace all named entities
for (const [entity, char] of Object.entries(entityMap)) {
result = result.replace(new RegExp(entity, 'g'), char);
}
// Clean up any remaining ampersand patterns
result = result.replace(/&([a-zA-Z]+);/g, (match, name) => {
// If it's not in our map, try to decode it or leave as is
return entityMap[`&${name};`] || match;
});
return result;
}
// HTML sanitization - preserve content but clean dangerous elements
// Also preserves bg_image attributes for later processing by fix-images.js
function sanitizeHTML(html) {
if (!html) return '';
let sanitized = html;
// Temporarily preserve bg_image attributes by replacing them with placeholders
// Handle both regular quotes and Unicode quotes
const bgImagePlaceholders = [];
sanitized = sanitized.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => {
const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`;
bgImagePlaceholders.push(match);
return placeholder;
});
// Remove script tags and inline handlers (security)
sanitized = sanitized.replace(/<script.*?>.*?<\/script>/gis, '');
sanitized = sanitized.replace(/\son\w+=".*?"/gi, '');
// Remove WPBakery shortcode wrappers but keep their content
// Replace vc_row/vc_column with divs to preserve structure
sanitized = sanitized.replace(/\[vc_row.*?\]/gi, '<div class="vc-row">');
sanitized = sanitized.replace(/\[\/vc_row\]/gi, '</div>');
sanitized = sanitized.replace(/\[vc_column.*?\]/gi, '<div class="vc-column">');
sanitized = sanitized.replace(/\[\/vc_column\]/gi, '</div>');
// Remove other shortcodes but keep text content
sanitized = sanitized.replace(/\[vc_column_text.*?\]/gi, '<div class="vc-text">');
sanitized = sanitized.replace(/\[\/vc_column_text\]/gi, '</div>');
// Handle Nectar shortcodes - remove them but keep any text content
// [nectar_cta] blocks often contain text we want to preserve
sanitized = sanitized.replace(/\[nectar_cta.*?\]([\s\S]*?)\[\/nectar_cta\]/gi, '$1');
sanitized = sanitized.replace(/\[nectar.*?\]/gi, '');
// Remove all remaining shortcodes
sanitized = sanitized.replace(/\[.*?\]/g, '');
// Remove empty paragraphs and divs
sanitized = sanitized.replace(/<p[^>]*>\s*<\/p>/gi, '');
sanitized = sanitized.replace(/<div[^>]*>\s*<\/div>/gi, '');
// Normalize whitespace but preserve HTML structure
sanitized = sanitized.replace(/\s+/g, ' ').trim();
// Restore bg_image placeholders
bgImagePlaceholders.forEach((placeholder, index) => {
sanitized = sanitized.replace(`__BG_IMAGE_${index}__`, placeholder);
});
return sanitized;
}
// Process excerpts specifically to handle shortcodes comprehensively
function processExcerptShortcodes(excerptHtml) {
if (!excerptHtml) return '';
let processed = excerptHtml;
// First, decode HTML entities to regular characters
processed = decodeHTMLEntities(processed);
// Temporarily preserve bg_image attributes (handle both regular and Unicode quotes)
const bgImagePlaceholders = [];
processed = processed.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => {
const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`;
bgImagePlaceholders.push(match);
return placeholder;
});
// Process WPBakery shortcodes with HTML entities
processed = processed
// vc_row - convert to div with classes (handle both complete and truncated)
// Preserve any placeholders in the attributes
.replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => {
const classes = ['vc-row'];
if (attrs.includes('full_width_background')) classes.push('full-width-bg');
if (attrs.includes('in_container')) classes.push('in-container');
if (attrs.includes('full_width_content')) classes.push('full-width-content');
// Extract and preserve placeholders from attrs
const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || [];
const preservedAttrs = placeholderMatches.join(' ');
return `<div class="${classes.join(' ')}" ${preservedAttrs}>`;
})
// Handle truncated vc_row (no closing bracket)
.replace(/\[vc_row([^\]]*)$/gi, (match, attrs) => {
const classes = ['vc-row'];
if (attrs.includes('full_width_background')) classes.push('full-width-bg');
if (attrs.includes('in_container')) classes.push('in-container');
if (attrs.includes('full_width_content')) classes.push('full-width-content');
// Extract and preserve placeholders from attrs
const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || [];
const preservedAttrs = placeholderMatches.join(' ');
return `<div class="${classes.join(' ')}" ${preservedAttrs}>`;
})
.replace(/\[\/vc_row\]/gi, '</div>')
// vc_column - convert to div with classes
// Handle both complete and incomplete (truncated) shortcodes
.replace(/\[vc_column([^\]]*)\]/gi, (match, attrs) => {
const classes = ['vc-column'];
if (attrs.includes('1/1')) classes.push('col-1-1');
if (attrs.includes('1/2')) classes.push('col-1-2');
if (attrs.includes('1/3')) classes.push('col-1-3');
if (attrs.includes('2/3')) classes.push('col-2-3');
if (attrs.includes('1/4')) classes.push('col-1-4');
if (attrs.includes('3/4')) classes.push('col-3-4');
if (attrs.includes('5/12')) classes.push('col-5-12');
if (attrs.includes('7/12')) classes.push('col-7-12');
return `<div class="${classes.join(' ')}">`;
})
// Also handle incomplete vc_column shortcodes (truncated at end of excerpt)
.replace(/\[vc_column([^\]]*)$/gi, (match, attrs) => {
const classes = ['vc-column'];
if (attrs.includes('1/1')) classes.push('col-1-1');
if (attrs.includes('1/2')) classes.push('col-1-2');
if (attrs.includes('1/3')) classes.push('col-1-3');
if (attrs.includes('2/3')) classes.push('col-2-3');
if (attrs.includes('1/4')) classes.push('col-1-4');
if (attrs.includes('3/4')) classes.push('col-3-4');
if (attrs.includes('5/12')) classes.push('col-5-12');
if (attrs.includes('7/12')) classes.push('col-7-12');
return `<div class="${classes.join(' ')}">`;
})
.replace(/\[\/vc_column\]/gi, '</div>')
// Handle truncated vc_column_text
.replace(/\[vc_column_text([^\]]*)$/gi, '<div class="vc-column-text">')
// vc_column_text - convert to div
.replace(/\[vc_column_text([^\]]*)\]/gi, '<div class="vc-column-text">')
.replace(/\[\/vc_column_text\]/gi, '</div>')
// nectar_cta - convert to button
.replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-cta">$2</a>')
// nectar_highlighted_text - convert to span
.replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi,
'<span class="nectar-highlighted">$2</span>')
// nectar_responsive_text - convert to span
.replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi,
'<span class="nectar-responsive">$2</span>')
// nectar_icon_list - convert to ul
.replace(/\[nectar_icon_list([^\]]*)\]/gi, '<ul class="nectar-icon-list">')
.replace(/\[\/nectar_icon_list\]/gi, '</ul>')
// nectar_icon_list_item - convert to li
.replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi,
'<li><strong>$2</strong>: $4</li>')
// nectar_btn - convert to button
.replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-btn">$2</a>')
// split_line_heading - convert to heading
.replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi,
'<h2 class="split-line-heading">$2</h2>')
// vc_row_inner - convert to div
.replace(/\[vc_row_inner([^\]]*)\]/gi, '<div class="vc-row-inner">')
.replace(/\[\/vc_row_inner\]/gi, '</div>')
// vc_column_inner - convert to div
.replace(/\[vc_column_inner([^\]]*)\]/gi, '<div class="vc-column-inner">')
.replace(/\[\/vc_column_inner\]/gi, '</div>')
// divider - convert to hr
.replace(/\[divider([^\]]*)\]/gi, '<hr class="divider" />')
// vc_gallery - convert to div (placeholder)
.replace(/\[vc_gallery([^\]]*)\]/gi, '<div class="vc-gallery">[Gallery]</div>')
// vc_raw_js - remove or convert to div
.replace(/\[vc_raw_js\](.*?)\[\/vc_raw_js\]/gi, '<div class="vc-raw-js">[JavaScript]</div>')
// nectar_gmap - convert to div
.replace(/\[nectar_gmap([^\]]*)\]/gi, '<div class="nectar-gmap">[Google Map]</div>');
// Remove any remaining shortcodes
processed = processed.replace(/\[.*?\]/g, '');
// Clean up any HTML that might be broken
processed = processed.replace(/<p[^>]*>\s*<\/p>/gi, '');
processed = processed.replace(/<div[^>]*>\s*<\/div>/gi, '');
// Normalize whitespace
processed = processed.replace(/\s+/g, ' ').trim();
// Restore bg_image placeholders
bgImagePlaceholders.forEach((placeholder, index) => {
processed = processed.replace(`__BG_IMAGE_${index}__`, placeholder);
});
return processed;
}
// Extract excerpt from content
function generateExcerpt(content, maxLength = 200) {
const text = content.replace(/<[^>]*>/g, '');
if (text.length <= maxLength) return text;
return text.substring(0, maxLength) + '...';
}
// Process pages
function processPages(pagesEN, pagesDE, translationMapping) {
const processed = [];
// Process English pages
pagesEN.forEach(page => {
const translationKey = page.slug;
const deMatch = translationMapping.pages[translationKey];
// Extract title and decode HTML entities
const rawTitle = page.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'en',
slug: page.slug,
path: `/${page.slug}`,
title: decodedTitle,
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
featuredImage: page.featuredImage,
updatedAt: page.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
// Process German pages
pagesDE.forEach(page => {
const translationKey = page.slug;
const enMatch = translationMapping.pages[translationKey];
// Extract title and decode HTML entities
const rawTitle = page.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'de',
slug: page.slug,
path: `/de/${page.slug}`,
title: decodedTitle,
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
featuredImage: page.featuredImage,
updatedAt: page.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process posts
function processPosts(postsEN, postsDE, translationMapping) {
const processed = [];
postsEN.forEach(post => {
const translationKey = post.slug;
const deMatch = translationMapping.posts[translationKey];
// Extract title and decode HTML entities
const rawTitle = post.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'en',
slug: post.slug,
path: `/blog/${post.slug}`,
title: decodedTitle,
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
featuredImage: post.featuredImage,
datePublished: post.datePublished,
updatedAt: post.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
postsDE.forEach(post => {
const translationKey = post.slug;
const enMatch = translationMapping.posts[translationKey];
// Extract title and decode HTML entities
const rawTitle = post.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'de',
slug: post.slug,
path: `/de/blog/${post.slug}`,
title: decodedTitle,
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
featuredImage: post.featuredImage,
datePublished: post.datePublished,
updatedAt: post.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process products
function processProducts(productsEN, productsDE, translationMapping) {
const processed = [];
productsEN.forEach(product => {
const translationKey = product.slug;
const deMatch = translationMapping.products[translationKey];
processed.push({
id: product.id,
translationKey: translationKey,
locale: 'en',
slug: product.slug,
path: `/product/${product.slug}`,
name: product.name,
shortDescriptionHtml: product.shortDescriptionHtml,
descriptionHtml: sanitizeHTML(product.descriptionHtml),
images: product.images,
featuredImage: product.featuredImage,
sku: product.sku,
regularPrice: product.regularPrice,
salePrice: product.salePrice,
currency: product.currency,
stockStatus: product.stockStatus,
categories: product.categories,
attributes: product.attributes,
variations: product.variations,
updatedAt: product.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
productsDE.forEach(product => {
const translationKey = product.slug;
const enMatch = translationMapping.products[translationKey];
processed.push({
id: product.id,
translationKey: translationKey,
locale: 'de',
slug: product.slug,
path: `/de/product/${product.slug}`,
name: product.name,
shortDescriptionHtml: product.shortDescriptionHtml,
descriptionHtml: sanitizeHTML(product.descriptionHtml),
images: product.images,
featuredImage: product.featuredImage,
sku: product.sku,
regularPrice: product.regularPrice,
salePrice: product.salePrice,
currency: product.currency,
stockStatus: product.stockStatus,
categories: product.categories,
attributes: product.attributes,
variations: product.variations,
updatedAt: product.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process product categories
function processProductCategories(categoriesEN, categoriesDE, translationMapping) {
const processed = [];
categoriesEN.forEach(category => {
const translationKey = category.slug;
const deMatch = translationMapping.productCategories[translationKey];
processed.push({
id: category.id,
translationKey: translationKey,
locale: 'en',
slug: category.slug,
name: category.name,
path: `/product-category/${category.slug}`,
description: category.description,
count: category.count,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
categoriesDE.forEach(category => {
const translationKey = category.slug;
const enMatch = translationMapping.productCategories[translationKey];
processed.push({
id: category.id,
translationKey: translationKey,
locale: 'de',
slug: category.slug,
name: category.name,
path: `/de/product-category/${category.slug}`,
description: category.description,
count: category.count,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process media manifest
function processMedia(media) {
return media.map(item => ({
id: item.id,
filename: item.filename,
url: item.url,
localPath: `/media/${item.filename}`,
alt: item.alt,
width: item.width,
height: item.height,
mimeType: item.mime_type
}));
}
// Generate asset map for URL replacement
function generateAssetMap(media) {
const map = {};
media.forEach(item => {
if (item.url) {
map[item.url] = `/media/${item.filename}`;
}
});
return map;
}
// Main processing function
function main() {
const exportDir = getLatestExportDir();
console.log('🔄 Processing WordPress Data for Next.js');
console.log('========================================\n');
// Load raw data
const loadJSON = (file) => {
try {
return JSON.parse(fs.readFileSync(path.join(exportDir, file), 'utf8'));
} catch (e) {
console.error(`❌ Failed to load ${file}:`, e.message);
return [];
}
};
const translationMapping = loadJSON('translation-mapping.json');
const pagesEN = loadJSON('pages.en.json');
const pagesDE = loadJSON('pages.de.json');
const postsEN = loadJSON('posts.en.json');
const postsDE = loadJSON('posts.de.json');
const productsEN = loadJSON('products.en.json');
const productsDE = loadJSON('products.de.json');
const categoriesEN = loadJSON('product-categories.en.json');
const categoriesDE = loadJSON('product-categories.de.json');
const media = loadJSON('media.json');
const redirects = loadJSON('redirects.json');
const siteInfo = loadJSON('site-info.json');
console.log('📊 Processing content types...\n');
// Process each content type
const pages = processPages(pagesEN, pagesDE, translationMapping);
const posts = processPosts(postsEN, postsDE, translationMapping);
const products = processProducts(productsEN, productsDE, translationMapping);
const categories = processProductCategories(categoriesEN, categoriesDE, translationMapping);
const processedMedia = processMedia(media);
const assetMap = generateAssetMap(media);
// Create processed data structure
const processedData = {
site: {
title: siteInfo.siteTitle,
description: siteInfo.siteDescription,
baseUrl: siteInfo.baseUrl,
defaultLocale: siteInfo.defaultLocale || 'en',
locales: ['en', 'de']
},
content: {
pages,
posts,
products,
categories
},
assets: {
media: processedMedia,
map: assetMap
},
redirects,
exportDate: new Date().toISOString()
};
// Save processed data
const outputPath = path.join(PROCESSED_DIR, 'wordpress-data.json');
fs.writeFileSync(outputPath, JSON.stringify(processedData, null, 2));
// Save individual files for easier access
fs.writeFileSync(path.join(PROCESSED_DIR, 'pages.json'), JSON.stringify(pages, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'posts.json'), JSON.stringify(posts, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'products.json'), JSON.stringify(products, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'categories.json'), JSON.stringify(categories, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'media.json'), JSON.stringify(processedMedia, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'asset-map.json'), JSON.stringify(assetMap, null, 2));
// Summary
console.log('✅ Data Processing Complete\n');
console.log('📦 Processed Content:');
console.log(` Pages: ${pages.length} (with translations)`);
console.log(` Posts: ${posts.length} (with translations)`);
console.log(` Products: ${products.length} (with translations)`);
console.log(` Categories: ${categories.length} (with translations)`);
console.log(` Media: ${processedMedia.length} files`);
console.log(` Redirects: ${redirects.length} rules\n`);
console.log('📁 Output Files:');
console.log(` ${outputPath}`);
console.log(` ${path.join(PROCESSED_DIR, 'pages.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'posts.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'products.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'categories.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'media.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'asset-map.json')}\n`);
// Sample data
if (pages.length > 0) {
console.log('📄 Sample Page:');
console.log(` Title: ${pages[0].title}`);
console.log(` Path: ${pages[0].path}`);
console.log(` Locale: ${pages[0].locale}`);
console.log(` Translation: ${pages[0].translation ? 'Yes' : 'No'}\n`);
}
if (posts.length > 0) {
console.log('📝 Sample Post:');
console.log(` Title: ${posts[0].title}`);
console.log(` Path: ${posts[0].path}`);
console.log(` Locale: ${posts[0].locale}`);
console.log(` Date: ${posts[0].datePublished}\n`);
}
console.log('💡 Next: Ready for Next.js project setup!');
}
if (require.main === module) {
main();
}