Files
klz-cables.com/scripts/process-data-fixed.js
2025-12-30 00:06:54 +01:00

563 lines
20 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* WordPress → Next.js Data Processing Pipeline
* Transforms raw WordPress data into Next.js compatible format
*/
const fs = require('fs');
const path = require('path');
const DATA_DIR = path.join(__dirname, '..', 'data');
const RAW_DIR = path.join(DATA_DIR, 'raw');
const PROCESSED_DIR = path.join(DATA_DIR, 'processed');
// Create processed directory
if (!fs.existsSync(PROCESSED_DIR)) {
fs.mkdirSync(PROCESSED_DIR, { recursive: true });
}
// Find latest export
function getLatestExportDir() {
const dirs = fs.readdirSync(RAW_DIR).filter(f => {
const stat = fs.statSync(path.join(RAW_DIR, f));
return stat.isDirectory();
});
dirs.sort().reverse();
return path.join(RAW_DIR, dirs[0]);
}
// HTML sanitization - preserve content but clean dangerous elements
function sanitizeHTML(html) {
if (!html) return '';
let sanitized = html;
// Remove script tags and inline handlers (security)
sanitized = sanitized.replace(/<script.*?>.*?<\/script>/gis, '');
sanitized = sanitized.replace(/\son\w+=".*?"/gi, '');
// Remove WPBakery shortcode wrappers but keep their content
// Replace vc_row/vc_column with divs to preserve structure
sanitized = sanitized.replace(/\[vc_row.*?\]/gi, '<div class="vc-row">');
sanitized = sanitized.replace(/\[\/vc_row\]/gi, '</div>');
sanitized = sanitized.replace(/\[vc_column.*?\]/gi, '<div class="vc-column">');
sanitized = sanitized.replace(/\[\/vc_column\]/gi, '</div>');
// Remove other shortcodes but keep text content
sanitized = sanitized.replace(/\[vc_column_text.*?\]/gi, '<div class="vc-text">');
sanitized = sanitized.replace(/\[\/vc_column_text\]/gi, '</div>');
// Handle Nectar shortcodes - remove them but keep any text content
// [nectar_cta] blocks often contain text we want to preserve
sanitized = sanitized.replace(/\[nectar_cta.*?\]([\s\S]*?)\[\/nectar_cta\]/gi, '$1');
sanitized = sanitized.replace(/\[nectar.*?\]/gi, '');
// Remove all remaining shortcodes
sanitized = sanitized.replace(/\[.*?\]/g, '');
// Remove empty paragraphs and divs
sanitized = sanitized.replace(/<p[^>]*>\s*<\/p>/gi, '');
sanitized = sanitized.replace(/<div[^>]*>\s*<\/div>/gi, '');
// Normalize whitespace but preserve HTML structure
sanitized = sanitized.replace(/\s+/g, ' ').trim();
return sanitized;
}
// Process excerpts specifically to handle shortcodes comprehensively
function processExcerptShortcodes(excerptHtml) {
if (!excerptHtml) return '';
let processed = excerptHtml;
// First, decode HTML entities to regular characters
// Handle both numeric entities (”) and named entities (")
processed = processed
// Numeric HTML entities commonly found in WordPress raw data
.replace(/”/g, '"') // ” - Right double quote
.replace(/“/g, '"') // “ - Left double quote
.replace(/„/g, ',') // „ - Low double quote
.replace(/‟/g, '"') // ‟ - High double quote
.replace(//g, "'") // - Left single quote
.replace(//g, "'") // - Right single quote
.replace(//g, '-') // - En dash
.replace(/—/g, '—') // — - Em dash
.replace(/…/g, '…') // … - Ellipsis
.replace(/″/g, '"') // ″ - Inches/Prime
.replace(//g, "'") // - Feet/Prime
.replace(//g, ',') // - Single low quote
.replace(//g, '`') // - Single high reversed quote
.replace(/•/g, '•') // • - Bullet
.replace(/€/g, '€') // € - Euro
// Unicode characters (from rendered content)
.replace(/”/g, '"') // Right double quote
.replace(/“/g, '"') // Left double quote
.replace(/„/g, ',') // Low double quote
.replace(/‟/g, '"') // High double quote
.replace(//g, "'") // Left single quote
.replace(//g, "'") // Right single quote
.replace(//g, '-') // En dash
.replace(/—/g, '—') // Em dash
.replace(/…/g, '…') // Ellipsis
.replace(/″/g, '"') // Inches/Prime
.replace(//g, "'") // Feet/Prime
.replace(/•/g, '•') // Bullet
// Named HTML entities
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(//g, "'")
.replace(//g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
.replace(//g, '-')
.replace(/—/g, '—')
.replace(/…/g, '…')
.replace(/•/g, '•')
.replace(/€/g, '€');
// Process WPBakery shortcodes with HTML entities
processed = processed
// vc_row - convert to div with classes
.replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => {
const classes = ['vc-row'];
if (attrs.includes('full_width_background')) classes.push('full-width-bg');
if (attrs.includes('in_container')) classes.push('in-container');
if (attrs.includes('full_width_content')) classes.push('full-width-content');
return `<div class="${classes.join(' ')}">`;
})
.replace(/\[\/vc_row\]/gi, '</div>')
// vc_column - convert to div with classes
.replace(/\[vc_column([^\]]*)\]/gi, (match, attrs) => {
const classes = ['vc-column'];
if (attrs.includes('1/1')) classes.push('col-1-1');
if (attrs.includes('1/2')) classes.push('col-1-2');
if (attrs.includes('1/3')) classes.push('col-1-3');
if (attrs.includes('2/3')) classes.push('col-2-3');
if (attrs.includes('1/4')) classes.push('col-1-4');
if (attrs.includes('3/4')) classes.push('col-3-4');
if (attrs.includes('5/12')) classes.push('col-5-12');
if (attrs.includes('7/12')) classes.push('col-7-12');
return `<div class="${classes.join(' ')}">`;
})
.replace(/\[\/vc_column\]/gi, '</div>')
// vc_column_text - convert to div
.replace(/\[vc_column_text([^\]]*)\]/gi, '<div class="vc-column-text">')
.replace(/\[\/vc_column_text\]/gi, '</div>')
// nectar_cta - convert to button
.replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-cta">$2</a>')
// nectar_highlighted_text - convert to span
.replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi,
'<span class="nectar-highlighted">$2</span>')
// nectar_responsive_text - convert to span
.replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi,
'<span class="nectar-responsive">$2</span>')
// nectar_icon_list - convert to ul
.replace(/\[nectar_icon_list([^\]]*)\]/gi, '<ul class="nectar-icon-list">')
.replace(/\[\/nectar_icon_list\]/gi, '</ul>')
// nectar_icon_list_item - convert to li
.replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi,
'<li><strong>$2</strong>: $4</li>')
// nectar_btn - convert to button
.replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-btn">$2</a>')
// split_line_heading - convert to heading
.replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi,
'<h2 class="split-line-heading">$2</h2>')
// vc_row_inner - convert to div
.replace(/\[vc_row_inner([^\]]*)\]/gi, '<div class="vc-row-inner">')
.replace(/\[\/vc_row_inner\]/gi, '</div>')
// vc_column_inner - convert to div
.replace(/\[vc_column_inner([^\]]*)\]/gi, '<div class="vc-column-inner">')
.replace(/\[\/vc_column_inner\]/gi, '</div>')
// divider - convert to hr
.replace(/\[divider([^\]]*)\]/gi, '<hr class="divider" />')
// vc_gallery - convert to div (placeholder)
.replace(/\[vc_gallery([^\]]*)\]/gi, '<div class="vc-gallery">[Gallery]</div>')
// vc_raw_js - remove or convert to div
.replace(/\[vc_raw_js\](.*?)\[\/vc_raw_js\]/gi, '<div class="vc-raw-js">[JavaScript]</div>')
// nectar_gmap - convert to div
.replace(/\[nectar_gmap([^\]]*)\]/gi, '<div class="nectar-gmap">[Google Map]</div>');
// Remove any remaining shortcodes
processed = processed.replace(/\[.*?\]/g, '');
// Clean up any HTML that might be broken
processed = processed.replace(/<p[^>]*>\s*<\/p>/gi, '');
processed = processed.replace(/<div[^>]*>\s*<\/div>/gi, '');
// Normalize whitespace
processed = processed.replace(/\s+/g, ' ').trim();
return processed;
}
// Extract excerpt from content
function generateExcerpt(content, maxLength = 200) {
const text = content.replace(/<[^>]*>/g, '');
if (text.length <= maxLength) return text;
return text.substring(0, maxLength) + '...';
}
// Process pages
function processPages(pagesEN, pagesDE, translationMapping) {
const processed = [];
// Process English pages
pagesEN.forEach(page => {
const translationKey = page.slug;
const deMatch = translationMapping.pages[translationKey];
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'en',
slug: page.slug,
path: `/${page.slug}`,
title: page.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
featuredImage: page.featuredImage,
updatedAt: page.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
// Process German pages
pagesDE.forEach(page => {
const translationKey = page.slug;
const enMatch = translationMapping.pages[translationKey];
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'de',
slug: page.slug,
path: `/de/${page.slug}`,
title: page.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
featuredImage: page.featuredImage,
updatedAt: page.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process posts
function processPosts(postsEN, postsDE, translationMapping) {
const processed = [];
postsEN.forEach(post => {
const translationKey = post.slug;
const deMatch = translationMapping.posts[translationKey];
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'en',
slug: post.slug,
path: `/blog/${post.slug}`,
title: post.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
featuredImage: post.featuredImage,
datePublished: post.datePublished,
updatedAt: post.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
postsDE.forEach(post => {
const translationKey = post.slug;
const enMatch = translationMapping.posts[translationKey];
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'de',
slug: post.slug,
path: `/de/blog/${post.slug}`,
title: post.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
featuredImage: post.featuredImage,
datePublished: post.datePublished,
updatedAt: post.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process products
function processProducts(productsEN, productsDE, translationMapping) {
const processed = [];
productsEN.forEach(product => {
const translationKey = product.slug;
const deMatch = translationMapping.products[translationKey];
processed.push({
id: product.id,
translationKey: translationKey,
locale: 'en',
slug: product.slug,
path: `/product/${product.slug}`,
name: product.name,
shortDescriptionHtml: product.shortDescriptionHtml,
descriptionHtml: sanitizeHTML(product.descriptionHtml),
images: product.images,
featuredImage: product.featuredImage,
sku: product.sku,
regularPrice: product.regularPrice,
salePrice: product.salePrice,
currency: product.currency,
stockStatus: product.stockStatus,
categories: product.categories,
attributes: product.attributes,
variations: product.variations,
updatedAt: product.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
productsDE.forEach(product => {
const translationKey = product.slug;
const enMatch = translationMapping.products[translationKey];
processed.push({
id: product.id,
translationKey: translationKey,
locale: 'de',
slug: product.slug,
path: `/de/product/${product.slug}`,
name: product.name,
shortDescriptionHtml: product.shortDescriptionHtml,
descriptionHtml: sanitizeHTML(product.descriptionHtml),
images: product.images,
featuredImage: product.featuredImage,
sku: product.sku,
regularPrice: product.regularPrice,
salePrice: product.salePrice,
currency: product.currency,
stockStatus: product.stockStatus,
categories: product.categories,
attributes: product.attributes,
variations: product.variations,
updatedAt: product.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process product categories
function processProductCategories(categoriesEN, categoriesDE, translationMapping) {
const processed = [];
categoriesEN.forEach(category => {
const translationKey = category.slug;
const deMatch = translationMapping.productCategories[translationKey];
processed.push({
id: category.id,
translationKey: translationKey,
locale: 'en',
slug: category.slug,
name: category.name,
path: `/product-category/${category.slug}`,
description: category.description,
count: category.count,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
categoriesDE.forEach(category => {
const translationKey = category.slug;
const enMatch = translationMapping.productCategories[translationKey];
processed.push({
id: category.id,
translationKey: translationKey,
locale: 'de',
slug: category.slug,
name: category.name,
path: `/de/product-category/${category.slug}`,
description: category.description,
count: category.count,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process media manifest
function processMedia(media) {
return media.map(item => ({
id: item.id,
filename: item.filename,
url: item.url,
localPath: `/media/${item.filename}`,
alt: item.alt,
width: item.width,
height: item.height,
mimeType: item.mime_type
}));
}
// Generate asset map for URL replacement
function generateAssetMap(media) {
const map = {};
media.forEach(item => {
if (item.url) {
map[item.url] = `/media/${item.filename}`;
}
});
return map;
}
// Main processing function
function main() {
const exportDir = getLatestExportDir();
console.log('🔄 Processing WordPress Data for Next.js');
console.log('========================================\n');
// Load raw data
const loadJSON = (file) => {
try {
return JSON.parse(fs.readFileSync(path.join(exportDir, file), 'utf8'));
} catch (e) {
console.error(`❌ Failed to load ${file}:`, e.message);
return [];
}
};
const translationMapping = loadJSON('translation-mapping-improved.json');
const pagesEN = loadJSON('pages.en.json');
const pagesDE = loadJSON('pages.de.json');
const postsEN = loadJSON('posts.en.json');
const postsDE = loadJSON('posts.de.json');
const productsEN = loadJSON('products.en.json');
const productsDE = loadJSON('products.de.json');
const categoriesEN = loadJSON('product-categories.en.json');
const categoriesDE = loadJSON('product-categories.de.json');
const media = loadJSON('media.json');
const redirects = loadJSON('redirects.json');
const siteInfo = loadJSON('site-info.json');
console.log('📊 Processing content types...\n');
// Process each content type
const pages = processPages(pagesEN, pagesDE, translationMapping);
const posts = processPosts(postsEN, postsDE, translationMapping);
const products = processProducts(productsEN, productsDE, translationMapping);
const categories = processProductCategories(categoriesEN, categoriesDE, translationMapping);
const processedMedia = processMedia(media);
const assetMap = generateAssetMap(media);
// Create processed data structure
const processedData = {
site: {
title: siteInfo.siteTitle,
description: siteInfo.siteDescription,
baseUrl: siteInfo.baseUrl,
defaultLocale: siteInfo.defaultLocale || 'en',
locales: ['en', 'de']
},
content: {
pages,
posts,
products,
categories
},
assets: {
media: processedMedia,
map: assetMap
},
redirects,
exportDate: new Date().toISOString()
};
// Save processed data
const outputPath = path.join(PROCESSED_DIR, 'wordpress-data.json');
fs.writeFileSync(outputPath, JSON.stringify(processedData, null, 2));
// Save individual files for easier access
fs.writeFileSync(path.join(PROCESSED_DIR, 'pages.json'), JSON.stringify(pages, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'posts.json'), JSON.stringify(posts, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'products.json'), JSON.stringify(products, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'categories.json'), JSON.stringify(categories, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'media.json'), JSON.stringify(processedMedia, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'asset-map.json'), JSON.stringify(assetMap, null, 2));
// Summary
console.log('✅ Data Processing Complete\n');
console.log('📦 Processed Content:');
console.log(` Pages: ${pages.length} (with translations)`);
console.log(` Posts: ${posts.length} (with translations)`);
console.log(` Products: ${products.length} (with translations)`);
console.log(` Categories: ${categories.length} (with translations)`);
console.log(` Media: ${processedMedia.length} files`);
console.log(` Redirects: ${redirects.length} rules\n`);
console.log('📁 Output Files:');
console.log(` ${outputPath}`);
console.log(` ${path.join(PROCESSED_DIR, 'pages.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'posts.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'products.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'categories.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'media.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'asset-map.json')}\n`);
// Sample data
if (pages.length > 0) {
console.log('📄 Sample Page:');
console.log(` Title: ${pages[0].title}`);
console.log(` Path: ${pages[0].path}`);
console.log(` Locale: ${pages[0].locale}`);
console.log(` Translation: ${pages[0].translation ? 'Yes' : 'No'}\n`);
}
if (posts.length > 0) {
console.log('📝 Sample Post:');
console.log(` Title: ${posts[0].title}`);
console.log(` Path: ${posts[0].path}`);
console.log(` Locale: ${posts[0].locale}`);
console.log(` Date: ${posts[0].datePublished}\n`);
}
console.log('💡 Next: Ready for Next.js project setup!');
}
if (require.main === module) {
main();
}