migration wip

This commit is contained in:
2025-12-30 00:06:54 +01:00
parent 3efbac78cb
commit 89dbf8af87
94 changed files with 5674 additions and 308 deletions

View File

@@ -0,0 +1,563 @@
#!/usr/bin/env node
/**
* WordPress → Next.js Data Processing Pipeline
* Transforms raw WordPress data into Next.js compatible format
*/
const fs = require('fs');
const path = require('path');
const DATA_DIR = path.join(__dirname, '..', 'data');
const RAW_DIR = path.join(DATA_DIR, 'raw');
const PROCESSED_DIR = path.join(DATA_DIR, 'processed');
// Create processed directory
if (!fs.existsSync(PROCESSED_DIR)) {
fs.mkdirSync(PROCESSED_DIR, { recursive: true });
}
// Find latest export
function getLatestExportDir() {
const dirs = fs.readdirSync(RAW_DIR).filter(f => {
const stat = fs.statSync(path.join(RAW_DIR, f));
return stat.isDirectory();
});
dirs.sort().reverse();
return path.join(RAW_DIR, dirs[0]);
}
// HTML sanitization - preserve content but clean dangerous elements
function sanitizeHTML(html) {
if (!html) return '';
let sanitized = html;
// Remove script tags and inline handlers (security)
sanitized = sanitized.replace(/<script.*?>.*?<\/script>/gis, '');
sanitized = sanitized.replace(/\son\w+=".*?"/gi, '');
// Remove WPBakery shortcode wrappers but keep their content
// Replace vc_row/vc_column with divs to preserve structure
sanitized = sanitized.replace(/\[vc_row.*?\]/gi, '<div class="vc-row">');
sanitized = sanitized.replace(/\[\/vc_row\]/gi, '</div>');
sanitized = sanitized.replace(/\[vc_column.*?\]/gi, '<div class="vc-column">');
sanitized = sanitized.replace(/\[\/vc_column\]/gi, '</div>');
// Remove other shortcodes but keep text content
sanitized = sanitized.replace(/\[vc_column_text.*?\]/gi, '<div class="vc-text">');
sanitized = sanitized.replace(/\[\/vc_column_text\]/gi, '</div>');
// Handle Nectar shortcodes - remove them but keep any text content
// [nectar_cta] blocks often contain text we want to preserve
sanitized = sanitized.replace(/\[nectar_cta.*?\]([\s\S]*?)\[\/nectar_cta\]/gi, '$1');
sanitized = sanitized.replace(/\[nectar.*?\]/gi, '');
// Remove all remaining shortcodes
sanitized = sanitized.replace(/\[.*?\]/g, '');
// Remove empty paragraphs and divs
sanitized = sanitized.replace(/<p[^>]*>\s*<\/p>/gi, '');
sanitized = sanitized.replace(/<div[^>]*>\s*<\/div>/gi, '');
// Normalize whitespace but preserve HTML structure
sanitized = sanitized.replace(/\s+/g, ' ').trim();
return sanitized;
}
// Process excerpts specifically to handle shortcodes comprehensively
function processExcerptShortcodes(excerptHtml) {
if (!excerptHtml) return '';
let processed = excerptHtml;
// First, decode HTML entities to regular characters
// Handle both numeric entities (”) and named entities (")
processed = processed
// Numeric HTML entities commonly found in WordPress raw data
.replace(/”/g, '"') // ” - Right double quote
.replace(/“/g, '"') // “ - Left double quote
.replace(/„/g, ',') // „ - Low double quote
.replace(/‟/g, '"') // ‟ - High double quote
.replace(//g, "'") // - Left single quote
.replace(//g, "'") // - Right single quote
.replace(//g, '-') // - En dash
.replace(/—/g, '—') // — - Em dash
.replace(/…/g, '…') // … - Ellipsis
.replace(/″/g, '"') // ″ - Inches/Prime
.replace(//g, "'") // - Feet/Prime
.replace(//g, ',') // - Single low quote
.replace(//g, '`') // - Single high reversed quote
.replace(/•/g, '•') // • - Bullet
.replace(/€/g, '€') // € - Euro
// Unicode characters (from rendered content)
.replace(/”/g, '"') // Right double quote
.replace(/“/g, '"') // Left double quote
.replace(/„/g, ',') // Low double quote
.replace(/‟/g, '"') // High double quote
.replace(//g, "'") // Left single quote
.replace(//g, "'") // Right single quote
.replace(//g, '-') // En dash
.replace(/—/g, '—') // Em dash
.replace(/…/g, '…') // Ellipsis
.replace(/″/g, '"') // Inches/Prime
.replace(//g, "'") // Feet/Prime
.replace(/•/g, '•') // Bullet
// Named HTML entities
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(//g, "'")
.replace(//g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
.replace(//g, '-')
.replace(/—/g, '—')
.replace(/…/g, '…')
.replace(/•/g, '•')
.replace(/€/g, '€');
// Process WPBakery shortcodes with HTML entities
processed = processed
// vc_row - convert to div with classes
.replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => {
const classes = ['vc-row'];
if (attrs.includes('full_width_background')) classes.push('full-width-bg');
if (attrs.includes('in_container')) classes.push('in-container');
if (attrs.includes('full_width_content')) classes.push('full-width-content');
return `<div class="${classes.join(' ')}">`;
})
.replace(/\[\/vc_row\]/gi, '</div>')
// vc_column - convert to div with classes
.replace(/\[vc_column([^\]]*)\]/gi, (match, attrs) => {
const classes = ['vc-column'];
if (attrs.includes('1/1')) classes.push('col-1-1');
if (attrs.includes('1/2')) classes.push('col-1-2');
if (attrs.includes('1/3')) classes.push('col-1-3');
if (attrs.includes('2/3')) classes.push('col-2-3');
if (attrs.includes('1/4')) classes.push('col-1-4');
if (attrs.includes('3/4')) classes.push('col-3-4');
if (attrs.includes('5/12')) classes.push('col-5-12');
if (attrs.includes('7/12')) classes.push('col-7-12');
return `<div class="${classes.join(' ')}">`;
})
.replace(/\[\/vc_column\]/gi, '</div>')
// vc_column_text - convert to div
.replace(/\[vc_column_text([^\]]*)\]/gi, '<div class="vc-column-text">')
.replace(/\[\/vc_column_text\]/gi, '</div>')
// nectar_cta - convert to button
.replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-cta">$2</a>')
// nectar_highlighted_text - convert to span
.replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi,
'<span class="nectar-highlighted">$2</span>')
// nectar_responsive_text - convert to span
.replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi,
'<span class="nectar-responsive">$2</span>')
// nectar_icon_list - convert to ul
.replace(/\[nectar_icon_list([^\]]*)\]/gi, '<ul class="nectar-icon-list">')
.replace(/\[\/nectar_icon_list\]/gi, '</ul>')
// nectar_icon_list_item - convert to li
.replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi,
'<li><strong>$2</strong>: $4</li>')
// nectar_btn - convert to button
.replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-btn">$2</a>')
// split_line_heading - convert to heading
.replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi,
'<h2 class="split-line-heading">$2</h2>')
// vc_row_inner - convert to div
.replace(/\[vc_row_inner([^\]]*)\]/gi, '<div class="vc-row-inner">')
.replace(/\[\/vc_row_inner\]/gi, '</div>')
// vc_column_inner - convert to div
.replace(/\[vc_column_inner([^\]]*)\]/gi, '<div class="vc-column-inner">')
.replace(/\[\/vc_column_inner\]/gi, '</div>')
// divider - convert to hr
.replace(/\[divider([^\]]*)\]/gi, '<hr class="divider" />')
// vc_gallery - convert to div (placeholder)
.replace(/\[vc_gallery([^\]]*)\]/gi, '<div class="vc-gallery">[Gallery]</div>')
// vc_raw_js - remove or convert to div
.replace(/\[vc_raw_js\](.*?)\[\/vc_raw_js\]/gi, '<div class="vc-raw-js">[JavaScript]</div>')
// nectar_gmap - convert to div
.replace(/\[nectar_gmap([^\]]*)\]/gi, '<div class="nectar-gmap">[Google Map]</div>');
// Remove any remaining shortcodes
processed = processed.replace(/\[.*?\]/g, '');
// Clean up any HTML that might be broken
processed = processed.replace(/<p[^>]*>\s*<\/p>/gi, '');
processed = processed.replace(/<div[^>]*>\s*<\/div>/gi, '');
// Normalize whitespace
processed = processed.replace(/\s+/g, ' ').trim();
return processed;
}
// Extract excerpt from content
function generateExcerpt(content, maxLength = 200) {
const text = content.replace(/<[^>]*>/g, '');
if (text.length <= maxLength) return text;
return text.substring(0, maxLength) + '...';
}
// Process pages
function processPages(pagesEN, pagesDE, translationMapping) {
const processed = [];
// Process English pages
pagesEN.forEach(page => {
const translationKey = page.slug;
const deMatch = translationMapping.pages[translationKey];
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'en',
slug: page.slug,
path: `/${page.slug}`,
title: page.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
featuredImage: page.featuredImage,
updatedAt: page.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
// Process German pages
pagesDE.forEach(page => {
const translationKey = page.slug;
const enMatch = translationMapping.pages[translationKey];
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'de',
slug: page.slug,
path: `/de/${page.slug}`,
title: page.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
featuredImage: page.featuredImage,
updatedAt: page.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process posts
function processPosts(postsEN, postsDE, translationMapping) {
const processed = [];
postsEN.forEach(post => {
const translationKey = post.slug;
const deMatch = translationMapping.posts[translationKey];
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'en',
slug: post.slug,
path: `/blog/${post.slug}`,
title: post.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
featuredImage: post.featuredImage,
datePublished: post.datePublished,
updatedAt: post.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
postsDE.forEach(post => {
const translationKey = post.slug;
const enMatch = translationMapping.posts[translationKey];
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'de',
slug: post.slug,
path: `/de/blog/${post.slug}`,
title: post.titleHtml.replace(/<[^>]*>/g, ''),
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
featuredImage: post.featuredImage,
datePublished: post.datePublished,
updatedAt: post.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process products
function processProducts(productsEN, productsDE, translationMapping) {
const processed = [];
productsEN.forEach(product => {
const translationKey = product.slug;
const deMatch = translationMapping.products[translationKey];
processed.push({
id: product.id,
translationKey: translationKey,
locale: 'en',
slug: product.slug,
path: `/product/${product.slug}`,
name: product.name,
shortDescriptionHtml: product.shortDescriptionHtml,
descriptionHtml: sanitizeHTML(product.descriptionHtml),
images: product.images,
featuredImage: product.featuredImage,
sku: product.sku,
regularPrice: product.regularPrice,
salePrice: product.salePrice,
currency: product.currency,
stockStatus: product.stockStatus,
categories: product.categories,
attributes: product.attributes,
variations: product.variations,
updatedAt: product.updatedAt,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
productsDE.forEach(product => {
const translationKey = product.slug;
const enMatch = translationMapping.products[translationKey];
processed.push({
id: product.id,
translationKey: translationKey,
locale: 'de',
slug: product.slug,
path: `/de/product/${product.slug}`,
name: product.name,
shortDescriptionHtml: product.shortDescriptionHtml,
descriptionHtml: sanitizeHTML(product.descriptionHtml),
images: product.images,
featuredImage: product.featuredImage,
sku: product.sku,
regularPrice: product.regularPrice,
salePrice: product.salePrice,
currency: product.currency,
stockStatus: product.stockStatus,
categories: product.categories,
attributes: product.attributes,
variations: product.variations,
updatedAt: product.updatedAt,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process product categories
function processProductCategories(categoriesEN, categoriesDE, translationMapping) {
const processed = [];
categoriesEN.forEach(category => {
const translationKey = category.slug;
const deMatch = translationMapping.productCategories[translationKey];
processed.push({
id: category.id,
translationKey: translationKey,
locale: 'en',
slug: category.slug,
name: category.name,
path: `/product-category/${category.slug}`,
description: category.description,
count: category.count,
translation: deMatch ? { locale: 'de', id: deMatch.de } : null
});
});
categoriesDE.forEach(category => {
const translationKey = category.slug;
const enMatch = translationMapping.productCategories[translationKey];
processed.push({
id: category.id,
translationKey: translationKey,
locale: 'de',
slug: category.slug,
name: category.name,
path: `/de/product-category/${category.slug}`,
description: category.description,
count: category.count,
translation: enMatch ? { locale: 'en', id: enMatch.en } : null
});
});
return processed;
}
// Process media manifest
function processMedia(media) {
return media.map(item => ({
id: item.id,
filename: item.filename,
url: item.url,
localPath: `/media/${item.filename}`,
alt: item.alt,
width: item.width,
height: item.height,
mimeType: item.mime_type
}));
}
// Generate asset map for URL replacement
function generateAssetMap(media) {
const map = {};
media.forEach(item => {
if (item.url) {
map[item.url] = `/media/${item.filename}`;
}
});
return map;
}
// Main processing function
function main() {
const exportDir = getLatestExportDir();
console.log('🔄 Processing WordPress Data for Next.js');
console.log('========================================\n');
// Load raw data
const loadJSON = (file) => {
try {
return JSON.parse(fs.readFileSync(path.join(exportDir, file), 'utf8'));
} catch (e) {
console.error(`❌ Failed to load ${file}:`, e.message);
return [];
}
};
const translationMapping = loadJSON('translation-mapping-improved.json');
const pagesEN = loadJSON('pages.en.json');
const pagesDE = loadJSON('pages.de.json');
const postsEN = loadJSON('posts.en.json');
const postsDE = loadJSON('posts.de.json');
const productsEN = loadJSON('products.en.json');
const productsDE = loadJSON('products.de.json');
const categoriesEN = loadJSON('product-categories.en.json');
const categoriesDE = loadJSON('product-categories.de.json');
const media = loadJSON('media.json');
const redirects = loadJSON('redirects.json');
const siteInfo = loadJSON('site-info.json');
console.log('📊 Processing content types...\n');
// Process each content type
const pages = processPages(pagesEN, pagesDE, translationMapping);
const posts = processPosts(postsEN, postsDE, translationMapping);
const products = processProducts(productsEN, productsDE, translationMapping);
const categories = processProductCategories(categoriesEN, categoriesDE, translationMapping);
const processedMedia = processMedia(media);
const assetMap = generateAssetMap(media);
// Create processed data structure
const processedData = {
site: {
title: siteInfo.siteTitle,
description: siteInfo.siteDescription,
baseUrl: siteInfo.baseUrl,
defaultLocale: siteInfo.defaultLocale || 'en',
locales: ['en', 'de']
},
content: {
pages,
posts,
products,
categories
},
assets: {
media: processedMedia,
map: assetMap
},
redirects,
exportDate: new Date().toISOString()
};
// Save processed data
const outputPath = path.join(PROCESSED_DIR, 'wordpress-data.json');
fs.writeFileSync(outputPath, JSON.stringify(processedData, null, 2));
// Save individual files for easier access
fs.writeFileSync(path.join(PROCESSED_DIR, 'pages.json'), JSON.stringify(pages, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'posts.json'), JSON.stringify(posts, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'products.json'), JSON.stringify(products, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'categories.json'), JSON.stringify(categories, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'media.json'), JSON.stringify(processedMedia, null, 2));
fs.writeFileSync(path.join(PROCESSED_DIR, 'asset-map.json'), JSON.stringify(assetMap, null, 2));
// Summary
console.log('✅ Data Processing Complete\n');
console.log('📦 Processed Content:');
console.log(` Pages: ${pages.length} (with translations)`);
console.log(` Posts: ${posts.length} (with translations)`);
console.log(` Products: ${products.length} (with translations)`);
console.log(` Categories: ${categories.length} (with translations)`);
console.log(` Media: ${processedMedia.length} files`);
console.log(` Redirects: ${redirects.length} rules\n`);
console.log('📁 Output Files:');
console.log(` ${outputPath}`);
console.log(` ${path.join(PROCESSED_DIR, 'pages.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'posts.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'products.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'categories.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'media.json')}`);
console.log(` ${path.join(PROCESSED_DIR, 'asset-map.json')}\n`);
// Sample data
if (pages.length > 0) {
console.log('📄 Sample Page:');
console.log(` Title: ${pages[0].title}`);
console.log(` Path: ${pages[0].path}`);
console.log(` Locale: ${pages[0].locale}`);
console.log(` Translation: ${pages[0].translation ? 'Yes' : 'No'}\n`);
}
if (posts.length > 0) {
console.log('📝 Sample Post:');
console.log(` Title: ${posts[0].title}`);
console.log(` Path: ${posts[0].path}`);
console.log(` Locale: ${posts[0].locale}`);
console.log(` Date: ${posts[0].datePublished}\n`);
}
console.log('💡 Next: Ready for Next.js project setup!');
}
if (require.main === module) {
main();
}