migration wip

This commit is contained in:
2025-12-30 16:19:42 +01:00
parent 65a7e9f24a
commit 4ae6b36da9
149 changed files with 32034 additions and 34406 deletions

View File

@@ -27,52 +27,83 @@ function getLatestExportDir() {
return path.join(RAW_DIR, dirs[0]);
}
// Decode HTML entities in text
// Decode HTML entities in text - comprehensive handling
function decodeHTMLEntities(text) {
if (!text) return '';
return text
// Decode numeric HTML entities first
.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec))
.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => String.fromCharCode(parseInt(hex, 16)))
// Handle common named entities
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(//g, "'")
.replace(//g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
.replace(//g, '-')
.replace(/—/g, '')
.replace(/…/g, '')
.replace(/•/g, '')
.replace(/€/g, '')
// Handle Unicode characters that might appear
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(//g, "'")
.replace(//g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
.replace(//g, '-') // En dash
.replace(/—/g, '') // Em dash
.replace(/…/g, '') // Ellipsis
.replace(/•/g, '') // Bullet
.replace(/€/g, ''); // Euro
// First, handle numeric entities (decimal and hex)
let result = text
.replace(/&#(\d+);/g, (match, dec) => {
const char = String.fromCharCode(parseInt(dec, 10));
return char;
})
.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => {
const char = String.fromCharCode(parseInt(hex, 16));
return char;
});
// Handle common named entities and Unicode characters
const entityMap = {
' ': ' ',
'': "'",
'': "'",
'“': '"',
'”': '"',
'″': '"', // Double prime (8243)
'': '-',
'—': '',
'…': '…',
'•': '•',
'€': '',
'©': '©',
'®': '®',
'™': '™',
'°': '°',
'±': '±',
'×': '×',
'÷': '÷',
'': '',
'¢': '¢',
'£': '£',
'¥': '¥',
'§': '§',
'¶': '¶',
'µ': 'µ',
'«': '«',
'»': '»',
'·': '·'
};
// Replace all named entities
for (const [entity, char] of Object.entries(entityMap)) {
result = result.replace(new RegExp(entity, 'g'), char);
}
// Clean up any remaining ampersand patterns
result = result.replace(/&([a-zA-Z]+);/g, (match, name) => {
// If it's not in our map, try to decode it or leave as is
return entityMap[`&${name};`] || match;
});
return result;
}
// HTML sanitization - preserve content but clean dangerous elements
// Also preserves bg_image attributes for later processing by fix-images.js
function sanitizeHTML(html) {
if (!html) return '';
let sanitized = html;
// Temporarily preserve bg_image attributes by replacing them with placeholders
// Handle both regular quotes and Unicode quotes
const bgImagePlaceholders = [];
sanitized = sanitized.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => {
const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`;
bgImagePlaceholders.push(match);
return placeholder;
});
// Remove script tags and inline handlers (security)
sanitized = sanitized.replace(/<script.*?>.*?<\/script>/gis, '');
sanitized = sanitized.replace(/\son\w+=".*?"/gi, '');
@@ -103,6 +134,11 @@ function sanitizeHTML(html) {
// Normalize whitespace but preserve HTML structure
sanitized = sanitized.replace(/\s+/g, ' ').trim();
// Restore bg_image placeholders
bgImagePlaceholders.forEach((placeholder, index) => {
sanitized = sanitized.replace(`__BG_IMAGE_${index}__`, placeholder);
});
return sanitized;
}
@@ -115,15 +151,29 @@ function processExcerptShortcodes(excerptHtml) {
// First, decode HTML entities to regular characters
processed = decodeHTMLEntities(processed);
// Temporarily preserve bg_image attributes (handle both regular and Unicode quotes)
const bgImagePlaceholders = [];
processed = processed.replace(/(bg_image=)(["”])([^"”]*?)["”]/gi, (match) => {
const placeholder = `__BG_IMAGE_${bgImagePlaceholders.length}__`;
bgImagePlaceholders.push(match);
return placeholder;
});
// Process WPBakery shortcodes with HTML entities
processed = processed
// vc_row - convert to div with classes (handle both complete and truncated)
// Preserve any placeholders in the attributes
.replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => {
const classes = ['vc-row'];
if (attrs.includes('full_width_background')) classes.push('full-width-bg');
if (attrs.includes('in_container')) classes.push('in-container');
if (attrs.includes('full_width_content')) classes.push('full-width-content');
return `<div class="${classes.join(' ')}">`;
// Extract and preserve placeholders from attrs
const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || [];
const preservedAttrs = placeholderMatches.join(' ');
return `<div class="${classes.join(' ')}" ${preservedAttrs}>`;
})
// Handle truncated vc_row (no closing bracket)
.replace(/\[vc_row([^\]]*)$/gi, (match, attrs) => {
@@ -131,7 +181,12 @@ function processExcerptShortcodes(excerptHtml) {
if (attrs.includes('full_width_background')) classes.push('full-width-bg');
if (attrs.includes('in_container')) classes.push('in-container');
if (attrs.includes('full_width_content')) classes.push('full-width-content');
return `<div class="${classes.join(' ')}">`;
// Extract and preserve placeholders from attrs
const placeholderMatches = attrs.match(/__BG_IMAGE_\d+__/g) || [];
const preservedAttrs = placeholderMatches.join(' ');
return `<div class="${classes.join(' ')}" ${preservedAttrs}>`;
})
.replace(/\[\/vc_row\]/gi, '</div>')
@@ -172,15 +227,15 @@ function processExcerptShortcodes(excerptHtml) {
.replace(/\[\/vc_column_text\]/gi, '</div>')
// nectar_cta - convert to button
.replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
.replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-cta">$2</a>')
// nectar_highlighted_text - convert to span
.replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi,
.replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi,
'<span class="nectar-highlighted">$2</span>')
// nectar_responsive_text - convert to span
.replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi,
.replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi,
'<span class="nectar-responsive">$2</span>')
// nectar_icon_list - convert to ul
@@ -188,15 +243,15 @@ function processExcerptShortcodes(excerptHtml) {
.replace(/\[\/nectar_icon_list\]/gi, '</ul>')
// nectar_icon_list_item - convert to li
.replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi,
.replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi,
'<li><strong>$2</strong>: $4</li>')
// nectar_btn - convert to button
.replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
.replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi,
'<a href="$4" class="nectar-btn">$2</a>')
// split_line_heading - convert to heading
.replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi,
.replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi,
'<h2 class="split-line-heading">$2</h2>')
// vc_row_inner - convert to div
@@ -229,6 +284,11 @@ function processExcerptShortcodes(excerptHtml) {
// Normalize whitespace
processed = processed.replace(/\s+/g, ' ').trim();
// Restore bg_image placeholders
bgImagePlaceholders.forEach((placeholder, index) => {
processed = processed.replace(`__BG_IMAGE_${index}__`, placeholder);
});
return processed;
}
@@ -498,7 +558,7 @@ function main() {
}
};
const translationMapping = loadJSON('translation-mapping-improved.json');
const translationMapping = loadJSON('translation-mapping.json');
const pagesEN = loadJSON('pages.en.json');
const pagesDE = loadJSON('pages.de.json');
const postsEN = loadJSON('posts.en.json');