migration wip

This commit is contained in:
2025-12-30 12:10:13 +01:00
parent 89dbf8af87
commit 65a7e9f24a
203 changed files with 192475 additions and 1562 deletions

View File

@@ -27,6 +27,46 @@ function getLatestExportDir() {
return path.join(RAW_DIR, dirs[0]);
}
// Decode HTML entities in text
function decodeHTMLEntities(text) {
if (!text) return '';
return text
// Decode numeric HTML entities first
.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec))
.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => String.fromCharCode(parseInt(hex, 16)))
// Handle common named entities
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(//g, "'")
.replace(//g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
.replace(//g, '-')
.replace(/—/g, '—')
.replace(/…/g, '…')
.replace(/•/g, '•')
.replace(/€/g, '€')
// Handle Unicode characters that might appear
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(//g, "'")
.replace(//g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
.replace(//g, '-') // En dash
.replace(/—/g, '—') // Em dash
.replace(/…/g, '…') // Ellipsis
.replace(/•/g, '•') // Bullet
.replace(/€/g, '€'); // Euro
}
// HTML sanitization - preserve content but clean dangerous elements
function sanitizeHTML(html) {
if (!html) return '';
@@ -73,40 +113,7 @@ function processExcerptShortcodes(excerptHtml) {
let processed = excerptHtml;
// First, decode HTML entities to regular characters
// Handle both numeric entities (”) and named entities (")
processed = processed
// Decode numeric HTML entities first
.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec))
// Then handle any remaining Unicode characters
.replace(/”/g, '"') // ” - Right double quote
.replace(/“/g, '"') // “ - Left double quote
.replace(/„/g, ',') // „ - Low double quote
.replace(/‟/g, '"') // ‟ - High double quote
.replace(//g, "'") // - Left single quote
.replace(//g, "'") // - Right single quote
.replace(//g, '-') // - En dash
.replace(/—/g, '—') // — - Em dash
.replace(/…/g, '…') // … - Ellipsis
.replace(/″/g, '"') // ″ - Inches/Prime
.replace(//g, "'") // - Feet/Prime
.replace(//g, ',') // - Single low quote
.replace(//g, '`') // - Single high reversed quote
.replace(/•/g, '•') // • - Bullet
.replace(/€/g, '€') // € - Euro
// Named HTML entities
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(//g, "'")
.replace(//g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
.replace(//g, '-')
.replace(/—/g, '—')
.replace(/…/g, '…')
.replace(/•/g, '•')
.replace(/€/g, '€');
processed = decodeHTMLEntities(processed);
// Process WPBakery shortcodes with HTML entities
processed = processed
@@ -241,13 +248,17 @@ function processPages(pagesEN, pagesDE, translationMapping) {
const translationKey = page.slug;
const deMatch = translationMapping.pages[translationKey];
// Extract title and decode HTML entities
const rawTitle = page.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'en',
slug: page.slug,
path: `/${page.slug}`,
title: page.titleHtml.replace(/<[^>]*>/g, ''),
title: decodedTitle,
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
@@ -262,13 +273,17 @@ function processPages(pagesEN, pagesDE, translationMapping) {
const translationKey = page.slug;
const enMatch = translationMapping.pages[translationKey];
// Extract title and decode HTML entities
const rawTitle = page.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: page.id,
translationKey: translationKey,
locale: 'de',
slug: page.slug,
path: `/de/${page.slug}`,
title: page.titleHtml.replace(/<[^>]*>/g, ''),
title: decodedTitle,
titleHtml: page.titleHtml,
contentHtml: sanitizeHTML(page.contentHtml),
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
@@ -289,13 +304,17 @@ function processPosts(postsEN, postsDE, translationMapping) {
const translationKey = post.slug;
const deMatch = translationMapping.posts[translationKey];
// Extract title and decode HTML entities
const rawTitle = post.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'en',
slug: post.slug,
path: `/blog/${post.slug}`,
title: post.titleHtml.replace(/<[^>]*>/g, ''),
title: decodedTitle,
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
@@ -310,13 +329,17 @@ function processPosts(postsEN, postsDE, translationMapping) {
const translationKey = post.slug;
const enMatch = translationMapping.posts[translationKey];
// Extract title and decode HTML entities
const rawTitle = post.titleHtml.replace(/<[^>]*>/g, '');
const decodedTitle = decodeHTMLEntities(rawTitle);
processed.push({
id: post.id,
translationKey: translationKey,
locale: 'de',
slug: post.slug,
path: `/de/blog/${post.slug}`,
title: post.titleHtml.replace(/<[^>]*>/g, ''),
title: decodedTitle,
titleHtml: post.titleHtml,
contentHtml: sanitizeHTML(post.contentHtml),
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),