migration wip
This commit is contained in:
@@ -27,6 +27,46 @@ function getLatestExportDir() {
|
||||
return path.join(RAW_DIR, dirs[0]);
|
||||
}
|
||||
|
||||
// Decode HTML entities in text
|
||||
function decodeHTMLEntities(text) {
|
||||
if (!text) return '';
|
||||
|
||||
return text
|
||||
// Decode numeric HTML entities first
|
||||
.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec))
|
||||
.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => String.fromCharCode(parseInt(hex, 16)))
|
||||
|
||||
// Handle common named entities
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/‘/g, "'")
|
||||
.replace(/’/g, "'")
|
||||
.replace(/“/g, '"')
|
||||
.replace(/”/g, '"')
|
||||
.replace(/–/g, '-')
|
||||
.replace(/—/g, '—')
|
||||
.replace(/…/g, '…')
|
||||
.replace(/•/g, '•')
|
||||
.replace(/€/g, '€')
|
||||
|
||||
// Handle Unicode characters that might appear
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/‘/g, "'")
|
||||
.replace(/’/g, "'")
|
||||
.replace(/“/g, '"')
|
||||
.replace(/”/g, '"')
|
||||
.replace(/–/g, '-') // En dash
|
||||
.replace(/—/g, '—') // Em dash
|
||||
.replace(/…/g, '…') // Ellipsis
|
||||
.replace(/•/g, '•') // Bullet
|
||||
.replace(/€/g, '€'); // Euro
|
||||
}
|
||||
|
||||
// HTML sanitization - preserve content but clean dangerous elements
|
||||
function sanitizeHTML(html) {
|
||||
if (!html) return '';
|
||||
@@ -73,40 +113,7 @@ function processExcerptShortcodes(excerptHtml) {
|
||||
let processed = excerptHtml;
|
||||
|
||||
// First, decode HTML entities to regular characters
|
||||
// Handle both numeric entities (”) and named entities (")
|
||||
processed = processed
|
||||
// Decode numeric HTML entities first
|
||||
.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec))
|
||||
|
||||
// Then handle any remaining Unicode characters
|
||||
.replace(/”/g, '"') // ” - Right double quote
|
||||
.replace(/“/g, '"') // “ - Left double quote
|
||||
.replace(/„/g, ',') // „ - Low double quote
|
||||
.replace(/‟/g, '"') // ‟ - High double quote
|
||||
.replace(/‘/g, "'") // ‘ - Left single quote
|
||||
.replace(/’/g, "'") // ’ - Right single quote
|
||||
.replace(/–/g, '-') // – - En dash
|
||||
.replace(/—/g, '—') // — - Em dash
|
||||
.replace(/…/g, '…') // … - Ellipsis
|
||||
.replace(/″/g, '"') // ″ - Inches/Prime
|
||||
.replace(/′/g, "'") // ′ - Feet/Prime
|
||||
.replace(/‚/g, ',') // ‚ - Single low quote
|
||||
.replace(/‛/g, '`') // ‛ - Single high reversed quote
|
||||
.replace(/•/g, '•') // • - Bullet
|
||||
.replace(/€/g, '€') // € - Euro
|
||||
|
||||
// Named HTML entities
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/‘/g, "'")
|
||||
.replace(/’/g, "'")
|
||||
.replace(/“/g, '"')
|
||||
.replace(/”/g, '"')
|
||||
.replace(/–/g, '-')
|
||||
.replace(/—/g, '—')
|
||||
.replace(/…/g, '…')
|
||||
.replace(/•/g, '•')
|
||||
.replace(/€/g, '€');
|
||||
processed = decodeHTMLEntities(processed);
|
||||
|
||||
// Process WPBakery shortcodes with HTML entities
|
||||
processed = processed
|
||||
@@ -241,13 +248,17 @@ function processPages(pagesEN, pagesDE, translationMapping) {
|
||||
const translationKey = page.slug;
|
||||
const deMatch = translationMapping.pages[translationKey];
|
||||
|
||||
// Extract title and decode HTML entities
|
||||
const rawTitle = page.titleHtml.replace(/<[^>]*>/g, '');
|
||||
const decodedTitle = decodeHTMLEntities(rawTitle);
|
||||
|
||||
processed.push({
|
||||
id: page.id,
|
||||
translationKey: translationKey,
|
||||
locale: 'en',
|
||||
slug: page.slug,
|
||||
path: `/${page.slug}`,
|
||||
title: page.titleHtml.replace(/<[^>]*>/g, ''),
|
||||
title: decodedTitle,
|
||||
titleHtml: page.titleHtml,
|
||||
contentHtml: sanitizeHTML(page.contentHtml),
|
||||
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
|
||||
@@ -262,13 +273,17 @@ function processPages(pagesEN, pagesDE, translationMapping) {
|
||||
const translationKey = page.slug;
|
||||
const enMatch = translationMapping.pages[translationKey];
|
||||
|
||||
// Extract title and decode HTML entities
|
||||
const rawTitle = page.titleHtml.replace(/<[^>]*>/g, '');
|
||||
const decodedTitle = decodeHTMLEntities(rawTitle);
|
||||
|
||||
processed.push({
|
||||
id: page.id,
|
||||
translationKey: translationKey,
|
||||
locale: 'de',
|
||||
slug: page.slug,
|
||||
path: `/de/${page.slug}`,
|
||||
title: page.titleHtml.replace(/<[^>]*>/g, ''),
|
||||
title: decodedTitle,
|
||||
titleHtml: page.titleHtml,
|
||||
contentHtml: sanitizeHTML(page.contentHtml),
|
||||
excerptHtml: processExcerptShortcodes(page.excerptHtml) || generateExcerpt(page.contentHtml),
|
||||
@@ -289,13 +304,17 @@ function processPosts(postsEN, postsDE, translationMapping) {
|
||||
const translationKey = post.slug;
|
||||
const deMatch = translationMapping.posts[translationKey];
|
||||
|
||||
// Extract title and decode HTML entities
|
||||
const rawTitle = post.titleHtml.replace(/<[^>]*>/g, '');
|
||||
const decodedTitle = decodeHTMLEntities(rawTitle);
|
||||
|
||||
processed.push({
|
||||
id: post.id,
|
||||
translationKey: translationKey,
|
||||
locale: 'en',
|
||||
slug: post.slug,
|
||||
path: `/blog/${post.slug}`,
|
||||
title: post.titleHtml.replace(/<[^>]*>/g, ''),
|
||||
title: decodedTitle,
|
||||
titleHtml: post.titleHtml,
|
||||
contentHtml: sanitizeHTML(post.contentHtml),
|
||||
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
|
||||
@@ -310,13 +329,17 @@ function processPosts(postsEN, postsDE, translationMapping) {
|
||||
const translationKey = post.slug;
|
||||
const enMatch = translationMapping.posts[translationKey];
|
||||
|
||||
// Extract title and decode HTML entities
|
||||
const rawTitle = post.titleHtml.replace(/<[^>]*>/g, '');
|
||||
const decodedTitle = decodeHTMLEntities(rawTitle);
|
||||
|
||||
processed.push({
|
||||
id: post.id,
|
||||
translationKey: translationKey,
|
||||
locale: 'de',
|
||||
slug: post.slug,
|
||||
path: `/de/blog/${post.slug}`,
|
||||
title: post.titleHtml.replace(/<[^>]*>/g, ''),
|
||||
title: decodedTitle,
|
||||
titleHtml: post.titleHtml,
|
||||
contentHtml: sanitizeHTML(post.contentHtml),
|
||||
excerptHtml: processExcerptShortcodes(post.excerptHtml) || generateExcerpt(post.contentHtml),
|
||||
|
||||
Reference in New Issue
Block a user