/**
* HTML Compatibility Layer
* Handles HTML entities, formatting, and class conversions from WordPress exports
*/
import { getMediaById } from './data';
/**
* Process HTML content from WordPress
* - Sanitizes dangerous content
* - Converts HTML entities
* - Removes scripts and styles
* - Processes shortcodes
*/
export function processHTML(html: string | null | undefined): string {
if (!html) return '';
let processed = html;
// Step 1: Replace HTML entities
processed = replaceHTMLEntities(processed);
// Step 2: Remove dangerous content
processed = sanitizeHTML(processed);
// Step 3: Process WordPress shortcodes
processed = processShortcodes(processed);
// Step 4: Clean up whitespace
processed = cleanWhitespace(processed);
return processed;
}
/**
* Replace common HTML entities with their actual characters
*/
function replaceHTMLEntities(html: string): string {
const entities: Record = {
'\u00A0': ' ', // Non-breaking space
'&': '&',
'<': '<',
'>': '>',
'"': '"',
"'": "'",
'¢': '¢',
'£': '£',
'¥': '¥',
'€': '€',
'©': '©',
'®': '®',
'™': '™',
'°': '°',
'±': '±',
'×': '×',
'÷': '÷',
'µ': 'µ',
'¶': '¶',
'§': '§',
'á': 'á',
'é': 'é',
'í': 'í',
'ó': 'ó',
'ú': 'ú',
'Á': 'Á',
'É': 'É',
'Í': 'Í',
'Ó': 'Ó',
'Ú': 'Ú',
'ñ': 'ñ',
'Ñ': 'Ñ',
'ü': 'ü',
'Ü': 'Ü',
'ö': 'ö',
'Ö': 'Ö',
'ä': 'ä',
'Ä': 'Ä',
'ß': 'ß',
'—': '—',
'–': '–',
'…': '…',
'«': '«',
'»': '»',
'‘': "'",
'’': "'",
'“': '"',
'”': '"',
'•': '•',
'·': '·',
// Additional common entities that might appear in WordPress exports
'„': '"', // Double low-reversed-9 quote
'‟': '"', // Double high-reversed-9 quote
'′': "'", // Prime
'″': '"', // Double prime
'‹': '<', // Single left-pointing angle quotation mark
'›': '>', // Single right-pointing angle quotation mark
'†': '†', // Dagger
'‡': '‡', // Double dagger
'‰': '‰', // Per mille
};
let processed = html;
for (const [entity, char] of Object.entries(entities)) {
processed = processed.replace(new RegExp(entity.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), char);
}
return processed;
}
/**
* Sanitize HTML by removing dangerous tags and attributes
*/
function sanitizeHTML(html: string): string {
let processed = html;
// Remove script tags
processed = processed.replace(/