#!/usr/bin/env node // Test the final function with actual raw data const fs = require('fs'); const path = require('path'); // Load the actual raw data const rawData = JSON.parse(fs.readFileSync('data/raw/2025-12-27T21-26-12-521Z/pages.en.json', 'utf8')); const testExcerpt = rawData[0].excerptHtml; console.log('=== Testing Final Function ==='); console.log('Raw excerpt (first 200 chars):'); console.log(testExcerpt.substring(0, 200)); console.log(''); // The function from process-data.js function processExcerptShortcodes(excerptHtml) { if (!excerptHtml) return ''; let processed = excerptHtml; // First, decode HTML entities to regular characters // Handle both numeric entities (”) and named entities (") processed = processed // Decode numeric HTML entities first .replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec)) // Then handle any remaining Unicode characters .replace(/”/g, '"') // ” - Right double quote .replace(/“/g, '"') // “ - Left double quote .replace(/„/g, ',') // „ - Low double quote .replace(/‟/g, '"') // ‟ - High double quote .replace(/‘/g, "'") // ‘ - Left single quote .replace(/’/g, "'") // ’ - Right single quote .replace(/–/g, '-') // – - En dash .replace(/—/g, '—') // — - Em dash .replace(/…/g, '…') // … - Ellipsis .replace(/″/g, '"') // ″ - Inches/Prime .replace(/′/g, "'") // ′ - Feet/Prime .replace(/‚/g, ',') // ‚ - Single low quote .replace(/‛/g, '`') // ‛ - Single high reversed quote .replace(/•/g, '•') // • - Bullet .replace(/€/g, '€') // € - Euro // Named HTML entities .replace(/"/g, '"') .replace(/'/g, "'") .replace(/‘/g, "'") .replace(/’/g, "'") .replace(/“/g, '"') .replace(/”/g, '"') .replace(/–/g, '-') .replace(/—/g, '—') .replace(/…/g, '…') .replace(/•/g, '•') .replace(/€/g, '€'); // Process WPBakery shortcodes with HTML entities processed = processed // vc_row - convert to div with classes .replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => { const classes = ['vc-row']; if (attrs.includes('full_width_background')) classes.push('full-width-bg'); if (attrs.includes('in_container')) classes.push('in-container'); if (attrs.includes('full_width_content')) classes.push('full-width-content'); return `
`; }) .replace(/\[\/vc_row\]/gi, '
') // vc_column - convert to div with classes .replace(/\[vc_column([^\]]*)\]/gi, (match, attrs) => { const classes = ['vc-column']; if (attrs.includes('1/1')) classes.push('col-1-1'); if (attrs.includes('1/2')) classes.push('col-1-2'); if (attrs.includes('1/3')) classes.push('col-1-3'); if (attrs.includes('2/3')) classes.push('col-2-3'); if (attrs.includes('1/4')) classes.push('col-1-4'); if (attrs.includes('3/4')) classes.push('col-3-4'); if (attrs.includes('5/12')) classes.push('col-5-12'); if (attrs.includes('7/12')) classes.push('col-7-12'); return `
`; }) .replace(/\[\/vc_column\]/gi, '
') // vc_column_text - convert to div .replace(/\[vc_column_text([^\]]*)\]/gi, '
') .replace(/\[\/vc_column_text\]/gi, '
'); // Remove any remaining shortcodes processed = processed.replace(/\[.*?\]/g, ''); // Clean up any HTML that might be broken processed = processed.replace(/]*>\s*<\/p>/gi, ''); processed = processed.replace(/]*>\s*<\/div>/gi, ''); // Normalize whitespace processed = processed.replace(/\s+/g, ' ').trim(); return processed; } const result = processExcerptShortcodes(testExcerpt); console.log('After processing:'); console.log(result); console.log(''); // Check for entities const hasEntities = /[”“‘’–—]/.test(result); const hasNumericEntities = /&#\d+;/.test(result); const hasShortcodes = /\[vc_row|\[vc_column/.test(result); console.log('=== Verification ==='); console.log('Has Unicode entities:', hasEntities); console.log('Has numeric entities:', hasNumericEntities); console.log('Has shortcodes:', hasShortcodes); console.log('Has proper HTML:', result.includes('