#!/usr/bin/env node /** * WordPress → Next.js Data Processing Pipeline (Enhanced) * Transforms raw WordPress data into Next.js compatible format * Includes bg_image ID resolution */ const fs = require('fs'); const path = require('path'); const DATA_DIR = path.join(__dirname, '..', 'data'); const RAW_DIR = path.join(DATA_DIR, 'raw'); const PROCESSED_DIR = path.join(DATA_DIR, 'processed'); // Create processed directory if (!fs.existsSync(PROCESSED_DIR)) { fs.mkdirSync(PROCESSED_DIR, { recursive: true }); } // Find latest export function getLatestExportDir() { const dirs = fs.readdirSync(RAW_DIR).filter(f => { const stat = fs.statSync(path.join(RAW_DIR, f)); return stat.isDirectory(); }); dirs.sort().reverse(); return path.join(RAW_DIR, dirs[0]); } // Load media mapping for ID resolution function loadMediaMapping() { const exportDir = getLatestExportDir(); const mediaJsonPath = path.join(exportDir, 'media.json'); if (!fs.existsSync(mediaJsonPath)) { console.warn('⚠️ No media.json found for ID resolution'); return {}; } const mediaData = JSON.parse(fs.readFileSync(mediaJsonPath, 'utf8')); const mapping = {}; mediaData.forEach(item => { if (item.id) { mapping[item.id] = `/media/${item.filename}`; } }); return mapping; } // Load asset map for URL replacement function loadAssetMap() { const assetMapPath = path.join(PROCESSED_DIR, 'asset-map.json'); if (!fs.existsSync(assetMapPath)) { return {}; } return JSON.parse(fs.readFileSync(assetMapPath, 'utf8')); } // Replace bg_image IDs with local paths function replaceBgImageIds(html, mediaMapping) { if (!html) return html; let processed = html; // Helper function to replace a single bg_image attribute const replaceBgImage = (match, id) => { const localPath = mediaMapping[id]; if (localPath) { return `bg_image="${localPath}"`; } return match; }; // Pattern 1: bg_image="ID" (regular quotes) processed = processed.replace(/bg_image="(\d+)"/gi, replaceBgImage); // Pattern 2: bg_image=”ID” (HTML entities for quotes) processed = processed.replace(/bg_image=”(\d+)″/gi, replaceBgImage); // Pattern 3: bg_image='ID' (single quotes) processed = processed.replace(/bg_image='(\d+)'/gi, replaceBgImage); // Pattern 4: layer_one_image="ID" processed = processed.replace(/layer_one_image="(\d+)"/gi, (match, id) => { const localPath = mediaMapping[id]; if (localPath) { return `layer_one_image="${localPath}"`; } return match; }); // Pattern 5: layer_one_image with HTML entities processed = processed.replace(/layer_one_image=”(\d+)″/gi, (match, id) => { const localPath = mediaMapping[id]; if (localPath) { return `layer_one_image="${localPath}"`; } return match; }); // Pattern 6: image_url="ID" (for image_with_animation) processed = processed.replace(/image_url="(\d+)"/gi, (match, id) => { const localPath = mediaMapping[id]; if (localPath) { return `image_url="${localPath}"`; } return match; }); // Pattern 7: image_url with HTML entities processed = processed.replace(/image_url=”(\d+)″/gi, (match, id) => { const localPath = mediaMapping[id]; if (localPath) { return `image_url="${localPath}"`; } return match; }); // Pattern 8: images="ID,ID,ID" (for vc_gallery) processed = processed.replace(/images="([^"]+)"/gi, (match, idList) => { const ids = idList.split(',').map(id => id.trim()); const localPaths = ids.map(id => { // Check if it's a numeric ID if (/^\d+$/.test(id)) { return mediaMapping[id] || id; } return id; }); return `images="${localPaths.join(',')}"`; }); // Pattern 9: images with HTML entities processed = processed.replace(/images=”([^&#]+)″/gi, (match, idList) => { const ids = idList.split(',').map(id => id.trim()); const localPaths = ids.map(id => { // Check if it's a numeric ID if (/^\d+$/.test(id)) { return mediaMapping[id] || id; } return id; }); return `images="${localPaths.join(',')}"`; }); return processed; } // Replace URLs with local paths using asset map function replaceUrlsWithLocalPaths(html, assetMap) { if (!html) return html; let processed = html; // Replace URLs in various attributes Object.keys(assetMap).forEach(url => { const localPath = assetMap[url]; // Escape special regex characters in URL const escapedUrl = url.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const regex = new RegExp(escapedUrl, 'gi'); processed = processed.replace(regex, localPath); }); return processed; } // Download video files from external URLs async function downloadVideoFile(url, filename) { const https = require('https'); const fs = require('fs'); const path = require('path'); const videoDir = path.join(__dirname, '..', 'public', 'media', 'videos'); if (!fs.existsSync(videoDir)) { fs.mkdirSync(videoDir, { recursive: true }); } const filePath = path.join(videoDir, filename); // Check if file already exists if (fs.existsSync(filePath)) { console.log(`✅ Video already exists: ${filename}`); return `/media/videos/${filename}`; } return new Promise((resolve, reject) => { const file = fs.createWriteStream(filePath); https.get(url, (res) => { if (res.statusCode === 200) { res.pipe(file); file.on('finish', () => { console.log(`✅ Downloaded video: ${filename}`); resolve(`/media/videos/${filename}`); }); } else { reject(new Error(`Failed to download video: ${res.statusCode}`)); } }).on('error', (err) => { fs.unlink(filePath, () => {}); reject(err); }); }); } // Extract and download video files from vc_row attributes async function processVideoAttributes(html) { if (!html) return { html, videoMap: {} }; const videoMap = {}; let processed = html; // Find all vc_row with video attributes const videoRowRegex = /\[vc_row[^\]]*video_bg="use_video"[^\]]*video_mp4="([^"]*)"[^\]]*video_webm="([^"]*)"[^\]]*\]/gi; let match; while ((match = videoRowRegex.exec(html)) !== null) { const videoMp4 = match[1]; const videoWebm = match[2]; // Generate filenames const mp4Filename = `video-${Date.now()}-${Math.random().toString(36).substring(7)}.mp4`; const webmFilename = `video-${Date.now()}-${Math.random().toString(36).substring(7)}.webm`; try { // Download files const mp4Path = await downloadVideoFile(videoMp4, mp4Filename); const webmPath = await downloadVideoFile(videoWebm, webmFilename); // Store in map for replacement videoMap[videoMp4] = mp4Path; videoMap[videoWebm] = webmPath; } catch (error) { console.warn(`⚠️ Failed to download video files: ${error.message}`); } } // Replace URLs in the HTML Object.keys(videoMap).forEach(url => { const localPath = videoMap[url]; const escapedUrl = url.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const regex = new RegExp(escapedUrl, 'gi'); processed = processed.replace(regex, localPath); }); return { html: processed, videoMap }; } // HTML sanitization - preserve content but clean dangerous elements function sanitizeHTML(html) { if (!html) return ''; let sanitized = html; // Remove script tags and inline handlers (security) sanitized = sanitized.replace(/.*?<\/script>/gis, ''); sanitized = sanitized.replace(/\son\w+=".*?"/gi, ''); // Extract and preserve ALL vc_row attributes before removing shortcodes // This includes bg_image, video_bg, video_mp4, video_webm, etc. const vcRowMatches = sanitized.match(/\[vc_row[^\]]*\]/gi) || []; const vcRowAttributes = []; vcRowMatches.forEach(match => { const attrs = { bgImage: null, videoBg: null, videoMp4: null, videoWebm: null, bgColor: null, colorOverlay: null, overlayStrength: null, enableGradient: null, gradientDirection: null, colorOverlay2: null, parallaxBg: null, parallaxBgSpeed: null, bgImageAnimation: null, topPadding: null, bottomPadding: null, textAlignment: null, textColor: null, shapeType: null, scenePosition: null, fullScreenRowPosition: null, fullScreen: null, equalHeight: null, contentPlacement: null, columnDirection: null, rowBorderRadius: null, rowBorderRadiusApplies: null }; // Extract all relevant attributes const bgImageMatch = match.match(/bg_image="([^"]*)"/i); if (bgImageMatch) attrs.bgImage = bgImageMatch[1]; const videoBgMatch = match.match(/video_bg="([^"]*)"/i); if (videoBgMatch) attrs.videoBg = videoBgMatch[1]; const videoMp4Match = match.match(/video_mp4="([^"]*)"/i); if (videoMp4Match) attrs.videoMp4 = videoMp4Match[1]; const videoWebmMatch = match.match(/video_webm="([^"]*)"/i); if (videoWebmMatch) attrs.videoWebm = videoWebmMatch[1]; const bgColorMatch = match.match(/bg_color="([^"]*)"/i); if (bgColorMatch) attrs.bgColor = bgColorMatch[1]; const colorOverlayMatch = match.match(/color_overlay="([^"]*)"/i); if (colorOverlayMatch) attrs.colorOverlay = colorOverlayMatch[1]; const overlayStrengthMatch = match.match(/overlay_strength="([^"]*)"/i); if (overlayStrengthMatch) attrs.overlayStrength = overlayStrengthMatch[1]; const enableGradientMatch = match.match(/enable_gradient="([^"]*)"/i); if (enableGradientMatch) attrs.enableGradient = enableGradientMatch[1]; const gradientDirectionMatch = match.match(/gradient_direction="([^"]*)"/i); if (gradientDirectionMatch) attrs.gradientDirection = gradientDirectionMatch[1]; const colorOverlay2Match = match.match(/color_overlay_2="([^"]*)"/i); if (colorOverlay2Match) attrs.colorOverlay2 = colorOverlay2Match[1]; const parallaxBgMatch = match.match(/parallax_bg="([^"]*)"/i); if (parallaxBgMatch) attrs.parallaxBg = parallaxBgMatch[1]; const parallaxBgSpeedMatch = match.match(/parallax_bg_speed="([^"]*)"/i); if (parallaxBgSpeedMatch) attrs.parallaxBgSpeed = parallaxBgSpeedMatch[1]; const bgImageAnimationMatch = match.match(/bg_image_animation="([^"]*)"/i); if (bgImageAnimationMatch) attrs.bgImageAnimation = bgImageAnimationMatch[1]; const topPaddingMatch = match.match(/top_padding="([^"]*)"/i); if (topPaddingMatch) attrs.topPadding = topPaddingMatch[1]; const bottomPaddingMatch = match.match(/bottom_padding="([^"]*)"/i); if (bottomPaddingMatch) attrs.bottomPadding = bottomPaddingMatch[1]; const textAlignmentMatch = match.match(/text_align="([^"]*)"/i); if (textAlignmentMatch) attrs.textAlignment = textAlignmentMatch[1]; const textColorMatch = match.match(/text_color="([^"]*)"/i); if (textColorMatch) attrs.textColor = textColorMatch[1]; const shapeTypeMatch = match.match(/shape_type="([^"]*)"/i); if (shapeTypeMatch) attrs.shapeType = shapeTypeMatch[1]; const scenePositionMatch = match.match(/scene_position="([^"]*)"/i); if (scenePositionMatch) attrs.scenePosition = scenePositionMatch[1]; const fullScreenRowPositionMatch = match.match(/full_screen_row_position="([^"]*)"/i); if (fullScreenRowPositionMatch) attrs.fullScreenRowPosition = fullScreenRowPositionMatch[1]; const fullScreenMatch = match.match(/full_screen="([^"]*)"/i); if (fullScreenMatch) attrs.fullScreen = fullScreenMatch[1]; const equalHeightMatch = match.match(/equal_height="([^"]*)"/i); if (equalHeightMatch) attrs.equalHeight = equalHeightMatch[1]; const contentPlacementMatch = match.match(/content_placement="([^"]*)"/i); if (contentPlacementMatch) attrs.contentPlacement = contentPlacementMatch[1]; const columnDirectionMatch = match.match(/column_direction="([^"]*)"/i); if (columnDirectionMatch) attrs.columnDirection = columnDirectionMatch[1]; const rowBorderRadiusMatch = match.match(/row_border_radius="([^"]*)"/i); if (rowBorderRadiusMatch) attrs.rowBorderRadius = rowBorderRadiusMatch[1]; const rowBorderRadiusAppliesMatch = match.match(/row_border_radius_applies="([^"]*)"/i); if (rowBorderRadiusAppliesMatch) attrs.rowBorderRadiusApplies = rowBorderRadiusAppliesMatch[1]; vcRowAttributes.push(attrs); }); // Remove WPBakery shortcode wrappers but keep their content with preserved attributes let shortcodeIndex = 0; sanitized = sanitized.replace(/\[vc_row[^\]]*\]/gi, (match) => { const attrs = vcRowAttributes[shortcodeIndex] || {}; shortcodeIndex++; // Build data attributes string const dataAttrs = []; if (attrs.bgImage) dataAttrs.push(`data-bg-image="${attrs.bgImage}"`); if (attrs.videoBg) dataAttrs.push(`data-video-bg="${attrs.videoBg}"`); if (attrs.videoMp4) dataAttrs.push(`data-video-mp4="${attrs.videoMp4}"`); if (attrs.videoWebm) dataAttrs.push(`data-video-webm="${attrs.videoWebm}"`); if (attrs.bgColor) dataAttrs.push(`data-bg-color="${attrs.bgColor}"`); if (attrs.colorOverlay) dataAttrs.push(`data-color-overlay="${attrs.colorOverlay}"`); if (attrs.overlayStrength) dataAttrs.push(`data-overlay-strength="${attrs.overlayStrength}"`); if (attrs.enableGradient) dataAttrs.push(`data-enable-gradient="${attrs.enableGradient}"`); if (attrs.gradientDirection) dataAttrs.push(`data-gradient-direction="${attrs.gradientDirection}"`); if (attrs.colorOverlay2) dataAttrs.push(`data-color-overlay-2="${attrs.colorOverlay2}"`); if (attrs.parallaxBg) dataAttrs.push(`data-parallax-bg="${attrs.parallaxBg}"`); if (attrs.parallaxBgSpeed) dataAttrs.push(`data-parallax-bg-speed="${attrs.parallaxBgSpeed}"`); if (attrs.bgImageAnimation) dataAttrs.push(`data-bg-image-animation="${attrs.bgImageAnimation}"`); if (attrs.topPadding) dataAttrs.push(`data-top-padding="${attrs.topPadding}"`); if (attrs.bottomPadding) dataAttrs.push(`data-bottom-padding="${attrs.bottomPadding}"`); if (attrs.textAlignment) dataAttrs.push(`data-text-align="${attrs.textAlignment}"`); if (attrs.textColor) dataAttrs.push(`data-text-color="${attrs.textColor}"`); if (attrs.shapeType) dataAttrs.push(`data-shape-type="${attrs.shapeType}"`); if (attrs.scenePosition) dataAttrs.push(`data-scene-position="${attrs.scenePosition}"`); if (attrs.fullScreenRowPosition) dataAttrs.push(`data-full-screen-row-position="${attrs.fullScreenRowPosition}"`); if (attrs.fullScreen) dataAttrs.push(`data-full-screen="${attrs.fullScreen}"`); if (attrs.equalHeight) dataAttrs.push(`data-equal-height="${attrs.equalHeight}"`); if (attrs.contentPlacement) dataAttrs.push(`data-content-placement="${attrs.contentPlacement}"`); if (attrs.columnDirection) dataAttrs.push(`data-column-direction="${attrs.columnDirection}"`); if (attrs.rowBorderRadius) dataAttrs.push(`data-row-border-radius="${attrs.rowBorderRadius}"`); if (attrs.rowBorderRadiusApplies) dataAttrs.push(`data-row-border-radius-applies="${attrs.rowBorderRadiusApplies}"`); const dataAttrsString = dataAttrs.length > 0 ? ' ' + dataAttrs.join(' ') : ''; // Check for full-width classes const classes = ['vc-row']; if (match.includes('full_width_background')) classes.push('full-width-bg'); if (match.includes('in_container')) classes.push('in-container'); if (match.includes('full_width_content')) classes.push('full-width-content'); return `
`; }); // Reset index for closing tags shortcodeIndex = 0; // Handle vc_row with bgImage attribute (for excerptHtml) sanitized = sanitized.replace(/\[vc_row[^\]]*bgImage="([^"]*)"[^\]]*\]/gi, (match, bgImage) => { return `
`; }); // Handle vc_row without bg_image sanitized = sanitized.replace(/\[vc_row[^\]]*\]/gi, '
'); sanitized = sanitized.replace(/\[\/vc_row\]/gi, '
'); sanitized = sanitized.replace(/\[vc_column[^\]]*\]/gi, '
'); sanitized = sanitized.replace(/\[\/vc_column\]/gi, '
'); // Remove other shortcodes but keep text content sanitized = sanitized.replace(/\[vc_column_text[^\]]*\]/gi, '
'); sanitized = sanitized.replace(/\[\/vc_column_text\]/gi, '
'); // Handle Nectar shortcodes - remove them but keep any text content // [nectar_cta] blocks often contain text we want to preserve sanitized = sanitized.replace(/\[nectar_cta[^\]]*\]([\s\S]*?)\[\/nectar_cta\]/gi, '$1'); sanitized = sanitized.replace(/\[nectar[^\]]*\]/gi, ''); // Remove all remaining shortcodes sanitized = sanitized.replace(/\[.*?\]/g, ''); // Remove empty paragraphs and divs sanitized = sanitized.replace(/]*>\s*<\/p>/gi, ''); sanitized = sanitized.replace(/]*>\s*<\/div>/gi, ''); // Normalize whitespace but preserve HTML structure sanitized = sanitized.replace(/\s+/g, ' ').trim(); return sanitized; } // Process excerpts specifically to handle shortcodes comprehensively function processExcerptShortcodes(excerptHtml) { if (!excerptHtml) return ''; let processed = excerptHtml; // First, decode HTML entities to regular characters // Handle both numeric entities (”) and named entities (") processed = processed // Numeric HTML entities commonly found in WordPress raw data .replace(/”/g, '"') // ” - Right double quote .replace(/“/g, '"') // “ - Left double quote .replace(/„/g, ',') // „ - Low double quote .replace(/‟/g, '"') // ‟ - High double quote .replace(/‘/g, "'") // ‘ - Left single quote .replace(/’/g, "'") // ’ - Right single quote .replace(/–/g, '-') // – - En dash .replace(/—/g, '—') // — - Em dash .replace(/…/g, '…') // … - Ellipsis .replace(/″/g, '"') // ″ - Inches/Prime .replace(/′/g, "'") // ′ - Feet/Prime .replace(/‚/g, ',') // ‚ - Single low quote .replace(/‛/g, '`') // ‛ - Single high reversed quote .replace(/•/g, '•') // • - Bullet .replace(/€/g, '€') // € - Euro // Unicode characters (from rendered content) .replace(/”/g, '"') // Right double quote .replace(/“/g, '"') // Left double quote .replace(/„/g, ',') // Low double quote .replace(/‟/g, '"') // High double quote .replace(/‘/g, "'") // Left single quote .replace(/’/g, "'") // Right single quote .replace(/–/g, '-') // En dash .replace(/—/g, '—') // Em dash .replace(/…/g, '…') // Ellipsis .replace(/″/g, '"') // Inches/Prime .replace(/′/g, "'") // Feet/Prime .replace(/•/g, '•') // Bullet // Named HTML entities .replace(/"/g, '"') .replace(/'/g, "'") .replace(/‘/g, "'") .replace(/’/g, "'") .replace(/“/g, '"') .replace(/”/g, '"') .replace(/–/g, '-') .replace(/—/g, '—') .replace(/…/g, '…') .replace(/•/g, '•') .replace(/€/g, '€'); // Process WPBakery shortcodes with HTML entities processed = processed // vc_row - convert to div with classes and handle bg_image/bgImage and video attributes .replace(/\[vc_row([^\]]*)\]/gi, (match, attrs) => { const classes = ['vc-row']; if (attrs.includes('full_width_background')) classes.push('full-width-bg'); if (attrs.includes('in_container')) classes.push('in-container'); if (attrs.includes('full_width_content')) classes.push('full-width-content'); // Build data attributes string const dataAttrs = []; // Extract bg_image attribute const bgImageMatch = attrs.match(/bg_image="([^"]*)"/i); if (bgImageMatch && bgImageMatch[1]) { const bgImage = bgImageMatch[1]; // If it's already a local path, use it directly if (bgImage.startsWith('/media/')) { dataAttrs.push(`data-bg-image="${bgImage}"`); } else { // If it's a numeric ID, keep it as-is for now (will be resolved later) dataAttrs.push(`data-bg-image="${bgImage}"`); } } // Extract bgImage attribute (alternative format) const bgImageMatch2 = attrs.match(/bgImage="([^"]*)"/i); if (bgImageMatch2 && bgImageMatch2[1]) { const bgImage = bgImageMatch2[1]; if (bgImage.startsWith('/media/')) { dataAttrs.push(`data-bg-image="${bgImage}"`); } else { dataAttrs.push(`data-bg-image="${bgImage}"`); } } // Extract video attributes const videoBgMatch = attrs.match(/video_bg="([^"]*)"/i); if (videoBgMatch && videoBgMatch[1]) { dataAttrs.push(`data-video-bg="${videoBgMatch[1]}"`); } const videoMp4Match = attrs.match(/video_mp4="([^"]*)"/i); if (videoMp4Match && videoMp4Match[1]) { dataAttrs.push(`data-video-mp4="${videoMp4Match[1]}"`); } const videoWebmMatch = attrs.match(/video_webm="([^"]*)"/i); if (videoWebmMatch && videoWebmMatch[1]) { dataAttrs.push(`data-video-webm="${videoWebmMatch[1]}"`); } // Extract other styling attributes const bgColorMatch = attrs.match(/bg_color="([^"]*)"/i); if (bgColorMatch) dataAttrs.push(`data-bg-color="${bgColorMatch[1]}"`); const colorOverlayMatch = attrs.match(/color_overlay="([^"]*)"/i); if (colorOverlayMatch) dataAttrs.push(`data-color-overlay="${colorOverlayMatch[1]}"`); const overlayStrengthMatch = attrs.match(/overlay_strength="([^"]*)"/i); if (overlayStrengthMatch) dataAttrs.push(`data-overlay-strength="${overlayStrengthMatch[1]}"`); const enableGradientMatch = attrs.match(/enable_gradient="([^"]*)"/i); if (enableGradientMatch) dataAttrs.push(`data-enable-gradient="${enableGradientMatch[1]}"`); const gradientDirectionMatch = attrs.match(/gradient_direction="([^"]*)"/i); if (gradientDirectionMatch) dataAttrs.push(`data-gradient-direction="${gradientDirectionMatch[1]}"`); const colorOverlay2Match = attrs.match(/color_overlay_2="([^"]*)"/i); if (colorOverlay2Match) dataAttrs.push(`data-color-overlay-2="${colorOverlay2Match[1]}"`); const parallaxBgMatch = attrs.match(/parallax_bg="([^"]*)"/i); if (parallaxBgMatch) dataAttrs.push(`data-parallax-bg="${parallaxBgMatch[1]}"`); const parallaxBgSpeedMatch = attrs.match(/parallax_bg_speed="([^"]*)"/i); if (parallaxBgSpeedMatch) dataAttrs.push(`data-parallax-bg-speed="${parallaxBgSpeedMatch[1]}"`); const bgImageAnimationMatch = attrs.match(/bg_image_animation="([^"]*)"/i); if (bgImageAnimationMatch) dataAttrs.push(`data-bg-image-animation="${bgImageAnimationMatch[1]}"`); const topPaddingMatch = attrs.match(/top_padding="([^"]*)"/i); if (topPaddingMatch) dataAttrs.push(`data-top-padding="${topPaddingMatch[1]}"`); const bottomPaddingMatch = attrs.match(/bottom_padding="([^"]*)"/i); if (bottomPaddingMatch) dataAttrs.push(`data-bottom-padding="${bottomPaddingMatch[1]}"`); const textAlignmentMatch = attrs.match(/text_align="([^"]*)"/i); if (textAlignmentMatch) dataAttrs.push(`data-text-align="${textAlignmentMatch[1]}"`); const textColorMatch = attrs.match(/text_color="([^"]*)"/i); if (textColorMatch) dataAttrs.push(`data-text-color="${textColorMatch[1]}"`); const shapeTypeMatch = attrs.match(/shape_type="([^"]*)"/i); if (shapeTypeMatch) dataAttrs.push(`data-shape-type="${shapeTypeMatch[1]}"`); const scenePositionMatch = attrs.match(/scene_position="([^"]*)"/i); if (scenePositionMatch) dataAttrs.push(`data-scene-position="${scenePositionMatch[1]}"`); const fullScreenRowPositionMatch = attrs.match(/full_screen_row_position="([^"]*)"/i); if (fullScreenRowPositionMatch) dataAttrs.push(`data-full-screen-row-position="${fullScreenRowPositionMatch[1]}"`); const fullScreenMatch = attrs.match(/full_screen="([^"]*)"/i); if (fullScreenMatch) dataAttrs.push(`data-full-screen="${fullScreenMatch[1]}"`); const equalHeightMatch = attrs.match(/equal_height="([^"]*)"/i); if (equalHeightMatch) dataAttrs.push(`data-equal-height="${equalHeightMatch[1]}"`); const contentPlacementMatch = attrs.match(/content_placement="([^"]*)"/i); if (contentPlacementMatch) dataAttrs.push(`data-content-placement="${contentPlacementMatch[1]}"`); const columnDirectionMatch = attrs.match(/column_direction="([^"]*)"/i); if (columnDirectionMatch) dataAttrs.push(`data-column-direction="${columnDirectionMatch[1]}"`); const rowBorderRadiusMatch = attrs.match(/row_border_radius="([^"]*)"/i); if (rowBorderRadiusMatch) dataAttrs.push(`data-row-border-radius="${rowBorderRadiusMatch[1]}"`); const rowBorderRadiusAppliesMatch = attrs.match(/row_border_radius_applies="([^"]*)"/i); if (rowBorderRadiusAppliesMatch) dataAttrs.push(`data-row-border-radius-applies="${rowBorderRadiusAppliesMatch[1]}"`); const dataAttrsString = dataAttrs.length > 0 ? ' ' + dataAttrs.join(' ') : ''; return `
`; }) .replace(/\[\/vc_row\]/gi, '
') // Also handle vc_row that's wrapped in

tags .replace(/

\[vc_row([^\]]*)\]<\/p>/gi, (match, attrs) => { const classes = ['vc-row']; if (attrs.includes('full_width_background')) classes.push('full-width-bg'); if (attrs.includes('in_container')) classes.push('in-container'); if (attrs.includes('full_width_content')) classes.push('full-width-content'); // Extract bg_image attribute const bgImageMatch = attrs.match(/bg_image="([^"]*)"/i); if (bgImageMatch && bgImageMatch[1]) { const bgImage = bgImageMatch[1]; if (bgImage.startsWith('/media/')) { return `

`; } return `
`; } return `
`; }) // vc_column - convert to div with classes .replace(/\[vc_column([^\]]*)\]/gi, (match, attrs) => { const classes = ['vc-column']; if (attrs.includes('1/1')) classes.push('col-1-1'); if (attrs.includes('1/2')) classes.push('col-1-2'); if (attrs.includes('1/3')) classes.push('col-1-3'); if (attrs.includes('2/3')) classes.push('col-2-3'); if (attrs.includes('1/4')) classes.push('col-1-4'); if (attrs.includes('3/4')) classes.push('col-3-4'); if (attrs.includes('5/12')) classes.push('col-5-12'); if (attrs.includes('7/12')) classes.push('col-7-12'); return `
`; }) .replace(/\[\/vc_column\]/gi, '
') // vc_column_text - convert to div .replace(/\[vc_column_text([^\]]*)\]/gi, '
') .replace(/\[\/vc_column_text\]/gi, '
') // nectar_cta - convert to button .replace(/\[nectar_cta([^\]]*)link_text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi, '$2') // nectar_highlighted_text - convert to span .replace(/\[nectar_highlighted_text([^\]]*)\](.*?)\[\/nectar_highlighted_text\]/gi, '$2') // nectar_responsive_text - convert to span .replace(/\[nectar_responsive_text([^\]]*)\](.*?)\[\/nectar_responsive_text\]/gi, '$2') // nectar_icon_list - convert to ul .replace(/\[nectar_icon_list([^\]]*)\]/gi, '
    ') .replace(/\[\/nectar_icon_list\]/gi, '
') // nectar_icon_list_item - convert to li .replace(/\[nectar_icon_list_item([^\]]*)header="([^"]*)"(.*?)text="([^"]*)"(.*?)\]/gi, '
  • $2: $4
  • ') // nectar_btn - convert to button .replace(/\[nectar_btn([^\]]*)text="([^"]*)"(.*?)url="([^"]*)"(.*?)\]/gi, '$2') // split_line_heading - convert to heading .replace(/\[split_line_heading([^\]]*)text_content="([^"]*)"(.*?)\]/gi, '

    $2

    ') // vc_row_inner - convert to div .replace(/\[vc_row_inner([^\]]*)\]/gi, '
    ') .replace(/\[\/vc_row_inner\]/gi, '
    ') // vc_column_inner - convert to div .replace(/\[vc_column_inner([^\]]*)\]/gi, '
    ') .replace(/\[\/vc_column_inner\]/gi, '
    ') // divider - convert to hr .replace(/\[divider([^\]]*)\]/gi, '
    ') // vc_gallery - convert to div (placeholder) .replace(/\[vc_gallery([^\]]*)\]/gi, '') // vc_raw_js - remove or convert to div .replace(/\[vc_raw_js\](.*?)\[\/vc_raw_js\]/gi, '
    [JavaScript]
    ') // nectar_gmap - convert to div .replace(/\[nectar_gmap([^\]]*)\]/gi, '
    [Google Map]
    '); // Remove any remaining shortcodes processed = processed.replace(/\[.*?\]/g, ''); // Clean up any HTML that might be broken processed = processed.replace(/]*>\s*<\/p>/gi, ''); processed = processed.replace(/]*>\s*<\/div>/gi, ''); // Normalize whitespace processed = processed.replace(/\s+/g, ' ').trim(); return processed; } // Extract excerpt from content function generateExcerpt(content, maxLength = 200) { const text = content.replace(/<[^>]*>/g, ''); if (text.length <= maxLength) return text; return text.substring(0, maxLength) + '...'; } // Process pages with bg_image resolution async function processPages(pagesEN, pagesDE, translationMapping, mediaMapping, assetMap) { const processed = []; // Helper to decode HTML entities in content const decodeContent = (html) => { if (!html) return html; // Decode numeric HTML entities first let decoded = html .replace(/"/g, '"') // ” - Right double quote .replace(/"/g, '"') // “ - Left double quote .replace(/'/g, "'") // ’ - Right single quote .replace(/'/g, "'") // ‘ - Left single quote .replace(/–/g, '-') // – - En dash .replace(/—/g, '—') // — - Em dash .replace(/…/g, '…') // … - Ellipsis .replace(/"/g, '"') // ″ - Double quote .replace(/'/g, "'") // ′ - Single quote // Decode Unicode characters .replace(/”/g, '"') // Right double quote .replace(/“/g, '"') // Left double quote .replace(/‘/g, "'") // Left single quote .replace(/’/g, "'") // Right single quote .replace(/–/g, '-') // En dash .replace(/—/g, '—') // Em dash .replace(/…/g, '…') // Ellipsis .replace(/″/g, '"') // Double quote .replace(/′/g, "'") // Single quote // Decode named HTML entities .replace(/"/g, '"') .replace(/'/g, "'"); // Also handle any remaining numeric entities decoded = decoded.replace(/&#(\d+);/g, (match, code) => { return String.fromCharCode(parseInt(code, 10)); }); return decoded; }; // Process English pages for (const page of pagesEN) { const translationKey = page.slug; const deMatch = translationMapping.pages[translationKey]; // Decode HTML entities first, then replace bg_image IDs and URLs let contentHtml = decodeContent(page.contentHtml); contentHtml = replaceBgImageIds(contentHtml, mediaMapping); contentHtml = replaceUrlsWithLocalPaths(contentHtml, assetMap); // Process video attributes and download videos const videoResult = await processVideoAttributes(contentHtml); contentHtml = videoResult.html; let excerptHtml = decodeContent(page.excerptHtml); excerptHtml = replaceBgImageIds(excerptHtml, mediaMapping); excerptHtml = replaceUrlsWithLocalPaths(excerptHtml, assetMap); processed.push({ id: page.id, translationKey: translationKey, locale: 'en', slug: page.slug, path: `/${page.slug}`, title: page.titleHtml.replace(/<[^>]*>/g, ''), titleHtml: page.titleHtml, contentHtml: sanitizeHTML(contentHtml), excerptHtml: processExcerptShortcodes(excerptHtml) || generateExcerpt(contentHtml), featuredImage: page.featuredImage, updatedAt: page.updatedAt, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); } // Process German pages for (const page of pagesDE) { const translationKey = page.slug; const enMatch = translationMapping.pages[translationKey]; // Decode HTML entities first, then replace bg_image IDs and URLs let contentHtml = decodeContent(page.contentHtml); contentHtml = replaceBgImageIds(contentHtml, mediaMapping); contentHtml = replaceUrlsWithLocalPaths(contentHtml, assetMap); // Process video attributes and download videos const videoResult = await processVideoAttributes(contentHtml); contentHtml = videoResult.html; let excerptHtml = decodeContent(page.excerptHtml); excerptHtml = replaceBgImageIds(excerptHtml, mediaMapping); excerptHtml = replaceUrlsWithLocalPaths(excerptHtml, assetMap); processed.push({ id: page.id, translationKey: translationKey, locale: 'de', slug: page.slug, path: `/de/${page.slug}`, title: page.titleHtml.replace(/<[^>]*>/g, ''), titleHtml: page.titleHtml, contentHtml: sanitizeHTML(contentHtml), excerptHtml: processExcerptShortcodes(excerptHtml) || generateExcerpt(contentHtml), featuredImage: page.featuredImage, updatedAt: page.updatedAt, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); } return processed; } // Process posts with bg_image resolution async function processPosts(postsEN, postsDE, translationMapping, mediaMapping, assetMap) { const processed = []; // Helper to decode HTML entities in content const decodeContent = (html) => { if (!html) return html; return html // Numeric HTML entities .replace(/”/g, '"') // Right double quote .replace(/″/g, '"') // Right double quote .replace(/“/g, '"') // Left double quote .replace(/„/g, '"') // Left double quote .replace(/‘/g, "'") // Left single quote .replace(/’/g, "'") // Right single quote .replace(/–/g, '-') // En dash .replace(/—/g, '—') // Em dash .replace(/…/g, '…') // Ellipsis .replace(/”/g, '"') // Right double quote .replace(/“/g, '"') // Left double quote .replace(/‘/g, "'") // Left single quote .replace(/’/g, "'") // Right single quote // Unicode characters .replace(/”/g, '"') // Right double quote .replace(/“/g, '"') // Left double quote .replace(/‘/g, "'") // Left single quote .replace(/’/g, "'") // Right single quote .replace(/–/g, '-') // En dash .replace(/—/g, '—') // Em dash .replace(/…/g, '…') // Ellipsis .replace(/″/g, '"') // Double quote .replace(/′/g, "'") // Single quote // Named HTML entities .replace(/"/g, '"') .replace(/'/g, "'"); }; for (const post of postsEN) { const translationKey = post.slug; const deMatch = translationMapping.posts[translationKey]; // Decode HTML entities first, then replace bg_image IDs and URLs let contentHtml = decodeContent(post.contentHtml); contentHtml = replaceBgImageIds(contentHtml, mediaMapping); contentHtml = replaceUrlsWithLocalPaths(contentHtml, assetMap); // Process video attributes and download videos const videoResult = await processVideoAttributes(contentHtml); contentHtml = videoResult.html; let excerptHtml = decodeContent(post.excerptHtml); excerptHtml = replaceBgImageIds(excerptHtml, mediaMapping); excerptHtml = replaceUrlsWithLocalPaths(excerptHtml, assetMap); processed.push({ id: post.id, translationKey: translationKey, locale: 'en', slug: post.slug, path: `/blog/${post.slug}`, title: post.titleHtml.replace(/<[^>]*>/g, ''), titleHtml: post.titleHtml, contentHtml: sanitizeHTML(contentHtml), excerptHtml: processExcerptShortcodes(excerptHtml) || generateExcerpt(contentHtml), featuredImage: post.featuredImage, datePublished: post.datePublished, updatedAt: post.updatedAt, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); } for (const post of postsDE) { const translationKey = post.slug; const enMatch = translationMapping.posts[translationKey]; // Decode HTML entities first, then replace bg_image IDs and URLs let contentHtml = decodeContent(post.contentHtml); contentHtml = replaceBgImageIds(contentHtml, mediaMapping); contentHtml = replaceUrlsWithLocalPaths(contentHtml, assetMap); // Process video attributes and download videos const videoResult = await processVideoAttributes(contentHtml); contentHtml = videoResult.html; let excerptHtml = decodeContent(post.excerptHtml); excerptHtml = replaceBgImageIds(excerptHtml, mediaMapping); excerptHtml = replaceUrlsWithLocalPaths(excerptHtml, assetMap); processed.push({ id: post.id, translationKey: translationKey, locale: 'de', slug: post.slug, path: `/de/blog/${post.slug}`, title: post.titleHtml.replace(/<[^>]*>/g, ''), titleHtml: post.titleHtml, contentHtml: sanitizeHTML(contentHtml), excerptHtml: processExcerptShortcodes(excerptHtml) || generateExcerpt(contentHtml), featuredImage: post.featuredImage, datePublished: post.datePublished, updatedAt: post.updatedAt, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); } return processed; } // Process products function processProducts(productsEN, productsDE, translationMapping) { const processed = []; productsEN.forEach(product => { const translationKey = product.slug; const deMatch = translationMapping.products[translationKey]; processed.push({ id: product.id, translationKey: translationKey, locale: 'en', slug: product.slug, path: `/product/${product.slug}`, name: product.name, shortDescriptionHtml: product.shortDescriptionHtml, descriptionHtml: sanitizeHTML(product.descriptionHtml), images: product.images, featuredImage: product.featuredImage, sku: product.sku, regularPrice: product.regularPrice, salePrice: product.salePrice, currency: product.currency, stockStatus: product.stockStatus, categories: product.categories, attributes: product.attributes, variations: product.variations, updatedAt: product.updatedAt, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); }); productsDE.forEach(product => { const translationKey = product.slug; const enMatch = translationMapping.products[translationKey]; processed.push({ id: product.id, translationKey: translationKey, locale: 'de', slug: product.slug, path: `/de/product/${product.slug}`, name: product.name, shortDescriptionHtml: product.shortDescriptionHtml, descriptionHtml: sanitizeHTML(product.descriptionHtml), images: product.images, featuredImage: product.featuredImage, sku: product.sku, regularPrice: product.regularPrice, salePrice: product.salePrice, currency: product.currency, stockStatus: product.stockStatus, categories: product.categories, attributes: product.attributes, variations: product.variations, updatedAt: product.updatedAt, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); }); return processed; } // Process product categories function processProductCategories(categoriesEN, categoriesDE, translationMapping) { const processed = []; categoriesEN.forEach(category => { const translationKey = category.slug; const deMatch = translationMapping.productCategories[translationKey]; processed.push({ id: category.id, translationKey: translationKey, locale: 'en', slug: category.slug, name: category.name, path: `/product-category/${category.slug}`, description: category.description, count: category.count, translation: deMatch ? { locale: 'de', id: deMatch.de } : null }); }); categoriesDE.forEach(category => { const translationKey = category.slug; const enMatch = translationMapping.productCategories[translationKey]; processed.push({ id: category.id, translationKey: translationKey, locale: 'de', slug: category.slug, name: category.name, path: `/de/product-category/${category.slug}`, description: category.description, count: category.count, translation: enMatch ? { locale: 'en', id: enMatch.en } : null }); }); return processed; } // Process media manifest function processMedia(media) { return media.map(item => ({ id: item.id, filename: item.filename, url: item.url, localPath: `/media/${item.filename}`, alt: item.alt, width: item.width, height: item.height, mimeType: item.mime_type })); } // Generate asset map for URL replacement function generateAssetMap(media) { const map = {}; media.forEach(item => { if (item.url) { map[item.url] = `/media/${item.filename}`; } }); return map; } // Main processing function async function main() { const exportDir = getLatestExportDir(); console.log('🔄 Processing WordPress Data for Next.js (with bg_image support)'); console.log('===============================================================\n'); // Load media mapping and asset map const mediaMapping = loadMediaMapping(); const assetMap = loadAssetMap(); console.log(`📊 Media mapping loaded: ${Object.keys(mediaMapping).length} IDs`); console.log(`🔗 Asset map loaded: ${Object.keys(assetMap).length} URLs\n`); // Load raw data const loadJSON = (file) => { try { return JSON.parse(fs.readFileSync(path.join(exportDir, file), 'utf8')); } catch (e) { console.error(`❌ Failed to load ${file}:`, e.message); return []; } }; const translationMapping = loadJSON('translation-mapping.json'); const pagesEN = loadJSON('pages.en.json'); const pagesDE = loadJSON('pages.de.json'); const postsEN = loadJSON('posts.en.json'); const postsDE = loadJSON('posts.de.json'); const productsEN = loadJSON('products.en.json'); const productsDE = loadJSON('products.de.json'); const categoriesEN = loadJSON('product-categories.en.json'); const categoriesDE = loadJSON('product-categories.de.json'); const media = loadJSON('media.json'); const redirects = loadJSON('redirects.json'); const siteInfo = loadJSON('site-info.json'); console.log('📊 Processing content types...\n'); // Process each content type with bg_image resolution const pages = await processPages(pagesEN, pagesDE, translationMapping, mediaMapping, assetMap); const posts = await processPosts(postsEN, postsDE, translationMapping, mediaMapping, assetMap); const products = processProducts(productsEN, productsDE, translationMapping); const categories = processProductCategories(categoriesEN, categoriesDE, translationMapping); const processedMedia = processMedia(media); const finalAssetMap = generateAssetMap(media); // Create processed data structure const processedData = { site: { title: siteInfo.siteTitle, description: siteInfo.siteDescription, baseUrl: siteInfo.baseUrl, defaultLocale: siteInfo.defaultLocale || 'en', locales: ['en', 'de'] }, content: { pages, posts, products, categories }, assets: { media: processedMedia, map: finalAssetMap }, redirects, exportDate: new Date().toISOString() }; // Save processed data const outputPath = path.join(PROCESSED_DIR, 'wordpress-data.json'); fs.writeFileSync(outputPath, JSON.stringify(processedData, null, 2)); // Save individual files for easier access fs.writeFileSync(path.join(PROCESSED_DIR, 'pages.json'), JSON.stringify(pages, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'posts.json'), JSON.stringify(posts, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'products.json'), JSON.stringify(products, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'categories.json'), JSON.stringify(categories, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'media.json'), JSON.stringify(processedMedia, null, 2)); fs.writeFileSync(path.join(PROCESSED_DIR, 'asset-map.json'), JSON.stringify(finalAssetMap, null, 2)); // Summary console.log('✅ Data Processing Complete\n'); console.log('📦 Processed Content:'); console.log(` Pages: ${pages.length} (with translations)`); console.log(` Posts: ${posts.length} (with translations)`); console.log(` Products: ${products.length} (with translations)`); console.log(` Categories: ${categories.length} (with translations)`); console.log(` Media: ${processedMedia.length} files`); console.log(` Redirects: ${redirects.length} rules\n`); console.log('📁 Output Files:'); console.log(` ${outputPath}`); console.log(` ${path.join(PROCESSED_DIR, 'pages.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'posts.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'products.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'categories.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'media.json')}`); console.log(` ${path.join(PROCESSED_DIR, 'asset-map.json')}\n`); // Check for bg_image replacements const bgImagePattern = /bg_image="(\d+)"/g; const layerImagePattern = /layer_one_image="(\d+)"/g; const imageUrlPattern = /image_url="(\d+)"/g; let bgImageMatches = 0; let layerImageMatches = 0; let imageUrlMatches = 0; [...pages, ...posts].forEach(item => { const bgMatches = (item.contentHtml.match(bgImagePattern) || []).length; const layerMatches = (item.contentHtml.match(layerImagePattern) || []).length; const imageMatches = (item.contentHtml.match(imageUrlPattern) || []).length; bgImageMatches += bgMatches; layerImageMatches += layerMatches; imageUrlMatches += imageMatches; }); if (bgImageMatches > 0 || layerImageMatches > 0 || imageUrlMatches > 0) { console.log('🖼️ bg_image Resolution:'); console.log(` bg_image IDs found: ${bgImageMatches}`); console.log(` layer_one_image IDs found: ${layerImageMatches}`); console.log(` image_url IDs found: ${imageUrlMatches}`); console.log(' ✅ All bg_image IDs have been replaced with local paths\n'); } // Sample data if (pages.length > 0) { console.log('📄 Sample Page:'); console.log(` Title: ${pages[0].title}`); console.log(` Path: ${pages[0].path}`); console.log(` Locale: ${pages[0].locale}`); console.log(` Translation: ${pages[0].translation ? 'Yes' : 'No'}\n`); } if (posts.length > 0) { console.log('📝 Sample Post:'); console.log(` Title: ${posts[0].title}`); console.log(` Path: ${posts[0].path}`); console.log(` Locale: ${posts[0].locale}`); console.log(` Date: ${posts[0].datePublished}\n`); } console.log('💡 Next: Ready for Next.js project setup!'); } if (require.main === module) { main().catch(console.error); } module.exports = { processPages, processPosts, processProducts, processProductCategories, processMedia, generateAssetMap, replaceBgImageIds, replaceUrlsWithLocalPaths, processVideoAttributes };