Files
klz-cables.com/scripts/improve-translation-mapping.js
2025-12-28 23:28:31 +01:00

246 lines
8.3 KiB
JavaScript
Executable File

#!/usr/bin/env node
/**
* Improved Translation Mapping Script
* Creates translation pairs by analyzing content similarity and patterns
*/
const fs = require('fs');
const path = require('path');
const DATA_DIR = path.join(__dirname, '..', 'data', 'raw');
// Find the latest export directory
function getLatestExportDir() {
const dirs = fs.readdirSync(DATA_DIR).filter(f => {
const stat = fs.statSync(path.join(DATA_DIR, f));
return stat.isDirectory();
});
dirs.sort().reverse();
return path.join(DATA_DIR, dirs[0]);
}
// Simple text similarity (Levenshtein-like)
function similarity(str1, str2) {
const longer = str1.length > str2.length ? str1 : str2;
const shorter = str1.length > str2.length ? str2 : str1;
if (longer.length === 0) return 1.0;
const editDistance = (s1, s2) => {
const costs = [];
for (let i = 0; i <= s1.length; i++) {
let lastValue = i;
for (let j = 0; j <= s2.length; j++) {
if (i === 0) {
costs[j] = j;
} else if (j > 0) {
let newValue = costs[j - 1];
if (s1.charAt(i - 1) !== s2.charAt(j - 1)) {
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
}
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
if (i > 0) costs[s2.length] = lastValue;
}
return costs[s2.length];
};
return (longer.length - editDistance(longer, shorter)) / longer.length;
}
// Extract keywords from slug
function extractKeywords(slug) {
return slug
.split('-')
.filter(word => word.length > 3)
.sort()
.join('-');
}
// Find translation pairs using multiple strategies
function findTranslationPairs(itemsEN, itemsDE, threshold = 0.6) {
const pairs = [];
const usedDE = new Set();
itemsEN.forEach(enItem => {
let bestMatch = null;
let bestScore = 0;
itemsDE.forEach(deItem => {
if (usedDE.has(deItem.id)) return;
// Strategy 1: Keyword similarity
const enKeywords = extractKeywords(enItem.slug);
const deKeywords = extractKeywords(deItem.slug);
const keywordScore = similarity(enKeywords, deKeywords);
// Strategy 2: Title similarity (if available)
let titleScore = 0;
if (enItem.titleHtml && deItem.titleHtml) {
const enTitle = enItem.titleHtml.replace(/<[^>]*>/g, '').toLowerCase();
const deTitle = deItem.titleHtml.replace(/<[^>]*>/g, '').toLowerCase();
titleScore = similarity(enTitle, deTitle);
}
// Strategy 3: Content preview similarity
let contentScore = 0;
if (enItem.contentHtml && deItem.contentHtml) {
const enPreview = enItem.contentHtml.substring(0, 200).replace(/<[^>]*>/g, '').toLowerCase();
const dePreview = deItem.contentHtml.substring(0, 200).replace(/<[^>]*>/g, '').toLowerCase();
contentScore = similarity(enPreview, dePreview);
}
// Combined score (weighted)
const combinedScore = (keywordScore * 0.4) + (titleScore * 0.4) + (contentScore * 0.2);
if (combinedScore > bestScore && combinedScore > threshold) {
bestScore = combinedScore;
bestMatch = deItem;
}
});
if (bestMatch) {
usedDE.add(bestMatch.id);
pairs.push({
translationKey: `${enItem.slug}`,
en: enItem.id,
de: bestMatch.id,
score: bestScore,
enSlug: enItem.slug,
deSlug: bestMatch.slug
});
}
});
return pairs;
}
// Main function
function main() {
const exportDir = getLatestExportDir();
console.log('🔧 Improving Translation Mapping');
console.log('================================\n');
// Load data
const loadJSON = (file) => {
try {
return JSON.parse(fs.readFileSync(path.join(exportDir, file), 'utf8'));
} catch (e) {
return [];
}
};
const pagesEN = loadJSON('pages.en.json');
const pagesDE = loadJSON('pages.de.json');
const postsEN = loadJSON('posts.en.json');
const postsDE = loadJSON('posts.de.json');
const productsEN = loadJSON('products.en.json');
const productsDE = loadJSON('products.de.json');
const categoriesEN = loadJSON('product-categories.en.json');
const categoriesDE = loadJSON('product-categories.de.json');
console.log('📊 Content loaded:');
console.log(` Pages: ${pagesEN.length} EN, ${pagesDE.length} DE`);
console.log(` Posts: ${postsEN.length} EN, ${postsDE.length} DE`);
console.log(` Products: ${productsEN.length} EN, ${productsDE.length} DE`);
console.log(` Categories: ${categoriesEN.length} EN, ${categoriesDE.length} DE\n`);
// Find pairs
console.log('🔍 Finding translation pairs...\n');
const pagePairs = findTranslationPairs(pagesEN, pagesDE, 0.5);
const postPairs = findTranslationPairs(postsEN, postsDE, 0.5);
const productPairs = findTranslationPairs(productsEN, productsDE, 0.6);
const categoryPairs = findTranslationPairs(categoriesEN, categoriesDE, 0.5);
// Build mapping
const mapping = {
pages: {},
posts: {},
products: {},
productCategories: {}
};
pagePairs.forEach(pair => {
mapping.pages[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
});
postPairs.forEach(pair => {
mapping.posts[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
});
productPairs.forEach(pair => {
mapping.products[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
});
categoryPairs.forEach(pair => {
mapping.productCategories[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
});
// Save improved mapping
const outputDir = path.join(exportDir, 'translation-mapping-improved.json');
fs.writeFileSync(outputDir, JSON.stringify(mapping, null, 2));
// Summary
console.log('✅ Translation Mapping Complete\n');
console.log('Pairs found:');
console.log(` Pages: ${pagePairs.length}`);
console.log(` Posts: ${postPairs.length}`);
console.log(` Products: ${productPairs.length}`);
console.log(` Categories: ${categoryPairs.length}`);
console.log(` Total: ${pagePairs.length + postPairs.length + productPairs.length + categoryPairs.length}\n`);
// Show some examples
if (postPairs.length > 0) {
console.log('📝 Sample Post Pairs:');
postPairs.slice(0, 3).forEach(pair => {
console.log(` ${pair.enSlug} (${pair.score.toFixed(2)})`);
console.log(`${pair.deSlug}`);
console.log('');
});
}
if (productPairs.length > 0) {
console.log('📦 Sample Product Pairs:');
productPairs.slice(0, 3).forEach(pair => {
console.log(` ${pair.enSlug} (${pair.score.toFixed(2)})`);
console.log(`${pair.deSlug}`);
console.log('');
});
}
// Show unmatched items
const matchedEN = new Set([...pagePairs.map(p => p.en), ...postPairs.map(p => p.en), ...productPairs.map(p => p.en), ...categoryPairs.map(p => p.en)]);
const matchedDE = new Set([...pagePairs.map(p => p.de), ...postPairs.map(p => p.de), ...productPairs.map(p => p.de), ...categoryPairs.map(p => p.de)]);
const unmatchedEN = {
pages: pagesEN.filter(p => !matchedEN.has(p.id)).length,
posts: postsEN.filter(p => !matchedEN.has(p.id)).length,
products: productsEN.filter(p => !matchedEN.has(p.id)).length,
categories: categoriesEN.filter(p => !matchedEN.has(p.id)).length
};
const unmatchedDE = {
pages: pagesDE.filter(p => !matchedDE.has(p.id)).length,
posts: postsDE.filter(p => !matchedDE.has(p.id)).length,
products: productsDE.filter(p => !matchedDE.has(p.id)).length,
categories: categoriesDE.filter(p => !matchedDE.has(p.id)).length
};
console.log('🔍 Unmatched Items (may need manual review):');
console.log(` EN: ${unmatchedEN.pages} pages, ${unmatchedEN.posts} posts, ${unmatchedEN.products} products, ${unmatchedEN.categories} categories`);
console.log(` DE: ${unmatchedDE.pages} pages, ${unmatchedDE.posts} posts, ${unmatchedDE.products} products, ${unmatchedDE.categories} categories`);
console.log('\n💾 File saved:', outputDir);
console.log('\n💡 Next steps:');
console.log(' 1. Review the improved mapping for accuracy');
console.log(' 2. Manually add any missing pairs');
console.log(' 3. Use this mapping for Next.js i18n implementation');
}
if (require.main === module) {
main();
}