246 lines
8.3 KiB
JavaScript
Executable File
246 lines
8.3 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
/**
|
|
* Improved Translation Mapping Script
|
|
* Creates translation pairs by analyzing content similarity and patterns
|
|
*/
|
|
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
const DATA_DIR = path.join(__dirname, '..', 'data', 'raw');
|
|
|
|
// Find the latest export directory
|
|
function getLatestExportDir() {
|
|
const dirs = fs.readdirSync(DATA_DIR).filter(f => {
|
|
const stat = fs.statSync(path.join(DATA_DIR, f));
|
|
return stat.isDirectory();
|
|
});
|
|
dirs.sort().reverse();
|
|
return path.join(DATA_DIR, dirs[0]);
|
|
}
|
|
|
|
// Simple text similarity (Levenshtein-like)
|
|
function similarity(str1, str2) {
|
|
const longer = str1.length > str2.length ? str1 : str2;
|
|
const shorter = str1.length > str2.length ? str2 : str1;
|
|
|
|
if (longer.length === 0) return 1.0;
|
|
|
|
const editDistance = (s1, s2) => {
|
|
const costs = [];
|
|
for (let i = 0; i <= s1.length; i++) {
|
|
let lastValue = i;
|
|
for (let j = 0; j <= s2.length; j++) {
|
|
if (i === 0) {
|
|
costs[j] = j;
|
|
} else if (j > 0) {
|
|
let newValue = costs[j - 1];
|
|
if (s1.charAt(i - 1) !== s2.charAt(j - 1)) {
|
|
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
|
|
}
|
|
costs[j - 1] = lastValue;
|
|
lastValue = newValue;
|
|
}
|
|
}
|
|
if (i > 0) costs[s2.length] = lastValue;
|
|
}
|
|
return costs[s2.length];
|
|
};
|
|
|
|
return (longer.length - editDistance(longer, shorter)) / longer.length;
|
|
}
|
|
|
|
// Extract keywords from slug
|
|
function extractKeywords(slug) {
|
|
return slug
|
|
.split('-')
|
|
.filter(word => word.length > 3)
|
|
.sort()
|
|
.join('-');
|
|
}
|
|
|
|
// Find translation pairs using multiple strategies
|
|
function findTranslationPairs(itemsEN, itemsDE, threshold = 0.6) {
|
|
const pairs = [];
|
|
const usedDE = new Set();
|
|
|
|
itemsEN.forEach(enItem => {
|
|
let bestMatch = null;
|
|
let bestScore = 0;
|
|
|
|
itemsDE.forEach(deItem => {
|
|
if (usedDE.has(deItem.id)) return;
|
|
|
|
// Strategy 1: Keyword similarity
|
|
const enKeywords = extractKeywords(enItem.slug);
|
|
const deKeywords = extractKeywords(deItem.slug);
|
|
const keywordScore = similarity(enKeywords, deKeywords);
|
|
|
|
// Strategy 2: Title similarity (if available)
|
|
let titleScore = 0;
|
|
if (enItem.titleHtml && deItem.titleHtml) {
|
|
const enTitle = enItem.titleHtml.replace(/<[^>]*>/g, '').toLowerCase();
|
|
const deTitle = deItem.titleHtml.replace(/<[^>]*>/g, '').toLowerCase();
|
|
titleScore = similarity(enTitle, deTitle);
|
|
}
|
|
|
|
// Strategy 3: Content preview similarity
|
|
let contentScore = 0;
|
|
if (enItem.contentHtml && deItem.contentHtml) {
|
|
const enPreview = enItem.contentHtml.substring(0, 200).replace(/<[^>]*>/g, '').toLowerCase();
|
|
const dePreview = deItem.contentHtml.substring(0, 200).replace(/<[^>]*>/g, '').toLowerCase();
|
|
contentScore = similarity(enPreview, dePreview);
|
|
}
|
|
|
|
// Combined score (weighted)
|
|
const combinedScore = (keywordScore * 0.4) + (titleScore * 0.4) + (contentScore * 0.2);
|
|
|
|
if (combinedScore > bestScore && combinedScore > threshold) {
|
|
bestScore = combinedScore;
|
|
bestMatch = deItem;
|
|
}
|
|
});
|
|
|
|
if (bestMatch) {
|
|
usedDE.add(bestMatch.id);
|
|
pairs.push({
|
|
translationKey: `${enItem.slug}`,
|
|
en: enItem.id,
|
|
de: bestMatch.id,
|
|
score: bestScore,
|
|
enSlug: enItem.slug,
|
|
deSlug: bestMatch.slug
|
|
});
|
|
}
|
|
});
|
|
|
|
return pairs;
|
|
}
|
|
|
|
// Main function
|
|
function main() {
|
|
const exportDir = getLatestExportDir();
|
|
console.log('🔧 Improving Translation Mapping');
|
|
console.log('================================\n');
|
|
|
|
// Load data
|
|
const loadJSON = (file) => {
|
|
try {
|
|
return JSON.parse(fs.readFileSync(path.join(exportDir, file), 'utf8'));
|
|
} catch (e) {
|
|
return [];
|
|
}
|
|
};
|
|
|
|
const pagesEN = loadJSON('pages.en.json');
|
|
const pagesDE = loadJSON('pages.de.json');
|
|
const postsEN = loadJSON('posts.en.json');
|
|
const postsDE = loadJSON('posts.de.json');
|
|
const productsEN = loadJSON('products.en.json');
|
|
const productsDE = loadJSON('products.de.json');
|
|
const categoriesEN = loadJSON('product-categories.en.json');
|
|
const categoriesDE = loadJSON('product-categories.de.json');
|
|
|
|
console.log('📊 Content loaded:');
|
|
console.log(` Pages: ${pagesEN.length} EN, ${pagesDE.length} DE`);
|
|
console.log(` Posts: ${postsEN.length} EN, ${postsDE.length} DE`);
|
|
console.log(` Products: ${productsEN.length} EN, ${productsDE.length} DE`);
|
|
console.log(` Categories: ${categoriesEN.length} EN, ${categoriesDE.length} DE\n`);
|
|
|
|
// Find pairs
|
|
console.log('🔍 Finding translation pairs...\n');
|
|
|
|
const pagePairs = findTranslationPairs(pagesEN, pagesDE, 0.5);
|
|
const postPairs = findTranslationPairs(postsEN, postsDE, 0.5);
|
|
const productPairs = findTranslationPairs(productsEN, productsDE, 0.6);
|
|
const categoryPairs = findTranslationPairs(categoriesEN, categoriesDE, 0.5);
|
|
|
|
// Build mapping
|
|
const mapping = {
|
|
pages: {},
|
|
posts: {},
|
|
products: {},
|
|
productCategories: {}
|
|
};
|
|
|
|
pagePairs.forEach(pair => {
|
|
mapping.pages[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
|
|
});
|
|
|
|
postPairs.forEach(pair => {
|
|
mapping.posts[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
|
|
});
|
|
|
|
productPairs.forEach(pair => {
|
|
mapping.products[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
|
|
});
|
|
|
|
categoryPairs.forEach(pair => {
|
|
mapping.productCategories[pair.translationKey] = { en: pair.en, de: pair.de, score: pair.score };
|
|
});
|
|
|
|
// Save improved mapping
|
|
const outputDir = path.join(exportDir, 'translation-mapping-improved.json');
|
|
fs.writeFileSync(outputDir, JSON.stringify(mapping, null, 2));
|
|
|
|
// Summary
|
|
console.log('✅ Translation Mapping Complete\n');
|
|
console.log('Pairs found:');
|
|
console.log(` Pages: ${pagePairs.length}`);
|
|
console.log(` Posts: ${postPairs.length}`);
|
|
console.log(` Products: ${productPairs.length}`);
|
|
console.log(` Categories: ${categoryPairs.length}`);
|
|
console.log(` Total: ${pagePairs.length + postPairs.length + productPairs.length + categoryPairs.length}\n`);
|
|
|
|
// Show some examples
|
|
if (postPairs.length > 0) {
|
|
console.log('📝 Sample Post Pairs:');
|
|
postPairs.slice(0, 3).forEach(pair => {
|
|
console.log(` ${pair.enSlug} (${pair.score.toFixed(2)})`);
|
|
console.log(` ↔ ${pair.deSlug}`);
|
|
console.log('');
|
|
});
|
|
}
|
|
|
|
if (productPairs.length > 0) {
|
|
console.log('📦 Sample Product Pairs:');
|
|
productPairs.slice(0, 3).forEach(pair => {
|
|
console.log(` ${pair.enSlug} (${pair.score.toFixed(2)})`);
|
|
console.log(` ↔ ${pair.deSlug}`);
|
|
console.log('');
|
|
});
|
|
}
|
|
|
|
// Show unmatched items
|
|
const matchedEN = new Set([...pagePairs.map(p => p.en), ...postPairs.map(p => p.en), ...productPairs.map(p => p.en), ...categoryPairs.map(p => p.en)]);
|
|
const matchedDE = new Set([...pagePairs.map(p => p.de), ...postPairs.map(p => p.de), ...productPairs.map(p => p.de), ...categoryPairs.map(p => p.de)]);
|
|
|
|
const unmatchedEN = {
|
|
pages: pagesEN.filter(p => !matchedEN.has(p.id)).length,
|
|
posts: postsEN.filter(p => !matchedEN.has(p.id)).length,
|
|
products: productsEN.filter(p => !matchedEN.has(p.id)).length,
|
|
categories: categoriesEN.filter(p => !matchedEN.has(p.id)).length
|
|
};
|
|
|
|
const unmatchedDE = {
|
|
pages: pagesDE.filter(p => !matchedDE.has(p.id)).length,
|
|
posts: postsDE.filter(p => !matchedDE.has(p.id)).length,
|
|
products: productsDE.filter(p => !matchedDE.has(p.id)).length,
|
|
categories: categoriesDE.filter(p => !matchedDE.has(p.id)).length
|
|
};
|
|
|
|
console.log('🔍 Unmatched Items (may need manual review):');
|
|
console.log(` EN: ${unmatchedEN.pages} pages, ${unmatchedEN.posts} posts, ${unmatchedEN.products} products, ${unmatchedEN.categories} categories`);
|
|
console.log(` DE: ${unmatchedDE.pages} pages, ${unmatchedDE.posts} posts, ${unmatchedDE.products} products, ${unmatchedDE.categories} categories`);
|
|
|
|
console.log('\n💾 File saved:', outputDir);
|
|
console.log('\n💡 Next steps:');
|
|
console.log(' 1. Review the improved mapping for accuracy');
|
|
console.log(' 2. Manually add any missing pairs');
|
|
console.log(' 3. Use this mapping for Next.js i18n implementation');
|
|
}
|
|
|
|
if (require.main === module) {
|
|
main();
|
|
} |