pdfs
This commit is contained in:
173
test-enrichment.js
Normal file
173
test-enrichment.js
Normal file
@@ -0,0 +1,173 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { execSync } = require('child_process');
|
||||
|
||||
// Copy the key functions from the PDF script
|
||||
const EXCEL_SOURCE_FILES = [
|
||||
path.join(process.cwd(), 'data/source/high-voltage.xlsx'),
|
||||
path.join(process.cwd(), 'data/source/medium-voltage-KM.xlsx'),
|
||||
path.join(process.cwd(), 'data/source/low-voltage-KM.xlsx'),
|
||||
path.join(process.cwd(), 'data/source/solar-cables.xlsx'),
|
||||
];
|
||||
|
||||
function normalizeExcelKey(value) {
|
||||
return String(value || '')
|
||||
.toUpperCase()
|
||||
.replace(/-\d+$/g, '')
|
||||
.replace(/[^A-Z0-9]+/g, '');
|
||||
}
|
||||
|
||||
function loadExcelRows(filePath) {
|
||||
const out = execSync(`npx -y xlsx-cli -j "${filePath}"`, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'] });
|
||||
const trimmed = out.trim();
|
||||
const jsonStart = trimmed.indexOf('[');
|
||||
if (jsonStart < 0) return [];
|
||||
const jsonText = trimmed.slice(jsonStart);
|
||||
try {
|
||||
return JSON.parse(jsonText);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function getExcelIndex() {
|
||||
if (getExcelIndex.cached) return getExcelIndex.cached;
|
||||
const idx = new Map();
|
||||
for (const file of EXCEL_SOURCE_FILES) {
|
||||
if (!fs.existsSync(file)) continue;
|
||||
const rows = loadExcelRows(file);
|
||||
const unitsRow = rows.find(r => r && r['Part Number'] === 'Units') || null;
|
||||
const units = {};
|
||||
if (unitsRow) {
|
||||
for (const [k, v] of Object.entries(unitsRow)) {
|
||||
if (k === 'Part Number') continue;
|
||||
const unit = String(v ?? '').trim();
|
||||
if (unit) units[k] = unit;
|
||||
}
|
||||
}
|
||||
for (const r of rows) {
|
||||
const pn = r?.['Part Number'];
|
||||
if (!pn || pn === 'Units') continue;
|
||||
const key = normalizeExcelKey(String(pn));
|
||||
if (!key) continue;
|
||||
const cur = idx.get(key);
|
||||
if (!cur) {
|
||||
idx.set(key, { rows: [r], units });
|
||||
} else {
|
||||
cur.rows.push(r);
|
||||
if (Object.keys(cur.units).length < Object.keys(units).length) cur.units = units;
|
||||
}
|
||||
}
|
||||
}
|
||||
getExcelIndex.cached = idx;
|
||||
return idx;
|
||||
}
|
||||
|
||||
function findExcelForProduct(product) {
|
||||
const idx = getExcelIndex();
|
||||
const candidates = [
|
||||
product.name,
|
||||
product.slug ? product.slug.replace(/-\d+$/g, '') : '',
|
||||
product.sku,
|
||||
product.translationKey,
|
||||
].filter(Boolean);
|
||||
|
||||
for (const c of candidates) {
|
||||
const key = normalizeExcelKey(c);
|
||||
const match = idx.get(key);
|
||||
if (match && match.rows.length) return match;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function findExcelRowsForProduct(product) {
|
||||
const match = findExcelForProduct(product);
|
||||
return match?.rows || [];
|
||||
}
|
||||
|
||||
function guessColumnKey(row, patterns) {
|
||||
const keys = Object.keys(row || {});
|
||||
for (const re of patterns) {
|
||||
const k = keys.find(x => re.test(String(x)));
|
||||
if (k) return k;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function normalizeValue(value) {
|
||||
return String(value || '')
|
||||
.replace(/<[^>]*>/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function getUniqueNonEmpty(options) {
|
||||
const uniq = [];
|
||||
const seen = new Set();
|
||||
for (const v of options.map(normalizeValue).filter(Boolean)) {
|
||||
const k = v.toLowerCase();
|
||||
if (seen.has(k)) continue;
|
||||
seen.add(k);
|
||||
uniq.push(v);
|
||||
}
|
||||
return uniq;
|
||||
}
|
||||
|
||||
function looksNumeric(value) {
|
||||
const v = normalizeValue(value).replace(/,/g, '.');
|
||||
return /^-?\d+(?:\.\d+)?$/.test(v);
|
||||
}
|
||||
|
||||
// Test the enrichment for a specific product
|
||||
const products = JSON.parse(fs.readFileSync('data/processed/products.json', 'utf8'));
|
||||
const testProduct = products.find(p => p.slug === 'na2xsfl2y-3');
|
||||
|
||||
if (testProduct) {
|
||||
console.log('=== Original Product ===');
|
||||
console.log('ID:', testProduct.id);
|
||||
console.log('Slug:', testProduct.slug);
|
||||
console.log('Name:', testProduct.name);
|
||||
console.log('Attributes:', testProduct.attributes?.length || 0);
|
||||
|
||||
const rows = findExcelRowsForProduct(testProduct);
|
||||
console.log('\n=== Excel Rows Found ===');
|
||||
console.log('Rows:', rows.length);
|
||||
|
||||
if (rows.length > 0) {
|
||||
console.log('\nFirst row columns:', Object.keys(rows[0]));
|
||||
console.log('\nFirst row sample:', JSON.stringify(rows[0], null, 2).substring(0, 500));
|
||||
|
||||
// Test cross-section detection
|
||||
const csKey = guessColumnKey(rows[0], [
|
||||
/number of cores and cross-section/i,
|
||||
/cross.?section/i,
|
||||
/ross section conductor/i,
|
||||
]);
|
||||
console.log('\nCross-section key:', csKey);
|
||||
|
||||
if (csKey) {
|
||||
const cfgOptions = rows
|
||||
.map(r => normalizeValue(String(r?.[csKey] ?? '')))
|
||||
.filter(Boolean);
|
||||
console.log('Configurations found:', cfgOptions.length);
|
||||
console.log('Sample configs:', cfgOptions.slice(0, 5));
|
||||
}
|
||||
|
||||
// Test additional columns
|
||||
const conductorKey = guessColumnKey(rows[0], [/conductor/i]);
|
||||
const insulationKey = guessColumnKey(rows[0], [/insulation/i]);
|
||||
const sheathKey = guessColumnKey(rows[0], [/sheath/i]);
|
||||
const normKey = guessColumnKey(rows[0], [/norm|standard|iec|vde/i]);
|
||||
|
||||
console.log('\nAdditional column keys:');
|
||||
console.log(' Conductor:', conductorKey);
|
||||
console.log(' Insulation:', insulationKey);
|
||||
console.log(' Sheath:', sheathKey);
|
||||
console.log(' Norm:', normKey);
|
||||
|
||||
if (conductorKey) {
|
||||
const values = getUniqueNonEmpty(rows.map(r => normalizeValue(String(r?.[conductorKey] ?? ''))));
|
||||
console.log('\nConductor values:', values);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user