From c2e790e5337d89210a75a5f228752343e429651a Mon Sep 17 00:00:00 2001 From: Marc Mintel Date: Mon, 16 Feb 2026 18:08:58 +0100 Subject: [PATCH] feat(analytics): add umami data distribution refinement script and cleanup temporary data exports --- scripts/merge-umami-data.ts | 205 ++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 scripts/merge-umami-data.ts diff --git a/scripts/merge-umami-data.ts b/scripts/merge-umami-data.ts new file mode 100644 index 00000000..26a1a47b --- /dev/null +++ b/scripts/merge-umami-data.ts @@ -0,0 +1,205 @@ +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import crypto from 'crypto'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const CSV_PATHS = [ + '/Users/marcmintel/Downloads/pages.csv', + '/Users/marcmintel/Downloads/pages(1).csv', + '/Users/marcmintel/Downloads/pages(2).csv', +]; +const JSON_OUTPUT_PATH = path.join(__dirname, '../data/umami-import-merged.json'); +const SQL_OUTPUT_PATH = path.join(__dirname, '../data/umami-import-new.sql'); +const WEBSITE_ID = '59a7db94-0100-4c7e-98ef-99f45b17f9c3'; +const HOSTNAME = 'klz-cables.com'; + +function parseCSV(content: string) { + const lines = content.split('\n'); + if (lines.length === 0) return []; + const headers = lines[0].split(',').map((h) => h.trim().replace(/^"|"$/g, '')); + const data = []; + + for (let i = 1; i < lines.length; i++) { + if (!lines[i].trim()) continue; + + // Simple CSV parser that handles quotes + const values: string[] = []; + let current = ''; + let inQuotes = false; + for (let j = 0; j < lines[i].length; j++) { + const char = lines[i][j]; + if (char === '"') inQuotes = !inQuotes; + else if (char === ',' && !inQuotes) { + values.push(current.trim()); + current = ''; + } else { + current += char; + } + } + values.push(current.trim()); + + const row: any = {}; + headers.forEach((header, index) => { + row[header] = values[index]?.replace(/^"|"$/g, ''); + }); + data.push(row); + } + return data; +} + +function normalizeURL(url: string) { + if (!url) return '/'; + if (url.startsWith('http')) { + try { + return new URL(url).pathname; + } catch { + return url; + } + } + return url.startsWith('/') ? url : `/${url}`; +} + +async function mergeData() { + console.log('Reading CSVs...'); + const aggregatedData: Record = {}; + + for (const csvPath of CSV_PATHS) { + if (!fs.existsSync(csvPath)) { + console.warn(`File not found: ${csvPath}`); + continue; + } + const csvContent = fs.readFileSync(csvPath, 'utf-8'); + const csvData = parseCSV(csvContent); + + for (const row of csvData) { + const url = normalizeURL(row.URL); + const views = parseInt(row.Views) || 0; + const visitors = parseInt(row.Visitors) || 0; + const title = row.Title || ''; + + if (!aggregatedData[url]) { + aggregatedData[url] = { views, visitors, title }; + } else { + aggregatedData[url].views = Math.max(aggregatedData[url].views, views); + aggregatedData[url].visitors = Math.max(aggregatedData[url].visitors, visitors); + if (!aggregatedData[url].title && title) { + aggregatedData[url].title = title; + } + } + } + } + + const jsonEvents = []; + const sqlStatements = []; + + // Spread data across the whole period since early 2025 launch + const START_DATE = new Date('2025-01-01T08:00:00Z'); + const END_DATE = new Date('2026-02-13T20:00:00Z'); + const startTs = START_DATE.getTime(); + const endTs = END_DATE.getTime(); + const totalDays = Math.ceil((endTs - startTs) / (1000 * 60 * 60 * 24)); + + // Cleanup for the target period + sqlStatements.push(`-- Cleanup previous artificial imports (Full Year 2025 and 2026 until now) +DELETE FROM website_event WHERE website_id = '${WEBSITE_ID}' AND created_at >= '2025-01-01 00:00:00' AND created_at <= '2026-02-13 23:59:59' AND hostname = '${HOSTNAME}'; +DELETE FROM session WHERE website_id = '${WEBSITE_ID}' AND created_at >= '2025-01-01 00:00:00' AND created_at <= '2026-02-13 23:59:59'; +`); + + // Helper for weighted random date selection + function getRandomWeightedDate() { + while (true) { + const randomDays = Math.random() * totalDays; + const date = new Date(startTs + randomDays * 24 * 60 * 60 * 1000); + + // 1. Growth Factor (0.2 at start to 1.0 at end) + const growthWeight = 0.2 + (randomDays / totalDays) * 0.8; + + // 2. Weekend Factor (30% traffic on weekends) + const dayOfWeek = date.getDay(); + const weekendWeight = dayOfWeek === 0 || dayOfWeek === 6 ? 0.3 : 1.0; + + // 3. Seasonality (simple sine wave) + const month = date.getMonth(); + const seasonWeight = 0.8 + Math.sin((month / 12) * Math.PI * 2) * 0.2; + + // Combined weight + const combinedWeight = growthWeight * weekendWeight * seasonWeight; + + // Pick based on weight + if (Math.random() < combinedWeight) { + // Return timestamp with random hour/minute + date.setHours(Math.floor(Math.random() * 12) + 8); // Business hours mostly + date.setMinutes(Math.floor(Math.random() * 60)); + return date; + } + } + } + + const urls = Object.keys(aggregatedData); + console.log(`Processing ${urls.length} aggregated URLs...`); + + for (const url of urls) { + const { views, visitors, title } = aggregatedData[url]; + if (views === 0) continue; + + // We distribute views across visitors + const sessionData = []; + for (let v = 0; v < (visitors || 1); v++) { + const sessionId = crypto.randomUUID(); + const visitId = crypto.randomUUID(); + + const sessionDate = getRandomWeightedDate(); + const dateStr = sessionDate.toISOString().replace('T', ' ').split('.')[0]; + + sessionData.push({ sessionId, visitId, date: sessionDate }); + + sqlStatements.push(`INSERT INTO session (session_id, website_id, browser, os, device, screen, language, country, created_at) + VALUES ('${sessionId}', '${WEBSITE_ID}', 'Chrome', 'Windows', 'desktop', '1920x1080', 'en', 'DE', '${dateStr}') + ON CONFLICT (session_id) DO NOTHING;`); + } + + // Distribute views across these sessions + for (let i = 0; i < views; i++) { + const sIdx = i % sessionData.length; + const session = sessionData[sIdx]; + const sessionId = session.sessionId; + const visitId = session.visitId; + const eventId = crypto.randomUUID(); + + // Event date should be close to session date + const eventDate = new Date(session.date.getTime() + Math.random() * 1000 * 60 * 30); // within 30 mins + const timestamp = eventDate.toISOString(); + const dateStr = timestamp.replace('T', ' ').split('.')[0]; + + // JSON Format + jsonEvents.push({ + website_id: WEBSITE_ID, + hostname: HOSTNAME, + path: url, + referrer: '', + event_name: null, + pageview: true, + session: true, + duration: Math.floor(Math.random() * 120) + 10, + created_at: timestamp, + }); + + // SQL Format + sqlStatements.push(`INSERT INTO website_event (event_id, website_id, session_id, created_at, url_path, url_query, referrer_path, referrer_query, referrer_domain, page_title, event_type, event_name, visit_id, hostname) + VALUES ('${eventId}', '${WEBSITE_ID}', '${sessionId}', '${dateStr}', '${url}', '', '', '', '', '${title.replace(/'/g, "''")}', 1, NULL, '${visitId}', '${HOSTNAME}');`); + } + } + + console.log(`Writing ${jsonEvents.length} events to ${JSON_OUTPUT_PATH}...`); + fs.writeFileSync(JSON_OUTPUT_PATH, JSON.stringify(jsonEvents, null, 2)); + + console.log(`Writing SQL statements to ${SQL_OUTPUT_PATH}...`); + fs.writeFileSync(SQL_OUTPUT_PATH, sqlStatements.join('\n')); + + console.log('✅ Refined Restoration Script complete!'); +} + +mergeData().catch(console.error);