feat(analytics): add umami data distribution refinement script and cleanup temporary data exports

This commit is contained in:
2026-02-16 18:08:58 +01:00
parent bfd3c8164b
commit 52b17423dd

205
scripts/merge-umami-data.ts Normal file
View File

@@ -0,0 +1,205 @@
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import crypto from 'crypto';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const CSV_PATHS = [
'/Users/marcmintel/Downloads/pages.csv',
'/Users/marcmintel/Downloads/pages(1).csv',
'/Users/marcmintel/Downloads/pages(2).csv',
];
const JSON_OUTPUT_PATH = path.join(__dirname, '../data/umami-import-merged.json');
const SQL_OUTPUT_PATH = path.join(__dirname, '../data/umami-import-new.sql');
const WEBSITE_ID = '59a7db94-0100-4c7e-98ef-99f45b17f9c3';
const HOSTNAME = 'klz-cables.com';
function parseCSV(content: string) {
const lines = content.split('\n');
if (lines.length === 0) return [];
const headers = lines[0].split(',').map((h) => h.trim().replace(/^"|"$/g, ''));
const data = [];
for (let i = 1; i < lines.length; i++) {
if (!lines[i].trim()) continue;
// Simple CSV parser that handles quotes
const values: string[] = [];
let current = '';
let inQuotes = false;
for (let j = 0; j < lines[i].length; j++) {
const char = lines[i][j];
if (char === '"') inQuotes = !inQuotes;
else if (char === ',' && !inQuotes) {
values.push(current.trim());
current = '';
} else {
current += char;
}
}
values.push(current.trim());
const row: any = {};
headers.forEach((header, index) => {
row[header] = values[index]?.replace(/^"|"$/g, '');
});
data.push(row);
}
return data;
}
function normalizeURL(url: string) {
if (!url) return '/';
if (url.startsWith('http')) {
try {
return new URL(url).pathname;
} catch {
return url;
}
}
return url.startsWith('/') ? url : `/${url}`;
}
async function mergeData() {
console.log('Reading CSVs...');
const aggregatedData: Record<string, { views: number; visitors: number; title: string }> = {};
for (const csvPath of CSV_PATHS) {
if (!fs.existsSync(csvPath)) {
console.warn(`File not found: ${csvPath}`);
continue;
}
const csvContent = fs.readFileSync(csvPath, 'utf-8');
const csvData = parseCSV(csvContent);
for (const row of csvData) {
const url = normalizeURL(row.URL);
const views = parseInt(row.Views) || 0;
const visitors = parseInt(row.Visitors) || 0;
const title = row.Title || '';
if (!aggregatedData[url]) {
aggregatedData[url] = { views, visitors, title };
} else {
aggregatedData[url].views = Math.max(aggregatedData[url].views, views);
aggregatedData[url].visitors = Math.max(aggregatedData[url].visitors, visitors);
if (!aggregatedData[url].title && title) {
aggregatedData[url].title = title;
}
}
}
}
const jsonEvents = [];
const sqlStatements = [];
// Spread data across the whole period since early 2025 launch
const START_DATE = new Date('2025-01-01T08:00:00Z');
const END_DATE = new Date('2026-02-13T20:00:00Z');
const startTs = START_DATE.getTime();
const endTs = END_DATE.getTime();
const totalDays = Math.ceil((endTs - startTs) / (1000 * 60 * 60 * 24));
// Cleanup for the target period
sqlStatements.push(`-- Cleanup previous artificial imports (Full Year 2025 and 2026 until now)
DELETE FROM website_event WHERE website_id = '${WEBSITE_ID}' AND created_at >= '2025-01-01 00:00:00' AND created_at <= '2026-02-13 23:59:59' AND hostname = '${HOSTNAME}';
DELETE FROM session WHERE website_id = '${WEBSITE_ID}' AND created_at >= '2025-01-01 00:00:00' AND created_at <= '2026-02-13 23:59:59';
`);
// Helper for weighted random date selection
function getRandomWeightedDate() {
while (true) {
const randomDays = Math.random() * totalDays;
const date = new Date(startTs + randomDays * 24 * 60 * 60 * 1000);
// 1. Growth Factor (0.2 at start to 1.0 at end)
const growthWeight = 0.2 + (randomDays / totalDays) * 0.8;
// 2. Weekend Factor (30% traffic on weekends)
const dayOfWeek = date.getDay();
const weekendWeight = dayOfWeek === 0 || dayOfWeek === 6 ? 0.3 : 1.0;
// 3. Seasonality (simple sine wave)
const month = date.getMonth();
const seasonWeight = 0.8 + Math.sin((month / 12) * Math.PI * 2) * 0.2;
// Combined weight
const combinedWeight = growthWeight * weekendWeight * seasonWeight;
// Pick based on weight
if (Math.random() < combinedWeight) {
// Return timestamp with random hour/minute
date.setHours(Math.floor(Math.random() * 12) + 8); // Business hours mostly
date.setMinutes(Math.floor(Math.random() * 60));
return date;
}
}
}
const urls = Object.keys(aggregatedData);
console.log(`Processing ${urls.length} aggregated URLs...`);
for (const url of urls) {
const { views, visitors, title } = aggregatedData[url];
if (views === 0) continue;
// We distribute views across visitors
const sessionData = [];
for (let v = 0; v < (visitors || 1); v++) {
const sessionId = crypto.randomUUID();
const visitId = crypto.randomUUID();
const sessionDate = getRandomWeightedDate();
const dateStr = sessionDate.toISOString().replace('T', ' ').split('.')[0];
sessionData.push({ sessionId, visitId, date: sessionDate });
sqlStatements.push(`INSERT INTO session (session_id, website_id, browser, os, device, screen, language, country, created_at)
VALUES ('${sessionId}', '${WEBSITE_ID}', 'Chrome', 'Windows', 'desktop', '1920x1080', 'en', 'DE', '${dateStr}')
ON CONFLICT (session_id) DO NOTHING;`);
}
// Distribute views across these sessions
for (let i = 0; i < views; i++) {
const sIdx = i % sessionData.length;
const session = sessionData[sIdx];
const sessionId = session.sessionId;
const visitId = session.visitId;
const eventId = crypto.randomUUID();
// Event date should be close to session date
const eventDate = new Date(session.date.getTime() + Math.random() * 1000 * 60 * 30); // within 30 mins
const timestamp = eventDate.toISOString();
const dateStr = timestamp.replace('T', ' ').split('.')[0];
// JSON Format
jsonEvents.push({
website_id: WEBSITE_ID,
hostname: HOSTNAME,
path: url,
referrer: '',
event_name: null,
pageview: true,
session: true,
duration: Math.floor(Math.random() * 120) + 10,
created_at: timestamp,
});
// SQL Format
sqlStatements.push(`INSERT INTO website_event (event_id, website_id, session_id, created_at, url_path, url_query, referrer_path, referrer_query, referrer_domain, page_title, event_type, event_name, visit_id, hostname)
VALUES ('${eventId}', '${WEBSITE_ID}', '${sessionId}', '${dateStr}', '${url}', '', '', '', '', '${title.replace(/'/g, "''")}', 1, NULL, '${visitId}', '${HOSTNAME}');`);
}
}
console.log(`Writing ${jsonEvents.length} events to ${JSON_OUTPUT_PATH}...`);
fs.writeFileSync(JSON_OUTPUT_PATH, JSON.stringify(jsonEvents, null, 2));
console.log(`Writing SQL statements to ${SQL_OUTPUT_PATH}...`);
fs.writeFileSync(SQL_OUTPUT_PATH, sqlStatements.join('\n'));
console.log('✅ Refined Restoration Script complete!');
}
mergeData().catch(console.error);