diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index cb44bc41..d5dda66d 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -61,8 +61,9 @@ function generateReport(resultsDir = RESULTS_DIR, opts = {}) { }).filter(r => r.data); // Load fixture images for Vision tab (base64) + // Skip in live mode — saves ~43MB of base64 per regeneration, making per-test updates instant const fixtureImages = {}; - if (fs.existsSync(FIXTURES_DIR)) { + if (!liveMode && fs.existsSync(FIXTURES_DIR)) { try { const frames = fs.readdirSync(FIXTURES_DIR).filter(f => /\.(png|jpg|jpeg)$/i.test(f)); for (const f of frames) { @@ -131,8 +132,8 @@ function buildHTML(allResults, fixtureImages, { liveMode = false, liveStatus = n const fixtureJSON = JSON.stringify(fixtureImages); - // Live mode: auto-refresh meta tag - const refreshMeta = liveMode ? '' : ''; + // Live mode: JS-based reload (stateful, preserves active tab + scroll) + const refreshMeta = ''; const liveBannerHTML = liveMode ? buildLiveBanner(liveStatus) : ''; return ` @@ -434,7 +435,7 @@ function buildSidebar() { let html = ''; for (const [family, runs] of Object.entries(groups)) { html += '
'; - html += '
' + esc(family) + ' (' + runs.length + ')
'; + html += '
' + esc(family) + ' (' + runs.length + ')
'; html += '
'; for (const r of runs.reverse()) { const sel = selectedIndices.has(r._idx); @@ -508,6 +509,14 @@ function renderPerformance() { html += statCard('Server Decode', fmt(srvDecode), 'tok/s', 'From llama-server /metrics'); html += statCard('Total Time', fmt(totalTime / 1000), 's', run.total + ' tests'); html += statCard('Throughput', fmt(tokPerSec), 'tok/s', fmtK(run.tokens || 0) + ' total tokens'); + + // GPU & Memory cards (from resource samples) + const res = perf?.resource; + if (res) { + html += statCard('GPU Utilization', res.gpu ? res.gpu.util + '' : '—', '%', res.gpu ? 'Renderer: ' + res.gpu.renderer + '% · Tiler: ' + res.gpu.tiler + '%' : 'MPS not available'); + html += statCard('GPU Memory', res.gpu?.memUsedGB != null ? fmt(res.gpu.memUsedGB) : '—', 'GB', res.gpu?.memAllocGB != null ? 'Alloc: ' + fmt(res.gpu.memAllocGB) + ' GB' : 'MPS not available'); + html += statCard('System Memory', fmt(res.sys?.usedGB), 'GB', 'of ' + fmt(res.sys?.totalGB) + ' GB total · Free: ' + fmt(res.sys?.freeGB) + ' GB'); + } html += '
'; // Comparison table if multiple selected @@ -611,7 +620,36 @@ function renderQuality() { // Multi-run comparison if (sel.length > 1) { - html += '
Quality Comparison
'; + // High-level summary comparison + html += '
Overall Comparison
'; + html += '
'; + for (const r of sel) html += ''; + html += ''; + const hasVlm = sel.some(r => r.vlmTotal > 0); + const hiRows = [ + ['Pass Rate', r => r.total > 0 ? pct(r.passed, r.total) + '%' : '—'], + ['Score', r => r.passed + '/' + r.total], + ['LLM Score', r => r.llmTotal > 0 ? (r.llmPassed || 0) + '/' + (r.llmTotal || 0) : '—'], + ...(hasVlm ? [['VLM Score', r => r.vlmTotal > 0 ? (r.vlmPassed || 0) + '/' + (r.vlmTotal || 0) : '—']] : []), + ['Failed', r => String(r.failed)], + ['Time', r => fmt(r.timeMs / 1000) + 's'], + ['Throughput', r => r.timeMs > 0 && r.tokens ? fmt(r.tokens / (r.timeMs / 1000)) + ' tok/s' : '—'], + ]; + for (const [label, fn] of hiRows) { + html += ''; + // Find best value for highlighting + const vals = sel.map(fn); + for (let i = 0; i < sel.length; i++) { + const isBest = label === 'Failed' ? vals[i] === String(Math.min(...sel.map(r => r.failed))) : + label === 'Pass Rate' ? vals[i] === pct(Math.max(...sel.map(r => r.passed)), sel[0].total) + '%' : false; + html += ' 1 ? ' style="color:var(--green);font-weight:600"' : '') + '>' + vals[i] + ''; + } + html += ''; + } + html += '
Metric' + esc(modelShort(r.model)) + '
' + shortDate(r.timestamp) + '
' + label + '
'; + + // Per-suite breakdown + html += '
Suite Comparison
'; html += '
'; for (const r of sel) html += ''; html += ''; @@ -823,9 +861,15 @@ function getActiveTab() { function renderActiveTab() { const tab = getActiveTab(); - if (tab === 'performance') renderPerformance(); - else if (tab === 'quality') renderQuality(); - else if (tab === 'vision') renderVision(); + try { + if (tab === 'performance') renderPerformance(); + else if (tab === 'quality') renderQuality(); + else if (tab === 'vision') renderVision(); + } catch (e) { + const panel = document.getElementById('tab-' + tab); + if (panel) panel.innerHTML = '
Render error: ' + e.message + '
' + e.stack + '
'; + console.error('Tab render error:', e); + } } // ═══════════════════════════════════════════════════════════════════════════════ @@ -837,6 +881,52 @@ function refresh() { renderActiveTab(); } +// ═══════════════════════════════════════════════════════════════════════════════ +// LIVE RELOAD (stateful — preserves tab + scroll) +// ═══════════════════════════════════════════════════════════════════════════════ +const IS_LIVE = ${liveMode ? 'true' : 'false'}; + +function saveState() { + try { + sessionStorage.setItem('_bench_tab', getActiveTab()); + sessionStorage.setItem('_bench_scroll', String(window.scrollY)); + sessionStorage.setItem('_bench_selected', JSON.stringify([...selectedIndices])); + sessionStorage.setItem('_bench_primary', String(primaryIndex)); + } catch {} +} + +function restoreState() { + try { + // Restore selection + const savedSel = sessionStorage.getItem('_bench_selected'); + if (savedSel) { + const arr = JSON.parse(savedSel).filter(i => i >= 0 && i < ALL_RUNS.length); + if (arr.length > 0) { selectedIndices = new Set(arr); } + } + const savedPrimary = sessionStorage.getItem('_bench_primary'); + if (savedPrimary != null) { + const pi = parseInt(savedPrimary); + if (pi >= 0 && pi < ALL_RUNS.length) primaryIndex = pi; + } + // Restore tab + const tab = sessionStorage.getItem('_bench_tab'); + if (tab && tab !== 'performance') { + document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); + document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active')); + const tabEl = document.querySelector('.tab[data-tab="' + tab + '"]'); + if (tabEl) tabEl.classList.add('active'); + const panel = document.getElementById('tab-' + tab); + if (panel) panel.classList.add('active'); + } + const scroll = parseInt(sessionStorage.getItem('_bench_scroll') || '0'); + if (scroll > 0) setTimeout(() => window.scrollTo(0, scroll), 50); + } catch {} +} + +if (IS_LIVE) { + setTimeout(() => { saveState(); location.reload(); }, 5000); +} + // ═══════════════════════════════════════════════════════════════════════════════ // INIT // ═══════════════════════════════════════════════════════════════════════════════ @@ -846,6 +936,7 @@ document.getElementById('btn-compare').addEventListener('click', () => { if (selectedIndices.size > 1) renderActiveTab(); }); +restoreState(); refresh(); @@ -865,15 +956,17 @@ function buildLiveBanner(status) { if (!status) { return `
Benchmark starting\u2026
`; } - const { suitesCompleted = 0, totalSuites = 0, currentSuite = '', startedAt = '' } = status; + const { suitesCompleted = 0, totalSuites = 0, currentSuite = '', currentTest = '', testsCompleted = 0, startedAt = '' } = status; const pct = totalSuites > 0 ? Math.round((suitesCompleted / totalSuites) * 100) : 0; const elapsed = startedAt ? Math.round((Date.now() - new Date(startedAt).getTime()) / 1000) : 0; const elapsedStr = elapsed > 60 ? Math.floor(elapsed / 60) + 'm ' + (elapsed % 60) + 's' : elapsed + 's'; + const testInfo = currentTest ? ` — ✅ ${escHtml(currentTest)}` : ''; return `
LIVE — Suite ${suitesCompleted}/${totalSuites} (${pct}%) - ${currentSuite ? ' — ' + currentSuite + '' : ''} - ${elapsedStr} elapsed + ${currentSuite ? ' — 🔧 ' + escHtml(currentSuite) + '' : ''} + ${testInfo} + ${testsCompleted} tests · ${elapsedStr} elapsed
`; } diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index 3193e7f8..8598be17 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -157,6 +157,7 @@ const results = { totals: { passed: 0, failed: 0, skipped: 0, total: 0, timeMs: 0 }, tokenTotals: { prompt: 0, completion: 0, total: 0 }, perfTotals: { ttftMs: [], decodeTokensPerSec: [], prefillTokensPerSec: null, serverDecodeTokensPerSec: null }, + resourceSamples: [], // GPU/memory snapshots taken after each suite }; async function llmCall(messages, opts = {}) { @@ -505,29 +506,110 @@ function assert(condition, msg) { if (!condition) throw new Error(msg || 'Assertion failed'); } +// ─── Resource Metrics (GPU/MPS + Memory) ───────────────────────────────────── + +/** + * Sample GPU (Apple Silicon MPS) utilization and system memory. + * Uses `ioreg` for GPU stats (no sudo needed). + */ +function sampleResourceMetrics() { + const os = require('os'); + const sample = { + timestamp: new Date().toISOString(), + sys: { + totalGB: parseFloat((os.totalmem() / 1073741824).toFixed(1)), + freeGB: parseFloat((os.freemem() / 1073741824).toFixed(1)), + usedGB: parseFloat(((os.totalmem() - os.freemem()) / 1073741824).toFixed(1)), + }, + process: { + rssMB: parseFloat((process.memoryUsage().rss / 1048576).toFixed(0)), + }, + gpu: null, + }; + + // Apple Silicon GPU via ioreg (macOS only) + if (process.platform === 'darwin') { + try { + const out = execSync('ioreg -r -c AGXAccelerator 2>/dev/null', { encoding: 'utf8', timeout: 3000 }); + const m = (key) => { const r = new RegExp('"' + key + '"=(\\d+)'); const match = out.match(r); return match ? parseInt(match[1]) : null; }; + const deviceUtil = m('Device Utilization %'); + const rendererUtil = m('Renderer Utilization %'); + const tilerUtil = m('Tiler Utilization %'); + const memUsed = m('In use system memory'); + const memAlloc = m('Alloc system memory'); + if (deviceUtil !== null) { + sample.gpu = { + util: deviceUtil, + renderer: rendererUtil, + tiler: tilerUtil, + memUsedGB: memUsed ? parseFloat((memUsed / 1073741824).toFixed(1)) : null, + memAllocGB: memAlloc ? parseFloat((memAlloc / 1073741824).toFixed(1)) : null, + }; + } + } catch { /* ioreg not available or timed out */ } + } + + return sample; +} + // ─── Live progress: intermediate saves + report regeneration ──────────────── let _liveReportOpened = false; +let _runStartedAt = null; // Set when runSuites() begins +let _currentTestName = null; // Set during test execution for live banner +let _currentSuiteIndex = 0; // Current suite index for live progress +let _totalSuites = 0; // Total number of suites /** * Save the current (in-progress) results to disk and regenerate the live report. - * Called after each suite completes so the browser auto-refreshes with updated data. + * Called after each test completes so the browser auto-refreshes with updated data. */ -function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName) { +function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName, currentTest) { try { fs.mkdirSync(RESULTS_DIR, { recursive: true }); // Save current results as a live file (will be overwritten each time) const liveFile = path.join(RESULTS_DIR, '_live_progress.json'); + // Include the in-progress suite so Quality/Vision tabs can render partial data + const liveSuites = [...results.suites]; + if (currentSuite && currentSuite.tests.length > 0 && !results.suites.includes(currentSuite)) { + liveSuites.push(currentSuite); + } const liveResults = { ...results, + suites: liveSuites, _live: true, - _progress: { suitesCompleted, totalSuites, startedAt }, + _progress: { suitesCompleted, totalSuites, startedAt, currentTest: currentTest || null }, }; fs.writeFileSync(liveFile, JSON.stringify(liveResults, null, 2)); // Build a temporary index with just the live file const indexFile = path.join(RESULTS_DIR, 'index.json'); - const liveIndex = [{ + + // Compute live performance summary from accumulated data + const ttftArr = [...results.perfTotals.ttftMs]; + const decArr = [...results.perfTotals.decodeTokensPerSec]; + const livePerfSummary = (ttftArr.length > 0 || decArr.length > 0) ? { + ttft: ttftArr.length > 0 ? { + avgMs: Math.round(ttftArr.reduce((a, b) => a + b, 0) / ttftArr.length), + p50Ms: [...ttftArr].sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.5)], + p95Ms: [...ttftArr].sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.95)], + samples: ttftArr.length, + } : null, + decode: decArr.length > 0 ? { + avgTokensPerSec: parseFloat((decArr.reduce((a, b) => a + b, 0) / decArr.length).toFixed(1)), + samples: decArr.length, + } : null, + server: { + prefillTokensPerSec: results.perfTotals.prefillTokensPerSec, + decodeTokensPerSec: results.perfTotals.serverDecodeTokensPerSec, + }, + resource: results.resourceSamples.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null, + } : null; + + // Preserve previous runs in index for comparison sidebar + let existingIndex = []; + try { existingIndex = JSON.parse(fs.readFileSync(indexFile, 'utf8')).filter(e => e.file !== '_live_progress.json'); } catch { } + const liveEntry = { file: '_live_progress.json', model: results.model.name || 'loading...', vlm: results.model.vlm || null, @@ -540,32 +622,43 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName vlmPassed: 0, vlmTotal: 0, timeMs: Date.now() - new Date(startedAt).getTime(), tokens: results.tokenTotals.total, - perfSummary: null, - }]; - fs.writeFileSync(indexFile, JSON.stringify(liveIndex, null, 2)); + perfSummary: livePerfSummary, + }; + fs.writeFileSync(indexFile, JSON.stringify([...existingIndex, liveEntry], null, 2)); // Regenerate report in live mode const reportScript = path.join(__dirname, 'generate-report.cjs'); // Clear require cache to pick up any code changes delete require.cache[require.resolve(reportScript)]; const { generateReport } = require(reportScript); + const testsCompleted = liveSuites.reduce((n, s) => n + s.tests.length, 0); + const testsTotal = liveSuites.reduce((n, s) => n + s.tests.length, 0) + (currentTest ? 0 : 0); const reportPath = generateReport(RESULTS_DIR, { liveMode: true, liveStatus: { suitesCompleted, totalSuites, - currentSuite: nextSuiteName || 'Finishing...', + currentSuite: currentSuite?.name || nextSuiteName || 'Finishing...', + currentTest: currentTest || null, + testsCompleted, startedAt, }, }); // Open browser on first save (so user sees live progress from the start) - if (!_liveReportOpened && !NO_OPEN && !IS_SKILL_MODE && reportPath) { - try { - const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open'; - execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' }); - log(' 📊 Live report opened in browser (auto-refreshes every 5s)'); - } catch { } + if (!_liveReportOpened && !NO_OPEN && reportPath) { + if (IS_SKILL_MODE) { + // Ask Aegis to open in its embedded browser window + emit({ event: 'open_report', reportPath }); + log(' 📊 Requested Aegis to open live report'); + } else { + // Standalone: open in system browser + try { + const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open'; + execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' }); + log(' 📊 Live report opened in browser (auto-refreshes every 5s)'); + } catch { } + } _liveReportOpened = true; } } catch (err) { @@ -575,9 +668,11 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName } async function runSuites() { - const startedAt = new Date().toISOString(); + _runStartedAt = new Date().toISOString(); + _totalSuites = suites.length; for (let si = 0; si < suites.length; si++) { const s = suites[si]; + _currentSuiteIndex = si; currentSuite = { name: s.name, tests: [], passed: 0, failed: 0, skipped: 0, timeMs: 0 }; log(`\n${'─'.repeat(60)}`); log(` ${s.name}`); @@ -594,8 +689,16 @@ async function runSuites() { emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs }); - // Live progress: save intermediate results + regenerate report after each suite - saveLiveProgress(startedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null); + // Sample resource metrics (GPU + memory) after each suite + const resourceSample = sampleResourceMetrics(); + resourceSample.suite = s.name; + results.resourceSamples.push(resourceSample); + + // Scrape server metrics after each suite so live perf cards update + await scrapeServerMetrics(); + + // Live progress: save after suite (also saved per-test, but suite boundary is a clean checkpoint) + saveLiveProgress(_runStartedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null); } } @@ -641,6 +744,12 @@ async function test(name, fn) { currentSuite.timeMs += testResult.timeMs; currentSuite.tests.push(testResult); emit({ event: 'test_result', suite: currentSuite.name, test: name, status: testResult.status, timeMs: testResult.timeMs, detail: testResult.detail.slice(0, 120), tokens: testResult.tokens, perf: testResult.perf }); + + // Live progress: save after each test for real-time updates in commander center + if (_runStartedAt) { + _currentTestName = null; // Test just completed + saveLiveProgress(_runStartedAt, _currentSuiteIndex, _totalSuites, null, name); + } } function skip(name, reason) { @@ -2357,7 +2466,10 @@ async function main() { vlmPassed, vlmTotal, timeMs, tokens: results.tokenTotals.total, - perfSummary: results.perfSummary || null, + perfSummary: { + ...(results.perfSummary || {}), + resource: results.resourceSamples?.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null, + }, }); fs.writeFileSync(indexFile, JSON.stringify(index, null, 2));
Suite' + esc(modelShort(r.model)) + '