diff --git a/.gitignore b/.gitignore index 085a6d7..28f5878 100644 --- a/.gitignore +++ b/.gitignore @@ -9,8 +9,7 @@ venv/ ENV/ .venv -# Test files and outputs -tests/ +# Test outputs *_updated*.xml *_applied*.xml *_diffs/ diff --git a/docs/agents_monitoring_handoff.md b/docs/agents_monitoring_handoff.md new file mode 100644 index 0000000..4880c31 --- /dev/null +++ b/docs/agents_monitoring_handoff.md @@ -0,0 +1,128 @@ +# Long-Running Agents Monitoring Handoff + +## Summary + +This handoff documents the implemented V1 monitoring capability: + +- New **Agents** tab in Electron UI for starting/stopping long-running monitoring. +- Continuous Python worker (`anomaly_monitor.py`) with: + - deterministic historical-deviation scoring, + - quality/staleness gates, + - optional LLM triage, + - Neo4j persistence for `AgentRun` and `AnomalyEvent`, + - event dedup and retention cleanup. +- IPC surface and stream channels from Electron main to renderer: + - `agents:start`, `agents:status`, `agents:stop`, + - `agents:list-events`, `agents:get-event`, `agents:ack-event`, `agents:cleanup`, + - channels: `agent-status`, `agent-event`, `agent-error`, `agent-complete`. +- Graph drill-down integration with anomaly node support. + +## Files Changed + +### Electron + +- `electron-ui/index.html` + - Added **Agents** nav button. + - Added `tab-agents` page shell with controls, filters, feed, and detail panel. + - Added graph filter option for anomaly layer. + +- `electron-ui/styles.css` + - Added Agents tab styles (`agents-*`, `status-chip`, feed cards, detail panel). + +- `electron-ui/preload.js` + - Added `agents*` API bridge methods. + - Added event listeners for `agent-status/event/error/complete`. + +- `electron-ui/main.js` + - Added background agent runtime management (`activeAgentRun`). + - Added stream parser for monitor stdout markers (`[AGENT_STATUS]`, etc.). + - Added full `agents:*` IPC handlers. + - Added graceful stop handling on app shutdown. + +- `electron-ui/renderer.js` + - Added Agents tab state management. + - Added start/stop/refresh/cleanup/ack handlers. + - Added realtime feed updates from agent channels. + - Added event detail rendering and graph drill-down action. + +### Python backend + +- `scripts/anomaly_rules.py` (new) + - Deterministic scoring logic (`z`, `MAD`, rate, drift trend, flatline). + - Quality/staleness helpers and dedup key generator. + +- `scripts/anomaly_monitor.py` (new) + - Long-running monitoring worker with CLI subcommands: + - `run`, `status`, `list-events`, `get-event`, `ack-event`, `cleanup`, `replay-fixtures`. + - Neo4j persistence + dedup + retention cleanup. + - Optional LLM triage with structured JSON fallback. + +- `scripts/ignition_api_client.py` + - Added `query_tag_history(...)` and local-time-to-UTC conversion helper. + +- `scripts/neo4j_ontology.py` + - Added monitoring schema constraints/indexes for `AgentRun` / `AnomalyEvent`. + - Added helper methods: list/get/cleanup anomaly events. + - Added CLI commands: + - `init-agent-schema` + - `list-anomaly-events` + - `get-anomaly-event` + - `cleanup-anomaly-events` + +- `scripts/graph_api.py` + - Added node groups/colors for `AgentRun` and `AnomalyEvent`. + - Extended neighbor center-node lookup to support `event_id` and `run_id`. + +### Fixtures + +- `scripts/fixtures/anomaly_replay_cases.json` (new) + - Deterministic replay cases: + - normal baseline, + - sudden spike, + - slow drift, + - flatline/stuck. + +## Runtime Commands + +### Deterministic replay validation + +```bash +python3 scripts/anomaly_monitor.py replay-fixtures --fixture-file scripts/fixtures/anomaly_replay_cases.json +``` + +### Monitor worker manual run + +```bash +python3 scripts/anomaly_monitor.py run --run-id demo-run --config-json '{"pollIntervalMs":1000}' +``` + +### Event operations + +```bash +python3 scripts/anomaly_monitor.py list-events --limit 50 +python3 scripts/anomaly_monitor.py get-event --event-id +python3 scripts/anomaly_monitor.py ack-event --event-id --note "Reviewed by operator" +python3 scripts/anomaly_monitor.py cleanup --retention-days 14 +``` + +## Known Environment Requirements + +The Python environment must include packages from `requirements.txt`: + +- `neo4j` +- `anthropic` (for LLM triage; deterministic fallback works without API key) +- `python-dotenv` +- `requests` + +If `ANTHROPIC_API_KEY` is absent, triage automatically falls back to deterministic explanations. + +## Validation Status + +- Syntax checks passed: + - Python (`py_compile`) for all modified scripts. + - JS syntax checks (`node --check`) for Electron files. +- Fixture replay passed: + - `4/4` deterministic scenarios. + +Live end-to-end validation against actual Ignition + Neo4j + Anthropic requires connected runtime services. + diff --git a/electron-ui/index.html b/electron-ui/index.html index 03b808e..08adc5d 100644 --- a/electron-ui/index.html +++ b/electron-ui/index.html @@ -3,7 +3,7 @@ - + Axilon @@ -36,6 +36,13 @@ Assist + + + + + +
+ Idle + No active run +
+ + +
+ + + + + + + + + + + + + + + +
+ +
+
Cycle (ms)0
+
Candidates0
+
Triaged0
+
Emitted0
+
Last heartbeatn/a
+
+ +
+
+

Subsystem Health

+
+ +
+
+
+
Start monitoring to see subsystem health.
+
+
+ +
+ + +
+
+

Event Details

+
+ + + +
+
+
+

Select an anomaly event from the feed.

+
+
+
+ +
@@ -630,6 +739,7 @@

Ontology Graph

+ @@ -1427,6 +1537,8 @@

Graph: Node

+ + diff --git a/electron-ui/main.js b/electron-ui/main.js index b5cdb4d..f3034f1 100644 --- a/electron-ui/main.js +++ b/electron-ui/main.js @@ -4,6 +4,8 @@ const fs = require('fs'); const { spawn } = require('child_process'); let mainWindow; +let activeAgentRun = null; +let isAppShuttingDown = false; // --------------------------------------------------------------------------- // Python backend configuration (works in both dev and packaged modes) @@ -88,6 +90,10 @@ function createWindow() { }); mainWindow.loadFile('index.html'); + + mainWindow.on('closed', () => { + mainWindow = null; + }); // Open DevTools in development if (process.argv.includes('--dev')) { @@ -103,6 +109,20 @@ app.on('window-all-closed', () => { } }); +app.on('before-quit', () => { + isAppShuttingDown = true; + console.info('[Shutdown] before-quit triggered'); + if (activeAgentRun && activeAgentRun.process && !activeAgentRun.process.killed) { + try { + console.info(`[Shutdown] Stopping active agent run ${activeAgentRun.runId}`); + activeAgentRun.process.kill('SIGTERM'); + } catch (err) { + // Ignore termination errors during shutdown. + console.warn('[Shutdown] Failed to terminate active agent process:', err.message); + } + } +}); + app.on('activate', () => { if (BrowserWindow.getAllWindows().length === 0) { createWindow(); @@ -124,27 +144,27 @@ function runPythonScript(scriptName, args = [], options = {}) { stdout += text; // Send streaming output to renderer if enabled - if (streaming && mainWindow) { + if (streaming) { // Parse and emit tool calls separately const lines = text.split('\n'); for (const line of lines) { if (line.startsWith('[TOOL]')) { - mainWindow.webContents.send('tool-call', { + sendToRenderer('tool-call', { streamId, tool: line.replace('[TOOL]', '').trim() - }); + }, 'runPythonScript stdout tool'); } else if (line.startsWith('[DEBUG]')) { - mainWindow.webContents.send('stream-output', { + sendToRenderer('stream-output', { streamId, text: line, type: 'debug' - }); + }, 'runPythonScript stdout debug'); } else if (line.trim()) { - mainWindow.webContents.send('stream-output', { + sendToRenderer('stream-output', { streamId, text: line, type: 'output' - }); + }, 'runPythonScript stdout output'); } } } @@ -155,21 +175,21 @@ function runPythonScript(scriptName, args = [], options = {}) { stderr += text; // Stream stderr too (useful for verbose output) - if (streaming && mainWindow) { - mainWindow.webContents.send('stream-output', { + if (streaming) { + sendToRenderer('stream-output', { streamId, text, type: 'stderr' - }); + }, 'runPythonScript stderr'); } }); pythonProcess.on('close', (code) => { - if (streaming && mainWindow) { - mainWindow.webContents.send('stream-complete', { + if (streaming) { + sendToRenderer('stream-complete', { streamId, success: code === 0 - }); + }, 'runPythonScript close'); } if (code === 0) { @@ -185,6 +205,173 @@ function runPythonScript(scriptName, args = [], options = {}) { }); } +function normalizeAgentConfig(config = {}) { + const thresholds = (config && typeof config.thresholds === 'object' && config.thresholds) || {}; + const scope = (config && typeof config.scope === 'object' && config.scope) || {}; + return { + pollIntervalMs: Math.max(1000, Number(config.pollIntervalMs || 1000)), + historyWindowMinutes: Math.max(10, Number(config.historyWindowMinutes || 360)), + minHistoryPoints: Math.max(10, Number(config.minHistoryPoints || 30)), + maxMonitoredTags: Math.max(10, Number(config.maxMonitoredTags || 200)), + maxCandidatesPerCycle: Math.max(1, Number(config.maxCandidatesPerCycle || 25)), + maxCandidatesPerSubsystem: Math.max(1, Number(config.maxCandidatesPerSubsystem || 8)), + maxLlmTriagesPerCycle: Math.max(0, Number(config.maxLlmTriagesPerCycle || 5)), + maxLlmTriagesPerSubsystem: Math.max(0, Number(config.maxLlmTriagesPerSubsystem || 2)), + dedupCooldownMinutes: Math.max(1, Number(config.dedupCooldownMinutes || 10)), + retentionDays: Math.max(1, Number(config.retentionDays || 14)), + cleanupEveryCycles: Math.max(1, Number(config.cleanupEveryCycles || 40)), + thresholds: { + z: Number(thresholds.z ?? 3.0), + mad: Number(thresholds.mad ?? 3.5), + rate: Number(thresholds.rate ?? 0.0), + stalenessSec: Number(thresholds.stalenessSec ?? 120), + flatline_std_epsilon: Number(thresholds.flatline_std_epsilon ?? 1e-6), + stuck_window_size: Number(thresholds.stuck_window_size ?? 20), + }, + scope: { + project: scope.project || null, + equipmentTags: Array.isArray(scope.equipmentTags) ? scope.equipmentTags : [], + tagRegex: scope.tagRegex || null, + subsystemMode: String(scope.subsystemMode || 'auto').toLowerCase() === 'global' ? 'global' : 'auto', + subsystemPriority: Array.isArray(scope.subsystemPriority) && scope.subsystemPriority.length + ? scope.subsystemPriority.map(String) + : ['view', 'equipment', 'group', 'global'], + subsystemInclude: Array.isArray(scope.subsystemInclude) ? scope.subsystemInclude.map(String) : [], + includeUnlinkedTags: Boolean(scope.includeUnlinkedTags), + }, + }; +} + +function canSendToRenderer() { + if (!mainWindow) return false; + if (typeof mainWindow.isDestroyed === 'function' && mainWindow.isDestroyed()) return false; + const wc = mainWindow.webContents; + if (!wc) return false; + if (typeof wc.isDestroyed === 'function' && wc.isDestroyed()) return false; + return true; +} + +function sendToRenderer(channel, payload, context = '') { + if (!canSendToRenderer()) { + if (isAppShuttingDown) { + console.info(`[Shutdown] Dropped renderer message ${channel}${context ? ` (${context})` : ''}`); + } else { + console.warn(`[IPC] Renderer unavailable for ${channel}${context ? ` (${context})` : ''}`); + } + return false; + } + try { + mainWindow.webContents.send(channel, payload); + return true; + } catch (err) { + console.warn(`[IPC] Failed sending ${channel}${context ? ` (${context})` : ''}: ${err.message}`); + return false; + } +} + +function routeAgentMessage(channel, payload) { + const ok = sendToRenderer(channel, payload, 'agent-stream'); + if (!ok) { + console.warn(`[Agent IPC] Failed to route message on ${channel}`); + } +} + +function parseAgentLine(line) { + const trimmed = (line || '').trim(); + if (!trimmed) return null; + const prefixes = [ + { key: '[AGENT_STATUS]', channel: 'agent-status' }, + { key: '[AGENT_EVENT]', channel: 'agent-event' }, + { key: '[AGENT_ERROR]', channel: 'agent-error' }, + { key: '[AGENT_COMPLETE]', channel: 'agent-complete' }, + ]; + for (const prefix of prefixes) { + if (!trimmed.startsWith(prefix.key)) continue; + const jsonText = trimmed.slice(prefix.key.length).trim(); + try { + const payload = JSON.parse(jsonText); + return { channel: prefix.channel, payload }; + } catch (err) { + return { + channel: 'agent-error', + payload: { + runId: activeAgentRun ? activeAgentRun.runId : null, + code: 'invalid_agent_json', + message: `Failed to parse agent stream line: ${trimmed.slice(0, 200)}`, + recoverable: true, + timestamp: new Date().toISOString(), + }, + }; + } + } + return null; +} + +function handleAgentStdoutChunk(text) { + if (!activeAgentRun) return; + activeAgentRun.stdoutBuffer += text; + const lines = activeAgentRun.stdoutBuffer.split(/\r?\n/); + activeAgentRun.stdoutBuffer = lines.pop() || ''; + for (const line of lines) { + const parsed = parseAgentLine(line); + if (!parsed) { + if (line.trim().startsWith('[AGENT')) { + console.warn('[Agent stream] Unparsed line:', line.slice(0, 300)); + } + continue; + } + if (parsed.channel === 'agent-status' && parsed.payload) { + activeAgentRun.status = parsed.payload.state || activeAgentRun.status; + activeAgentRun.metrics = { + cycleMs: parsed.payload.cycleMs || 0, + candidates: parsed.payload.candidates || 0, + triaged: parsed.payload.triaged || 0, + emitted: parsed.payload.emitted || 0, + timestamp: parsed.payload.timestamp || new Date().toISOString(), + }; + } + routeAgentMessage(parsed.channel, parsed.payload); + } +} + +async function stopActiveAgent(reason = 'stopped_by_user') { + if (!activeAgentRun || !activeAgentRun.process || activeAgentRun.process.killed) { + return { success: false, error: 'No active agent run' }; + } + const runId = activeAgentRun.runId; + activeAgentRun.status = 'stopping'; + + return new Promise((resolve) => { + const proc = activeAgentRun.process; + let settled = false; + const done = (result) => { + if (settled) return; + settled = true; + resolve(result); + }; + + proc.once('close', () => { + done({ success: true, runId, stoppedAt: new Date().toISOString(), reason }); + }); + + try { + proc.kill('SIGTERM'); + } catch (err) { + done({ success: false, error: err.message }); + return; + } + + setTimeout(() => { + if (proc.killed) return; + try { + proc.kill('SIGKILL'); + } catch (err) { + // Ignore forced termination errors. + } + }, 5000); + }); +} + // IPC Handlers // Select file dialog @@ -421,22 +608,22 @@ ipcMain.handle('troubleshoot', async (event, question, history) => { stderr += text; // Stream tool calls, debug info, and Claude response from stderr to frontend - if (mainWindow) { + if (canSendToRenderer()) { // Check for special prefixes first (they appear on their own lines) if (text.includes('[TOOL]') || text.includes('[DEBUG]') || text.includes('[INFO]')) { const lines = text.split('\n'); for (const line of lines) { if (line.startsWith('[TOOL]')) { - mainWindow.webContents.send('tool-call', { + sendToRenderer('tool-call', { streamId, tool: line.replace('[TOOL]', '').trim() - }); + }, 'troubleshoot stderr tool'); } else if (line.startsWith('[DEBUG]') || line.startsWith('[INFO]')) { - mainWindow.webContents.send('stream-output', { + sendToRenderer('stream-output', { streamId, text: line, type: 'debug' - }); + }, 'troubleshoot stderr debug'); } } } else if (text.includes('[STREAM]')) { @@ -444,29 +631,29 @@ ipcMain.handle('troubleshoot', async (event, question, history) => { const streamStart = text.indexOf('[STREAM]'); const afterStream = text.substring(streamStart + 8); // 8 = length of '[STREAM]' if (afterStream) { - mainWindow.webContents.send('stream-output', { + sendToRenderer('stream-output', { streamId, text: afterStream, type: 'claude-stream' - }); + }, 'troubleshoot stderr stream-start'); } } else if (text && !text.startsWith('[')) { // Continuation of Claude streaming (no prefix) - mainWindow.webContents.send('stream-output', { + sendToRenderer('stream-output', { streamId, text: text, type: 'claude-stream' - }); + }, 'troubleshoot stderr stream-cont'); } } }); proc.on('close', (code) => { - if (mainWindow) { - mainWindow.webContents.send('stream-complete', { + if (canSendToRenderer()) { + sendToRenderer('stream-complete', { streamId, success: code === 0 - }); + }, 'troubleshoot close'); } if (code === 0) { @@ -1019,25 +1206,25 @@ ipcMain.handle('graph:ai-propose', async (event, description) => { stderr += text; // Stream tool calls to frontend - if (mainWindow && text.includes('[TOOL]')) { + if (canSendToRenderer() && text.includes('[TOOL]')) { const lines = text.split('\n'); for (const line of lines) { if (line.startsWith('[TOOL]')) { - mainWindow.webContents.send('tool-call', { + sendToRenderer('tool-call', { streamId, tool: line.replace('[TOOL]', '').trim() - }); + }, 'ai-propose stderr tool'); } } } }); proc.on('close', (code) => { - if (mainWindow) { - mainWindow.webContents.send('stream-complete', { + if (canSendToRenderer()) { + sendToRenderer('stream-complete', { streamId, success: code === 0 - }); + }, 'ai-propose close'); } if (code === 0) { @@ -1304,7 +1491,9 @@ function readDbCredentials() { if (!fs.existsSync(credPath)) return {}; try { return JSON.parse(fs.readFileSync(credPath, 'utf-8')); - } catch { return {}; } + } catch { + return {}; + } } // Get database connections from Neo4j + credential status from db_credentials.json @@ -1314,10 +1503,8 @@ ipcMain.handle('get-db-connections', async () => { const proc = spawnPythonProcess('neo4j_ontology.py', ['db-connections', '--json']); let stdout = ''; - let stderr = ''; proc.stdout.on('data', (data) => { stdout += data.toString(); }); - proc.stderr.on('data', (data) => { stderr += data.toString(); }); proc.on('close', (code) => { if (code !== 0) { @@ -1335,7 +1522,7 @@ ipcMain.handle('get-db-connections', async () => { })); resolve({ success: true, connections: enriched }); - } catch (e) { + } catch { resolve({ success: true, connections: [] }); } }); @@ -1349,7 +1536,7 @@ ipcMain.handle('get-db-connections', async () => { ipcMain.handle('save-db-credentials', async (event, credentials) => { try { const credPath = getDbCredentialsPath(); - let existing = readDbCredentials(); + const existing = readDbCredentials(); for (const [name, cred] of Object.entries(credentials)) { existing[name] = { @@ -1392,4 +1579,198 @@ ipcMain.handle('test-db-connection', async (event, connectionName) => { } catch (error) { return { success: false, error: error.message }; } +}); + +// ============================================ +// Long-running Agent Monitoring IPC Handlers +// ============================================ + +ipcMain.handle('agents:start', async (event, rawConfig = {}) => { + if (activeAgentRun && activeAgentRun.process && !activeAgentRun.process.killed) { + return { success: false, error: `Agent run already active: ${activeAgentRun.runId}`, runId: activeAgentRun.runId }; + } + + const runId = `agent-${Date.now()}`; + const config = normalizeAgentConfig(rawConfig); + + try { + const proc = spawnPythonProcess('anomaly_monitor.py', [ + 'run', + '--run-id', + runId, + '--config-json', + JSON.stringify(config), + ]); + + activeAgentRun = { + runId, + process: proc, + status: 'starting', + startedAt: new Date().toISOString(), + metrics: { + cycleMs: 0, + candidates: 0, + triaged: 0, + emitted: 0, + timestamp: new Date().toISOString(), + }, + stdoutBuffer: '', + config, + }; + + proc.stdout.on('data', (data) => { + handleAgentStdoutChunk(data.toString()); + }); + + proc.stderr.on('data', (data) => { + const text = data.toString().trim(); + if (!text) return; + console.warn('[Agent stderr]', text.slice(0, 500)); + routeAgentMessage('agent-error', { + runId, + code: 'worker_stderr', + message: text, + recoverable: true, + timestamp: new Date().toISOString(), + }); + }); + + proc.on('close', (code) => { + const hadActive = activeAgentRun && activeAgentRun.runId === runId; + if (hadActive) { + routeAgentMessage('agent-complete', { + runId, + success: code === 0, + reason: code === 0 ? 'completed' : 'worker_exit_error', + stoppedAt: new Date().toISOString(), + }); + activeAgentRun = null; + } + }); + + proc.on('error', (err) => { + routeAgentMessage('agent-error', { + runId, + code: 'worker_spawn_error', + message: err.message, + recoverable: false, + timestamp: new Date().toISOString(), + }); + activeAgentRun = null; + }); + + return { success: true, runId, startedAt: activeAgentRun.startedAt, config }; + } catch (error) { + activeAgentRun = null; + return { success: false, error: error.message, runId }; + } +}); + +ipcMain.handle('agents:status', async (event, runId) => { + if (activeAgentRun && (!runId || runId === activeAgentRun.runId)) { + return { + success: true, + runId: activeAgentRun.runId, + status: activeAgentRun.status, + metrics: activeAgentRun.metrics, + lastHeartbeatAt: activeAgentRun.metrics.timestamp, + startedAt: activeAgentRun.startedAt, + config: activeAgentRun.config, + active: true, + }; + } + + if (!runId) { + return { success: true, active: false, status: 'idle' }; + } + + try { + const output = await runPythonScript('anomaly_monitor.py', ['status', '--run-id', runId]); + const parsed = JSON.parse(output || '{}'); + return parsed; + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:stop', async (event, runId = null) => { + if (!activeAgentRun) { + return { success: false, error: 'No active agent run' }; + } + if (runId && runId !== activeAgentRun.runId) { + return { success: false, error: `Requested run ${runId} does not match active run ${activeAgentRun.runId}` }; + } + return stopActiveAgent('stopped_by_user'); +}); + +ipcMain.handle('agents:list-events', async (event, filters = {}) => { + const args = ['list-events']; + if (filters.limit) args.push('--limit', String(filters.limit)); + if (filters.state) args.push('--state', String(filters.state)); + if (filters.severity) args.push('--severity', String(filters.severity)); + if (filters.runId) args.push('--run-id', String(filters.runId)); + + try { + const output = await runPythonScript('anomaly_monitor.py', args); + return JSON.parse(output || '{"success":true,"events":[]}'); + } catch (error) { + return { success: false, error: error.message, events: [] }; + } +}); + +ipcMain.handle('agents:get-event', async (event, eventId) => { + try { + const output = await runPythonScript('anomaly_monitor.py', ['get-event', '--event-id', String(eventId)]); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:ack-event', async (event, eventId, note = '') => { + try { + const args = ['ack-event', '--event-id', String(eventId)]; + if (note) args.push('--note', String(note)); + const output = await runPythonScript('anomaly_monitor.py', args); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:clear-event', async (event, eventId, note = '') => { + try { + const args = ['clear-event', '--event-id', String(eventId)]; + if (note) args.push('--note', String(note)); + const output = await runPythonScript('anomaly_monitor.py', args); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:deep-analyze', async (event, eventId) => { + try { + const output = await runPythonScript('anomaly_monitor.py', [ + 'deep-analyze', + '--event-id', + String(eventId), + ]); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:cleanup', async (event, retentionDays = 14) => { + try { + const output = await runPythonScript('anomaly_monitor.py', [ + 'cleanup', + '--retention-days', + String(retentionDays), + ]); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } }); \ No newline at end of file diff --git a/electron-ui/preload.js b/electron-ui/preload.js index d3c8171..cf1d75c 100644 --- a/electron-ui/preload.js +++ b/electron-ui/preload.js @@ -70,6 +70,17 @@ contextBridge.exposeInMainWorld('api', { getSettings: () => ipcRenderer.invoke('get-settings'), saveSettings: (settings) => ipcRenderer.invoke('save-settings', settings), testIgnitionConnection: (options) => ipcRenderer.invoke('test-ignition-connection', options), + + // Long-running agents monitoring + agentsStart: (config) => ipcRenderer.invoke('agents:start', config), + agentsStatus: (runId) => ipcRenderer.invoke('agents:status', runId), + agentsStop: (runId) => ipcRenderer.invoke('agents:stop', runId), + agentsListEvents: (filters) => ipcRenderer.invoke('agents:list-events', filters), + agentsGetEvent: (eventId) => ipcRenderer.invoke('agents:get-event', eventId), + agentsAckEvent: (eventId, note) => ipcRenderer.invoke('agents:ack-event', eventId, note), + agentsClearEvent: (eventId, note) => ipcRenderer.invoke('agents:clear-event', eventId, note), + agentsDeepAnalyze: (eventId) => ipcRenderer.invoke('agents:deep-analyze', eventId), + agentsCleanup: (retentionDays) => ipcRenderer.invoke('agents:cleanup', retentionDays), // Database connections getDbConnections: () => ipcRenderer.invoke('get-db-connections'), @@ -91,6 +102,26 @@ contextBridge.exposeInMainWorld('api', { const handler = (event, data) => callback(data); ipcRenderer.on('stream-complete', handler); return () => ipcRenderer.removeListener('stream-complete', handler); + }, + onAgentStatus: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-status', handler); + return () => ipcRenderer.removeListener('agent-status', handler); + }, + onAgentEvent: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-event', handler); + return () => ipcRenderer.removeListener('agent-event', handler); + }, + onAgentError: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-error', handler); + return () => ipcRenderer.removeListener('agent-error', handler); + }, + onAgentComplete: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-complete', handler); + return () => ipcRenderer.removeListener('agent-complete', handler); } }); diff --git a/electron-ui/renderer.js b/electron-ui/renderer.js index 53974f5..bba8767 100644 --- a/electron-ui/renderer.js +++ b/electron-ui/renderer.js @@ -3536,6 +3536,752 @@ btnSaveDbCreds?.addEventListener('click', async () => { btnSaveDbCreds.disabled = false; } }); +// Agents Tab - Long-running monitoring +// ============================================ + +const HEALTH_TREND_MAX_CYCLES = 20; + +const agentsState = { + runId: null, + status: 'idle', + events: [], + selectedEventId: null, + selectedSubsystemId: null, + listenersReady: false, + subsystemHealth: {}, + subsystemHistory: {}, +}; + +function getAgentsElements() { + return { + btnStart: document.getElementById('btn-agents-start'), + btnStop: document.getElementById('btn-agents-stop'), + btnRefresh: document.getElementById('btn-agents-refresh'), + btnCleanup: document.getElementById('btn-agents-cleanup'), + btnDeepAnalyze: document.getElementById('btn-agents-deep-analyze'), + btnOpenGraph: document.getElementById('btn-agents-open-graph'), + btnAck: document.getElementById('btn-agents-ack'), + statusChip: document.getElementById('agents-status-chip'), + statusText: document.getElementById('agents-status-text'), + list: document.getElementById('agents-event-list'), + detail: document.getElementById('agents-event-detail'), + filterState: document.getElementById('agents-filter-state'), + filterSeverity: document.getElementById('agents-filter-severity'), + filterSearch: document.getElementById('agents-filter-search'), + metricCycle: document.getElementById('agents-metric-cycle'), + metricCandidates: document.getElementById('agents-metric-candidates'), + metricTriaged: document.getElementById('agents-metric-triaged'), + metricEmitted: document.getElementById('agents-metric-emitted'), + metricHeartbeat: document.getElementById('agents-metric-heartbeat'), + cfgPoll: document.getElementById('agents-config-poll-ms'), + cfgHist: document.getElementById('agents-config-history-min'), + cfgPoints: document.getElementById('agents-config-min-points'), + cfgAutoLlm: document.getElementById('agents-config-auto-llm'), + cfgMaxLlm: document.getElementById('agents-config-max-llm'), + cfgZ: document.getElementById('agents-config-threshold-z'), + cfgMad: document.getElementById('agents-config-threshold-mad'), + cfgStale: document.getElementById('agents-config-staleness-sec'), + }; +} + +function getAgentsConfigFromUI() { + const el = getAgentsElements(); + return { + pollIntervalMs: Number(el.cfgPoll?.value || 1000), + historyWindowMinutes: Number(el.cfgHist?.value || 360), + minHistoryPoints: Number(el.cfgPoints?.value || 30), + maxCandidatesPerSubsystem: 8, + maxLlmTriagesPerCycle: el.cfgAutoLlm?.checked ? Number(el.cfgMaxLlm?.value || 5) : 0, + maxLlmTriagesPerSubsystem: el.cfgAutoLlm?.checked ? 2 : 0, + thresholds: { + z: Number(el.cfgZ?.value || 3), + mad: Number(el.cfgMad?.value || 3.5), + stalenessSec: Number(el.cfgStale?.value || 120), + }, + scope: { + subsystemMode: 'auto', + subsystemPriority: ['view', 'equipment', 'group', 'global'], + includeUnlinkedTags: false, + }, + }; +} + +function formatAgentTime(ts) { + if (!ts) return 'n/a'; + const d = new Date(ts); + if (Number.isNaN(d.getTime())) return String(ts); + return d.toLocaleString(); +} + +function computeHealthLevel(signal) { + const avgAbsZ = parseFloat(signal.avgAbsZ || 0); + const candidateRatio = parseFloat(signal.candidateRatio || 0); + const maxAbsZ = parseFloat(signal.maxAbsZ || 0); + if (candidateRatio >= 0.25 || maxAbsZ >= 5) return 'critical'; + if (candidateRatio >= 0.10 || avgAbsZ >= 2.5) return 'warning'; + if (signal.shiftRatio > 0.1 || avgAbsZ >= 1.5) return 'elevated'; + return 'healthy'; +} + +function healthLevelToScore(level) { + return { healthy: 0.1, elevated: 0.4, warning: 0.7, critical: 1.0 }[level] || 0.1; +} + +function updateSubsystemHealthFromDiagnostics(diagnostics) { + const tagMap = diagnostics?.subsystemTagMap; + if (tagMap && typeof tagMap === 'object') { + for (const [subId, info] of Object.entries(tagMap)) { + if (!agentsState.subsystemHealth[subId]) { + agentsState.subsystemHealth[subId] = { + subsystemId: subId, + subsystemType: info.type || 'global', + subsystemName: info.name || subId, + evaluated: (info.tags || []).length, + candidate: 0, + nearShift: 0, + maxAbsZ: 0, + avgAbsZ: 0, + healthLevel: 'healthy', + tagSignals: (info.tags || []).map((t) => ({ + path: t.path, + name: t.name || t.path, + z: 0, + mad: 0, + value: null, + })), + }; + } + } + } + + const signals = diagnostics?.subsystemShiftSignals; + if (Array.isArray(signals) && signals.length) { + for (const sig of signals) { + const subId = sig.subsystemId || sig.subsystemName || 'global:all'; + const healthLevel = computeHealthLevel(sig); + agentsState.subsystemHealth[subId] = { ...sig, healthLevel }; + + if (!agentsState.subsystemHistory[subId]) { + agentsState.subsystemHistory[subId] = []; + } + const history = agentsState.subsystemHistory[subId]; + history.push({ + healthLevel, + avgAbsZ: parseFloat(sig.avgAbsZ || 0), + candidateRatio: parseFloat(sig.candidateRatio || 0), + candidates: parseInt(sig.candidate || 0, 10), + evaluated: parseInt(sig.evaluated || 0, 10), + ts: Date.now(), + }); + if (history.length > HEALTH_TREND_MAX_CYCLES) { + history.splice(0, history.length - HEALTH_TREND_MAX_CYCLES); + } + } + } + + renderSubsystemHealthGrid(); +} + +function renderSubsystemHealthGrid() { + const container = document.getElementById('agents-health-grid'); + if (!container) return; + + const entries = Object.entries(agentsState.subsystemHealth); + if (!entries.length) { + container.innerHTML = '
Start monitoring to see subsystem health.
'; + return; + } + + const severityOrder = { critical: 0, warning: 1, elevated: 2, healthy: 3 }; + entries.sort((a, b) => { + const sa = severityOrder[a[1].healthLevel] ?? 3; + const sb = severityOrder[b[1].healthLevel] ?? 3; + if (sa !== sb) return sa - sb; + return (b[1].candidate || 0) - (a[1].candidate || 0); + }); + + container.innerHTML = entries + .map(([subId, sig]) => { + const level = sig.healthLevel || 'healthy'; + const isExpanded = agentsState.selectedSubsystemId === subId; + const expandedClass = isExpanded ? ' expanded selected' : ''; + const name = sig.subsystemName || subId; + const type = sig.subsystemType || 'global'; + const evaluated = parseInt(sig.evaluated || 0, 10); + const candidates = parseInt(sig.candidate || 0, 10); + const maxZ = parseFloat(sig.maxAbsZ || 0).toFixed(1); + const anomalyClass = candidates > 0 ? (level === 'critical' ? ' has-critical' : ' has-anomalies') : ''; + const history = agentsState.subsystemHistory[subId] || []; + + let expandedBody = ''; + if (isExpanded) { + const bigTrend = renderTrendBars(history, 48); + const tagRows = renderTagSignalRows(sig.tagSignals || []); + const tagCount = (sig.tagSignals || []).length; + expandedBody = ` +
+
${bigTrend}
+
+

Tags

+ ${tagCount} tags +
+
+ NameTrendz-scoreAvgCurrent +
+
${tagRows}
+
+ `; + } else { + expandedBody = `
${renderTrendBars(history, 28)}
`; + } + + return ` +
+
+
+ + ${escapeHtml(name)} +
+ ${escapeHtml(type)} +
+
+
+ Tags + ${evaluated} +
+
+ Anomalies + ${candidates} +
+
+ Peak z + ${maxZ} +
+
+ ${expandedBody} + ${escapeHtml(level)} +
+ `; + }) + .join(''); + + container.querySelectorAll('.agents-health-card').forEach((card) => { + card.addEventListener('click', (e) => { + if (e.target.closest('.health-tag-list')) return; + const subId = card.getAttribute('data-subsystem-id'); + selectSubsystem(subId); + }); + }); +} + +function renderTrendBars(history, maxHeight) { + const h = maxHeight || 28; + const slots = HEALTH_TREND_MAX_CYCLES; + const bars = []; + for (let i = 0; i < slots; i++) { + const idx = history.length - slots + i; + if (idx < 0) { + bars.push('
'); + continue; + } + const entry = history[idx]; + const level = entry.healthLevel || 'healthy'; + const score = healthLevelToScore(level); + const height = Math.max(3, Math.round(score * h)); + bars.push(`
`); + } + return bars.join(''); +} + +function tagZToHealthLevel(absZ) { + if (absZ >= 5) return 'critical'; + if (absZ >= 2.5) return 'warning'; + if (absZ >= 1.5) return 'elevated'; + return 'healthy'; +} + +function renderSparklineSvg(values, width, height) { + if (!values || values.length < 2) { + return ``; + } + const min = Math.min(...values); + const max = Math.max(...values); + const range = max - min || 1; + const pad = 1; + const usableH = height - pad * 2; + const step = width / (values.length - 1); + const points = values + .map((v, i) => `${(i * step).toFixed(1)},${(pad + usableH - ((v - min) / range) * usableH).toFixed(1)}`) + .join(' '); + return ``; +} + +function renderTagSignalRows(tagSignals) { + if (!tagSignals || !tagSignals.length) { + return '
No tag data available yet.
'; + } + + return tagSignals + .map((tag) => { + const absZ = Math.abs(tag.z || 0); + const level = tagZToHealthLevel(absZ); + const currentVal = tag.value != null ? String(tag.value) : '—'; + const avgVal = tag.avg != null ? String(tag.avg) : '—'; + const zDisplay = (tag.z || 0).toFixed(2); + const sparkline = tag.sparkline && tag.sparkline.length >= 2 + ? renderSparklineSvg(tag.sparkline, 120, 24) + : renderSparklineSvg(null, 120, 24); + return ` +
+ ${escapeHtml(tag.name || tag.path || '')} +
${sparkline}
+ z ${escapeHtml(zDisplay)} + ${escapeHtml(avgVal)} + ${escapeHtml(currentVal)} +
+ `; + }) + .join(''); +} + +function selectSubsystem(subId) { + const clearBtn = document.getElementById('btn-agents-clear-subsystem'); + if (agentsState.selectedSubsystemId === subId) { + agentsState.selectedSubsystemId = null; + if (clearBtn) clearBtn.style.display = 'none'; + } else { + agentsState.selectedSubsystemId = subId; + if (clearBtn) clearBtn.style.display = ''; + } + renderSubsystemHealthGrid(); + renderAgentEventList(); +} + +function updateAgentStatusUi(status, text) { + const el = getAgentsElements(); + if (!el.statusChip || !el.statusText) return; + el.statusChip.className = 'status-chip'; + const normalized = (status || 'idle').toLowerCase(); + if (normalized === 'running') el.statusChip.classList.add('running'); + if (normalized === 'failed' || normalized === 'error') el.statusChip.classList.add('error'); + el.statusChip.textContent = normalized; + el.statusText.textContent = text || normalized; + if (el.btnStart) el.btnStart.disabled = normalized === 'running' || normalized === 'starting'; + if (el.btnStop) el.btnStop.disabled = !(normalized === 'running' || normalized === 'starting' || normalized === 'stopping'); +} + +function updateAgentMetrics(metrics = {}, heartbeatTs = null) { + const el = getAgentsElements(); + if (el.metricCycle) el.metricCycle.textContent = String(metrics.cycleMs ?? metrics.lastCycleMs ?? 0); + if (el.metricCandidates) el.metricCandidates.textContent = String(metrics.candidates ?? metrics.lastCandidates ?? 0); + if (el.metricTriaged) el.metricTriaged.textContent = String(metrics.triaged ?? metrics.lastTriaged ?? 0); + if (el.metricEmitted) el.metricEmitted.textContent = String(metrics.emitted ?? metrics.lastEmitted ?? 0); + if (el.metricHeartbeat) el.metricHeartbeat.textContent = formatAgentTime(heartbeatTs || metrics.timestamp); +} + +function getFilteredAgentEvents() { + const el = getAgentsElements(); + const state = (el.filterState?.value || '').toLowerCase(); + const severity = (el.filterSeverity?.value || '').toLowerCase(); + const search = (el.filterSearch?.value || '').trim().toLowerCase(); + const subFilter = agentsState.selectedSubsystemId || ''; + return agentsState.events.filter((event) => { + if (state && String(event.state || '').toLowerCase() !== state) return false; + if (severity && String(event.severity || '').toLowerCase() !== severity) return false; + if (subFilter) { + const eventSubId = event.subsystem_id + || `${(event.subsystem_type || 'global')}:${(event.subsystem_name || 'all').toLowerCase()}`; + if (eventSubId !== subFilter) return false; + } + if (search) { + const haystack = [ + event.summary, + event.source_tag, + event.tag_name, + event.subsystem_name, + event.subsystem_type, + ...(event.equipment || []), + ...(event.tags || []), + ] + .filter(Boolean) + .join(' ') + .toLowerCase(); + if (!haystack.includes(search)) return false; + } + return true; + }); +} + +function renderAgentEventList() { + const el = getAgentsElements(); + if (!el.list) return; + const events = getFilteredAgentEvents(); + if (!events.length) { + const subName = agentsState.selectedSubsystemId + ? (agentsState.subsystemHealth[agentsState.selectedSubsystemId]?.subsystemName || agentsState.selectedSubsystemId) + : ''; + const msg = subName + ? `No anomaly events for "${subName}".` + : 'No anomaly events match the current filters.'; + el.list.innerHTML = `
${escapeHtml(msg)}
`; + return; + } + el.list.innerHTML = events + .map((event) => { + const active = event.event_id === agentsState.selectedEventId ? ' active' : ''; + const sev = String(event.severity || 'low').toLowerCase(); + const equipment = (event.equipment || []).slice(0, 2).join(', '); + const subsystemLabel = event.subsystem_name + ? `${event.subsystem_type || 'subsystem'}: ${event.subsystem_name}` + : ''; + const baseMeta = [event.tag_name || event.source_tag || '', equipment, subsystemLabel] + .filter(Boolean) + .join(' • '); + return ` +
+
+ ${escapeHtml(sev)} + ${escapeHtml(formatAgentTime(event.created_at))} +
+
${escapeHtml(event.summary || 'Untitled anomaly')}
+
${escapeHtml(baseMeta)}
+
+ `; + }) + .join(''); + + el.list.querySelectorAll('.agents-event-card').forEach((card) => { + card.addEventListener('click', () => { + const eventId = card.getAttribute('data-event-id'); + if (!eventId) return; + selectAgentEvent(eventId); + }); + }); +} + +function resolveAgentGraphTarget(event) { + if (String(event.subsystem_type || '').toLowerCase() === 'view' && event.subsystem_name) { + return { name: event.subsystem_name, type: 'View' }; + } + const equipment = (event.equipment || []).find(Boolean); + if (equipment) return { name: equipment, type: 'Equipment' }; + if (String(event.subsystem_type || '').toLowerCase() === 'equipment' && event.subsystem_name) { + return { name: event.subsystem_name, type: 'Equipment' }; + } + const tagName = event.tag_name || (event.tags || []).find(Boolean) || event.source_tag; + if (tagName) return { name: tagName, type: 'ScadaTag' }; + return null; +} + +function renderAgentEventDetails(event) { + const el = getAgentsElements(); + if (!el.detail) return; + if (!event) { + el.detail.innerHTML = '

Select an anomaly event from the feed.

'; + if (el.btnDeepAnalyze) el.btnDeepAnalyze.disabled = true; + if (el.btnOpenGraph) el.btnOpenGraph.disabled = true; + if (el.btnAck) el.btnAck.disabled = true; + return; + } + + let checks = []; + let causes = []; + let safety = []; + try { checks = JSON.parse(event.recommended_checks_json || '[]'); } catch (e) {} + try { causes = JSON.parse(event.probable_causes_json || '[]'); } catch (e) {} + try { safety = JSON.parse(event.safety_notes_json || '[]'); } catch (e) {} + + el.detail.innerHTML = ` +
+
Event ID${escapeHtml(event.event_id || '')}
+
State${escapeHtml(event.state || '')}
+
Severity${escapeHtml(event.severity || '')}
+
Confidence${escapeHtml(String(event.confidence ?? ''))}
+
Category${escapeHtml(event.category || '')}
+
Timestamp${escapeHtml(formatAgentTime(event.created_at))}
+
Subsystem Type${escapeHtml(event.subsystem_type || 'global')}
+
Subsystem${escapeHtml(event.subsystem_name || 'all')}
+
Source Tag${escapeHtml(event.source_tag || '')}
+
Tag Name${escapeHtml(event.tag_name || '')}
+
z-score${escapeHtml(String(event.z_score ?? '0'))}
+
MAD score${escapeHtml(String(event.mad_score ?? '0'))}
+
+
+
Summary
+
${escapeHtml(event.summary || '')}
+
+
+
Explanation
+
${escapeHtml(event.explanation || '')}
+
+
+
Probable Causes
+
    ${(causes || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
+
+
+
Verification Checks
+
    ${(checks || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
+
+
+
Safety Notes
+
    ${(safety || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
+
+ `; + + if (el.btnDeepAnalyze) { + el.btnDeepAnalyze.disabled = false; + el.btnDeepAnalyze.textContent = event.llm_triaged ? 'Re-Analyze' : 'Deep Analyze'; + } + if (el.btnOpenGraph) el.btnOpenGraph.disabled = !resolveAgentGraphTarget(event); + if (el.btnAck) { + const state = String(event.state || '').toLowerCase(); + if (state === 'acknowledged') { + el.btnAck.textContent = 'Clear'; + el.btnAck.disabled = false; + } else if (state === 'cleared') { + el.btnAck.textContent = 'Cleared'; + el.btnAck.disabled = true; + } else { + el.btnAck.textContent = 'Acknowledge'; + el.btnAck.disabled = false; + } + } +} + +async function selectAgentEvent(eventId) { + agentsState.selectedEventId = eventId; + const existing = agentsState.events.find((e) => e.event_id === eventId); + if (existing && existing.explanation && existing.recommended_checks_json) { + renderAgentEventList(); + renderAgentEventDetails(existing); + return; + } + const detailResult = await window.api.agentsGetEvent(eventId); + if (detailResult.success && detailResult.event) { + const idx = agentsState.events.findIndex((e) => e.event_id === eventId); + if (idx >= 0) { + agentsState.events[idx] = { ...agentsState.events[idx], ...detailResult.event }; + } else { + agentsState.events.unshift(detailResult.event); + } + renderAgentEventList(); + renderAgentEventDetails(detailResult.event); + } +} + +async function loadAgentEvents() { + const el = getAgentsElements(); + const result = await window.api.agentsListEvents({ + limit: 200, + state: el.filterState?.value || undefined, + severity: el.filterSeverity?.value || undefined, + runId: agentsState.runId || undefined, + }); + if (!result.success) return; + agentsState.events = Array.isArray(result.events) ? result.events : []; + renderAgentEventList(); + + if (agentsState.selectedEventId) { + const selected = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); + renderAgentEventDetails(selected || null); + } +} + +async function refreshAgentStatus() { + const status = await window.api.agentsStatus(agentsState.runId || undefined); + if (!status.success) { + updateAgentStatusUi('error', status.error || 'Failed to fetch status'); + return; + } + if (status.active) { + agentsState.runId = status.runId || agentsState.runId; + agentsState.status = status.status || 'running'; + updateAgentStatusUi(agentsState.status, `Run ${agentsState.runId}`); + updateAgentMetrics(status.metrics || {}, status.lastHeartbeatAt); + } else { + agentsState.status = 'idle'; + updateAgentStatusUi('idle', 'No active run'); + } +} + +async function startAgentsMonitoring() { + const config = getAgentsConfigFromUI(); + agentsState.subsystemHealth = {}; + agentsState.subsystemHistory = {}; + agentsState.selectedSubsystemId = null; + renderSubsystemHealthGrid(); + const clearSubBtn = document.getElementById('btn-agents-clear-subsystem'); + if (clearSubBtn) clearSubBtn.style.display = 'none'; + const result = await window.api.agentsStart(config); + if (!result.success) { + console.error('[Agents start failed]', result); + updateAgentStatusUi('error', result.error || 'Failed to start monitoring'); + return; + } + console.log('[Agents] started, runId=' + (result.runId || 'n/a')); + agentsState.runId = result.runId; + agentsState.status = 'running'; + updateAgentStatusUi('running', `Run ${result.runId}`); + await loadAgentEvents(); +} + +async function stopAgentsMonitoring() { + const result = await window.api.agentsStop(agentsState.runId || undefined); + if (!result.success) { + updateAgentStatusUi('error', result.error || 'Failed to stop monitoring'); + return; + } + agentsState.status = 'stopped'; + updateAgentStatusUi('stopped', 'Monitoring stopped'); +} + +async function deepAnalyzeSelectedEvent() { + if (!agentsState.selectedEventId) return; + const el = getAgentsElements(); + if (el.btnDeepAnalyze) { + el.btnDeepAnalyze.disabled = true; + el.btnDeepAnalyze.textContent = 'Analyzing…'; + } + try { + const result = await window.api.agentsDeepAnalyze(agentsState.selectedEventId); + if (result.success && result.event) { + const idx = agentsState.events.findIndex((e) => e.event_id === agentsState.selectedEventId); + if (idx >= 0) agentsState.events[idx] = { ...agentsState.events[idx], ...result.event }; + renderAgentEventList(); + renderAgentEventDetails(result.event); + } else { + console.error('[Agents] deep-analyze failed:', result.error); + if (el.btnDeepAnalyze) { + el.btnDeepAnalyze.textContent = 'Failed — Retry'; + el.btnDeepAnalyze.disabled = false; + } + } + } catch (err) { + console.error('[Agents] deep-analyze error:', err); + if (el.btnDeepAnalyze) { + el.btnDeepAnalyze.textContent = 'Failed — Retry'; + el.btnDeepAnalyze.disabled = false; + } + } +} + +async function acknowledgeSelectedAgentEvent() { + if (!agentsState.selectedEventId) return; + const selected = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); + const state = String(selected?.state || '').toLowerCase(); + const result = state === 'acknowledged' + ? await window.api.agentsClearEvent(agentsState.selectedEventId, '') + : await window.api.agentsAckEvent(agentsState.selectedEventId, ''); + if (!result.success) return; + await loadAgentEvents(); + const refreshed = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); + renderAgentEventDetails(refreshed || null); +} + +function upsertRealtimeAgentEvent(payload) { + if (!payload || !payload.eventId) return; + const idx = agentsState.events.findIndex((e) => e.event_id === payload.eventId); + const next = { + event_id: payload.eventId, + severity: payload.severity || 'medium', + summary: payload.summary || 'Anomaly detected', + category: payload.category || 'deviation', + created_at: payload.createdAt || new Date().toISOString(), + source_tag: payload.entityRefs?.sourceTag || payload.entityRefs?.tag || '', + tag_name: payload.entityRefs?.tag || '', + subsystem_type: payload.entityRefs?.subsystemType || '', + subsystem_name: payload.entityRefs?.subsystemName || '', + state: 'open', + }; + if (idx >= 0) { + agentsState.events[idx] = { ...agentsState.events[idx], ...next }; + } else { + agentsState.events.unshift(next); + } + renderAgentEventList(); +} + +function ensureAgentListeners() { + if (agentsState.listenersReady) return; + agentsState.listenersReady = true; + + window.api.onAgentStatus((payload) => { + if (!payload) return; + if (payload.runId) agentsState.runId = payload.runId; + agentsState.status = payload.state || agentsState.status; + updateAgentStatusUi(agentsState.status, `Run ${agentsState.runId || 'n/a'}`); + updateAgentMetrics(payload, payload.timestamp); + const diagnostics = payload.diagnostics || {}; + const phase = diagnostics.phase || '?'; + console.log(`[Agents] phase=${phase} tags=${diagnostics.monitoredTags ?? '?'}`); + + if (phase === 'cycle_complete') { + const signals = diagnostics.subsystemShiftSignals; + const subCount = Array.isArray(signals) ? signals.length : 0; + const evaluated = (diagnostics.evaluatedLinked || 0) + (diagnostics.evaluatedUnlinked || 0); + console.log(`[Agents] cycle_complete: ${subCount} subsystems, ${evaluated} evaluated, ${diagnostics.candidateLinked || 0} candidates`); + } + + updateSubsystemHealthFromDiagnostics(diagnostics); + }); + + window.api.onAgentEvent((payload) => { + upsertRealtimeAgentEvent(payload); + }); + + window.api.onAgentError((payload) => { + if (!payload) return; + console.error('[Agents error]', payload); + updateAgentStatusUi('error', payload.message || 'Agent runtime error'); + }); + + window.api.onAgentComplete((payload) => { + if (!payload) return; + console.log('[Agents] run complete, success=' + payload.success); + agentsState.status = payload.success ? 'stopped' : 'failed'; + updateAgentStatusUi(agentsState.status, payload.reason || 'Run complete'); + refreshAgentStatus(); + }); +} + +function initAgentsTab() { + ensureAgentListeners(); + const el = getAgentsElements(); + if (!el.btnStart) return; + if (!el.btnStart.dataset.bound) { + el.btnStart.dataset.bound = '1'; + el.btnStart.addEventListener('click', startAgentsMonitoring); + el.btnStop?.addEventListener('click', stopAgentsMonitoring); + el.btnRefresh?.addEventListener('click', loadAgentEvents); + el.btnCleanup?.addEventListener('click', async () => { + await window.api.agentsCleanup(14); + await loadAgentEvents(); + }); + el.btnDeepAnalyze?.addEventListener('click', deepAnalyzeSelectedEvent); + el.btnAck?.addEventListener('click', acknowledgeSelectedAgentEvent); + el.btnOpenGraph?.addEventListener('click', () => { + const event = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); + if (!event) return; + const target = resolveAgentGraphTarget(event); + if (!target) return; + openGraphModal(target.name, target.type, event.summary || target.name); + }); + el.filterState?.addEventListener('change', loadAgentEvents); + el.filterSeverity?.addEventListener('change', loadAgentEvents); + el.filterSearch?.addEventListener('input', renderAgentEventList); + + const clearSubBtn = document.getElementById('btn-agents-clear-subsystem'); + clearSubBtn?.addEventListener('click', () => { + agentsState.selectedSubsystemId = null; + clearSubBtn.style.display = 'none'; + renderSubsystemHealthGrid(); + renderAgentEventList(); + }); + } + refreshAgentStatus(); + loadAgentEvents(); + renderSubsystemHealthGrid(); +} // Initialize graph tab when it's first shown navButtons.forEach(btn => { @@ -3558,6 +4304,9 @@ navButtons.forEach(btn => { loadSettings(); loadDbConnections(); } + if (btn.dataset.tab === 'agents') { + setTimeout(initAgentsTab, 100); + } }); }); @@ -3569,5 +4318,6 @@ setTimeout(() => { loadTiaProjects(); loadSettings(); loadDbConnections(); + ensureAgentListeners(); }, 500); diff --git a/electron-ui/styles.css b/electron-ui/styles.css index 5ba9186..c967b08 100644 --- a/electron-ui/styles.css +++ b/electron-ui/styles.css @@ -2979,3 +2979,661 @@ select.input, .connection-status .status-dot { flex-shrink: 0; } + +/* ============================================ + AGENTS TAB + ============================================ */ + +.agents-topbar { + display: flex; + justify-content: space-between; + align-items: center; + gap: var(--space-4); + margin-bottom: var(--space-3); + flex-wrap: wrap; +} + +.agents-run-controls { + display: flex; + gap: var(--space-2); + flex-wrap: wrap; +} + +.agents-run-status { + display: flex; + align-items: center; + gap: var(--space-2); + color: var(--color-text-secondary); + font-size: var(--text-sm); +} + +.status-chip { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 2px 8px; + border-radius: 999px; + border: 1px solid var(--color-border); + background: var(--color-bg-panel-2); + color: var(--color-text-secondary); + font-size: var(--text-xs); + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.4px; +} + +.status-chip.running { + color: var(--color-success); + border-color: rgba(34, 197, 94, 0.35); + background: rgba(34, 197, 94, 0.12); +} + +.status-chip.error { + color: var(--color-danger); + border-color: rgba(239, 68, 68, 0.35); + background: rgba(239, 68, 68, 0.12); +} + +.agents-config-row { + display: grid; + grid-template-columns: repeat(16, minmax(0, 1fr)); + gap: var(--space-2); + margin-bottom: var(--space-4); + align-items: center; +} + +.agents-toggle-label { + display: flex; + align-items: center; + gap: 4px; + grid-column: span 2; + cursor: pointer; +} + +.agents-config-row label { + font-size: var(--text-xs); + color: var(--color-text-secondary); + text-transform: uppercase; + letter-spacing: 0.35px; +} + +.agents-config-row .input { + min-width: 0; +} + +.agents-metrics-row { + display: grid; + grid-template-columns: repeat(5, minmax(0, 1fr)); + gap: var(--space-2); + margin-bottom: var(--space-4); +} + +.metric-card { + border: 1px solid var(--color-border); + background: var(--color-bg-panel); + border-radius: var(--radius-md); + padding: var(--space-2) var(--space-3); + display: flex; + flex-direction: column; + gap: 2px; +} + +.metric-label { + font-size: var(--text-xs); + color: var(--color-text-muted); +} + +.metric-value { + font-family: var(--font-mono); + font-size: var(--text-sm); + color: var(--color-text); +} + +/* ---- Subsystem Health Dashboard ---- */ + +.agents-health-section { + margin-bottom: var(--space-4); +} + +.agents-health-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: var(--space-3); +} + +.agents-health-header h3 { + font-size: var(--text-md); + font-weight: 600; + color: var(--color-text); +} + +.agents-health-actions { + display: flex; + gap: var(--space-2); + align-items: center; +} + +.agents-health-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(260px, 1fr)); + gap: var(--space-3); +} + +.agents-health-empty { + grid-column: 1 / -1; + color: var(--color-text-muted); + font-size: var(--text-sm); + padding: var(--space-4); + text-align: center; + border: 1px dashed var(--color-border); + border-radius: var(--radius-lg); + background: var(--color-bg-panel); +} + +.agents-health-card { + border: 1px solid var(--color-border); + background: var(--color-bg-panel); + border-radius: var(--radius-lg); + padding: var(--space-3); + cursor: pointer; + transition: border-color var(--transition-fast), transform var(--transition-fast), box-shadow var(--transition-fast); + position: relative; + border-left: 3px solid var(--color-border); +} + +.agents-health-card:hover { + border-color: var(--color-border-active); + transform: translateY(-1px); + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.2); +} + +.agents-health-card.selected { + border-color: var(--color-accent); + box-shadow: 0 0 0 1px rgba(34, 211, 238, 0.25) inset, 0 2px 12px rgba(34, 211, 238, 0.08); +} + +.agents-health-card.health-healthy { + border-left-color: #22c55e; +} + +.agents-health-card.health-elevated { + border-left-color: #eab308; +} + +.agents-health-card.health-warning { + border-left-color: #f97316; +} + +.agents-health-card.health-critical { + border-left-color: #ef4444; +} + +.health-card-top { + display: flex; + justify-content: space-between; + align-items: flex-start; + margin-bottom: var(--space-2); +} + +.health-card-identity { + display: flex; + align-items: center; + gap: var(--space-2); + min-width: 0; + flex: 1; +} + +.health-indicator { + width: 10px; + height: 10px; + border-radius: 50%; + flex-shrink: 0; + box-shadow: 0 0 6px currentColor; +} + +.health-indicator.health-healthy { + background: #22c55e; + color: #22c55e; +} + +.health-indicator.health-elevated { + background: #eab308; + color: #eab308; +} + +.health-indicator.health-warning { + background: #f97316; + color: #f97316; +} + +.health-indicator.health-critical { + background: #ef4444; + color: #ef4444; + animation: pulse-critical 2s ease-in-out infinite; +} + +@keyframes pulse-critical { + 0%, 100% { opacity: 1; box-shadow: 0 0 6px currentColor; } + 50% { opacity: 0.6; box-shadow: 0 0 12px currentColor; } +} + +.health-card-name { + font-size: var(--text-sm); + font-weight: 600; + color: var(--color-text); + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.health-card-type { + font-size: 10px; + text-transform: uppercase; + letter-spacing: 0.5px; + color: var(--color-text-muted); + padding: 1px 6px; + border-radius: 999px; + border: 1px solid var(--color-border); + background: var(--color-bg-panel-2); + flex-shrink: 0; + white-space: nowrap; +} + +.health-card-stats { + display: grid; + grid-template-columns: 1fr 1fr 1fr; + gap: var(--space-1); + margin-bottom: var(--space-2); +} + +.health-stat { + display: flex; + flex-direction: column; + gap: 1px; +} + +.health-stat-label { + font-size: 10px; + color: var(--color-text-muted); + text-transform: uppercase; + letter-spacing: 0.3px; +} + +.health-stat-value { + font-family: var(--font-mono); + font-size: var(--text-sm); + color: var(--color-text); +} + +.health-stat-value.has-anomalies { + color: #f97316; +} + +.health-stat-value.has-critical { + color: #ef4444; +} + +.health-trend { + display: flex; + align-items: flex-end; + gap: 2px; + height: 28px; + padding-top: var(--space-1); + border-top: 1px solid var(--color-border-subtle); +} + +.health-trend-bar { + flex: 1; + min-width: 3px; + max-width: 8px; + border-radius: 2px 2px 0 0; + transition: height 0.3s ease; +} + +.health-trend-bar.trend-healthy { + background: rgba(34, 197, 94, 0.5); +} + +.health-trend-bar.trend-elevated { + background: rgba(234, 179, 8, 0.5); +} + +.health-trend-bar.trend-warning { + background: rgba(249, 115, 22, 0.6); +} + +.health-trend-bar.trend-critical { + background: rgba(239, 68, 68, 0.6); +} + +.health-trend-bar.trend-empty { + background: var(--color-border-subtle); +} + +.health-card-health-label { + font-size: 10px; + text-transform: uppercase; + letter-spacing: 0.4px; + font-weight: 600; + margin-top: 2px; +} + +.health-card-health-label.health-healthy { color: #22c55e; } +.health-card-health-label.health-elevated { color: #eab308; } +.health-card-health-label.health-warning { color: #f97316; } +.health-card-health-label.health-critical { color: #ef4444; } + +/* ---- Expanded Subsystem Card ---- */ + +.agents-health-card.expanded { + grid-column: 1 / -1; + border-color: var(--color-accent); + background: var(--color-bg-elevated); +} + +.health-expanded-body { + margin-top: var(--space-3); + border-top: 1px solid var(--color-border-subtle); + padding-top: var(--space-3); +} + +.health-expanded-trend { + display: flex; + align-items: flex-end; + gap: 3px; + height: 48px; + margin-bottom: var(--space-3); +} + +.health-expanded-trend .health-trend-bar { + max-width: 14px; +} + +.health-tag-list-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: var(--space-2); +} + +.health-tag-list-header h4 { + font-size: var(--text-sm); + font-weight: 600; + color: var(--color-text-secondary); +} + +.health-tag-list-header span { + font-size: var(--text-xs); + color: var(--color-text-muted); +} + +.health-tag-col-headers { + display: grid; + grid-template-columns: minmax(110px, 1fr) 120px 55px 55px 55px; + gap: var(--space-2); + padding: 0 var(--space-2) 2px; + font-size: 10px; + font-weight: 600; + color: var(--color-text-muted); + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.health-tag-col-headers span:nth-child(n+3) { + text-align: right; +} + +.health-tag-list { + display: flex; + flex-direction: column; + gap: var(--space-1); + max-height: 320px; + overflow-y: auto; +} + +.health-tag-row { + display: grid; + grid-template-columns: minmax(110px, 1fr) 120px 55px 55px 55px; + gap: var(--space-2); + align-items: center; + padding: 5px var(--space-2); + border-radius: var(--radius-sm); + background: var(--color-bg-panel); + border: 1px solid var(--color-border-subtle); + font-size: var(--text-xs); +} + +.health-tag-row:hover { + border-color: var(--color-border-active); +} + +.health-tag-name { + font-family: var(--font-mono); + color: var(--color-text); + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.health-tag-sparkline { + display: flex; + align-items: center; + justify-content: center; +} + +.tag-sparkline { + display: block; + width: 120px; + height: 24px; +} + +.health-tag-zscore { + font-family: var(--font-mono); + color: var(--color-text-secondary); + text-align: right; +} + +.health-tag-zscore.tag-z-healthy { color: #22c55e; } +.health-tag-zscore.tag-z-elevated { color: #eab308; } +.health-tag-zscore.tag-z-warning { color: #f97316; } +.health-tag-zscore.tag-z-critical { color: #ef4444; } + +.health-tag-avg { + font-family: var(--font-mono); + color: var(--color-text-muted); + text-align: right; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.health-tag-value { + font-family: var(--font-mono); + color: var(--color-text); + text-align: right; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.health-tag-empty { + color: var(--color-text-muted); + font-size: var(--text-sm); + padding: var(--space-3); + text-align: center; +} + +.agents-main { + display: grid; + grid-template-columns: minmax(300px, 38%) minmax(0, 1fr); + gap: var(--space-3); + min-height: 480px; +} + +.agents-feed-panel, +.agents-detail-panel { + border: 1px solid var(--color-border); + background: var(--color-bg-panel); + border-radius: var(--radius-lg); + overflow: hidden; + display: flex; + flex-direction: column; +} + +.agents-feed-header, +.agents-detail-header { + padding: var(--space-3); + border-bottom: 1px solid var(--color-border-subtle); + display: flex; + justify-content: space-between; + align-items: center; + gap: var(--space-2); +} + +.agents-feed-header h3, +.agents-detail-header h3 { + font-size: var(--text-md); + font-weight: 600; +} + +.agents-feed-filters { + display: flex; + gap: var(--space-2); + flex-wrap: wrap; +} + +.agents-feed-filters .input { + min-width: 120px; +} + +.agents-event-list { + overflow-y: auto; + padding: var(--space-2); + display: flex; + flex-direction: column; + gap: var(--space-2); + flex: 1; +} + +.agents-empty { + color: var(--color-text-muted); + font-size: var(--text-sm); + padding: var(--space-4); + text-align: center; +} + +.agents-event-card { + border: 1px solid var(--color-border); + background: var(--color-bg-panel-2); + border-radius: var(--radius-md); + padding: var(--space-2) var(--space-3); + cursor: pointer; + transition: border-color var(--transition-fast), transform var(--transition-fast); +} + +.agents-event-card:hover { + border-color: var(--color-border-active); + transform: translateY(-1px); +} + +.agents-event-card.active { + border-color: var(--color-accent); + box-shadow: 0 0 0 1px rgba(34, 211, 238, 0.35) inset; +} + +.agents-event-line-top { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 4px; + gap: var(--space-2); +} + +.agents-severity { + font-size: var(--text-xs); + text-transform: uppercase; + letter-spacing: 0.4px; + padding: 2px 6px; + border-radius: 999px; + border: 1px solid transparent; +} + +.agents-severity.sev-critical { + color: #fecaca; + background: rgba(239, 68, 68, 0.2); + border-color: rgba(239, 68, 68, 0.4); +} + +.agents-severity.sev-high { + color: #fdba74; + background: rgba(249, 115, 22, 0.18); + border-color: rgba(249, 115, 22, 0.35); +} + +.agents-severity.sev-medium { + color: #fde68a; + background: rgba(245, 158, 11, 0.15); + border-color: rgba(245, 158, 11, 0.35); +} + +.agents-severity.sev-low { + color: #bfdbfe; + background: rgba(59, 130, 246, 0.15); + border-color: rgba(59, 130, 246, 0.35); +} + +.agents-event-time { + font-size: var(--text-xs); + color: var(--color-text-muted); + font-family: var(--font-mono); +} + +.agents-event-summary { + font-size: var(--text-sm); + color: var(--color-text); + margin-bottom: 4px; +} + +.agents-event-meta { + font-size: var(--text-xs); + color: var(--color-text-muted); +} + +.agents-detail-content { + padding: var(--space-3); + overflow-y: auto; + font-size: var(--text-sm); + display: flex; + flex-direction: column; + gap: var(--space-3); +} + +.agents-detail-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: var(--space-2) var(--space-3); +} + +.agents-detail-item { + display: flex; + flex-direction: column; + gap: 2px; +} + +.agents-detail-label { + font-size: var(--text-xs); + color: var(--color-text-muted); + text-transform: uppercase; + letter-spacing: 0.3px; +} + +.agents-detail-value { + font-family: var(--font-mono); + color: var(--color-text); +} + +.agents-list { + margin-left: var(--space-4); + color: var(--color-text-secondary); +} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..3b2c446 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = tests +python_files = test_*.py +addopts = -q diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pytest diff --git a/scripts/anomaly_monitor.py b/scripts/anomaly_monitor.py new file mode 100644 index 0000000..0d0d91a --- /dev/null +++ b/scripts/anomaly_monitor.py @@ -0,0 +1,2195 @@ +#!/usr/bin/env python3 +""" +Long-running anomaly monitor worker. + +Modes: + - run: start continuous monitoring loop + - status: get run status + - list-events: list persisted anomaly events + - get-event: fetch one anomaly event + - ack-event: mark event as acknowledged + - cleanup: delete old events by retention policy + - replay-fixtures: run deterministic fixture validation +""" + +from __future__ import annotations + +import argparse +import json +import os +import signal +import sys +import time +import uuid +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional, Set, Tuple + +try: + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional fallback for minimal environments + def load_dotenv(*_args, **_kwargs): + return False + +from anomaly_rules import ( + compute_deviation_scores, + dedup_key, + is_quality_good, + is_stale, + parse_timestamp, + safe_float, +) + + +load_dotenv() + + +def utc_now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def emit(prefix: str, payload: Dict[str, Any]) -> None: + """Emit machine-parseable messages for Electron main process.""" + print(f"[{prefix}] {json.dumps(payload, default=str)}", flush=True) + + +DEFAULT_SUBSYSTEM_PRIORITY = ["view", "equipment", "group", "global"] + + +def _preview_value(value: Any, max_len: int = 120) -> Any: + if value is None or isinstance(value, (bool, int, float)): + return value + text = str(value) + if len(text) <= max_len: + return text + return text[: max_len - 3] + "..." + + +def make_default_diagnostics( + *, + staleness_threshold_sec: int = 120, + phase: str = "initializing", + reason: str = "", +) -> Dict[str, Any]: + return { + "phase": phase, + "reason": reason, + "monitoredTags": 0, + "linkedTags": 0, + "unlinkedTags": 0, + "validLiveCount": 0, + "missingTimestampCount": 0, + "inferredTimestampCount": 0, + "liveErrorCount": 0, + "liveErrorLinked": 0, + "liveErrorUnlinked": 0, + "qualityFilteredCount": 0, + "qualityFilteredLinked": 0, + "qualityFilteredUnlinked": 0, + "staleFilteredCount": 0, + "staleFilteredLinked": 0, + "staleFilteredUnlinked": 0, + "historyErrorCount": 0, + "historyErrorLinked": 0, + "historyErrorUnlinked": 0, + "insufficientHistoryCount": 0, + "lowHistoryCandidateCount": 0, + "evaluatedLinked": 0, + "evaluatedUnlinked": 0, + "candidateLinked": 0, + "candidateUnlinked": 0, + "nearShiftCount": 0, + "nearShiftLinked": 0, + "nearShiftUnlinked": 0, + "stalenessThresholdSec": staleness_threshold_sec, + "staleSamples": [], + "timestampParseNote": "Naive timestamps are treated as local time by parse_timestamp().", + "detectedSubsystemCount": 0, + "detectedSubsystems": [], + "candidateSubsystemCount": 0, + "candidateBySubsystem": {}, + "subsystemShiftSignals": [], + "maxCandidatesPerSubsystem": 0, + "maxLlmTriagesPerSubsystem": 0, + "llmTriagedCount": 0, + "dedupSuppressedCount": 0, + "toolCalls": [], + } + + +def _canonical_subsystem_type(kind: Any) -> str: + value = str(kind or "").strip().lower() + if value in {"view", "views"}: + return "view" + if value in {"equipment", "equip", "asset"}: + return "equipment" + if value in {"group", "groups", "folder", "path", "prefix", "tag_group"}: + return "group" + if value in {"global", "all", "system"}: + return "global" + return "group" + + +def _subsystem_ref(kind: Any, name: Any) -> Dict[str, str]: + subsystem_type = _canonical_subsystem_type(kind) + subsystem_name = str(name or "").strip() + if not subsystem_name: + subsystem_type = "global" + subsystem_name = "all" + return { + "type": subsystem_type, + "name": subsystem_name, + "id": f"{subsystem_type}:{subsystem_name.lower()}", + } + + +def infer_tag_group(tag_path: Optional[str], folder_name: Optional[str] = None) -> Optional[str]: + folder = str(folder_name or "").strip().strip("/") + if folder: + head = folder.split("/", 1)[0].strip() + if head: + return head + + raw = str(tag_path or "").strip() + if not raw: + return None + if raw.startswith("[") and "]" in raw: + raw = raw.split("]", 1)[1] + raw = raw.strip("/") + if not raw: + return None + parts = [p.strip() for p in raw.split("/") if p.strip()] + # Ignore flat tags and only infer a group when there is at least one folder segment. + if len(parts) < 2: + return None + return parts[0] + + +def _last_segment_from_tag_path(tag_path: Optional[str]) -> str: + raw = str(tag_path or "").strip() + if not raw: + return "" + if raw.startswith("[") and "]" in raw: + raw = raw.split("]", 1)[1] + raw = raw.strip("/") + if not raw: + return "" + parts = [p.strip() for p in raw.split("/") if p.strip()] + return parts[-1] if parts else raw + + +def _looks_like_live_tag_path(value: Optional[str]) -> bool: + path = str(value or "").strip() + if not path: + return False + # Typical Ignition path shape: [provider]Folder/Tag or Folder/Tag + if path.startswith("[") and "]" in path: + return True + if "/" in path and not any(ch in path for ch in "{}()"): + return True + return False + + +def derive_subsystems_for_tag( + tag_meta: Dict[str, Any], + subsystem_mode: str = "auto", + priority: Optional[List[str]] = None, +) -> Tuple[List[Dict[str, str]], Dict[str, str]]: + mode = str(subsystem_mode or "auto").strip().lower() + if mode in {"global", "off", "disabled"}: + global_ref = _subsystem_ref("global", "all") + return [global_ref], global_ref + + refs: List[Dict[str, str]] = [] + seen: Set[str] = set() + + def add_ref(kind: str, name: Optional[str]) -> None: + if not name: + return + ref = _subsystem_ref(kind, name) + if ref["id"] in seen: + return + seen.add(ref["id"]) + refs.append(ref) + + for view_name in tag_meta.get("views") or []: + add_ref("view", str(view_name)) + for equipment_name in tag_meta.get("equipment") or []: + add_ref("equipment", str(equipment_name)) + add_ref("group", infer_tag_group(tag_meta.get("path"), tag_meta.get("folder_name"))) + + if not refs: + refs = [_subsystem_ref("global", "all")] + + ordered_priority = [ + _canonical_subsystem_type(x) for x in (priority or DEFAULT_SUBSYSTEM_PRIORITY) + ] + primary = refs[0] + for kind in ordered_priority: + found = next((s for s in refs if s.get("type") == kind), None) + if found: + primary = found + break + + return refs, primary + + +def merge_defaults(config: Optional[Dict[str, Any]]) -> Dict[str, Any]: + raw = dict(config or {}) + thresholds = raw.get("thresholds", {}) if isinstance(raw.get("thresholds"), dict) else {} + defaults = { + "pollIntervalMs": 1000, + "historyWindowMinutes": 360, + "minHistoryPoints": 30, + "maxMonitoredTags": 200, + "maxCandidatesPerCycle": 25, + "maxCandidatesPerSubsystem": 8, + "maxLlmTriagesPerCycle": 0, + "maxLlmTriagesPerSubsystem": 0, + "dedupCooldownMinutes": 10, + "retentionDays": 14, + "cleanupEveryCycles": 40, + "historyCacheTtlSec": 30, + "tagCacheTtlSec": 60, + "runMode": "live", + "scope": { + "project": None, + "equipmentTags": [], + "tagRegex": None, + "subsystemMode": "auto", + "subsystemPriority": list(DEFAULT_SUBSYSTEM_PRIORITY), + "subsystemInclude": [], + "includeUnlinkedTags": False, + }, + "thresholds": { + "z": 3.0, + "mad": 3.5, + "rate": 0.0, + "stalenessSec": 120, + "flatline_std_epsilon": 1e-6, + "stuck_window_size": 20, + }, + } + cfg = dict(defaults) + cfg["scope"] = dict(defaults["scope"]) + cfg["thresholds"] = dict(defaults["thresholds"]) + cfg.update({k: v for k, v in raw.items() if k in defaults and k != "thresholds"}) + cfg["thresholds"].update({k: v for k, v in thresholds.items() if v is not None}) + if isinstance(raw.get("scope"), dict): + cfg["scope"].update(raw["scope"]) + scope_cfg = cfg["scope"] + mode = str(scope_cfg.get("subsystemMode") or "auto").strip().lower() + if mode not in {"auto", "global", "off", "disabled"}: + mode = "auto" + scope_cfg["subsystemMode"] = mode + if not isinstance(scope_cfg.get("subsystemPriority"), list) or not scope_cfg.get("subsystemPriority"): + scope_cfg["subsystemPriority"] = list(DEFAULT_SUBSYSTEM_PRIORITY) + scope_cfg["subsystemPriority"] = [ + str(x).strip() + for x in scope_cfg.get("subsystemPriority", []) + if str(x).strip() + ] or list(DEFAULT_SUBSYSTEM_PRIORITY) + if not isinstance(scope_cfg.get("subsystemInclude"), list): + scope_cfg["subsystemInclude"] = [] + scope_cfg["subsystemInclude"] = [ + str(x).strip().lower() + for x in scope_cfg.get("subsystemInclude", []) + if str(x).strip() + ] + scope_cfg["includeUnlinkedTags"] = bool(scope_cfg.get("includeUnlinkedTags", False)) + return cfg + + +class AnomalyMonitor: + def __init__(self, config: Dict[str, Any], run_id: Optional[str] = None): + self.config = merge_defaults(config) + self.run_id = run_id or f"agent-run-{uuid.uuid4()}" + from ignition_api_client import IgnitionApiClient + from neo4j_ontology import get_ontology_graph + + self.graph = get_ontology_graph() + + self.api = IgnitionApiClient( + base_url=self.config.get("ignitionApiUrl") or os.getenv("IGNITION_API_URL"), + api_token=self.config.get("ignitionApiToken") or os.getenv("IGNITION_API_TOKEN"), + timeout=15.0, + ) + + self.llm = None + self._llm_enabled = bool(os.getenv("ANTHROPIC_API_KEY")) + if self._llm_enabled: + try: + from claude_client import ClaudeClient + + self.llm = ClaudeClient( + enable_tools=False, + ignition_api_url=self.config.get("ignitionApiUrl"), + ignition_api_token=self.config.get("ignitionApiToken"), + ) + except Exception as exc: + self._llm_enabled = False + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "llm_init_failed", + "message": str(exc), + "recoverable": True, + "timestamp": utc_now_iso(), + }) + + self._running = True + self._cycle_count = 0 + self._prev_values: Dict[str, float] = {} + self._history_cache: Dict[str, Dict[str, Any]] = {} + self._tag_cache: Optional[Dict[str, Any]] = None + self._tag_cache_at: float = 0.0 + + # ----------------------------- + # Schema / run lifecycle + # ----------------------------- + def init_schema(self) -> None: + self.graph.init_agent_monitoring_schema() + + def upsert_run(self, status: str, reason: Optional[str] = None) -> None: + with self.graph.session() as session: + session.run( + """ + MERGE (r:AgentRun {run_id: $run_id}) + SET r.status = $status, + r.updated_at = datetime(), + r.last_heartbeat_at = datetime(), + r.config_json = $config_json, + r.cycle_count = $cycle_count, + r.started_at = coalesce(r.started_at, datetime()), + r.stopped_at = CASE WHEN $status IN ['stopped', 'failed'] THEN datetime() ELSE r.stopped_at END, + r.stop_reason = CASE WHEN $reason IS NULL THEN r.stop_reason ELSE $reason END + """, + run_id=self.run_id, + status=status, + config_json=json.dumps(self.config, default=str), + cycle_count=self._cycle_count, + reason=reason, + ) + + def heartbeat(self, metrics: Dict[str, Any]) -> None: + with self.graph.session() as session: + session.run( + """ + MATCH (r:AgentRun {run_id: $run_id}) + SET r.last_heartbeat_at = datetime(), + r.cycle_count = $cycle_count, + r.last_cycle_ms = $cycle_ms, + r.last_candidates = $candidates, + r.last_triaged = $triaged, + r.last_emitted = $emitted + """, + run_id=self.run_id, + cycle_count=self._cycle_count, + cycle_ms=metrics.get("cycleMs", 0), + candidates=metrics.get("candidates", 0), + triaged=metrics.get("triaged", 0), + emitted=metrics.get("emitted", 0), + ) + + # ----------------------------- + # Tag and context collection + # ----------------------------- + def get_monitored_tags(self) -> List[Dict[str, Any]]: + ttl = float(self.config.get("tagCacheTtlSec", 60)) + now = time.time() + if self._tag_cache is not None and ttl > 0 and (now - self._tag_cache_at) < ttl: + return self._tag_cache + + result = self._fetch_monitored_tags() + self._tag_cache = result + self._tag_cache_at = time.time() + return result + + def _fetch_monitored_tags(self) -> List[Dict[str, Any]]: + max_tags = int(self.config.get("maxMonitoredTags", 200)) + scope = self.config.get("scope", {}) + tag_regex = scope.get("tagRegex") + equipment_tags = { + str(x).strip().lower() + for x in (scope.get("equipmentTags") or []) + if str(x).strip() + } + subsystem_mode = str(scope.get("subsystemMode") or "auto").strip().lower() + subsystem_priority = scope.get("subsystemPriority") or list(DEFAULT_SUBSYSTEM_PRIORITY) + subsystem_include = { + str(x).strip().lower() + for x in (scope.get("subsystemInclude") or []) + if str(x).strip() + } + include_unlinked = bool(scope.get("includeUnlinkedTags", False)) + tag_map: Dict[str, Dict[str, Any]] = {} + + def upsert_tag( + *, + tag_path: str, + tag_name: str, + folder_name: str = "", + views: Optional[List[str]] = None, + equipment: Optional[List[str]] = None, + source: str = "unknown", + ) -> None: + path = str(tag_path or "").strip() + if not path: + return + entry = tag_map.setdefault( + path, + { + "path": path, + "name": str(tag_name or _last_segment_from_tag_path(path) or path), + "folder_name": str(folder_name or ""), + "views": [], + "equipment": [], + "source": source, + "bound_to_view": False, + }, + ) + if source == "view_binding": + entry["bound_to_view"] = True + entry["source"] = source + if folder_name and not entry.get("folder_name"): + entry["folder_name"] = str(folder_name) + if tag_name and ( + not entry.get("name") + or entry.get("name") == entry.get("path") + or entry.get("name") == _last_segment_from_tag_path(entry.get("path")) + ): + entry["name"] = str(tag_name) + for view_name in views or []: + v = str(view_name or "").strip() + if v and v not in entry["views"]: + entry["views"].append(v) + for eq_name in equipment or []: + eq = str(eq_name or "").strip() + if eq and eq not in entry["equipment"]: + entry["equipment"].append(eq) + + with self.graph.session() as session: + bound_result = session.run( + """ + MATCH (v:View)-[:HAS_COMPONENT]->(c:ViewComponent)-[r:BINDS_TO]->(n) + WHERE r.tag_path IS NOT NULL + AND trim(r.tag_path) <> '' + AND toLower(coalesce(r.binding_type, 'tag')) = 'tag' + OPTIONAL MATCH (eq:Equipment)-[*1..2]-(n) + RETURN DISTINCT trim(r.tag_path) AS tag_path, + coalesce(n.name, '') AS tag_name, + collect(DISTINCT v.name) AS views, + collect(DISTINCT eq.name) AS equipment + LIMIT $limit + """, + limit=max_tags * 4, + ) + for r in bound_result: + path = str(r["tag_path"] or "").strip() + if not _looks_like_live_tag_path(path): + continue + upsert_tag( + tag_path=path, + tag_name=str(r["tag_name"] or _last_segment_from_tag_path(path)), + folder_name=infer_tag_group(path) or "", + views=[x for x in (r["views"] or []) if x], + equipment=[x for x in (r["equipment"] or []) if x], + source="view_binding", + ) + + scada_result = session.run( + """ + MATCH (t:ScadaTag) + WHERE t.opc_item_path IS NOT NULL + AND trim(t.opc_item_path) <> '' + OPTIONAL MATCH (c:ViewComponent)-[:BINDS_TO]->(t) + OPTIONAL MATCH (v:View)-[:HAS_COMPONENT]->(c) + OPTIONAL MATCH (eq:Equipment)-[*1..2]-(t) + RETURN DISTINCT trim(t.opc_item_path) AS tag_path, + coalesce(t.name, t.opc_item_path) AS tag_name, + coalesce(t.folder_name, '') AS folder_name, + collect(DISTINCT v.name) AS views, + collect(DISTINCT eq.name) AS equipment + LIMIT $limit + """, + limit=max_tags * 6, + ) + for r in scada_result: + path = str(r["tag_path"] or "").strip() + if not _looks_like_live_tag_path(path): + continue + upsert_tag( + tag_path=path, + tag_name=str(r["tag_name"] or _last_segment_from_tag_path(path)), + folder_name=str(r["folder_name"] or ""), + views=[x for x in (r["views"] or []) if x], + equipment=[x for x in (r["equipment"] or []) if x], + source="scada_tag", + ) + + tags = list(tag_map.values()) + + if not include_unlinked: + linked = [t for t in tags if (t.get("views") or t.get("equipment") or t.get("bound_to_view"))] + if linked: + tags = linked + + if tag_regex: + import re + try: + pattern = re.compile(tag_regex, re.IGNORECASE) + tags = [t for t in tags if pattern.search(t["path"]) or pattern.search(t["name"])] + except re.error: + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "invalid_tag_regex", + "message": f"Invalid regex: {tag_regex}", + "recoverable": True, + "timestamp": utc_now_iso(), + }) + + if equipment_tags: + tags = [ + t for t in tags + if t["name"].lower() in equipment_tags + or t["path"].lower() in equipment_tags + or any(str(eq).strip().lower() in equipment_tags for eq in t.get("equipment", [])) + ] + + tags.sort( + key=lambda t: ( + 0 if t.get("bound_to_view") else 1, + 0 if (t.get("views") or t.get("equipment")) else 1, + str(t.get("path", "")), + ) + ) + + for tag in tags: + subsystems, primary = derive_subsystems_for_tag( + tag_meta=tag, + subsystem_mode=subsystem_mode, + priority=subsystem_priority, + ) + tag["subsystems"] = subsystems + tag["primary_subsystem"] = primary + + if subsystem_include: + tags = [ + t + for t in tags + if any( + s.get("id", "").lower() in subsystem_include + or s.get("name", "").lower() in subsystem_include + for s in (t.get("subsystems") or []) + ) + ] + + return tags[:max_tags] + + def _extract_history_values(self, history_data: Any, tag_path: str) -> List[float]: + """Normalize multiple gateway response shapes to numeric values list.""" + values: List[float] = [] + if history_data is None: + return values + if isinstance(history_data, dict) and history_data.get("error"): + return values + + rows: List[Any] = [] + if isinstance(history_data, list): + rows = history_data + elif isinstance(history_data, dict): + for key in ("rows", "data", "results", "values", "history"): + chunk = history_data.get(key) + if isinstance(chunk, list): + rows = chunk + break + if not rows and "tagHistory" in history_data and isinstance(history_data["tagHistory"], list): + rows = history_data["tagHistory"] + + prefixed = self.api._ensure_provider_prefix(tag_path) if hasattr(self, "api") else tag_path + stripped = tag_path + if stripped.startswith("[") and "]" in stripped: + stripped = stripped[stripped.index("]") + 1:] + path_variants = {tag_path, prefixed, stripped} + + for row in rows: + if isinstance(row, (int, float, str)): + val = safe_float(row) + if val is not None: + values.append(val) + continue + if not isinstance(row, dict): + continue + candidate = None + if "value" in row: + candidate = row.get("value") + else: + matched_key = next((k for k in path_variants if k in row), None) + if matched_key: + candidate = row.get(matched_key) + elif len(row) <= 2: + for k, v in row.items(): + if k.lower() in {"timestamp", "ts", "t", "time"}: + continue + candidate = v + break + val = safe_float(candidate) + if val is not None: + values.append(val) + return values + + def fetch_history_values(self, tag_path: str) -> tuple[List[float], Optional[str]]: + ttl = float(self.config.get("historyCacheTtlSec", 30)) + now = time.time() + cached = self._history_cache.get(tag_path) + if cached and ttl > 0 and (now - cached["fetched_at"]) < ttl: + return list(cached["values"]), cached.get("error") + + minutes = int(self.config.get("historyWindowMinutes", 360)) + end_dt = datetime.now(timezone.utc) + start_dt = end_dt - timedelta(minutes=minutes) + data = self.api.query_tag_history( + [tag_path], + start_dt.isoformat(), + end_dt.isoformat(), + return_size=max(100, int(self.config.get("minHistoryPoints", 30)) * 4), + aggregation_mode="Average", + return_format="Wide", + ) + if isinstance(data, dict) and data.get("error"): + err = str(data.get("error")) + self._history_cache[tag_path] = {"values": [], "error": err, "fetched_at": now} + return [], err + values = self._extract_history_values(data, tag_path) + self._history_cache[tag_path] = {"values": values, "error": None, "fetched_at": now} + return values, None + + def fetch_history_batch(self, tag_paths: List[str]) -> Dict[str, Tuple[List[float], Optional[str]]]: + """Fetch history for many tags, using cache and batched API calls.""" + ttl = float(self.config.get("historyCacheTtlSec", 30)) + now = time.time() + results: Dict[str, Tuple[List[float], Optional[str]]] = {} + uncached: List[str] = [] + + for path in tag_paths: + cached = self._history_cache.get(path) + if cached and ttl > 0 and (now - cached["fetched_at"]) < ttl: + results[path] = (list(cached["values"]), cached.get("error")) + else: + uncached.append(path) + + if not uncached: + return results + + minutes = int(self.config.get("historyWindowMinutes", 360)) + end_dt = datetime.now(timezone.utc) + start_dt = end_dt - timedelta(minutes=minutes) + return_size = max(100, int(self.config.get("minHistoryPoints", 30)) * 4) + batch_size = 20 + + for i in range(0, len(uncached), batch_size): + batch = uncached[i : i + batch_size] + data = self.api.query_tag_history( + batch, + start_dt.isoformat(), + end_dt.isoformat(), + return_size=return_size, + aggregation_mode="Average", + return_format="Wide", + ) + fetch_ts = time.time() + + if isinstance(data, dict) and data.get("error"): + err = str(data.get("error")) + for path in batch: + results[path] = ([], err) + self._history_cache[path] = {"values": [], "error": err, "fetched_at": fetch_ts} + continue + + for path in batch: + values = self._extract_history_values(data, path) + results[path] = (values, None) + self._history_cache[path] = {"values": values, "error": None, "fetched_at": fetch_ts} + + return results + + def get_context(self, tag_path: str) -> Dict[str, Any]: + with self.graph.session() as session: + result = session.run( + """ + MATCH (t:ScadaTag) + WHERE t.name = $tag OR t.opc_item_path = $tag + OPTIONAL MATCH (vc:ViewComponent)-[:BINDS_TO]->(t) + OPTIONAL MATCH (v:View)-[:HAS_COMPONENT]->(vc) + OPTIONAL MATCH (eq:Equipment)-[*1..2]-(t) + OPTIONAL MATCH (eq)-[:HAS_SYMPTOM]->(s:FaultSymptom) + OPTIONAL MATCH (s)-[:CAUSED_BY]->(fc:FaultCause) + OPTIONAL MATCH (eq)-[:HAS_PATTERN]->(p:ControlPattern) + OPTIONAL MATCH (eq)-[:SAFETY_CRITICAL]->(se:SafetyElement) + RETURN t, + collect(DISTINCT v.name) AS views, + collect(DISTINCT eq.name) AS equipment, + collect(DISTINCT s.symptom) AS symptoms, + collect(DISTINCT fc.cause) AS causes, + collect(DISTINCT p.name) AS patterns, + collect(DISTINCT se.name) AS safety + LIMIT 1 + """, + tag=tag_path, + ) + record = result.single() + fallback_views: List[str] = [] + fallback_equipment: List[str] = [] + fallback_result = session.run( + """ + MATCH (v:View)-[:HAS_COMPONENT]->(vc:ViewComponent)-[r:BINDS_TO]->(n) + WHERE r.tag_path = $tag + OPTIONAL MATCH (eq:Equipment)-[*1..2]-(n) + RETURN collect(DISTINCT v.name) AS views, + collect(DISTINCT eq.name) AS equipment + LIMIT 1 + """, + tag=tag_path, + ).single() + if fallback_result: + fallback_views = [x for x in (fallback_result["views"] or []) if x] + fallback_equipment = [x for x in (fallback_result["equipment"] or []) if x] + + if not record: + return { + "tag_path": tag_path, + "tag_name": _last_segment_from_tag_path(tag_path) or tag_path, + "views": fallback_views, + "equipment": fallback_equipment, + "group": infer_tag_group(tag_path), + "symptoms": [], + "causes": [], + "patterns": [], + "safety": [], + } + node = record["t"] + return { + "tag_path": tag_path, + "tag_name": node.get("name") if node else (_last_segment_from_tag_path(tag_path) or tag_path), + "views": sorted(set([x for x in record["views"] if x] + fallback_views)), + "equipment": sorted(set([x for x in record["equipment"] if x] + fallback_equipment)), + "group": infer_tag_group(tag_path, node.get("folder_name") if node else None), + "symptoms": [x for x in record["symptoms"] if x], + "causes": [x for x in record["causes"] if x], + "patterns": [x for x in record["patterns"] if x], + "safety": [x for x in record["safety"] if x], + } + + # ----------------------------- + # Triage and persistence + # ----------------------------- + def run_llm_triage( + self, + context: Dict[str, Any], + deterministic: Dict[str, Any], + live_sample: Dict[str, Any], + ) -> Dict[str, Any]: + fallback = { + "summary": f"Deterministic anomaly on {context.get('tag_name', context['tag_path'])}", + "category": deterministic.get("category", "deviation"), + "severity": "medium", + "confidence": 0.55, + "probable_causes": ["Signal deviates from historical baseline."], + "verification_checks": [ + f"Check live quality/timestamp for {context.get('tag_path')}", + "Inspect upstream interlocks and communication health.", + ], + "safety_notes": context.get("safety", []), + "rationale": "LLM triage unavailable; using deterministic fallback.", + "related_entities": [ + {"label": "Equipment", "name": e} for e in context.get("equipment", [])[:3] + ] + [{"label": "View", "name": v} for v in context.get("views", [])[:2]], + } + if not self.llm: + return fallback + + system_prompt = ( + "You are an industrial anomaly triage assistant. " + "Return ONLY valid JSON with keys: summary, category, severity, confidence, " + "probable_causes, verification_checks, safety_notes, rationale, related_entities. " + "Severity must be one of critical/high/medium/low. " + "Category must be one of spike/drift/stuck/state-conflict/quality-issue/deviation. " + "related_entities is a list of objects: {label,name}." + ) + user_prompt = json.dumps( + { + "context": context, + "deterministic": deterministic, + "live_sample": live_sample, + }, + default=str, + ) + try: + result = self.llm.query_json( + system_prompt=system_prompt, + user_prompt=user_prompt, + max_tokens=900, + use_tools=False, + ) + data = result.get("data") + if not isinstance(data, dict): + return fallback + merged = dict(fallback) + merged.update({k: v for k, v in data.items() if v is not None}) + return merged + except Exception as exc: + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "llm_triage_failed", + "message": str(exc), + "recoverable": True, + "timestamp": utc_now_iso(), + }) + return fallback + + def _severity_from_scores(self, deterministic: Dict[str, Any], llm_out: Dict[str, Any]) -> str: + sev = str(llm_out.get("severity", "")).lower() + if sev in {"critical", "high", "medium", "low"}: + return sev + z = abs(float(deterministic.get("z_score", 0.0))) + if z >= 8: + return "critical" + if z >= 5: + return "high" + if z >= 3: + return "medium" + return "low" + + def is_duplicate_recent(self, dedup_sig: str) -> bool: + cooldown = max(1, int(self.config.get("dedupCooldownMinutes", 10))) + with self.graph.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent {dedup_key: $dedup_key}) + WHERE e.created_at IS NOT NULL + AND datetime(e.created_at) > datetime() - duration({minutes: $minutes}) + RETURN count(e) AS cnt + """, + dedup_key=dedup_sig, + minutes=cooldown, + ) + row = result.single() + return bool(row and row["cnt"] > 0) + + def persist_event( + self, + context: Dict[str, Any], + deterministic: Dict[str, Any], + live_sample: Dict[str, Any], + triage: Dict[str, Any], + subsystem: Optional[Dict[str, str]] = None, + ) -> Optional[Dict[str, Any]]: + category = triage.get("category") or deterministic.get("category", "deviation") + subsystem_ref = subsystem or _subsystem_ref("global", "all") + dedup_source = f"{context['tag_path']}::{subsystem_ref.get('id', 'global:all')}" + dedup_sig = dedup_key(dedup_source, category, int(self.config.get("dedupCooldownMinutes", 10))) + if self.is_duplicate_recent(dedup_sig): + return None + + event_id = f"ae-{uuid.uuid4()}" + severity = self._severity_from_scores(deterministic, triage) + confidence = float(max(0.0, min(1.0, triage.get("confidence", 0.5)))) + event_data = { + "event_id": event_id, + "run_id": self.run_id, + "event_schema_version": 1, + "state": "open", + "severity": severity, + "confidence": confidence, + "category": category, + "summary": triage.get("summary", f"Anomaly on {context['tag_path']}"), + "explanation": triage.get("rationale", ""), + "recommended_checks_json": json.dumps(triage.get("verification_checks", []), default=str), + "probable_causes_json": json.dumps(triage.get("probable_causes", []), default=str), + "safety_notes_json": json.dumps(triage.get("safety_notes", []), default=str), + "deterministic_reasons_json": json.dumps(deterministic.get("reasons", []), default=str), + "z_score": float(deterministic.get("z_score", 0.0)), + "mad_score": float(deterministic.get("mad_score", 0.0)), + "delta_rate": float(deterministic.get("delta_rate", 0.0)), + "window_volatility": float(deterministic.get("window_volatility", 0.0)), + "source_tag": context["tag_path"], + "tag_name": context.get("tag_name") or context["tag_path"], + "subsystem_type": subsystem_ref.get("type"), + "subsystem_name": subsystem_ref.get("name"), + "subsystem_id": subsystem_ref.get("id"), + "live_quality": live_sample.get("quality"), + "live_timestamp": live_sample.get("timestamp"), + "live_value": str(live_sample.get("value")), + "dedup_key": dedup_sig, + "created_at": utc_now_iso(), + "updated_at": utc_now_iso(), + } + + with self.graph.session() as session: + session.run( + """ + MATCH (r:AgentRun {run_id: $run_id}) + CREATE (e:AnomalyEvent $props) + MERGE (r)-[:EMITTED]->(e) + """, + run_id=self.run_id, + props=event_data, + ) + + session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + MATCH (t:ScadaTag) + WHERE t.name = $tag OR t.opc_item_path = $tag + MERGE (e)-[:OBSERVED_ON]->(t) + """, + event_id=event_id, + tag=context["tag_path"], + ) + + for equipment_name in context.get("equipment", [])[:5]: + session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + MATCH (eq:Equipment {name: $name}) + MERGE (e)-[:AFFECTS]->(eq) + """, + event_id=event_id, + name=equipment_name, + ) + + if subsystem_ref.get("type") == "view": + session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + MATCH (v:View {name: $name}) + MERGE (e)-[:SCOPED_TO]->(v) + """, + event_id=event_id, + name=subsystem_ref.get("name"), + ) + elif subsystem_ref.get("type") == "equipment": + session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + MATCH (eq:Equipment {name: $name}) + MERGE (e)-[:SCOPED_TO]->(eq) + """, + event_id=event_id, + name=subsystem_ref.get("name"), + ) + + related_inputs: List[Dict[str, str]] = [] + for item in triage.get("related_entities", []) or []: + if isinstance(item, dict) and item.get("label") and item.get("name"): + related_inputs.append({"label": str(item["label"]), "name": str(item["name"])}) + for name in context.get("symptoms", [])[:3]: + related_inputs.append({"label": "FaultSymptom", "name": name}) + for name in context.get("causes", [])[:3]: + related_inputs.append({"label": "FaultCause", "name": name}) + + for rel in related_inputs[:8]: + label = rel["label"] + if label not in {"FaultSymptom", "FaultCause", "ControlPattern", "SafetyElement", "Equipment", "ScadaTag", "View"}: + continue + session.run( + f""" + MATCH (e:AnomalyEvent {{event_id: $event_id}}) + MATCH (n:{label}) + WHERE n.name = $name OR n.symptom = $name OR n.cause = $name + MERGE (e)-[:RELATED_TO]->(n) + """, + event_id=event_id, + name=rel["name"], + ) + + return event_data + + def _emit_persisted_event(self, persisted: Dict[str, Any]) -> None: + """Emit normalized AGENT_EVENT payload for UI stream.""" + emit("AGENT_EVENT", { + "runId": self.run_id, + "eventId": persisted["event_id"], + "severity": persisted["severity"], + "summary": persisted["summary"], + "category": persisted.get("category"), + "entityRefs": { + "tag": persisted.get("tag_name") or persisted.get("source_tag"), + "sourceTag": persisted.get("source_tag"), + "subsystemType": persisted.get("subsystem_type"), + "subsystemName": persisted.get("subsystem_name"), + }, + "createdAt": persisted.get("created_at"), + }) + + def emit_provider_failure_event( + self, + code: str, + message: str, + *, + severity: str = "high", + category: str = "quality-issue", + source_tag: Optional[str] = None, + details: Optional[Dict[str, Any]] = None, + subsystem: Optional[Dict[str, str]] = None, + ) -> bool: + """ + Persist and stream provider-health anomalies so failures appear in feed. + + Returns: + True if a new event was persisted (false if deduped). + """ + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": code, + "message": message, + "recoverable": True, + "timestamp": utc_now_iso(), + }) + + tag = source_tag or f"provider://{code}" + detail_blob = json.dumps(details or {}, default=str) + context = { + "tag_path": tag, + "tag_name": source_tag or "ProviderHealth", + "equipment": [], + "symptoms": [], + "causes": [], + "patterns": [], + "safety": [], + } + deterministic = { + "candidate": True, + "reasons": [code], + "category": category, + "z_score": 0.0, + "mad_score": 0.0, + "delta_rate": 0.0, + "window_volatility": 0.0, + "history_points": 0, + } + triage = { + "summary": message, + "category": category, + "severity": severity, + "confidence": 0.9, + "probable_causes": [message], + "verification_checks": [ + "Check Ignition gateway connectivity and credentials.", + "Validate tag provider availability and endpoint health.", + ], + "safety_notes": [], + "rationale": f"Provider health event ({code}). Details: {detail_blob}", + "related_entities": [], + } + persisted = self.persist_event( + context=context, + deterministic=deterministic, + live_sample={ + "path": tag, + "value": "", + "quality": "Bad", + "timestamp": utc_now_iso(), + "data_type": "provider_health", + }, + triage=triage, + subsystem=subsystem, + ) + if persisted: + self._emit_persisted_event(persisted) + return True + return False + + # ----------------------------- + # Monitoring loop + # ----------------------------- + def run_cycle(self) -> Dict[str, Any]: + cycle_start = time.time() + thresholds = self.config.get("thresholds", {}) + stale_threshold_sec = int(thresholds.get("stalenessSec", 120)) + metrics = { + "candidates": 0, + "triaged": 0, + "emitted": 0, + "cycleMs": 0, + "diagnostics": make_default_diagnostics( + staleness_threshold_sec=stale_threshold_sec, + phase="cycle_start", + reason="cycle_initialized", + ), + } + min_history = int(self.config.get("minHistoryPoints", 30)) + max_candidates_total = max(1, int(self.config.get("maxCandidatesPerCycle", 25))) + max_candidates_per_subsystem = max(1, int(self.config.get("maxCandidatesPerSubsystem", 8))) + max_triage_total = max(0, int(self.config.get("maxLlmTriagesPerCycle", 5))) + max_triage_per_subsystem = max(0, int(self.config.get("maxLlmTriagesPerSubsystem", 2))) + + if not self.api.is_configured: + emitted = self.emit_provider_failure_event( + "ignition_not_configured", + "Ignition API URL/token not configured.", + severity="critical", + category="state-conflict", + ) + if emitted: + metrics["emitted"] += 1 + metrics["diagnostics"]["phase"] = "cycle_early_exit" + metrics["diagnostics"]["reason"] = "ignition_not_configured" + metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) + return metrics + + tags = self.get_monitored_tags() + if not tags: + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "no_tags_found", + "message": "No ScadaTag nodes with readable tag paths found.", + "recoverable": True, + "timestamp": utc_now_iso(), + }) + metrics["diagnostics"]["phase"] = "cycle_early_exit" + metrics["diagnostics"]["reason"] = "no_tags_found" + metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) + return metrics + + tag_paths = [t["path"] for t in tags] + tag_lookup = {t["path"]: t for t in tags} + linked_tag_count = sum( + 1 for t in tags if (t.get("views") or t.get("equipment")) + ) + unlinked_tag_count = max(0, len(tags) - linked_tag_count) + detected_subsystems = sorted( + { + (t.get("primary_subsystem") or _subsystem_ref("global", "all")).get("id", "global:all") + for t in tags + } + ) + + subsystem_tag_map: Dict[str, Dict[str, Any]] = {} + for t in tags: + sub = t.get("primary_subsystem") or _subsystem_ref("global", "all") + sub_id = sub.get("id", "global:all") + bucket = subsystem_tag_map.setdefault(sub_id, { + "type": sub.get("type", "global"), + "name": sub.get("name", "all"), + "tags": [], + }) + bucket["tags"].append({ + "path": t["path"], + "name": t.get("name", t["path"]), + "views": t.get("views", []), + "equipment": t.get("equipment", []), + "allSubsystems": [s.get("id") for s in (t.get("subsystems") or [])], + }) + + live_values = self.api.read_tags(tag_paths) + tool_calls: List[Dict[str, Any]] = [] + tool_calls.append({ + "tool": "read_tags", + "request": { + "count": len(tag_paths), + "samplePaths": tag_paths[:8], + }, + "result": { + "count": len(live_values), + "errorCount": sum(1 for tv in live_values if tv.error), + "qualityGoodCount": sum(1 for tv in live_values if is_quality_good(tv.quality)), + "timestampMissingCount": sum(1 for tv in live_values if not tv.timestamp), + "timestampInferredCount": sum( + 1 + for tv in live_values + if isinstance(tv.config, dict) and bool(tv.config.get("timestamp_inferred")) + ), + "sample": [ + { + "path": tv.path, + "value": _preview_value(tv.value), + "quality": tv.quality, + "timestamp": tv.timestamp, + "timestampInferred": bool(tv.config.get("timestamp_inferred")) + if isinstance(tv.config, dict) + else False, + "configKeys": sorted(list(tv.config.keys()))[:8] + if isinstance(tv.config, dict) + else [], + "error": tv.error, + } + for tv in live_values[:5] + ], + }, + }) + candidates: List[Dict[str, Any]] = [] + now = datetime.now(timezone.utc) + live_error_count = 0 + live_error_samples: List[str] = [] + history_error_count = 0 + history_error_samples: List[str] = [] + valid_live_count = 0 + missing_timestamp_count = 0 + inferred_timestamp_count = 0 + quality_filtered_count = 0 + stale_filtered_count = 0 + insufficient_history_count = 0 + low_history_candidate_count = 0 + candidate_subsystem_counts: Dict[str, int] = {} + live_error_linked = 0 + live_error_unlinked = 0 + history_error_linked = 0 + history_error_unlinked = 0 + quality_filtered_linked = 0 + quality_filtered_unlinked = 0 + stale_filtered_linked = 0 + stale_filtered_unlinked = 0 + evaluated_linked = 0 + evaluated_unlinked = 0 + candidate_linked = 0 + candidate_unlinked = 0 + near_shift_count = 0 + near_shift_linked = 0 + near_shift_unlinked = 0 + stale_samples: List[Dict[str, Any]] = [] + subsystem_shift_signals: Dict[str, Dict[str, Any]] = {} + processed_live_count = 0 + total_live_count = len(live_values) + last_progress_emit = 0.0 + + def emit_cycle_progress(reason: str, current_tag: str = "", include_tag_map: bool = False) -> None: + nonlocal last_progress_emit + diag = make_default_diagnostics( + staleness_threshold_sec=stale_threshold_sec, + phase="cycle_in_progress", + reason=reason, + ) + diag.update({ + "processedLiveCount": processed_live_count, + "totalLiveCount": total_live_count, + "currentTag": current_tag, + "candidatesSoFar": len(candidates), + "liveErrorCount": live_error_count, + "qualityFilteredCount": quality_filtered_count, + "staleFilteredCount": stale_filtered_count, + "historyErrorCount": history_error_count, + "monitoredTags": len(tags), + "linkedTags": linked_tag_count, + "unlinkedTags": unlinked_tag_count, + "detectedSubsystemCount": len(detected_subsystems), + "detectedSubsystems": detected_subsystems[:10], + }) + if include_tag_map: + diag["subsystemTagMap"] = subsystem_tag_map + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "running", + "cycleMs": int((time.time() - cycle_start) * 1000), + "candidates": len(candidates), + "triaged": 0, + "emitted": metrics.get("emitted", 0), + "diagnostics": diag, + "timestamp": utc_now_iso(), + }) + last_progress_emit = time.time() + + emit_cycle_progress("cycle_started", include_tag_map=True) + + def _update_subsystem_signal( + subsystem_ref: Dict[str, str], deterministic: Dict[str, Any], + tag_path: str, live_value: Any = None, + ) -> None: + sub_id = subsystem_ref.get("id", "global:all") + abs_z = abs(float(deterministic.get("z_score", 0.0))) + z = float(deterministic.get("z_score", 0.0)) + mad = float(deterministic.get("mad_score", 0.0)) + bucket = subsystem_shift_signals.setdefault( + sub_id, + { + "subsystemId": sub_id, + "subsystemType": subsystem_ref.get("type", "global"), + "subsystemName": subsystem_ref.get("name", "all"), + "evaluated": 0, + "candidate": 0, + "nearShift": 0, + "sumAbsZ": 0.0, + "sumZ": 0.0, + "maxAbsZ": 0.0, + "sampleTag": tag_path, + "_tagEntries": [], + }, + ) + bucket["evaluated"] += 1 + bucket["sumAbsZ"] += abs_z + bucket["sumZ"] += z + if abs_z >= 1.5: + bucket["nearShift"] += 1 + if abs_z > bucket["maxAbsZ"]: + bucket["maxAbsZ"] = abs_z + bucket["sampleTag"] = tag_path + tag_name = tag_path.rsplit("/", 1)[-1] if "/" in str(tag_path) else str(tag_path) + bucket["_tagEntries"].append({ + "path": str(tag_path), + "name": tag_name, + "z": round(z, 3), + "absZ": round(abs_z, 3), + "mad": round(mad, 3), + "value": live_value, + }) + + # ---- Phase 1: Filter live values (no I/O) ---- + TagEntry = Tuple[Any, Dict[str, Any], Dict[str, str], bool] # (tv, tag_meta, subsystem, is_linked) + tags_for_history: List[TagEntry] = [] + + for idx, tv in enumerate(live_values): + processed_live_count += 1 + tag_meta = ( + tags[idx] if idx < len(tags) + else tag_lookup.get(tv.path, {"path": tv.path, "name": tv.path}) + ) + subsystem = tag_meta.get("primary_subsystem") or _subsystem_ref("global", "all") + is_linked = bool(tag_meta.get("views") or tag_meta.get("equipment")) + + if tv.error: + live_error_count += 1 + if is_linked: + live_error_linked += 1 + else: + live_error_unlinked += 1 + if len(live_error_samples) < 5: + live_error_samples.append(f"{tv.path}: {tv.error}") + continue + valid_live_count += 1 + if not tv.timestamp: + missing_timestamp_count += 1 + if isinstance(tv.config, dict) and bool(tv.config.get("timestamp_inferred")): + inferred_timestamp_count += 1 + if not is_quality_good(tv.quality): + quality_filtered_count += 1 + if is_linked: + quality_filtered_linked += 1 + else: + quality_filtered_unlinked += 1 + continue + parsed_ts = parse_timestamp(tv.timestamp) + age_sec = (now - parsed_ts).total_seconds() if parsed_ts is not None else None + if is_stale(tv.timestamp, stale_threshold_sec, now=now): + stale_filtered_count += 1 + if is_linked: + stale_filtered_linked += 1 + else: + stale_filtered_unlinked += 1 + if len(stale_samples) < 8: + stale_samples.append({ + "path": tv.path, + "timestampRaw": tv.timestamp, + "timestampParsedUtc": parsed_ts.isoformat() if parsed_ts else None, + "ageSec": round(age_sec, 3) if age_sec is not None else None, + "thresholdSec": stale_threshold_sec, + "reason": "timestamp_parse_failed" if parsed_ts is None else "age_exceeds_threshold", + }) + continue + + tags_for_history.append((tv, tag_meta, subsystem, is_linked)) + + emit_cycle_progress( + "filtering_complete", + current_tag=f"{len(tags_for_history)} tags passed filters", + ) + + # ---- Phase 2: Batched history fetch ---- + history_fetch_start = time.time() + history_paths = [tv.path for tv, _, _, _ in tags_for_history] + history_results = self.fetch_history_batch(history_paths) if history_paths else {} + history_fetch_elapsed = time.time() - history_fetch_start + emit_cycle_progress( + "history_complete", + current_tag=f"{len(history_results)} in {round(history_fetch_elapsed, 1)}s", + ) + + # ---- Phase 3: Score and build candidates using pre-fetched history ---- + for tv, tag_meta, subsystem, is_linked in tags_for_history: + history, history_error = history_results.get(tv.path, ([], "No history result")) + + if len(tool_calls) < 18: + tool_calls.append({ + "tool": "query_tag_history", + "request": { + "tagPath": tv.path, + "historyWindowMinutes": int(self.config.get("historyWindowMinutes", 360)), + }, + "result": { + "historyPoints": len(history), + "error": history_error, + }, + }) + if history_error: + history_error_count += 1 + if is_linked: + history_error_linked += 1 + else: + history_error_unlinked += 1 + if len(history_error_samples) < 5: + history_error_samples.append(f"{tv.path}: {history_error}") + continue + if len(history) < min_history: + insufficient_history_count += 1 + if len(history) >= 5: + prev_val = self._prev_values.get(tv.path) + deterministic = compute_deviation_scores( + current_value=tv.value, + history_values=history, + prev_value=prev_val, + thresholds=thresholds, + ) + curr_num = safe_float(tv.value) + if curr_num is not None: + self._prev_values[tv.path] = curr_num + + _update_subsystem_signal(subsystem, deterministic, tv.path, live_value=tv.value) + if is_linked: + evaluated_linked += 1 + else: + evaluated_unlinked += 1 + if abs(float(deterministic.get("z_score", 0.0))) >= 1.5: + near_shift_count += 1 + if is_linked: + near_shift_linked += 1 + else: + near_shift_unlinked += 1 + + if deterministic.get("candidate"): + sub_bucket = subsystem_shift_signals.setdefault( + subsystem.get("id", "global:all"), + { + "subsystemId": subsystem.get("id", "global:all"), + "subsystemType": subsystem.get("type", "global"), + "subsystemName": subsystem.get("name", "all"), + "evaluated": 0, + "candidate": 0, + "nearShift": 0, + "sumAbsZ": 0.0, + "sumZ": 0.0, + "maxAbsZ": 0.0, + "sampleTag": tv.path, + "_tagEntries": [], + }, + ) + sub_bucket["candidate"] += 1 + if is_linked: + candidate_linked += 1 + else: + candidate_unlinked += 1 + deterministic["reasons"] = list(deterministic.get("reasons", [])) + ["low_history_override"] + deterministic["history_quality"] = "low" + context = self.get_context(tv.path) + context["subsystem"] = subsystem + context["subsystems"] = tag_meta.get("subsystems") or [subsystem] + candidates.append( + { + "context": context, + "deterministic": deterministic, + "live_sample": { + "path": tv.path, + "value": tv.value, + "quality": tv.quality, + "timestamp": tv.timestamp, + "data_type": tv.data_type, + }, + "subsystem": subsystem, + } + ) + sub_id = subsystem.get("id", "global:all") + candidate_subsystem_counts[sub_id] = candidate_subsystem_counts.get(sub_id, 0) + 1 + low_history_candidate_count += 1 + continue + + prev_val = self._prev_values.get(tv.path) + deterministic = compute_deviation_scores( + current_value=tv.value, + history_values=history, + prev_value=prev_val, + thresholds=thresholds, + ) + curr_num = safe_float(tv.value) + if curr_num is not None: + self._prev_values[tv.path] = curr_num + + _update_subsystem_signal(subsystem, deterministic, tv.path, live_value=tv.value) + if is_linked: + evaluated_linked += 1 + else: + evaluated_unlinked += 1 + if abs(float(deterministic.get("z_score", 0.0))) >= 1.5: + near_shift_count += 1 + if is_linked: + near_shift_linked += 1 + else: + near_shift_unlinked += 1 + + if deterministic.get("candidate"): + sub_bucket = subsystem_shift_signals.setdefault( + subsystem.get("id", "global:all"), + { + "subsystemId": subsystem.get("id", "global:all"), + "subsystemType": subsystem.get("type", "global"), + "subsystemName": subsystem.get("name", "all"), + "evaluated": 0, + "candidate": 0, + "nearShift": 0, + "sumAbsZ": 0.0, + "sumZ": 0.0, + "maxAbsZ": 0.0, + "sampleTag": tv.path, + "_tagEntries": [], + }, + ) + sub_bucket["candidate"] += 1 + if is_linked: + candidate_linked += 1 + else: + candidate_unlinked += 1 + context = self.get_context(tv.path) + context["subsystem"] = subsystem + context["subsystems"] = tag_meta.get("subsystems") or [subsystem] + candidates.append( + { + "context": context, + "deterministic": deterministic, + "live_sample": { + "path": tv.path, + "value": tv.value, + "quality": tv.quality, + "timestamp": tv.timestamp, + "data_type": tv.data_type, + }, + "subsystem": subsystem, + } + ) + sub_id = subsystem.get("id", "global:all") + candidate_subsystem_counts[sub_id] = candidate_subsystem_counts.get(sub_id, 0) + 1 + + emit_cycle_progress("scoring_complete") + + if live_values and live_error_count == len(live_values): + emitted = self.emit_provider_failure_event( + "live_tag_provider_failed", + f"Live tag provider failed for all reads ({live_error_count}/{len(live_values)}).", + severity="high", + category="quality-issue", + details={"samples": live_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + elif live_error_count > 0: + emitted = self.emit_provider_failure_event( + "live_tag_provider_partial_failure", + f"Live tag provider partially failed ({live_error_count}/{len(live_values)} reads).", + severity="medium", + category="quality-issue", + details={"samples": live_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + + if valid_live_count > 0 and history_error_count >= max(1, int(valid_live_count * 0.8)): + emitted = self.emit_provider_failure_event( + "history_provider_failed", + f"History provider failed for most queries ({history_error_count}/{valid_live_count}).", + severity="high", + category="quality-issue", + details={"samples": history_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + elif history_error_count > 0: + emitted = self.emit_provider_failure_event( + "history_provider_partial_failure", + f"History provider partially failed ({history_error_count}/{valid_live_count}).", + severity="medium", + category="quality-issue", + details={"samples": history_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + + if valid_live_count > 0 and stale_filtered_count >= max(1, int(valid_live_count * 0.8)): + emitted = self.emit_provider_failure_event( + "live_timestamp_stale", + f"Most live samples were stale ({stale_filtered_count}/{valid_live_count}).", + severity="medium", + category="quality-issue", + details={"staleCount": stale_filtered_count, "validLiveCount": valid_live_count}, + ) + if emitted: + metrics["emitted"] += 1 + + if valid_live_count > 0 and quality_filtered_count >= max(1, int(valid_live_count * 0.8)): + emitted = self.emit_provider_failure_event( + "live_quality_bad", + f"Most live samples had non-good quality ({quality_filtered_count}/{valid_live_count}).", + severity="medium", + category="quality-issue", + details={"qualityFilteredCount": quality_filtered_count, "validLiveCount": valid_live_count}, + ) + if emitted: + metrics["emitted"] += 1 + + metrics["candidates"] = len(candidates) + shortlisted: List[Dict[str, Any]] = [] + selected_per_subsystem: Dict[str, int] = {} + for candidate in candidates: + subsystem = candidate.get("subsystem") or _subsystem_ref("global", "all") + sub_id = subsystem.get("id", "global:all") + if selected_per_subsystem.get(sub_id, 0) >= max_candidates_per_subsystem: + continue + shortlisted.append(candidate) + selected_per_subsystem[sub_id] = selected_per_subsystem.get(sub_id, 0) + 1 + if len(shortlisted) >= max_candidates_total: + break + + llm_total = 0 + llm_per_subsystem: Dict[str, int] = {} + dedup_suppressed_count = 0 + + if shortlisted: + emit_cycle_progress( + "triage_started", + current_tag=f"{len(shortlisted)} candidates to process", + ) + + for ci, candidate in enumerate(shortlisted): + subsystem = candidate.get("subsystem") or _subsystem_ref("global", "all") + sub_id = subsystem.get("id", "global:all") + tag_name = candidate["context"].get("tag_name", candidate["context"].get("tag_path", "?")) + use_llm = ( + llm_total < max_triage_total + and llm_per_subsystem.get(sub_id, 0) < max_triage_per_subsystem + ) + triage = ( + self.run_llm_triage( + candidate["context"], + candidate["deterministic"], + candidate["live_sample"], + ) + if use_llm + else { + "summary": ( + f"Deviation on {candidate['context'].get('tag_name', candidate['context']['tag_path'])} " + f"in subsystem {subsystem.get('name', 'all')}" + ), + "category": candidate["deterministic"].get("category", "deviation"), + "severity": "medium", + "confidence": 0.5, + "verification_checks": [], + "probable_causes": [], + "safety_notes": [], + "rationale": "Deterministic-only triage (LLM triage disabled or cap reached).", + "related_entities": [], + } + ) + if use_llm: + llm_total += 1 + llm_per_subsystem[sub_id] = llm_per_subsystem.get(sub_id, 0) + 1 + metrics["triaged"] += 1 + persisted = self.persist_event( + candidate["context"], + candidate["deterministic"], + candidate["live_sample"], + triage, + subsystem=subsystem, + ) + if persisted: + metrics["emitted"] += 1 + self._emit_persisted_event(persisted) + else: + dedup_suppressed_count += 1 + + if (ci + 1) % 5 == 0 or ci == len(shortlisted) - 1: + emit_cycle_progress( + "triaging", + current_tag=f"{ci + 1}/{len(shortlisted)} ({tag_name})", + ) + + top_candidates_by_subsystem = dict( + sorted(candidate_subsystem_counts.items(), key=lambda item: item[1], reverse=True)[:10] + ) + top_shift_signals = sorted( + subsystem_shift_signals.values(), + key=lambda item: ( + int(item.get("candidate", 0)), + float(item.get("maxAbsZ", 0.0)), + int(item.get("nearShift", 0)), + int(item.get("evaluated", 0)), + ), + reverse=True, + ) + sparkline_size = 20 + for item in top_shift_signals: + evaluated = max(1, int(item.get("evaluated", 0))) + item["avgAbsZ"] = round(float(item.get("sumAbsZ", 0.0)) / evaluated, 3) + item["avgZ"] = round(float(item.get("sumZ", 0.0)) / evaluated, 3) + item["shiftRatio"] = round(float(item.get("nearShift", 0)) / evaluated, 3) + item["candidateRatio"] = round(float(item.get("candidate", 0)) / evaluated, 3) + item.pop("sumAbsZ", None) + item.pop("sumZ", None) + raw_tags = item.pop("_tagEntries", []) + sorted_tags = sorted(raw_tags, key=lambda t: t.get("absZ", 0.0), reverse=True) + tag_signals = [] + for t in sorted_tags: + entry = {k: v for k, v in t.items() if k != "absZ"} + cached_hist = self._history_cache.get(t.get("path", "")) + if cached_hist and cached_hist.get("values"): + vals = cached_hist["values"] + entry["avg"] = round(sum(vals) / len(vals), 2) + if len(vals) <= sparkline_size: + entry["sparkline"] = [round(v, 2) for v in vals] + else: + step = len(vals) / sparkline_size + entry["sparkline"] = [round(vals[int(i * step)], 2) for i in range(sparkline_size)] + tag_signals.append(entry) + item["tagSignals"] = tag_signals + + metrics["diagnostics"] = { + **make_default_diagnostics( + staleness_threshold_sec=int(thresholds.get("stalenessSec", 120)), + phase="cycle_complete", + reason="ok", + ), + "monitoredTags": len(tag_paths), + "linkedTags": linked_tag_count, + "unlinkedTags": unlinked_tag_count, + "validLiveCount": valid_live_count, + "missingTimestampCount": missing_timestamp_count, + "inferredTimestampCount": inferred_timestamp_count, + "liveErrorCount": live_error_count, + "liveErrorLinked": live_error_linked, + "liveErrorUnlinked": live_error_unlinked, + "qualityFilteredCount": quality_filtered_count, + "qualityFilteredLinked": quality_filtered_linked, + "qualityFilteredUnlinked": quality_filtered_unlinked, + "staleFilteredCount": stale_filtered_count, + "staleFilteredLinked": stale_filtered_linked, + "staleFilteredUnlinked": stale_filtered_unlinked, + "historyErrorCount": history_error_count, + "historyErrorLinked": history_error_linked, + "historyErrorUnlinked": history_error_unlinked, + "insufficientHistoryCount": insufficient_history_count, + "lowHistoryCandidateCount": low_history_candidate_count, + "evaluatedLinked": evaluated_linked, + "evaluatedUnlinked": evaluated_unlinked, + "candidateLinked": candidate_linked, + "candidateUnlinked": candidate_unlinked, + "nearShiftCount": near_shift_count, + "nearShiftLinked": near_shift_linked, + "nearShiftUnlinked": near_shift_unlinked, + "stalenessThresholdSec": int(thresholds.get("stalenessSec", 120)), + "staleSamples": stale_samples, + "timestampParseNote": "Naive timestamps are treated as local time by parse_timestamp().", + "detectedSubsystemCount": len(detected_subsystems), + "detectedSubsystems": detected_subsystems[:10], + "subsystemTagMap": subsystem_tag_map, + "candidateSubsystemCount": len(candidate_subsystem_counts), + "candidateBySubsystem": top_candidates_by_subsystem, + "subsystemShiftSignals": top_shift_signals, + "maxCandidatesPerSubsystem": max_candidates_per_subsystem, + "maxLlmTriagesPerSubsystem": max_triage_per_subsystem, + "llmTriagedCount": llm_total, + "dedupSuppressedCount": dedup_suppressed_count, + "toolCalls": tool_calls, + } + metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) + return metrics + + def cleanup_retention(self) -> int: + retention_days = int(self.config.get("retentionDays", 14)) + return self.graph.cleanup_anomaly_events(retention_days=retention_days) + + def run_forever(self) -> int: + self.init_schema() + self.upsert_run("running") + startup_diag = make_default_diagnostics( + staleness_threshold_sec=int(self.config.get("thresholds", {}).get("stalenessSec", 120)), + phase="startup", + reason="worker_started", + ) + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "running", + "cycleMs": 0, + "candidates": 0, + "triaged": 0, + "emitted": 0, + "diagnostics": startup_diag, + "timestamp": utc_now_iso(), + }) + + poll_ms = int(self.config.get("pollIntervalMs", 1000)) + cleanup_every = max(1, int(self.config.get("cleanupEveryCycles", 40))) + exit_code = 0 + reason = "stopped" + + while self._running: + self._cycle_count += 1 + cycle_started = time.time() + try: + metrics = self.run_cycle() + self.heartbeat(metrics) + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "running", + "cycleMs": metrics["cycleMs"], + "candidates": metrics["candidates"], + "triaged": metrics["triaged"], + "emitted": metrics["emitted"], + "diagnostics": metrics.get("diagnostics", {}), + "timestamp": utc_now_iso(), + }) + if self._cycle_count % cleanup_every == 0: + deleted = self.cleanup_retention() + if deleted > 0: + cleanup_diag = make_default_diagnostics( + staleness_threshold_sec=int(self.config.get("thresholds", {}).get("stalenessSec", 120)), + phase="retention_cleanup", + reason="cleanup_complete", + ) + cleanup_diag["emittedCleanupCount"] = deleted + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "retention_cleanup", + "cycleMs": 0, + "candidates": 0, + "triaged": 0, + "emitted": deleted, + "diagnostics": cleanup_diag, + "timestamp": utc_now_iso(), + }) + except Exception as exc: + reason = "failed" + exit_code = 1 + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "cycle_error", + "message": str(exc), + "recoverable": True, + "timestamp": utc_now_iso(), + }) + error_diag = make_default_diagnostics( + staleness_threshold_sec=int(self.config.get("thresholds", {}).get("stalenessSec", 120)), + phase="cycle_error", + reason="exception", + ) + error_diag["errorMessage"] = str(exc) + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "running", + "cycleMs": int((time.time() - cycle_started) * 1000), + "candidates": 0, + "triaged": 0, + "emitted": 0, + "diagnostics": error_diag, + "timestamp": utc_now_iso(), + }) + + elapsed_ms = int((time.time() - cycle_started) * 1000) + remaining = max(0, poll_ms - elapsed_ms) / 1000.0 + if remaining > 0: + time.sleep(remaining) + + self.upsert_run("stopped" if reason != "failed" else "failed", reason=reason) + emit("AGENT_COMPLETE", { + "runId": self.run_id, + "success": exit_code == 0, + "reason": reason, + "stoppedAt": utc_now_iso(), + }) + return exit_code + + # ----------------------------- + # Single-operation helpers + # ----------------------------- + def list_events(self, limit: int, state: Optional[str], severity: Optional[str], run_id: Optional[str]) -> Dict[str, Any]: + events = self.graph.list_anomaly_events(limit=limit, state=state, severity=severity, run_id=run_id) + return {"success": True, "events": events} + + def get_event(self, event_id: str) -> Dict[str, Any]: + event = self.graph.get_anomaly_event(event_id) + if not event: + return {"success": False, "error": f"Event not found: {event_id}"} + return {"success": True, "event": event} + + def ack_event(self, event_id: str, note: Optional[str]) -> Dict[str, Any]: + with self.graph.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + SET e.state = 'acknowledged', + e.acknowledged_at = datetime(), + e.ack_note = $note, + e.updated_at = datetime() + RETURN count(e) AS cnt + """, + event_id=event_id, + note=note or "", + ) + record = result.single() + if not record or record["cnt"] == 0: + return {"success": False, "error": f"Event not found: {event_id}"} + return {"success": True, "eventId": event_id} + + def clear_event(self, event_id: str, note: Optional[str]) -> Dict[str, Any]: + with self.graph.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + SET e.state = 'cleared', + e.cleared_at = datetime(), + e.clear_note = $note, + e.updated_at = datetime() + RETURN count(e) AS cnt + """, + event_id=event_id, + note=note or "", + ) + record = result.single() + if not record or record["cnt"] == 0: + return {"success": False, "error": f"Event not found: {event_id}"} + return {"success": True, "eventId": event_id} + + def deep_analyze(self, event_id: str) -> Dict[str, Any]: + """Run LLM triage on an existing event and update it in-place.""" + event = self.graph.get_anomaly_event(event_id) + if not event: + return {"success": False, "error": f"Event not found: {event_id}"} + + tag_path = event.get("source_tag") or event.get("tag_name", "") + if not tag_path: + return {"success": False, "error": "Event has no source_tag"} + + context = self.get_context(tag_path) + context["subsystem"] = { + "id": event.get("subsystem_id", "global:all"), + "type": event.get("subsystem_type", "global"), + "name": event.get("subsystem_name", "all"), + } + + deterministic = { + "candidate": True, + "z_score": float(event.get("z_score", 0)), + "mad_score": float(event.get("mad_score", 0)), + "delta_rate": float(event.get("delta_rate", 0)), + "window_volatility": float(event.get("window_volatility", 0)), + "reasons": json.loads(event.get("deterministic_reasons_json", "[]")), + "category": event.get("category", "deviation"), + } + + live_sample = { + "value": event.get("live_value"), + "quality": event.get("live_quality"), + "timestamp": event.get("live_timestamp"), + } + + if not self.llm: + return {"success": False, "error": "LLM client not configured"} + + triage = self.run_llm_triage(context, deterministic, live_sample) + + severity = self._severity_from_scores(deterministic, triage) + with self.graph.session() as session: + session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + SET e.summary = $summary, + e.explanation = $explanation, + e.severity = $severity, + e.confidence = $confidence, + e.recommended_checks_json = $checks, + e.probable_causes_json = $causes, + e.safety_notes_json = $safety, + e.updated_at = $updated_at, + e.llm_triaged = true + RETURN e + """, + event_id=event_id, + summary=triage.get("summary", ""), + explanation=triage.get("rationale", ""), + severity=severity, + confidence=float(max(0.0, min(1.0, triage.get("confidence", 0.5)))), + checks=json.dumps(triage.get("verification_checks", []), default=str), + causes=json.dumps(triage.get("probable_causes", []), default=str), + safety=json.dumps(triage.get("safety_notes", []), default=str), + updated_at=utc_now_iso(), + ) + + updated_event = self.graph.get_anomaly_event(event_id) + return {"success": True, "event": updated_event} + + def get_status(self, run_id: str) -> Dict[str, Any]: + with self.graph.session() as session: + result = session.run( + """ + MATCH (r:AgentRun {run_id: $run_id}) + RETURN r + LIMIT 1 + """, + run_id=run_id, + ) + row = result.single() + if not row: + return {"success": False, "error": f"Run not found: {run_id}"} + props = dict(row["r"]) + return { + "success": True, + "status": props.get("status"), + "metrics": { + "cycleCount": props.get("cycle_count", 0), + "lastCycleMs": props.get("last_cycle_ms", 0), + "lastCandidates": props.get("last_candidates", 0), + "lastTriaged": props.get("last_triaged", 0), + "lastEmitted": props.get("last_emitted", 0), + }, + "lastHeartbeatAt": props.get("last_heartbeat_at"), + "run": props, + } + + +def _load_fixture_cases(path: Path) -> List[Dict[str, Any]]: + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, dict): + return data.get("cases", []) + if isinstance(data, list): + return data + return [] + + +def replay_fixtures(config_json: Optional[str], fixture_path: str) -> Dict[str, Any]: + config = merge_defaults(json.loads(config_json) if config_json else {}) + path = Path(fixture_path) + cases = _load_fixture_cases(path) + thresholds = config.get("thresholds", {}) + passed = 0 + failures: List[Dict[str, Any]] = [] + + for case in cases: + result = compute_deviation_scores( + current_value=case.get("current_value"), + history_values=case.get("history_values", []), + prev_value=case.get("prev_value"), + thresholds=thresholds, + ) + expected = bool(case.get("expected_candidate", False)) + if result.get("candidate") == expected: + passed += 1 + else: + failures.append( + { + "id": case.get("id"), + "expected_candidate": expected, + "actual_candidate": result.get("candidate"), + "category": result.get("category"), + "reasons": result.get("reasons", []), + } + ) + + return { + "success": len(failures) == 0, + "total": len(cases), + "passed": passed, + "failed": len(failures), + "failures": failures, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description="Anomaly monitor worker") + sub = parser.add_subparsers(dest="command", required=True) + + p_run = sub.add_parser("run", help="Run continuous anomaly monitoring") + p_run.add_argument("--run-id", help="Optional run id") + p_run.add_argument("--config-json", default="{}", help="JSON config string") + + p_status = sub.add_parser("status", help="Get status for one run") + p_status.add_argument("--run-id", required=True) + + p_list = sub.add_parser("list-events", help="List anomaly events") + p_list.add_argument("--limit", type=int, default=100) + p_list.add_argument("--state") + p_list.add_argument("--severity") + p_list.add_argument("--run-id") + + p_get = sub.add_parser("get-event", help="Get one anomaly event") + p_get.add_argument("--event-id", required=True) + + p_ack = sub.add_parser("ack-event", help="Acknowledge one anomaly event") + p_ack.add_argument("--event-id", required=True) + p_ack.add_argument("--note") + + p_clear = sub.add_parser("clear-event", help="Clear one acknowledged anomaly event") + p_clear.add_argument("--event-id", required=True) + p_clear.add_argument("--note") + + p_deep = sub.add_parser("deep-analyze", help="Run LLM triage on an existing event") + p_deep.add_argument("--event-id", required=True) + + p_cleanup = sub.add_parser("cleanup", help="Delete old anomaly events") + p_cleanup.add_argument("--retention-days", type=int, default=14) + + p_replay = sub.add_parser("replay-fixtures", help="Validate deterministic scoring against fixtures") + p_replay.add_argument("--fixture-file", required=True) + p_replay.add_argument("--config-json", default="{}") + + args = parser.parse_args() + + if args.command == "replay-fixtures": + result = replay_fixtures(args.config_json, args.fixture_file) + print(json.dumps(result)) + return 0 if result["success"] else 1 + + try: + monitor = AnomalyMonitor( + config=json.loads(getattr(args, "config_json", "{}") or "{}"), + run_id=getattr(args, "run_id", None), + ) + except Exception as exc: + print(json.dumps({"success": False, "error": str(exc)})) + return 1 + + if args.command == "run": + def _signal_handler(_signum, _frame): + monitor._running = False + + signal.signal(signal.SIGTERM, _signal_handler) + if hasattr(signal, "SIGINT"): + signal.signal(signal.SIGINT, _signal_handler) + return monitor.run_forever() + + if args.command == "status": + print(json.dumps(monitor.get_status(args.run_id), default=str)) + return 0 + + if args.command == "list-events": + print(json.dumps(monitor.list_events(args.limit, args.state, args.severity, args.run_id), default=str)) + return 0 + + if args.command == "get-event": + print(json.dumps(monitor.get_event(args.event_id), default=str)) + return 0 + + if args.command == "ack-event": + print(json.dumps(monitor.ack_event(args.event_id, args.note), default=str)) + return 0 + + if args.command == "clear-event": + print(json.dumps(monitor.clear_event(args.event_id, args.note), default=str)) + return 0 + + if args.command == "deep-analyze": + print(json.dumps(monitor.deep_analyze(args.event_id), default=str)) + return 0 + + if args.command == "cleanup": + deleted = monitor.graph.cleanup_anomaly_events(args.retention_days) + print(json.dumps({"success": True, "deleted": deleted})) + return 0 + + return 1 + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/scripts/anomaly_rules.py b/scripts/anomaly_rules.py new file mode 100644 index 0000000..5e0e6aa --- /dev/null +++ b/scripts/anomaly_rules.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +Deterministic anomaly scoring primitives for monitoring agents. + +This module intentionally avoids external dependencies so it can run in +packaged/offline environments. +""" + +from __future__ import annotations + +import hashlib +import math +from datetime import datetime, timezone +from statistics import mean, median, pstdev +from typing import Any, Dict, List, Optional + + +def safe_float(value: Any) -> Optional[float]: + """Best-effort conversion to float.""" + if value is None: + return None + if isinstance(value, bool): + return float(value) + if isinstance(value, (int, float)): + if math.isnan(value) or math.isinf(value): + return None + return float(value) + text = str(value).strip() + if not text: + return None + try: + result = float(text) + except ValueError: + return None + if math.isnan(result) or math.isinf(result): + return None + return result + + +def parse_timestamp(ts: Optional[str]) -> Optional[datetime]: + """Parse an ISO-like timestamp to UTC-aware datetime.""" + if not ts: + return None + text = str(ts).strip() + if not text: + return None + # Handle unix epoch (seconds or milliseconds) represented as numeric text. + if text.isdigit(): + try: + raw = int(text) + if raw > 10_000_000_000: # likely milliseconds + raw = raw / 1000.0 + return datetime.fromtimestamp(raw, tz=timezone.utc) + except (ValueError, OSError, OverflowError): + return None + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + dt = datetime.fromisoformat(text) + except ValueError: + return None + if dt.tzinfo is None: + # Ignition often returns naive local timestamps; assume local timezone. + local_tz = datetime.now().astimezone().tzinfo or timezone.utc + dt = dt.replace(tzinfo=local_tz) + return dt.astimezone(timezone.utc) + + +def is_quality_good(quality: Optional[str]) -> bool: + """Conservative quality gate.""" + if quality is None: + return False + q = str(quality).strip().lower() + if not q: + return False + if "good" in q or "ok" in q or q in {"192"}: + return True + return False + + +def is_stale(timestamp: Optional[str], staleness_sec: int, now: Optional[datetime] = None) -> bool: + """Return True if sample timestamp is stale or invalid.""" + if staleness_sec <= 0: + return False + parsed = parse_timestamp(timestamp) + if parsed is None: + return True + baseline = now or datetime.now(timezone.utc) + age = (baseline - parsed).total_seconds() + return age > staleness_sec + + +def _mad(values: List[float]) -> float: + """Median absolute deviation.""" + if not values: + return 0.0 + med = median(values) + abs_dev = [abs(v - med) for v in values] + return median(abs_dev) if abs_dev else 0.0 + + +def _percentile_rank(values: List[float], current: float) -> float: + """Approximate percentile rank of current within values.""" + if not values: + return 0.0 + less_or_equal = sum(1 for v in values if v <= current) + return less_or_equal / len(values) + + +def compute_deviation_scores( + current_value: Any, + history_values: List[Any], + prev_value: Any = None, + thresholds: Optional[Dict[str, float]] = None, +) -> Dict[str, Any]: + """ + Compute deterministic anomaly scores and candidate flags. + + Threshold defaults are intentionally conservative and should be configured + per process during rollout. + """ + cfg = { + "z": 3.0, + "mad": 3.5, + "rate": 0.0, + "flatline_std_epsilon": 1e-6, + "stuck_window_size": 20, + } + if thresholds: + cfg.update({k: v for k, v in thresholds.items() if v is not None}) + + current = safe_float(current_value) + hist = [v for v in (safe_float(x) for x in history_values) if v is not None] + previous = safe_float(prev_value) + + result: Dict[str, Any] = { + "candidate": False, + "reasons": [], + "category": "normal", + "z_score": 0.0, + "mad_score": 0.0, + "delta_rate": 0.0, + "window_volatility": 0.0, + "percentile_rank": 0.0, + "drift_score": 0.0, + "history_points": len(hist), + } + + if current is None: + result["category"] = "invalid_value" + result["reasons"].append("current_value_not_numeric") + return result + if not hist: + result["category"] = "insufficient_history" + result["reasons"].append("history_empty") + return result + + mu = mean(hist) + sigma = pstdev(hist) if len(hist) > 1 else 0.0 + sigma = max(sigma, 1e-9) + z_score = (current - mu) / sigma + result["z_score"] = z_score + result["window_volatility"] = sigma + result["percentile_rank"] = _percentile_rank(hist, current) + + mad = _mad(hist) + mad_denom = max(mad * 1.4826, 1e-9) + mad_score = abs(current - median(hist)) / mad_denom + result["mad_score"] = mad_score + + if previous is not None: + result["delta_rate"] = abs(current - previous) + + if abs(z_score) >= float(cfg["z"]): + result["candidate"] = True + result["reasons"].append("z_score_threshold") + if mad_score >= float(cfg["mad"]): + result["candidate"] = True + result["reasons"].append("mad_score_threshold") + if float(cfg["rate"]) > 0 and result["delta_rate"] >= float(cfg["rate"]): + result["candidate"] = True + result["reasons"].append("delta_rate_threshold") + + if len(hist) >= 20: + midpoint = len(hist) // 2 + first_half = hist[:midpoint] + second_half = hist[midpoint:] + trend_delta = abs(mean(second_half) - mean(first_half)) + trend_score = trend_delta / sigma + result["drift_score"] = trend_score + if trend_score >= 1.25 and (result["percentile_rank"] >= 0.85 or result["percentile_rank"] <= 0.15): + result["candidate"] = True + result["reasons"].append("drift_trend") + + recent = hist[-int(max(3, cfg["stuck_window_size"])) :] + recent_std = pstdev(recent) if len(recent) > 1 else 0.0 + if recent_std <= float(cfg["flatline_std_epsilon"]): + if previous is not None and abs(current - previous) <= float(cfg["flatline_std_epsilon"]): + result["candidate"] = True + result["reasons"].append("flatline_detected") + result["category"] = "stuck" + + if result["category"] == "normal" and result["candidate"]: + if "flatline_detected" in result["reasons"]: + result["category"] = "stuck" + elif result["delta_rate"] > 0 and "delta_rate_threshold" in result["reasons"]: + result["category"] = "spike" + elif "drift_trend" in result["reasons"]: + result["category"] = "drift" + elif abs(z_score) > 0 and len(hist) >= 20: + # Drift-like heuristic for sustained tail position with moderate rate + if result["percentile_rank"] >= 0.95 or result["percentile_rank"] <= 0.05: + result["category"] = "drift" + else: + result["category"] = "spike" + else: + result["category"] = "deviation" + + return result + + +def dedup_key(tag_path: str, category: str, bucket_minutes: int = 10) -> str: + """Create a deterministic dedup signature for event cooldown windows.""" + now = datetime.now(timezone.utc) + bucket = int(now.timestamp() // max(1, bucket_minutes * 60)) + raw = f"{tag_path}|{category}|{bucket}" + return hashlib.sha1(raw.encode("utf-8")).hexdigest() + diff --git a/scripts/fixtures/anomaly_replay_cases.json b/scripts/fixtures/anomaly_replay_cases.json new file mode 100644 index 0000000..544cd3f --- /dev/null +++ b/scripts/fixtures/anomaly_replay_cases.json @@ -0,0 +1,32 @@ +{ + "cases": [ + { + "id": "normal-baseline", + "current_value": 50.3, + "prev_value": 50.1, + "history_values": [49.9, 50.1, 50.0, 50.2, 50.1, 49.8, 50.3, 50.0, 49.9, 50.2, 50.1, 50.0, 49.9, 50.2, 50.1, 50.0, 50.2, 49.8, 50.0, 50.1, 50.0, 49.9, 50.1, 50.2, 50.0, 50.1, 49.9, 50.0, 50.1, 50.0], + "expected_candidate": false + }, + { + "id": "sudden-spike", + "current_value": 91.0, + "prev_value": 49.8, + "history_values": [49.9, 50.1, 50.0, 50.2, 50.1, 49.8, 50.3, 50.0, 49.9, 50.2, 50.1, 50.0, 49.9, 50.2, 50.1, 50.0, 50.2, 49.8, 50.0, 50.1, 50.0, 49.9, 50.1, 50.2, 50.0, 50.1, 49.9, 50.0, 50.1, 50.0], + "expected_candidate": true + }, + { + "id": "slow-drift-tail", + "current_value": 61.5, + "prev_value": 61.0, + "history_values": [50.0, 50.2, 50.3, 50.5, 50.7, 50.9, 51.1, 51.4, 51.8, 52.1, 52.6, 53.0, 53.5, 54.0, 54.5, 55.1, 55.6, 56.0, 56.6, 57.0, 57.5, 58.0, 58.4, 58.9, 59.4, 59.9, 60.2, 60.6, 60.9, 61.2], + "expected_candidate": true + }, + { + "id": "flatline-stuck", + "current_value": 72.0, + "prev_value": 72.0, + "history_values": [72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0], + "expected_candidate": true + } + ] +} diff --git a/scripts/graph_api.py b/scripts/graph_api.py index 8e36e7c..e3bff45 100644 --- a/scripts/graph_api.py +++ b/scripts/graph_api.py @@ -77,6 +77,8 @@ class GraphAPI: "processdeviation": "mes", "functionallocation": "mes", "vendor": "mes", + "agentrun": "anomaly", + "anomalyevent": "anomaly", } # Color palette for node types @@ -91,6 +93,7 @@ class GraphAPI: "flows": "#E91E63", "overview": "#607D8B", "mes": "#00897B", + "anomaly": "#F44336", "other": "#9E9E9E", } @@ -252,9 +255,11 @@ def get_neighbors( WHERE center.name = $node_id OR center.name ENDS WITH $node_id OR center.name CONTAINS $node_id + OR center.event_id = $node_id + OR center.run_id = $node_id RETURN elementId(center) as id, labels(center)[0] as type, - center.name as label, + coalesce(center.name, center.event_id, center.run_id, center.symptom, center.phrase, 'unknown') as label, properties(center) as props LIMIT 1 """ @@ -264,9 +269,11 @@ def get_neighbors( WHERE center.name = $node_id OR center.name ENDS WITH $node_id OR center.name CONTAINS $node_id + OR center.event_id = $node_id + OR center.run_id = $node_id RETURN elementId(center) as id, labels(center)[0] as type, - center.name as label, + coalesce(center.name, center.event_id, center.run_id, center.symptom, center.phrase, 'unknown') as label, properties(center) as props LIMIT 1 """ diff --git a/scripts/ignition_api_client.py b/scripts/ignition_api_client.py index d0d7e41..eae959a 100644 --- a/scripts/ignition_api_client.py +++ b/scripts/ignition_api_client.py @@ -17,12 +17,17 @@ import os import json import logging +from datetime import datetime, timezone from typing import Dict, List, Optional, Any from dataclasses import dataclass, field from urllib.parse import urljoin, quote import requests -from dotenv import load_dotenv +try: + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional fallback for minimal envs + def load_dotenv(*_args, **_kwargs): + return False load_dotenv() @@ -235,7 +240,11 @@ def read_tags(self, paths: List[str]) -> List[TagValue]: for p in normalised ] - return self._parse_tags_response(normalised, data) + return self._parse_tags_response( + normalised, + data, + fallback_timestamp=datetime.now(timezone.utc).isoformat(), + ) # --------------------------------------------------------------------- # # Tag history – WebDev module endpoint @@ -243,60 +252,55 @@ def read_tags(self, paths: List[str]) -> List[TagValue]: @staticmethod def _local_iso_to_utc(dt_str: str) -> str: - """Convert a bare ISO datetime string (assumed local) to UTC. + """ + Convert a bare ISO datetime string (assumed local time) to UTC. - If the string already has a timezone indicator (Z, +, -) - or looks like epoch milliseconds, it is returned unchanged. + If the input already contains timezone info or appears to be epoch + milliseconds, it is returned unchanged. """ from datetime import datetime, timezone - s = str(dt_str).strip() + text = str(dt_str).strip() + if not text: + return text - # Epoch ms – pass through - if s.isdigit(): - return s + # Epoch millis (or seconds) should pass through unchanged. + if text.isdigit(): + return text - # Already has TZ info – pass through - if s.endswith("Z") or "+" in s[10:] or s[10:].count("-") > 0: - return s + # Already timezone-aware. + if text.endswith("Z") or "+" in text[10:] or text[10:].count("-") > 0: + return text try: - naive = datetime.fromisoformat(s) - local_dt = naive.astimezone() # attach local TZ + naive = datetime.fromisoformat(text) + local_dt = naive.astimezone() utc_dt = local_dt.astimezone(timezone.utc) return utc_dt.strftime("%Y-%m-%dT%H:%M:%S") except (ValueError, TypeError): - return s + return text def query_tag_history( self, tag_paths: List[str], start_date: str, end_date: str, - return_size: int = 100, + return_size: int = 200, aggregation_mode: str = "Average", return_format: str = "Wide", interval_minutes: Optional[int] = None, include_bounding_values: bool = False, ) -> Optional[Any]: - """Query historical tag values via the WebDev queryTagHistory endpoint. - - Bare ISO datetime strings (no timezone suffix) are assumed to be in - the server's local timezone and are converted to UTC before sending - to the gateway (which interprets all times as UTC). - - Args: - tag_paths: Tag paths with provider prefix, e.g. ['[default]Folder/Tag']. - start_date: ISO datetime string (local) or epoch ms. - end_date: ISO datetime string (local) or epoch ms. - return_size: Max rows to return (default 100). - aggregation_mode: Average, MinMax, LastValue, Sum, Minimum, Maximum. - return_format: Wide or Tall. - interval_minutes: Aggregation interval in minutes. - include_bounding_values: Include values at boundaries. """ - normalised = [self._ensure_provider_prefix(p) for p in tag_paths] + Query historical tag values from the WebDev queryTagHistory endpoint. + Dates may be passed as local ISO strings; they are converted to UTC + to match Ignition endpoint expectations. + """ + if not tag_paths: + return {"error": "No tag paths provided", "tagPaths": []} + + normalised = [self._ensure_provider_prefix(p) for p in tag_paths] utc_start = self._local_iso_to_utc(start_date) utc_end = self._local_iso_to_utc(end_date) @@ -304,19 +308,17 @@ def query_tag_history( "tagPaths": ",".join(normalised), "startDate": utc_start, "endDate": utc_end, - "returnSize": return_size, + "returnSize": int(return_size), "aggregationMode": aggregation_mode, "returnFormat": return_format, - "includeBoundingValues": str(include_bounding_values).lower(), + "includeBoundingValues": str(bool(include_bounding_values)).lower(), } if interval_minutes is not None: - params["intervalMinutes"] = interval_minutes + params["intervalMinutes"] = int(interval_minutes) data = self._get("system/webdev/Axilon/queryTagHistory", params=params) - if data is None: return {"error": "API request failed or not configured", "tagPaths": normalised} - return data # --------------------------------------------------------------------- # @@ -347,11 +349,36 @@ def _ensure_provider_prefix(path: str) -> str: return path return f"[default]{path}" - _TAG_ITEM_KNOWN_KEYS = {"value", "quality", "tagPath", "isGood", - "timestamp", "t", "dataType", "data_type"} + _TAG_ITEM_KNOWN_KEYS = { + "value", + "v", + "quality", + "q", + "tagPath", + "path", + "fullPath", + "isGood", + "timestamp", + "t", + "ts", + "time", + "timeStamp", + "dateTime", + "datetime", + "lastChange", + "lastChanged", + "timestampMs", + "eventTime", + "dataType", + "data_type", + } @staticmethod - def _parse_tags_response(paths: List[str], data: Any) -> List["TagValue"]: + def _parse_tags_response( + paths: List[str], + data: Any, + fallback_timestamp: Optional[str] = None, + ) -> List["TagValue"]: """Parse the response from the WebDev getTags endpoint. Expected shape: {"allGood": bool, "success": bool, "count": N, @@ -370,9 +397,41 @@ def _parse_tags_response(paths: List[str], data: Any) -> List["TagValue"]: return [TagValue(path=p, value=data, quality="Unknown") for p in paths] by_path: Dict[str, dict] = {} + + def extract_item_path(item: Dict[str, Any]) -> Optional[str]: + for key in ("tagPath", "path", "fullPath"): + val = item.get(key) + if isinstance(val, str) and val.strip(): + return val.strip() + return None + + def extract_item_timestamp(item: Dict[str, Any]) -> Optional[str]: + for key in ( + "timestamp", + "t", + "ts", + "time", + "timeStamp", + "dateTime", + "datetime", + "lastChange", + "lastChanged", + "timestampMs", + "eventTime", + ): + val = item.get(key) + if val is None: + continue + text = str(val).strip() + if text: + return text + return None + for item in items: - if isinstance(item, dict) and "tagPath" in item: - by_path[item["tagPath"]] = item + if isinstance(item, dict): + item_path = extract_item_path(item) + if item_path: + by_path[item_path] = item results: List[TagValue] = [] for i, path in enumerate(paths): @@ -383,13 +442,22 @@ def _parse_tags_response(paths: List[str], data: Any) -> List["TagValue"]: if item is None: results.append(TagValue(path=path, error="No data returned for this path")) elif isinstance(item, dict): + ts = extract_item_timestamp(item) + inferred_timestamp = False + if not ts and fallback_timestamp: + ts = fallback_timestamp + inferred_timestamp = True extra = {k: v for k, v in item.items() if k not in IgnitionApiClient._TAG_ITEM_KNOWN_KEYS} or None + if inferred_timestamp: + if extra is None: + extra = {} + extra["timestamp_inferred"] = True results.append(TagValue( - path=item.get("tagPath", path), - value=item.get("value"), - quality=str(item.get("quality", "Good" if item.get("isGood") else "Unknown")), - timestamp=item.get("timestamp") or item.get("t"), + path=extract_item_path(item) or path, + value=item.get("value", item.get("v")), + quality=str(item.get("quality", item.get("q", "Good" if item.get("isGood") else "Unknown"))), + timestamp=ts, data_type=item.get("dataType") or item.get("data_type"), config=extra, )) diff --git a/scripts/neo4j_ontology.py b/scripts/neo4j_ontology.py index 110719f..380e3cb 100644 --- a/scripts/neo4j_ontology.py +++ b/scripts/neo4j_ontology.py @@ -9,7 +9,11 @@ from typing import Dict, List, Optional, Any, Union from dataclasses import dataclass, field from contextlib import contextmanager -from dotenv import load_dotenv +try: + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional fallback for minimal envs + def load_dotenv(*_args, **_kwargs): + return False from neo4j import GraphDatabase, Driver, Session @@ -147,6 +151,8 @@ def create_indexes(self) -> None: "CREATE CONSTRAINT project_name IF NOT EXISTS FOR (p:Project) REQUIRE p.name IS UNIQUE", "CREATE CONSTRAINT script_name IF NOT EXISTS FOR (s:Script) REQUIRE s.name IS UNIQUE", "CREATE CONSTRAINT namedquery_name IF NOT EXISTS FOR (q:NamedQuery) REQUIRE q.name IS UNIQUE", + "CREATE CONSTRAINT agentrun_id IF NOT EXISTS FOR (r:AgentRun) REQUIRE r.run_id IS UNIQUE", + "CREATE CONSTRAINT anomalyevent_id IF NOT EXISTS FOR (e:AnomalyEvent) REQUIRE e.event_id IS UNIQUE", ] # Regular indexes @@ -186,6 +192,11 @@ def create_indexes(self) -> None: "CREATE INDEX hmitextlist_name IF NOT EXISTS FOR (htl:HMITextList) ON (htl.name)", "CREATE INDEX plctagtable_name IF NOT EXISTS FOR (pt:PLCTagTable) ON (pt.name)", "CREATE INDEX plctag_name IF NOT EXISTS FOR (ptg:PLCTag) ON (ptg.name)", + # Agent monitoring indexes + "CREATE INDEX anomalyevent_created IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.created_at)", + "CREATE INDEX anomalyevent_state IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.state)", + "CREATE INDEX anomalyevent_severity IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.severity)", + "CREATE INDEX anomalyevent_dedup_key IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.dedup_key)", ] for constraint in constraints: @@ -202,6 +213,95 @@ def create_indexes(self) -> None: if "already exists" not in str(e).lower(): print(f"[WARNING] Index error: {e}") + def init_agent_monitoring_schema(self) -> None: + """Ensure agent monitoring labels and indexes exist.""" + self.create_indexes() + + def list_anomaly_events( + self, + limit: int = 100, + state: Optional[str] = None, + severity: Optional[str] = None, + run_id: Optional[str] = None, + ) -> List[Dict[str, Any]]: + """List persisted anomaly events for UI feeds.""" + with self.session() as session: + clauses = [] + params: Dict[str, Any] = {"limit": max(1, min(limit, 500))} + if state: + clauses.append("e.state = $state") + params["state"] = state + if severity: + clauses.append("e.severity = $severity") + params["severity"] = severity + if run_id: + clauses.append("e.run_id = $run_id") + params["run_id"] = run_id + where = f"WHERE {' AND '.join(clauses)}" if clauses else "" + query = f""" + MATCH (e:AnomalyEvent) + {where} + OPTIONAL MATCH (e)-[:OBSERVED_ON]->(t:ScadaTag) + OPTIONAL MATCH (e)-[:AFFECTS]->(eq:Equipment) + RETURN e, collect(DISTINCT t.name) AS tags, collect(DISTINCT eq.name) AS equipment + ORDER BY e.created_at DESC + LIMIT $limit + """ + result = session.run(query, **params) + events: List[Dict[str, Any]] = [] + for record in result: + node = record["e"] + props = dict(node) + props["tags"] = [x for x in record["tags"] if x] + props["equipment"] = [x for x in record["equipment"] if x] + events.append(props) + return events + + def get_anomaly_event(self, event_id: str) -> Optional[Dict[str, Any]]: + """Get one anomaly event with linked context labels.""" + with self.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + OPTIONAL MATCH (e)-[:OBSERVED_ON]->(t:ScadaTag) + OPTIONAL MATCH (e)-[:AFFECTS]->(eq:Equipment) + OPTIONAL MATCH (e)-[r:RELATED_TO]->(n) + RETURN e, + collect(DISTINCT t.name) AS tags, + collect(DISTINCT eq.name) AS equipment, + collect(DISTINCT {type: type(r), label: labels(n)[0], name: coalesce(n.name, n.symptom, n.phrase)}) AS related + LIMIT 1 + """, + event_id=event_id, + ) + record = result.single() + if not record: + return None + data = dict(record["e"]) + data["tags"] = [x for x in record["tags"] if x] + data["equipment"] = [x for x in record["equipment"] if x] + data["related"] = [ + x for x in record["related"] if x and x.get("name") + ] + return data + + def cleanup_anomaly_events(self, retention_days: int = 14) -> int: + """Delete old anomaly events outside retention window.""" + with self.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent) + WHERE e.created_at IS NOT NULL + AND datetime(e.created_at) < datetime() - duration({days: $days}) + WITH collect(e) AS old_events + FOREACH (n IN old_events | DETACH DELETE n) + RETURN size(old_events) AS deleted + """, + days=max(1, retention_days), + ) + record = result.single() + return int(record["deleted"]) if record else 0 + def clear_all(self) -> None: """Clear all nodes and relationships. USE WITH CAUTION.""" with self.session() as session: @@ -4192,12 +4292,22 @@ def main(): "tia-projects", "tia-project-resources", "db-connections", + "init-agent-schema", + "list-anomaly-events", + "get-anomaly-event", + "cleanup-anomaly-events", ], help="Command to execute", ) parser.add_argument("--file", "-f", help="JSON file for import/export") parser.add_argument("--query", "-q", help="Query string for search") parser.add_argument("--project", "-p", help="Project name for project-resources") + parser.add_argument("--event-id", help="Event ID for get-anomaly-event") + parser.add_argument("--state", help="Filter anomaly events by state") + parser.add_argument("--severity", help="Filter anomaly events by severity") + parser.add_argument("--run-id", help="Filter anomaly events by run_id") + parser.add_argument("--limit", type=int, default=100, help="Limit results for list commands") + parser.add_argument("--retention-days", type=int, default=14, help="Retention window in days") parser.add_argument("--json", action="store_true", help="Output in JSON format") parser.add_argument( "--enrichment-status", @@ -4437,7 +4547,43 @@ def main(): f" {c['name']} ({c['database_type']}) " f"- {c['url']} [{enabled}]" ) + elif args.command == "init-agent-schema": + graph.init_agent_monitoring_schema() + print("[OK] Initialized agent monitoring schema") + + elif args.command == "list-anomaly-events": + events = graph.list_anomaly_events( + limit=args.limit, + state=args.state, + severity=args.severity, + run_id=args.run_id, + ) + if args.json: + print(json_module.dumps(events)) + else: + print(f"Anomaly events: {len(events)}") + for event in events: + print( + f"- {event.get('event_id')} {event.get('severity')} " + f"{event.get('summary', '')[:80]}" + ) + + elif args.command == "get-anomaly-event": + if not args.event_id: + print("[ERROR] --event-id required for get-anomaly-event") + return + event = graph.get_anomaly_event(args.event_id) + if args.json: + print(json_module.dumps(event or {})) + else: + if not event: + print(f"[ERROR] Event not found: {args.event_id}") + return + print(json_module.dumps(event, indent=2)) + elif args.command == "cleanup-anomaly-events": + deleted = graph.cleanup_anomaly_events(args.retention_days) + print(f"[OK] Deleted {deleted} anomaly events older than {args.retention_days} days") if __name__ == "__main__": main() diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..350a8d4 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,46 @@ +# Test Flow: Agents Monitoring + Ingest + +This repository now includes a lightweight test scaffold using `pytest`. + +## Layout + +- `tests/unit/` + - `test_anomaly_rules.py` + Unit tests for deterministic anomaly scoring and quality/staleness gates. + - `test_ingest_workbench_parser.py` + Unit tests for workbench ingest parsing. + - `test_ingest_siemens_parser.py` + Unit tests for Siemens `.st` ingest parsing. + +- `tests/integration/` + - `simulated_ignition_server.py` + Local simulated live/history webserver implementing: + - `/system/webdev/Axilon/getTags` + - `/system/webdev/Axilon/queryTagHistory` + - `test_live_value_sim_server.py` + Integration tests for `IgnitionApiClient` + anomaly scoring with simulated live values. + +## Run all tests + +```bash +python3 -m pytest +``` + +## Run only unit tests + +```bash +python3 -m pytest tests/unit +``` + +## Run only integration tests + +```bash +python3 -m pytest tests/integration +``` + +## Notes + +- Integration tests are fully local and do **not** require a real Ignition gateway. +- LLM services are not required for these tests. +- Neo4j is not required for this test suite. + diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5b51088 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCRIPTS_DIR = REPO_ROOT / "scripts" +INTEGRATION_DIR = REPO_ROOT / "tests" / "integration" + +for path in (SCRIPTS_DIR, INTEGRATION_DIR): + path_str = str(path) + if path_str not in sys.path: + sys.path.insert(0, path_str) + + +@pytest.fixture +def sim_ignition(): + from simulated_ignition_server import ( + start_simulated_ignition_server, + stop_simulated_ignition_server, + ) + + server, thread, state, base_url = start_simulated_ignition_server() + try: + yield { + "server": server, + "thread": thread, + "state": state, + "base_url": base_url, + } + finally: + stop_simulated_ignition_server(server, thread) diff --git a/tests/integration/simulated_ignition_server.py b/tests/integration/simulated_ignition_server.py new file mode 100644 index 0000000..607f316 --- /dev/null +++ b/tests/integration/simulated_ignition_server.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Simulated Ignition WebDev endpoints for local integration tests. +""" + +from __future__ import annotations + +import json +import threading +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Dict, List, Tuple +from urllib.parse import parse_qs, urlparse + + +def _utc_iso(offset_minutes: int = 0) -> str: + return (datetime.now(timezone.utc) + timedelta(minutes=offset_minutes)).isoformat() + + +@dataclass +class SimulatedIgnitionState: + fail_live_reads: bool = False + fail_history_reads: bool = False + live_tags: Dict[str, Dict] = field(default_factory=dict) + tag_history: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict) + + def __post_init__(self) -> None: + if not self.live_tags: + self.live_tags = { + "[default]Line/Throughput": { + "value": 95.0, + "quality": "Good", + "timestamp": _utc_iso(), + "dataType": "Float8", + }, + "[default]Line/Temperature": { + "value": 42.0, + "quality": "Good", + "timestamp": _utc_iso(), + "dataType": "Float8", + }, + } + if not self.tag_history: + base = [49.9, 50.1, 50.0, 50.2, 50.1, 49.8, 50.3, 50.0, 49.9, 50.2] + self.tag_history = { + "[default]Line/Throughput": [ + (_utc_iso(offset_minutes=-(len(base) - i)), value) + for i, value in enumerate(base) + ], + "[default]Line/Temperature": [ + (_utc_iso(offset_minutes=-(len(base) - i)), 41.5 + (i * 0.1)) + for i in range(len(base)) + ], + } + + +class _IgnitionHandler(BaseHTTPRequestHandler): + state: SimulatedIgnitionState + + def _send_json(self, payload, status: int = 200) -> None: + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_GET(self): # noqa: N802 - BaseHTTPRequestHandler naming + parsed = urlparse(self.path) + path = parsed.path + query = parse_qs(parsed.query) + + if path == "/system/webdev/Axilon/getTags": + if self.state.fail_live_reads: + self._send_json({"error": "simulated live provider failure"}, status=503) + return + + raw = query.get("tagPaths", [""])[0] + tag_paths = [p.strip() for p in raw.split(",") if p.strip()] + tags = [] + for tag_path in tag_paths: + data = self.state.live_tags.get(tag_path) + if not data: + tags.append( + { + "tagPath": tag_path, + "value": None, + "quality": "Bad", + "isGood": False, + "timestamp": _utc_iso(), + "dataType": "Unknown", + } + ) + continue + tags.append( + { + "tagPath": tag_path, + "value": data.get("value"), + "quality": data.get("quality", "Good"), + "isGood": str(data.get("quality", "Good")).lower() == "good", + "timestamp": data.get("timestamp", _utc_iso()), + "dataType": data.get("dataType", "Unknown"), + } + ) + self._send_json({"success": True, "count": len(tags), "tags": tags}) + return + + if path == "/system/webdev/Axilon/queryTagHistory": + if self.state.fail_history_reads: + self._send_json({"error": "simulated history provider failure"}, status=503) + return + + raw = query.get("tagPaths", [""])[0] + tag_paths = [p.strip() for p in raw.split(",") if p.strip()] + rows = [] + + primary_path = tag_paths[0] if tag_paths else "[default]Line/Throughput" + primary_hist = self.state.tag_history.get(primary_path, []) + for ts, _ in primary_hist: + row = {"timestamp": ts} + for tag_path in tag_paths: + values = self.state.tag_history.get(tag_path, []) + match_val = None + for hist_ts, hist_val in values: + if hist_ts == ts: + match_val = hist_val + break + if match_val is None and values: + match_val = values[-1][1] + row[tag_path] = match_val + rows.append(row) + + self._send_json( + { + "success": True, + "rows": rows, + "tagPaths": tag_paths, + "returnFormat": "Wide", + } + ) + return + + self._send_json({"error": f"unsupported endpoint: {path}"}, status=404) + + def log_message(self, format, *args): # noqa: A003 - stdlib signature + # Silence default HTTP request logs during tests. + return + + +def start_simulated_ignition_server() -> tuple[HTTPServer, threading.Thread, SimulatedIgnitionState, str]: + state = SimulatedIgnitionState() + handler_cls = type( + "IgnitionHandlerWithState", + (_IgnitionHandler,), + {"state": state}, + ) + server = HTTPServer(("127.0.0.1", 0), handler_cls) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + host, port = server.server_address + base_url = f"http://{host}:{port}" + return server, thread, state, base_url + + +def stop_simulated_ignition_server(server: HTTPServer, thread: threading.Thread) -> None: + server.shutdown() + server.server_close() + thread.join(timeout=3) + diff --git a/tests/integration/test_live_value_sim_server.py b/tests/integration/test_live_value_sim_server.py new file mode 100644 index 0000000..d6feeea --- /dev/null +++ b/tests/integration/test_live_value_sim_server.py @@ -0,0 +1,75 @@ +from datetime import datetime, timedelta, timezone + +from anomaly_rules import compute_deviation_scores +from ignition_api_client import IgnitionApiClient + +def test_read_tags_history_and_detect_spike(sim_ignition): + state = sim_ignition["state"] + state.fail_live_reads = False + state.fail_history_reads = False + + client = IgnitionApiClient(base_url=sim_ignition["base_url"], api_token="token") + try: + tag_path = "[default]Line/Throughput" + tv = client.read_tag(tag_path) + assert tv.error is None + assert tv.quality == "Good" + assert float(tv.value) == 95.0 + + start = (datetime.now(timezone.utc) - timedelta(hours=1)).replace(microsecond=0).isoformat() + end = datetime.now(timezone.utc).replace(microsecond=0).isoformat() + history = client.query_tag_history([tag_path], start, end, return_size=100) + assert isinstance(history, dict) + assert "rows" in history + + history_values = [ + row[tag_path] + for row in history["rows"] + if isinstance(row, dict) and tag_path in row and row[tag_path] is not None + ] + assert len(history_values) > 5 + + score = compute_deviation_scores( + current_value=tv.value, + history_values=history_values, + prev_value=55.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 10.0}, + ) + assert score["candidate"] + assert score["category"] in {"spike", "deviation", "drift"} + finally: + client.close() + + +def test_live_provider_failure_surfaces_as_read_error(sim_ignition): + state = sim_ignition["state"] + state.fail_live_reads = True + + client = IgnitionApiClient(base_url=sim_ignition["base_url"], api_token="token") + try: + tv = client.read_tag("[default]Line/Throughput") + assert tv.error is not None + assert "failed" in tv.error.lower() + finally: + client.close() + + +def test_history_provider_failure_surfaces_error_payload(sim_ignition): + state = sim_ignition["state"] + state.fail_history_reads = True + + client = IgnitionApiClient(base_url=sim_ignition["base_url"], api_token="token") + try: + start = (datetime.now(timezone.utc) - timedelta(hours=1)).replace(microsecond=0).isoformat() + end = datetime.now(timezone.utc).replace(microsecond=0).isoformat() + history = client.query_tag_history( + ["[default]Line/Throughput"], + start, + end, + return_size=100, + ) + assert isinstance(history, dict) + assert "error" in history + finally: + client.close() + diff --git a/tests/quick_import_test.py b/tests/quick_import_test.py new file mode 100644 index 0000000..1edd415 --- /dev/null +++ b/tests/quick_import_test.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +""" +Quick test script to verify import files are ready. +Shows what will be imported without needing CODESYS running. +""" + +import os +from pathlib import Path + + +def analyze_import_directory(import_dir): + """Analyze what will be imported.""" + import_path = Path(import_dir) + + if not import_path.exists(): + print(f"Error: Directory not found: {import_dir}") + return + + print(f"Analyzing import directory: {import_dir}\n") + print("=" * 60) + + pous = [] + gvls = [] + + for st_file in import_path.rglob("*.st"): + filename = st_file.name + + if filename.endswith('.prg.st'): + name = filename.replace('.prg.st', '') + pous.append(('Program', name, st_file)) + elif filename.endswith('.fb.st'): + name = filename.replace('.fb.st', '') + pous.append(('FunctionBlock', name, st_file)) + elif filename.endswith('.fun.st'): + name = filename.replace('.fun.st', '') + pous.append(('Function', name, st_file)) + elif filename.endswith('.gvl.st'): + name = filename.replace('.gvl.st', '') + gvls.append((name, st_file)) + + print(f"\nPOUs to import: {len(pous)}") + for pou_type, name, filepath in pous: + print(f" - {name} ({pou_type})") + # Show first few lines + with open(filepath, 'r', encoding='utf-8') as f: + lines = f.readlines()[:5] + for line in lines: + print(f" {line.rstrip()}") + print() + + print(f"\nGVLs to import: {len(gvls)}") + for name, filepath in gvls: + print(f" - {name}") + # Show content + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + print(f" {content.strip()}") + print() + + print("=" * 60) + print(f"\nTotal: {len(pous)} POUs, {len(gvls)} GVLs") + print(f"\nTo import, run inside CODESYS:") + print(f' codesys_import.py "" "{import_dir}"') + + +if __name__ == "__main__": + import sys + + if len(sys.argv) < 2: + print("Usage: python quick_import_test.py ") + print("\nExample:") + print(" python quick_import_test.py test_cross_applied_export") + sys.exit(1) + + analyze_import_directory(sys.argv[1]) + diff --git a/tests/unit/test_anomaly_monitor_subsystems.py b/tests/unit/test_anomaly_monitor_subsystems.py new file mode 100644 index 0000000..83795ce --- /dev/null +++ b/tests/unit/test_anomaly_monitor_subsystems.py @@ -0,0 +1,74 @@ +from anomaly_monitor import ( + _last_segment_from_tag_path, + _looks_like_live_tag_path, + derive_subsystems_for_tag, + infer_tag_group, +) + + +def test_infer_tag_group_prefers_folder_name(): + group = infer_tag_group("[default]Area1/Pump101/Speed", folder_name="LineA/Area1") + assert group == "LineA" + + +def test_infer_tag_group_from_tag_path(): + group = infer_tag_group("[default]Boiler/Feedwater/Flow") + assert group == "Boiler" + + +def test_infer_tag_group_none_for_flat_tag(): + assert infer_tag_group("[default]SingleTag") is None + + +def test_derive_subsystems_auto_with_priority(): + subsystems, primary = derive_subsystems_for_tag( + tag_meta={ + "path": "[default]Line1/PumpA/Pressure", + "folder_name": "Line1/PumpA", + "views": ["Overview/Main"], + "equipment": ["PumpA"], + }, + subsystem_mode="auto", + priority=["equipment", "view", "group"], + ) + subsystem_ids = {item["id"] for item in subsystems} + assert "equipment:pumpa" in subsystem_ids + assert "view:overview/main" in subsystem_ids + assert "group:line1" in subsystem_ids + assert primary["type"] == "equipment" + assert primary["name"] == "PumpA" + + +def test_derive_subsystems_global_mode(): + subsystems, primary = derive_subsystems_for_tag( + tag_meta={ + "path": "[default]Line1/PumpA/Pressure", + "views": ["Overview/Main"], + "equipment": ["PumpA"], + }, + subsystem_mode="global", + ) + assert subsystems == [{"type": "global", "name": "all", "id": "global:all"}] + assert primary == {"type": "global", "name": "all", "id": "global:all"} + + +def test_derive_subsystems_falls_back_to_global_when_no_ontology_links(): + subsystems, primary = derive_subsystems_for_tag( + tag_meta={"path": "[default]TagOnly"}, + subsystem_mode="auto", + ) + assert len(subsystems) == 1 + assert primary["id"] == "global:all" + + +def test_tag_path_helpers_identify_live_paths(): + assert _looks_like_live_tag_path("[default]Line/Pump/Speed") + assert _looks_like_live_tag_path("Line/Pump/Speed") + assert not _looks_like_live_tag_path("SimpleTagNameOnly") + assert not _looks_like_live_tag_path("{../props.value}") + + +def test_last_segment_from_tag_path(): + assert _last_segment_from_tag_path("[default]Line/Pump/Speed") == "Speed" + assert _last_segment_from_tag_path("Line/Pump/Speed") == "Speed" + assert _last_segment_from_tag_path("Speed") == "Speed" diff --git a/tests/unit/test_anomaly_rules.py b/tests/unit/test_anomaly_rules.py new file mode 100644 index 0000000..7a75c9e --- /dev/null +++ b/tests/unit/test_anomaly_rules.py @@ -0,0 +1,78 @@ +from datetime import datetime, timedelta, timezone + +import pytest + +from anomaly_rules import compute_deviation_scores, is_quality_good, is_stale, parse_timestamp + + +def test_detects_sharp_rise_and_sharp_drop(): + baseline = [50.0, 49.9, 50.1, 50.2, 49.8, 50.0, 50.1, 49.9, 50.0, 50.2] * 3 + + rise = compute_deviation_scores( + current_value=95.0, + history_values=baseline, + prev_value=52.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 10.0}, + ) + drop = compute_deviation_scores( + current_value=12.0, + history_values=baseline, + prev_value=49.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 10.0}, + ) + + assert rise["candidate"] + assert drop["candidate"] + + +def test_detects_flatline_stuck_pattern(): + flat = [72.0] * 30 + result = compute_deviation_scores( + current_value=72.0, + history_values=flat, + prev_value=72.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 1.0, "stuck_window_size": 20}, + ) + assert result["candidate"] + assert "flatline_detected" in result["reasons"] + assert result["category"] == "stuck" + + +@pytest.mark.parametrize( + "quality,expected", + [("Good", True), ("OK", True), ("Bad", False), (None, False)], +) +def test_quality_helper(quality, expected): + assert is_quality_good(quality) is expected + + +def test_staleness_helper(): + recent_ts = datetime.now(timezone.utc).isoformat() + old_ts = (datetime.now(timezone.utc) - timedelta(minutes=15)).isoformat() + assert not is_stale(recent_ts, staleness_sec=300) + assert is_stale(old_ts, staleness_sec=300) + + +def test_staleness_accepts_epoch_seconds_and_millis(): + now = datetime.now(timezone.utc) + recent = int(now.timestamp()) + recent_ms = int(now.timestamp() * 1000) + assert not is_stale(str(recent), staleness_sec=300, now=now) + assert not is_stale(str(recent_ms), staleness_sec=300, now=now) + + +def test_parse_timestamp_naive_assumed_local_time(): + local_now = datetime.now().replace(microsecond=0) + parsed = parse_timestamp(local_now.isoformat()) + assert parsed is not None + + +def test_non_numeric_current_value_is_rejected(): + result = compute_deviation_scores( + current_value="not-a-number", + history_values=[1, 2, 3, 4, 5], + prev_value=3, + ) + assert not result["candidate"] + assert result["category"] == "invalid_value" + diff --git a/tests/unit/test_ignition_api_client_parser.py b/tests/unit/test_ignition_api_client_parser.py new file mode 100644 index 0000000..2157673 --- /dev/null +++ b/tests/unit/test_ignition_api_client_parser.py @@ -0,0 +1,45 @@ +from ignition_api_client import IgnitionApiClient + + +def test_parse_tags_response_infers_timestamp_when_missing(): + paths = ["[default]Feed_Storage/Tank1_Level"] + fallback_ts = "2026-03-02T00:00:00+00:00" + payload = { + "tags": [ + { + "tagPath": paths[0], + "value": 42.5, + "quality": "Good", + } + ] + } + + rows = IgnitionApiClient._parse_tags_response(paths, payload, fallback_timestamp=fallback_ts) + assert len(rows) == 1 + assert rows[0].path == paths[0] + assert rows[0].timestamp == fallback_ts + assert rows[0].config is not None + assert rows[0].config.get("timestamp_inferred") is True + + +def test_parse_tags_response_supports_alt_keys(): + paths = ["[default]Feed_Storage/Tank1_Pressure"] + payload = { + "items": [ + { + "path": paths[0], + "v": 101.3, + "q": "Good", + "ts": "1710000000000", + "data_type": "Float8", + } + ] + } + + rows = IgnitionApiClient._parse_tags_response(paths, payload) + assert len(rows) == 1 + assert rows[0].path == paths[0] + assert rows[0].value == 101.3 + assert rows[0].quality == "Good" + assert rows[0].timestamp == "1710000000000" + assert rows[0].data_type == "Float8" diff --git a/tests/unit/test_ingest_siemens_parser.py b/tests/unit/test_ingest_siemens_parser.py new file mode 100644 index 0000000..935bf71 --- /dev/null +++ b/tests/unit/test_ingest_siemens_parser.py @@ -0,0 +1,72 @@ +from pathlib import Path + +from siemens_parser import SiemensSTParser + + +SAMPLE_ST = """ +NAMESPACE Plant.Process + +TYPE MotorData : STRUCT + Speed : REAL; +END_STRUCT +END_TYPE + +CLASS MotorFB +VAR_INPUT + StartCmd : BOOL; // start command +END_VAR +VAR_OUTPUT + Running : BOOL; +END_VAR +METHOD PUBLIC Execute : BOOL +VAR + tempVar : INT := 1; +END_VAR +Running := StartCmd; +END_METHOD +END_CLASS + +PROGRAM MainProgram +VAR + Counter : INT := 0; +END_VAR +Counter := Counter + 1; +END_PROGRAM + +CONFIGURATION Config1 +TASK MainTask(INTERVAL := T#100MS, PRIORITY := 1); +PROGRAM PLC_PRG WITH MainTask: MainProgram; +END_CONFIGURATION + +END_NAMESPACE +""" + + +def test_parse_structured_text_blocks(tmp_path): + st_path = Path(tmp_path) / "sample.st" + st_path.write_text(SAMPLE_ST, encoding="utf-8") + + parser = SiemensSTParser() + blocks = parser.parse_file(str(st_path)) + assert len(blocks) >= 4 + + by_name = {b.name: b for b in blocks} + assert "MotorData" in by_name + assert by_name["MotorData"].type == "UDT" + assert by_name["MotorData"].local_tags[0].name == "Speed" + + assert "MotorFB" in by_name + fb = by_name["MotorFB"] + assert fb.type == "FB" + assert any(t.name == "StartCmd" and t.direction == "INPUT" for t in fb.input_tags) + assert any(t.name == "Running" and t.direction == "OUTPUT" for t in fb.output_tags) + assert any(r["name"] == "Execute" for r in fb.routines) + + assert "MainProgram" in by_name + assert by_name["MainProgram"].type == "PROGRAM" + assert "Counter := Counter + 1" in by_name["MainProgram"].raw_implementation + + assert "Config1" in by_name + assert by_name["Config1"].type == "CONFIGURATION" + assert "MainTask" in by_name["Config1"].description + diff --git a/tests/unit/test_ingest_workbench_parser.py b/tests/unit/test_ingest_workbench_parser.py new file mode 100644 index 0000000..7609490 --- /dev/null +++ b/tests/unit/test_ingest_workbench_parser.py @@ -0,0 +1,119 @@ +import json +from pathlib import Path + +from workbench_parser import WorkbenchParser + + +def test_parse_workbench_project_json_with_inline_resources(tmp_path): + root = Path(tmp_path) + + # Script file expected by WorkbenchParser._read_script_file + script_file = root / "scripts" / "PlantA" / "utility" / "tags" / "code.py" + script_file.parent.mkdir(parents=True, exist_ok=True) + script_file.write_text("def read_tag():\n return 42\n", encoding="utf-8") + + data = { + "__typeName": "WorkbenchState", + "version": "1.2.3", + "root": { + "windows": [ + { + "projectName": "PlantA", + "title": "MainView", + "path": "main/view", + "windowType": "perspective", + "rootContainer": { + "meta": {"name": "Root"}, + "type": "ia.container", + "propConfig": { + "props.value": { + "binding": { + "type": "tag", + "config": { + "tagPath": "[default]Line/Speed", + "bidirectional": True, + }, + } + } + }, + "children": [], + }, + } + ], + "namedQueries": [ + { + "projectName": "PlantA", + "queryName": "GetBatches", + "folderPath": "Prod\\Ops", + "query": "SELECT * FROM batches", + } + ], + "scripts": [ + { + "projectName": "PlantA", + "path": ["utility", "tags"], + "scope": "A", + } + ], + "tags": [ + { + "name": "LineSpeed", + "type": "Opc", + "dataType": "Float8", + "opcItemPath": "[default]Line/Speed", + }, + { + "name": "BatchCount", + "type": "Memory", + "dataType": "Int4", + "value": 7, + }, + ], + "udtDefinitions": [ + { + "name": "MotorUDT", + "id": "MotorUDT", + "parameters": { + "area": {"dataType": "String", "value": "A1"} + }, + "members": [ + { + "name": "Run", + "type": "opc", + "dataType": "Boolean", + "opcItemPath": "[default]Motor/Run", + "serverName": {"binding": "default"}, + } + ], + } + ], + }, + } + + project_json = root / "project.json" + project_json.write_text(json.dumps(data), encoding="utf-8") + + parser = WorkbenchParser() + backup = parser.parse_file(str(project_json)) + + assert "PlantA" in backup.projects + assert len(backup.windows) == 1 + assert backup.windows[0].name == "MainView" + assert backup.windows[0].components[0].bindings[0].target == "[default]Line/Speed" + + assert len(backup.named_queries) == 1 + assert backup.named_queries[0].id == "Prod/Ops/GetBatches" + assert "SELECT" in backup.named_queries[0].query_text + + assert len(backup.scripts) == 1 + assert "return 42" in backup.scripts[0].script_text + + tag_types = {t.name: t.tag_type for t in backup.tags} + assert tag_types["LineSpeed"] == "opc" + assert tag_types["BatchCount"] == "memory" + + assert len(backup.udt_definitions) == 1 + udt = backup.udt_definitions[0] + assert "area" in udt.parameters + assert udt.members[0].server_name == "default" +