From 5c9f93692af5323ec278878ac3952cba81f32e54 Mon Sep 17 00:00:00 2001 From: Keith Smith Date: Sat, 23 May 2026 16:08:27 -0600 Subject: [PATCH] Add alert rule editing UI --- docs/agent-handoff.md | 2 +- docs/progress.md | 31 ++- frontend/src/api/client.ts | 8 + frontend/src/app/App.tsx | 2 +- frontend/src/pages/AlertsPage.tsx | 257 ++++++++++++++++++++++- frontend/src/pages/NetworkChecksPage.tsx | 16 +- frontend/src/pages/NotificationsPage.tsx | 21 +- frontend/src/pages/WebsitesPage.tsx | 16 +- frontend/src/types/api.ts | 21 ++ worker/app/scheduler.py | 13 ++ 10 files changed, 340 insertions(+), 47 deletions(-) diff --git a/docs/agent-handoff.md b/docs/agent-handoff.md index abcd200..d26ce29 100644 --- a/docs/agent-handoff.md +++ b/docs/agent-handoff.md @@ -76,7 +76,7 @@ Issue source docs: - `docs/progress.md` - `docs/roadmap.md` -Current completed items include TLS expiry monitor support, HTTP/website checks, ping and TCP port checks, basic alert evaluation, incident actions, and webhook notification channels. The next recommended implementation issue is alert rule editing UI, followed by guided SNMP discovery and monitor selection. +Current completed items include TLS expiry monitor support, HTTP/website checks, ping and TCP port checks, basic alert evaluation, alert rule editing UI, incident actions, and webhook notification channels. The next recommended implementation issue is SNMP credential profiles and guided SNMP discovery, followed by SNMP monitor selection. ## Guardrails diff --git a/docs/progress.md b/docs/progress.md index e273512..1cdb667 100644 --- a/docs/progress.md +++ b/docs/progress.md @@ -41,13 +41,22 @@ Implemented notification slice: - Worker sends incident open and recovery notifications. - Notification state/history is stored in incident details to avoid duplicate sends. +Implemented alerting management slice: + +- Alerts page lists alert rules separately from incidents. +- Alert rules can be enabled, disabled, and edited from the UI. +- Editable alert rule fields include friendly name, severity, failure threshold, and cooldown. +- HTTPS website alert rules expose TLS certificate expiry check and warning-day controls. +- Existing simple alert conditions are shown in friendly language instead of raw condition data. +- Worker honors alert rule cooldown before opening a new incident for a recently-triggered rule. + ## Known Gaps - Credential vault UI and real credential encryption workflows are not complete. - Audit logging tables exist, but events are not consistently written yet. - User management UI is not implemented. - Role management is basic and needs full admin flows. -- Alert rule editing UI is not implemented. +- Richer alert condition editing is not implemented yet. - Guided SNMP device discovery and friendly SNMP monitor selection are not implemented yet. - SNMP credential profiles, interface status, traffic counters, errors, uptime, CPU, and memory checks are not implemented yet. - Notification routing/policies are not implemented; all enabled webhook channels receive incident notifications. @@ -59,16 +68,16 @@ Implemented notification slice: ## Recommended Next Work -1. Add alert rule editing UI and richer alert conditions. -2. Add SNMP credential profiles and guided SNMP device discovery. -3. Add SNMP discovery selection UI to choose what to monitor and alert on. -4. Add SNMP interface status, traffic, errors, uptime, CPU, and memory collection. -5. Add notification policy/routing controls. -6. Add email/SMTP notification channel. -7. Add audit event writes for auth, monitor, credential, notification, and incident actions. -8. Build credential vault UI with masked secret handling. -9. Add user administration UI. -10. Add graphs for website response time and monitor status history. +1. Add SNMP credential profiles and guided SNMP device discovery. +2. Add SNMP discovery selection UI to choose what to monitor and alert on. +3. Add SNMP interface status, traffic, errors, uptime, CPU, and memory collection. +4. Add notification policy/routing controls. +5. Add email/SMTP notification channel. +6. Add audit event writes for auth, monitor, credential, notification, and incident actions. +7. Build credential vault UI with masked secret handling. +8. Add user administration UI. +9. Add graphs for website response time and monitor status history. +10. Add richer alert condition editing. 11. Add backend and worker tests for the website-monitor and notification flows. ## Operational Notes diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index 61c1d1f..a7c26af 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -1,4 +1,6 @@ import type { + AlertRule, + AlertRuleUpdate, Asset, CheckResult, Incident, @@ -85,6 +87,12 @@ export const api = { }), monitorResults: (token: string, monitorId: number, limit = 1) => request(`/monitors/${monitorId}/results?limit=${limit}`, token), + alertRules: (token: string) => request("/alerts/rules", token), + updateAlertRule: (token: string, ruleId: number, payload: AlertRuleUpdate) => + request(`/alerts/rules/${ruleId}`, token, { + method: "PATCH", + body: JSON.stringify(payload), + }), incidents: (token: string) => request("/incidents", token), acknowledgeIncident: (token: string, incidentId: number) => request(`/incidents/${incidentId}/acknowledge`, token, { diff --git a/frontend/src/app/App.tsx b/frontend/src/app/App.tsx index 2fbabb8..1fea0a6 100644 --- a/frontend/src/app/App.tsx +++ b/frontend/src/app/App.tsx @@ -84,7 +84,7 @@ export function App() { ) : null} {page === "alerts" ? ( - + ) : null} {page === "discovery" ? : null} {page === "graphs" ? : null} diff --git a/frontend/src/pages/AlertsPage.tsx b/frontend/src/pages/AlertsPage.tsx index 2bcad0c..5eb2b02 100644 --- a/frontend/src/pages/AlertsPage.tsx +++ b/frontend/src/pages/AlertsPage.tsx @@ -1,22 +1,52 @@ -import { useState } from "react"; -import { AlertTriangle, BellOff, CheckCheck, RefreshCw } from "lucide-react"; +import { FormEvent, useEffect, useMemo, useState } from "react"; +import { AlertTriangle, BellOff, CheckCheck, Pencil, RefreshCw, Save, ShieldAlert, X } from "lucide-react"; import { api } from "../api/client"; import { Button } from "../components/Button"; -import type { Incident } from "../types/api"; +import type { AlertRule, Incident, Monitor } from "../types/api"; interface AlertsPageProps { token: string; + monitors: Monitor[]; incidents: Incident[]; selectedIncidentId?: number | null; onChanged: () => Promise; } -export function AlertsPage({ token, incidents, selectedIncidentId, onChanged }: AlertsPageProps) { - const [busyId, setBusyId] = useState(null); +export function AlertsPage({ token, monitors, incidents, selectedIncidentId, onChanged }: AlertsPageProps) { + const [rules, setRules] = useState([]); + const [busyIncidentId, setBusyIncidentId] = useState(null); + const [busyRuleId, setBusyRuleId] = useState(null); + const [editingRuleId, setEditingRuleId] = useState(null); + const [ruleName, setRuleName] = useState(""); + const [severity, setSeverity] = useState("warning"); + const [failureThreshold, setFailureThreshold] = useState(3); + const [cooldownSeconds, setCooldownSeconds] = useState(300); + const [enabled, setEnabled] = useState(true); + const [tlsCheckEnabled, setTlsCheckEnabled] = useState(false); + const [tlsWarningDays, setTlsWarningDays] = useState(30); + const [savingRule, setSavingRule] = useState(false); + const [message, setMessage] = useState(null); + + const monitorById = useMemo(() => new Map(monitors.map((monitor) => [monitor.id, monitor])), [monitors]); + const editingRule = rules.find((rule) => rule.id === editingRuleId) ?? null; + const editingMonitor = editingRule ? (monitorById.get(editingRule.monitor_id) ?? null) : null; + const editingHttpsMonitor = editingMonitor && isHttpsWebsiteMonitor(editingMonitor) ? editingMonitor : null; + + async function refreshRules() { + setRules(await api.alertRules(token)); + } + + async function refreshAll() { + await Promise.all([onChanged(), refreshRules()]); + } + + useEffect(() => { + refreshRules().catch(() => setRules([])); + }, [token]); async function runAction(incidentId: number, action: "ack" | "silence") { - setBusyId(incidentId); + setBusyIncidentId(incidentId); try { if (action === "ack") { await api.acknowledgeIncident(token, incidentId); @@ -25,7 +55,78 @@ export function AlertsPage({ token, incidents, selectedIncidentId, onChanged }: } await onChanged(); } finally { - setBusyId(null); + setBusyIncidentId(null); + } + } + + function startEdit(rule: AlertRule) { + setEditingRuleId(rule.id); + setRuleName(rule.name); + setSeverity(rule.severity); + setFailureThreshold(rule.failure_threshold); + setCooldownSeconds(rule.cooldown_seconds); + setEnabled(rule.is_enabled); + const monitor = monitorById.get(rule.monitor_id); + setTlsCheckEnabled(Boolean(monitor?.config?.check_tls_expiry ?? false)); + setTlsWarningDays(Number(monitor?.config?.tls_warning_days ?? 30)); + setMessage(null); + } + + function resetRuleForm() { + setEditingRuleId(null); + setRuleName(""); + setSeverity("warning"); + setFailureThreshold(3); + setCooldownSeconds(300); + setEnabled(true); + setTlsCheckEnabled(false); + setTlsWarningDays(30); + } + + async function saveRule(event: FormEvent) { + event.preventDefault(); + if (!editingRule) return; + + setSavingRule(true); + setMessage(null); + try { + await Promise.all([ + api.updateAlertRule(token, editingRule.id, { + name: ruleName, + severity, + failure_threshold: failureThreshold, + cooldown_seconds: cooldownSeconds, + is_enabled: enabled, + }), + editingHttpsMonitor + ? api.updateMonitor(token, editingHttpsMonitor.id, { + config: { + ...(editingHttpsMonitor.config ?? {}), + check_tls_expiry: tlsCheckEnabled, + tls_warning_days: tlsWarningDays, + }, + }) + : Promise.resolve(), + ]); + resetRuleForm(); + await refreshAll(); + } catch (err) { + setMessage(err instanceof Error ? err.message : "Could not save alert rule"); + } finally { + setSavingRule(false); + } + } + + async function toggleRule(rule: AlertRule) { + setBusyRuleId(rule.id); + setMessage(null); + try { + await api.updateAlertRule(token, rule.id, { is_enabled: !rule.is_enabled }); + await refreshRules(); + } catch (err) { + setMessage(err instanceof Error ? err.message : "Could not update alert rule"); + } finally { + setBusyRuleId(null); } } @@ -34,14 +135,129 @@ export function AlertsPage({ token, incidents, selectedIncidentId, onChanged }:

Alerts

-

Open incidents, acknowledgements, silences, recoveries, and notification history.

+

Alert rules, open incidents, acknowledgements, silences, recoveries, and notification history.

-
+
+
+
+ +

{editingRule ? "Edit Alert Rule" : "Alert Rule Settings"}

+
+ + {editingRule ? ( + <> +
+
{editingMonitor?.name ?? `Monitor ${editingRule.monitor_id}`}
+
{describeCondition(editingRule.condition, editingMonitor)}
+
+ + + + + +
+ + +
+ +
+ Enabled + setEnabled(event.target.checked)} type="checkbox" /> +
+ + {editingHttpsMonitor ? ( +
+
TLS Certificate
+
+
+ Expiry Check + setTlsCheckEnabled(event.target.checked)} type="checkbox" /> +
+ +
+
+ ) : null} + + {message ?
{message}
: null} + +
+ + +
+ + ) : ( +
Select a rule from the list to edit severity, threshold, cooldown, or enabled state.
+ )} +
+ +
+
+

Alert Rules

+
+
+ {rules.length ? ( + rules.map((rule) => { + const monitor = monitorById.get(rule.monitor_id); + return ( +
+
+
{rule.name}
+
{monitor?.name ?? `Monitor ${rule.monitor_id}`}
+
+ {describeCondition(rule.condition, monitor)} after {rule.failure_threshold} failed {rule.failure_threshold === 1 ? "check" : "checks"}, cooldown {formatDuration(rule.cooldown_seconds)} +
+
+ + +
+ + +
+
+ ); + }) + ) : ( +
No alert rules yet.
+ )} +
+
+
+
Incident
@@ -78,11 +294,11 @@ export function AlertsPage({ token, incidents, selectedIncidentId, onChanged }: {incident.silenced_until ?
Silenced
: null}
- - @@ -99,6 +315,25 @@ export function AlertsPage({ token, incidents, selectedIncidentId, onChanged }: ); } +function describeCondition(condition: Record, monitor?: Monitor | null) { + if (condition.type === "status_not_up" && monitor?.monitor_type === "http" && monitor.config?.check_tls_expiry) { + return `Website failure or TLS certificate inside ${String(monitor.config.tls_warning_days ?? 30)}-day warning window`; + } + if (condition.type === "status_not_up") return "Monitor is not up"; + return "Custom condition"; +} + +function isHttpsWebsiteMonitor(monitor: Monitor) { + return monitor.monitor_type === "http" && monitor.target.trim().toLowerCase().startsWith("https://"); +} + +function formatDuration(seconds: number) { + if (seconds === 0) return "off"; + if (seconds % 3600 === 0) return `${seconds / 3600}h`; + if (seconds % 60 === 0) return `${seconds / 60}m`; + return `${seconds}s`; +} + function Badge({ value, tone }: { value: string; tone: "critical" | "warning" | "ok" | "neutral" }) { const classes = { critical: "border-red-500/40 bg-red-950/40 text-red-200", diff --git a/frontend/src/pages/NetworkChecksPage.tsx b/frontend/src/pages/NetworkChecksPage.tsx index dc0a0d5..b8ef97e 100644 --- a/frontend/src/pages/NetworkChecksPage.tsx +++ b/frontend/src/pages/NetworkChecksPage.tsx @@ -1,5 +1,5 @@ import { FormEvent, useState } from "react"; -import { Activity, Edit3, PlugZap, Plus, RefreshCw, Trash2, X } from "lucide-react"; +import { Activity, Pencil, PlugZap, Plus, RefreshCw, Trash2, X } from "lucide-react"; import { api } from "../api/client"; import { Button } from "../components/Button"; @@ -205,20 +205,22 @@ export function NetworkChecksPage({ token, monitors, onChanged }: NetworkChecksP
{networkChecks.length ? ( networkChecks.map((monitor) => ( -
+
{monitor.name}
{monitor.target}
{monitor.monitor_type} -
+
{monitor.last_checked_at ? new Date(monitor.last_checked_at).toLocaleTimeString() : "Not checked"}
- -
diff --git a/frontend/src/pages/NotificationsPage.tsx b/frontend/src/pages/NotificationsPage.tsx index 8175490..d96ebbd 100644 --- a/frontend/src/pages/NotificationsPage.tsx +++ b/frontend/src/pages/NotificationsPage.tsx @@ -1,5 +1,5 @@ import { FormEvent, useEffect, useState } from "react"; -import { Bell, Edit3, RefreshCw, Send, Trash2, X } from "lucide-react"; +import { Bell, Pencil, RefreshCw, Send, Trash2, X } from "lucide-react"; import { api } from "../api/client"; import { Button } from "../components/Button"; @@ -177,7 +177,7 @@ export function NotificationsPage({ token }: NotificationsPageProps) {
{channels.length ? ( channels.map((channel) => ( -
+
{channel.name}
{String(channel.settings.username || "OrbitalWard")}
@@ -185,15 +185,18 @@ export function NotificationsPage({ token }: NotificationsPageProps) {
{channel.channel_type}
-
- - -
diff --git a/frontend/src/pages/WebsitesPage.tsx b/frontend/src/pages/WebsitesPage.tsx index 668c169..205b765 100644 --- a/frontend/src/pages/WebsitesPage.tsx +++ b/frontend/src/pages/WebsitesPage.tsx @@ -1,5 +1,5 @@ import { FormEvent, useEffect, useState } from "react"; -import { Edit3, Globe2, Plus, RefreshCw, Trash2, X } from "lucide-react"; +import { Globe2, Pencil, Plus, RefreshCw, Trash2, X } from "lucide-react"; import { api } from "../api/client"; import { Button } from "../components/Button"; @@ -233,7 +233,7 @@ export function WebsitesPage({ token, monitors, onCreated }: WebsitesPageProps)
{websites.length ? ( websites.map((monitor) => ( -
+
{monitor.name}
{monitor.target}
@@ -241,13 +241,15 @@ export function WebsitesPage({ token, monitors, onCreated }: WebsitesPageProps) {monitor.config?.check_tls_expiry ?
TLS warning at {String(monitor.config.tls_warning_days ?? 30)} days
: null}
-
+
{monitor.last_checked_at ? new Date(monitor.last_checked_at).toLocaleTimeString() : "Not checked"}
- -
diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index b4585d9..3484e64 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -57,6 +57,27 @@ export interface Incident { details: Record; } +export interface AlertRule { + id: number; + monitor_id: number; + name: string; + severity: string; + condition: Record; + failure_threshold: number; + cooldown_seconds: number; + is_enabled: boolean; + created_at: string; + updated_at: string; +} + +export interface AlertRuleUpdate { + name?: string; + severity?: string; + failure_threshold?: number; + cooldown_seconds?: number; + is_enabled?: boolean; +} + export interface NotificationChannel { id: number; name: string; diff --git a/worker/app/scheduler.py b/worker/app/scheduler.py index 4a66380..0d01b02 100644 --- a/worker/app/scheduler.py +++ b/worker/app/scheduler.py @@ -144,6 +144,19 @@ class Scheduler: ) threshold_met = len(recent_statuses) >= rule.failure_threshold and all(status != "up" for status in recent_statuses) if threshold_met and open_incident is None: + if rule.cooldown_seconds > 0: + latest_incident = db.scalar( + select(Incident) + .where( + Incident.monitor_id == monitor.id, + Incident.alert_rule_id == rule.id, + ) + .order_by(Incident.opened_at.desc()) + .limit(1) + ) + if latest_incident is not None and latest_incident.opened_at + timedelta(seconds=rule.cooldown_seconds) > now: + return + incident = Incident( asset_id=monitor.asset_id, monitor_id=monitor.id,