Add worker alert and notification tests

2026-05-23 19:50:13 -06:00
parent 68d5e0a705
commit 19d4c6e603
3 changed files with 175 additions and 4 deletions
@@ -50,11 +50,12 @@ Implemented alerting management slice:
 - Existing simple alert conditions are shown in friendly language instead of raw condition data.
 - Worker honors alert rule cooldown before opening a new incident for a recently-triggered rule.

-Implemented backend test coverage:
+Implemented monitor and notification test coverage:

 - Test fixtures isolate API tests with an in-memory database and authenticated owner override.
 - Website monitor tests cover asset creation, default alert rule creation, TLS config persistence, and disabled default alerts.
 - Notification channel tests verify saved webhook URLs are encrypted and are not returned by create, list, or update responses.
+- Worker scheduler tests cover alert threshold incident opening, recovery resolution, notification history deduplication, and alert cooldown behavior.

 ## Known Gaps

@@ -69,7 +70,7 @@ Implemented backend test coverage:
 - Email/SMTP notifications are not implemented yet.
 - Graphing exists only as placeholders; metric visualization is not implemented.
 - Worker scheduling is simple polling, not a Redis queue yet.
- Tests still need worker notification delivery, alert evaluation, and frontend coverage.
+- Tests still need frontend coverage and broader edge-case coverage across monitor types.
 - Production deployment hardening is not done.

 ## Recommended Next Work
@@ -84,7 +85,7 @@ Implemented backend test coverage:
 8. Add user administration UI.
 9. Add graphs for website response time and monitor status history.
 10. Add richer alert condition editing.
-11. Add worker tests for alert evaluation and notification delivery.
+11. Add frontend coverage for monitor, alert, and notification workflows.

 ## Operational Notes

@@ -154,7 +154,10 @@ class Scheduler:
                    .order_by(Incident.opened_at.desc())
                    .limit(1)
                )
-                if latest_incident is not None and latest_incident.opened_at + timedelta(seconds=rule.cooldown_seconds) > now:
+                if (
+                    latest_incident is not None
+                    and self._as_utc(latest_incident.opened_at) + timedelta(seconds=rule.cooldown_seconds) > now
+                ):
                    return

            incident = Incident(
@@ -221,6 +224,12 @@ class Scheduler:
            response = await client.post(url, json={"username": username, "text": message})
            response.raise_for_status()

+    @staticmethod
+    def _as_utc(value: datetime) -> datetime:
+        if value.tzinfo is None:
+            return value.replace(tzinfo=UTC)
+        return value.astimezone(UTC)
+
    def _format_incident_message(self, incident: Incident, monitor: Monitor, event_type: str) -> str:
        if event_type == "resolved":
            title = f"RESOLVED: {monitor.name} recovered"
@@ -0,0 +1,161 @@
+import base64
+import hashlib
+import unittest
+from datetime import UTC, datetime, timedelta
+
+from cryptography.fernet import Fernet
+from sqlalchemy import create_engine, select
+from sqlalchemy.orm import Session, sessionmaker
+from sqlalchemy.pool import StaticPool
+
+from app.collectors.website import WebsiteCheckResult
+from app.config import settings
+from app.models import AlertRule, Base, CheckResult, Incident, Monitor, NotificationChannel
+from app.scheduler import Scheduler
+
+
+def encrypt_secret(value: str) -> str:
+    digest = hashlib.sha256(settings.orbitalward_secret_key.encode("utf-8")).digest()
+    return Fernet(base64.urlsafe_b64encode(digest)).encrypt(value.encode("utf-8")).decode("utf-8")
+
+
+class RecordingScheduler(Scheduler):
+    def __init__(self, results: list[WebsiteCheckResult] | None = None) -> None:
+        super().__init__()
+        self.results = list(results or [])
+        self.posts: list[dict[str, str]] = []
+
+    async def _collect_monitor_result(self, monitor: Monitor) -> WebsiteCheckResult:
+        return self.results.pop(0)
+
+    async def _post_webhook(self, url: str, message: str, username: str) -> None:
+        self.posts.append({"url": url, "message": message, "username": username})
+
+
+class SchedulerTestCase(unittest.IsolatedAsyncioTestCase):
+    def setUp(self) -> None:
+        self.engine = create_engine(
+            "sqlite://",
+            connect_args={"check_same_thread": False},
+            poolclass=StaticPool,
+        )
+        Base.metadata.create_all(bind=self.engine)
+        self.session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
+        self.db: Session = self.session_factory()
+
+    def tearDown(self) -> None:
+        self.db.close()
+        Base.metadata.drop_all(bind=self.engine)
+        self.engine.dispose()
+
+    def create_monitor_with_rule(self, *, failure_threshold: int = 2, cooldown_seconds: int = 0) -> tuple[Monitor, AlertRule]:
+        monitor = Monitor(
+            name="Example Site",
+            monitor_type="http",
+            target="https://example.com",
+            config={"expected_status": 200, "timeout_seconds": 5},
+            interval_seconds=60,
+            status="unknown",
+        )
+        self.db.add(monitor)
+        self.db.flush()
+
+        rule = AlertRule(
+            monitor_id=monitor.id,
+            name="Example Site failure",
+            severity="critical",
+            condition={"type": "status_not_up"},
+            failure_threshold=failure_threshold,
+            cooldown_seconds=cooldown_seconds,
+            is_enabled=True,
+        )
+        self.db.add(rule)
+        self.db.flush()
+        return monitor, rule
+
+    async def test_alert_evaluation_opens_incident_after_failure_threshold(self) -> None:
+        monitor, rule = self.create_monitor_with_rule(failure_threshold=2)
+        scheduler = RecordingScheduler(
+            [
+                WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
+                WebsiteCheckResult(status="down", response_time_ms=110, message="HTTP 500 again"),
+            ]
+        )
+
+        await scheduler._run_monitor(self.db, monitor)
+        assert self.db.scalars(select(Incident)).all() == []
+
+        await scheduler._run_monitor(self.db, monitor)
+
+        incident = self.db.scalar(select(Incident))
+        assert incident is not None
+        assert incident.monitor_id == monitor.id
+        assert incident.alert_rule_id == rule.id
+        assert incident.status == "open"
+        assert incident.severity == "critical"
+        assert incident.details["last_message"] == "HTTP 500 again"
+        assert incident.details["failure_threshold"] == 2
+
+    async def test_recovery_resolves_open_incident_and_sends_notifications_once(self) -> None:
+        monitor, rule = self.create_monitor_with_rule(failure_threshold=1)
+        channel = NotificationChannel(
+            name="Ops Webhook",
+            channel_type="generic_webhook",
+            settings={"username": "OrbitalWard"},
+            encrypted_secret=encrypt_secret("https://hooks.example.test/orbitalward"),
+            is_enabled=True,
+        )
+        self.db.add(channel)
+        self.db.flush()
+        scheduler = RecordingScheduler(
+            [
+                WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
+                WebsiteCheckResult(status="up", response_time_ms=80, message="Website check passed"),
+            ]
+        )
+
+        await scheduler._run_monitor(self.db, monitor)
+
+        incident = self.db.scalar(select(Incident))
+        assert incident is not None
+        assert incident.status == "open"
+        assert len(scheduler.posts) == 1
+        assert scheduler.posts[0]["url"] == "https://hooks.example.test/orbitalward"
+        assert scheduler.posts[0]["username"] == "OrbitalWard"
+        assert incident.details["notification_history"][0]["event"] == "opened"
+
+        await scheduler._send_incident_notifications(self.db, incident, monitor, "opened", datetime.now(UTC))
+        assert len(scheduler.posts) == 1
+
+        await scheduler._run_monitor(self.db, monitor)
+
+        assert incident.status == "resolved"
+        assert incident.resolved_at is not None
+        assert incident.details["recovery_message"] == "Website check passed"
+        assert len(scheduler.posts) == 2
+        assert incident.details["notification_history"][1]["event"] == "resolved"
+
+    async def test_alert_cooldown_suppresses_new_incident_after_recent_resolution(self) -> None:
+        monitor, rule = self.create_monitor_with_rule(failure_threshold=1, cooldown_seconds=300)
+        now = datetime.now(UTC)
+        self.db.add(
+            Incident(
+                monitor_id=monitor.id,
+                alert_rule_id=rule.id,
+                title="Example Site is failing",
+                severity="critical",
+                status="resolved",
+                opened_at=now - timedelta(seconds=60),
+                resolved_at=now - timedelta(seconds=30),
+                details={},
+            )
+        )
+        self.db.add(CheckResult(monitor_id=monitor.id, status="down", response_time_ms=100, message="HTTP 500", observed_at=now))
+        monitor.status = "down"
+        self.db.flush()
+        scheduler = RecordingScheduler()
+
+        await scheduler._evaluate_rule(self.db, monitor, rule, now, "HTTP 500")
+
+        open_incidents = self.db.scalars(select(Incident).where(Incident.status == "open")).all()
+        assert open_incidents == []