Add worker alert and notification tests

This commit is contained in:
Keith Smith
2026-05-23 19:50:13 -06:00
parent 68d5e0a705
commit 19d4c6e603
3 changed files with 175 additions and 4 deletions
+4 -3
View File
@@ -50,11 +50,12 @@ Implemented alerting management slice:
- Existing simple alert conditions are shown in friendly language instead of raw condition data.
- Worker honors alert rule cooldown before opening a new incident for a recently-triggered rule.
Implemented backend test coverage:
Implemented monitor and notification test coverage:
- Test fixtures isolate API tests with an in-memory database and authenticated owner override.
- Website monitor tests cover asset creation, default alert rule creation, TLS config persistence, and disabled default alerts.
- Notification channel tests verify saved webhook URLs are encrypted and are not returned by create, list, or update responses.
- Worker scheduler tests cover alert threshold incident opening, recovery resolution, notification history deduplication, and alert cooldown behavior.
## Known Gaps
@@ -69,7 +70,7 @@ Implemented backend test coverage:
- Email/SMTP notifications are not implemented yet.
- Graphing exists only as placeholders; metric visualization is not implemented.
- Worker scheduling is simple polling, not a Redis queue yet.
- Tests still need worker notification delivery, alert evaluation, and frontend coverage.
- Tests still need frontend coverage and broader edge-case coverage across monitor types.
- Production deployment hardening is not done.
## Recommended Next Work
@@ -84,7 +85,7 @@ Implemented backend test coverage:
8. Add user administration UI.
9. Add graphs for website response time and monitor status history.
10. Add richer alert condition editing.
11. Add worker tests for alert evaluation and notification delivery.
11. Add frontend coverage for monitor, alert, and notification workflows.
## Operational Notes
+10 -1
View File
@@ -154,7 +154,10 @@ class Scheduler:
.order_by(Incident.opened_at.desc())
.limit(1)
)
if latest_incident is not None and latest_incident.opened_at + timedelta(seconds=rule.cooldown_seconds) > now:
if (
latest_incident is not None
and self._as_utc(latest_incident.opened_at) + timedelta(seconds=rule.cooldown_seconds) > now
):
return
incident = Incident(
@@ -221,6 +224,12 @@ class Scheduler:
response = await client.post(url, json={"username": username, "text": message})
response.raise_for_status()
@staticmethod
def _as_utc(value: datetime) -> datetime:
if value.tzinfo is None:
return value.replace(tzinfo=UTC)
return value.astimezone(UTC)
def _format_incident_message(self, incident: Incident, monitor: Monitor, event_type: str) -> str:
if event_type == "resolved":
title = f"RESOLVED: {monitor.name} recovered"
+161
View File
@@ -0,0 +1,161 @@
import base64
import hashlib
import unittest
from datetime import UTC, datetime, timedelta
from cryptography.fernet import Fernet
from sqlalchemy import create_engine, select
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.pool import StaticPool
from app.collectors.website import WebsiteCheckResult
from app.config import settings
from app.models import AlertRule, Base, CheckResult, Incident, Monitor, NotificationChannel
from app.scheduler import Scheduler
def encrypt_secret(value: str) -> str:
digest = hashlib.sha256(settings.orbitalward_secret_key.encode("utf-8")).digest()
return Fernet(base64.urlsafe_b64encode(digest)).encrypt(value.encode("utf-8")).decode("utf-8")
class RecordingScheduler(Scheduler):
def __init__(self, results: list[WebsiteCheckResult] | None = None) -> None:
super().__init__()
self.results = list(results or [])
self.posts: list[dict[str, str]] = []
async def _collect_monitor_result(self, monitor: Monitor) -> WebsiteCheckResult:
return self.results.pop(0)
async def _post_webhook(self, url: str, message: str, username: str) -> None:
self.posts.append({"url": url, "message": message, "username": username})
class SchedulerTestCase(unittest.IsolatedAsyncioTestCase):
def setUp(self) -> None:
self.engine = create_engine(
"sqlite://",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(bind=self.engine)
self.session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
self.db: Session = self.session_factory()
def tearDown(self) -> None:
self.db.close()
Base.metadata.drop_all(bind=self.engine)
self.engine.dispose()
def create_monitor_with_rule(self, *, failure_threshold: int = 2, cooldown_seconds: int = 0) -> tuple[Monitor, AlertRule]:
monitor = Monitor(
name="Example Site",
monitor_type="http",
target="https://example.com",
config={"expected_status": 200, "timeout_seconds": 5},
interval_seconds=60,
status="unknown",
)
self.db.add(monitor)
self.db.flush()
rule = AlertRule(
monitor_id=monitor.id,
name="Example Site failure",
severity="critical",
condition={"type": "status_not_up"},
failure_threshold=failure_threshold,
cooldown_seconds=cooldown_seconds,
is_enabled=True,
)
self.db.add(rule)
self.db.flush()
return monitor, rule
async def test_alert_evaluation_opens_incident_after_failure_threshold(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=2)
scheduler = RecordingScheduler(
[
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
WebsiteCheckResult(status="down", response_time_ms=110, message="HTTP 500 again"),
]
)
await scheduler._run_monitor(self.db, monitor)
assert self.db.scalars(select(Incident)).all() == []
await scheduler._run_monitor(self.db, monitor)
incident = self.db.scalar(select(Incident))
assert incident is not None
assert incident.monitor_id == monitor.id
assert incident.alert_rule_id == rule.id
assert incident.status == "open"
assert incident.severity == "critical"
assert incident.details["last_message"] == "HTTP 500 again"
assert incident.details["failure_threshold"] == 2
async def test_recovery_resolves_open_incident_and_sends_notifications_once(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=1)
channel = NotificationChannel(
name="Ops Webhook",
channel_type="generic_webhook",
settings={"username": "OrbitalWard"},
encrypted_secret=encrypt_secret("https://hooks.example.test/orbitalward"),
is_enabled=True,
)
self.db.add(channel)
self.db.flush()
scheduler = RecordingScheduler(
[
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
WebsiteCheckResult(status="up", response_time_ms=80, message="Website check passed"),
]
)
await scheduler._run_monitor(self.db, monitor)
incident = self.db.scalar(select(Incident))
assert incident is not None
assert incident.status == "open"
assert len(scheduler.posts) == 1
assert scheduler.posts[0]["url"] == "https://hooks.example.test/orbitalward"
assert scheduler.posts[0]["username"] == "OrbitalWard"
assert incident.details["notification_history"][0]["event"] == "opened"
await scheduler._send_incident_notifications(self.db, incident, monitor, "opened", datetime.now(UTC))
assert len(scheduler.posts) == 1
await scheduler._run_monitor(self.db, monitor)
assert incident.status == "resolved"
assert incident.resolved_at is not None
assert incident.details["recovery_message"] == "Website check passed"
assert len(scheduler.posts) == 2
assert incident.details["notification_history"][1]["event"] == "resolved"
async def test_alert_cooldown_suppresses_new_incident_after_recent_resolution(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=1, cooldown_seconds=300)
now = datetime.now(UTC)
self.db.add(
Incident(
monitor_id=monitor.id,
alert_rule_id=rule.id,
title="Example Site is failing",
severity="critical",
status="resolved",
opened_at=now - timedelta(seconds=60),
resolved_at=now - timedelta(seconds=30),
details={},
)
)
self.db.add(CheckResult(monitor_id=monitor.id, status="down", response_time_ms=100, message="HTTP 500", observed_at=now))
monitor.status = "down"
self.db.flush()
scheduler = RecordingScheduler()
await scheduler._evaluate_rule(self.db, monitor, rule, now, "HTTP 500")
open_incidents = self.db.scalars(select(Incident).where(Incident.status == "open")).all()
assert open_incidents == []