diff --git a/docs/progress.md b/docs/progress.md index 543d502..860a6bc 100644 --- a/docs/progress.md +++ b/docs/progress.md @@ -50,11 +50,12 @@ Implemented alerting management slice: - Existing simple alert conditions are shown in friendly language instead of raw condition data. - Worker honors alert rule cooldown before opening a new incident for a recently-triggered rule. -Implemented backend test coverage: +Implemented monitor and notification test coverage: - Test fixtures isolate API tests with an in-memory database and authenticated owner override. - Website monitor tests cover asset creation, default alert rule creation, TLS config persistence, and disabled default alerts. - Notification channel tests verify saved webhook URLs are encrypted and are not returned by create, list, or update responses. +- Worker scheduler tests cover alert threshold incident opening, recovery resolution, notification history deduplication, and alert cooldown behavior. ## Known Gaps @@ -69,7 +70,7 @@ Implemented backend test coverage: - Email/SMTP notifications are not implemented yet. - Graphing exists only as placeholders; metric visualization is not implemented. - Worker scheduling is simple polling, not a Redis queue yet. -- Tests still need worker notification delivery, alert evaluation, and frontend coverage. +- Tests still need frontend coverage and broader edge-case coverage across monitor types. - Production deployment hardening is not done. ## Recommended Next Work @@ -84,7 +85,7 @@ Implemented backend test coverage: 8. Add user administration UI. 9. Add graphs for website response time and monitor status history. 10. Add richer alert condition editing. -11. Add worker tests for alert evaluation and notification delivery. +11. Add frontend coverage for monitor, alert, and notification workflows. ## Operational Notes diff --git a/worker/app/scheduler.py b/worker/app/scheduler.py index 0d01b02..2d13f3a 100644 --- a/worker/app/scheduler.py +++ b/worker/app/scheduler.py @@ -154,7 +154,10 @@ class Scheduler: .order_by(Incident.opened_at.desc()) .limit(1) ) - if latest_incident is not None and latest_incident.opened_at + timedelta(seconds=rule.cooldown_seconds) > now: + if ( + latest_incident is not None + and self._as_utc(latest_incident.opened_at) + timedelta(seconds=rule.cooldown_seconds) > now + ): return incident = Incident( @@ -221,6 +224,12 @@ class Scheduler: response = await client.post(url, json={"username": username, "text": message}) response.raise_for_status() + @staticmethod + def _as_utc(value: datetime) -> datetime: + if value.tzinfo is None: + return value.replace(tzinfo=UTC) + return value.astimezone(UTC) + def _format_incident_message(self, incident: Incident, monitor: Monitor, event_type: str) -> str: if event_type == "resolved": title = f"RESOLVED: {monitor.name} recovered" diff --git a/worker/tests/test_scheduler.py b/worker/tests/test_scheduler.py new file mode 100644 index 0000000..eb2361d --- /dev/null +++ b/worker/tests/test_scheduler.py @@ -0,0 +1,161 @@ +import base64 +import hashlib +import unittest +from datetime import UTC, datetime, timedelta + +from cryptography.fernet import Fernet +from sqlalchemy import create_engine, select +from sqlalchemy.orm import Session, sessionmaker +from sqlalchemy.pool import StaticPool + +from app.collectors.website import WebsiteCheckResult +from app.config import settings +from app.models import AlertRule, Base, CheckResult, Incident, Monitor, NotificationChannel +from app.scheduler import Scheduler + + +def encrypt_secret(value: str) -> str: + digest = hashlib.sha256(settings.orbitalward_secret_key.encode("utf-8")).digest() + return Fernet(base64.urlsafe_b64encode(digest)).encrypt(value.encode("utf-8")).decode("utf-8") + + +class RecordingScheduler(Scheduler): + def __init__(self, results: list[WebsiteCheckResult] | None = None) -> None: + super().__init__() + self.results = list(results or []) + self.posts: list[dict[str, str]] = [] + + async def _collect_monitor_result(self, monitor: Monitor) -> WebsiteCheckResult: + return self.results.pop(0) + + async def _post_webhook(self, url: str, message: str, username: str) -> None: + self.posts.append({"url": url, "message": message, "username": username}) + + +class SchedulerTestCase(unittest.IsolatedAsyncioTestCase): + def setUp(self) -> None: + self.engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + Base.metadata.create_all(bind=self.engine) + self.session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False) + self.db: Session = self.session_factory() + + def tearDown(self) -> None: + self.db.close() + Base.metadata.drop_all(bind=self.engine) + self.engine.dispose() + + def create_monitor_with_rule(self, *, failure_threshold: int = 2, cooldown_seconds: int = 0) -> tuple[Monitor, AlertRule]: + monitor = Monitor( + name="Example Site", + monitor_type="http", + target="https://example.com", + config={"expected_status": 200, "timeout_seconds": 5}, + interval_seconds=60, + status="unknown", + ) + self.db.add(monitor) + self.db.flush() + + rule = AlertRule( + monitor_id=monitor.id, + name="Example Site failure", + severity="critical", + condition={"type": "status_not_up"}, + failure_threshold=failure_threshold, + cooldown_seconds=cooldown_seconds, + is_enabled=True, + ) + self.db.add(rule) + self.db.flush() + return monitor, rule + + async def test_alert_evaluation_opens_incident_after_failure_threshold(self) -> None: + monitor, rule = self.create_monitor_with_rule(failure_threshold=2) + scheduler = RecordingScheduler( + [ + WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"), + WebsiteCheckResult(status="down", response_time_ms=110, message="HTTP 500 again"), + ] + ) + + await scheduler._run_monitor(self.db, monitor) + assert self.db.scalars(select(Incident)).all() == [] + + await scheduler._run_monitor(self.db, monitor) + + incident = self.db.scalar(select(Incident)) + assert incident is not None + assert incident.monitor_id == monitor.id + assert incident.alert_rule_id == rule.id + assert incident.status == "open" + assert incident.severity == "critical" + assert incident.details["last_message"] == "HTTP 500 again" + assert incident.details["failure_threshold"] == 2 + + async def test_recovery_resolves_open_incident_and_sends_notifications_once(self) -> None: + monitor, rule = self.create_monitor_with_rule(failure_threshold=1) + channel = NotificationChannel( + name="Ops Webhook", + channel_type="generic_webhook", + settings={"username": "OrbitalWard"}, + encrypted_secret=encrypt_secret("https://hooks.example.test/orbitalward"), + is_enabled=True, + ) + self.db.add(channel) + self.db.flush() + scheduler = RecordingScheduler( + [ + WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"), + WebsiteCheckResult(status="up", response_time_ms=80, message="Website check passed"), + ] + ) + + await scheduler._run_monitor(self.db, monitor) + + incident = self.db.scalar(select(Incident)) + assert incident is not None + assert incident.status == "open" + assert len(scheduler.posts) == 1 + assert scheduler.posts[0]["url"] == "https://hooks.example.test/orbitalward" + assert scheduler.posts[0]["username"] == "OrbitalWard" + assert incident.details["notification_history"][0]["event"] == "opened" + + await scheduler._send_incident_notifications(self.db, incident, monitor, "opened", datetime.now(UTC)) + assert len(scheduler.posts) == 1 + + await scheduler._run_monitor(self.db, monitor) + + assert incident.status == "resolved" + assert incident.resolved_at is not None + assert incident.details["recovery_message"] == "Website check passed" + assert len(scheduler.posts) == 2 + assert incident.details["notification_history"][1]["event"] == "resolved" + + async def test_alert_cooldown_suppresses_new_incident_after_recent_resolution(self) -> None: + monitor, rule = self.create_monitor_with_rule(failure_threshold=1, cooldown_seconds=300) + now = datetime.now(UTC) + self.db.add( + Incident( + monitor_id=monitor.id, + alert_rule_id=rule.id, + title="Example Site is failing", + severity="critical", + status="resolved", + opened_at=now - timedelta(seconds=60), + resolved_at=now - timedelta(seconds=30), + details={}, + ) + ) + self.db.add(CheckResult(monitor_id=monitor.id, status="down", response_time_ms=100, message="HTTP 500", observed_at=now)) + monitor.status = "down" + self.db.flush() + scheduler = RecordingScheduler() + + await scheduler._evaluate_rule(self.db, monitor, rule, now, "HTTP 500") + + open_incidents = self.db.scalars(select(Incident).where(Incident.status == "open")).all() + assert open_incidents == []