Add worker alert and notification tests

This commit is contained in:
Keith Smith
2026-05-23 19:50:13 -06:00
parent 68d5e0a705
commit 19d4c6e603
3 changed files with 175 additions and 4 deletions
+10 -1
View File
@@ -154,7 +154,10 @@ class Scheduler:
.order_by(Incident.opened_at.desc())
.limit(1)
)
if latest_incident is not None and latest_incident.opened_at + timedelta(seconds=rule.cooldown_seconds) > now:
if (
latest_incident is not None
and self._as_utc(latest_incident.opened_at) + timedelta(seconds=rule.cooldown_seconds) > now
):
return
incident = Incident(
@@ -221,6 +224,12 @@ class Scheduler:
response = await client.post(url, json={"username": username, "text": message})
response.raise_for_status()
@staticmethod
def _as_utc(value: datetime) -> datetime:
if value.tzinfo is None:
return value.replace(tzinfo=UTC)
return value.astimezone(UTC)
def _format_incident_message(self, incident: Incident, monitor: Monitor, event_type: str) -> str:
if event_type == "resolved":
title = f"RESOLVED: {monitor.name} recovered"
+161
View File
@@ -0,0 +1,161 @@
import base64
import hashlib
import unittest
from datetime import UTC, datetime, timedelta
from cryptography.fernet import Fernet
from sqlalchemy import create_engine, select
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.pool import StaticPool
from app.collectors.website import WebsiteCheckResult
from app.config import settings
from app.models import AlertRule, Base, CheckResult, Incident, Monitor, NotificationChannel
from app.scheduler import Scheduler
def encrypt_secret(value: str) -> str:
digest = hashlib.sha256(settings.orbitalward_secret_key.encode("utf-8")).digest()
return Fernet(base64.urlsafe_b64encode(digest)).encrypt(value.encode("utf-8")).decode("utf-8")
class RecordingScheduler(Scheduler):
def __init__(self, results: list[WebsiteCheckResult] | None = None) -> None:
super().__init__()
self.results = list(results or [])
self.posts: list[dict[str, str]] = []
async def _collect_monitor_result(self, monitor: Monitor) -> WebsiteCheckResult:
return self.results.pop(0)
async def _post_webhook(self, url: str, message: str, username: str) -> None:
self.posts.append({"url": url, "message": message, "username": username})
class SchedulerTestCase(unittest.IsolatedAsyncioTestCase):
def setUp(self) -> None:
self.engine = create_engine(
"sqlite://",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(bind=self.engine)
self.session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
self.db: Session = self.session_factory()
def tearDown(self) -> None:
self.db.close()
Base.metadata.drop_all(bind=self.engine)
self.engine.dispose()
def create_monitor_with_rule(self, *, failure_threshold: int = 2, cooldown_seconds: int = 0) -> tuple[Monitor, AlertRule]:
monitor = Monitor(
name="Example Site",
monitor_type="http",
target="https://example.com",
config={"expected_status": 200, "timeout_seconds": 5},
interval_seconds=60,
status="unknown",
)
self.db.add(monitor)
self.db.flush()
rule = AlertRule(
monitor_id=monitor.id,
name="Example Site failure",
severity="critical",
condition={"type": "status_not_up"},
failure_threshold=failure_threshold,
cooldown_seconds=cooldown_seconds,
is_enabled=True,
)
self.db.add(rule)
self.db.flush()
return monitor, rule
async def test_alert_evaluation_opens_incident_after_failure_threshold(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=2)
scheduler = RecordingScheduler(
[
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
WebsiteCheckResult(status="down", response_time_ms=110, message="HTTP 500 again"),
]
)
await scheduler._run_monitor(self.db, monitor)
assert self.db.scalars(select(Incident)).all() == []
await scheduler._run_monitor(self.db, monitor)
incident = self.db.scalar(select(Incident))
assert incident is not None
assert incident.monitor_id == monitor.id
assert incident.alert_rule_id == rule.id
assert incident.status == "open"
assert incident.severity == "critical"
assert incident.details["last_message"] == "HTTP 500 again"
assert incident.details["failure_threshold"] == 2
async def test_recovery_resolves_open_incident_and_sends_notifications_once(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=1)
channel = NotificationChannel(
name="Ops Webhook",
channel_type="generic_webhook",
settings={"username": "OrbitalWard"},
encrypted_secret=encrypt_secret("https://hooks.example.test/orbitalward"),
is_enabled=True,
)
self.db.add(channel)
self.db.flush()
scheduler = RecordingScheduler(
[
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
WebsiteCheckResult(status="up", response_time_ms=80, message="Website check passed"),
]
)
await scheduler._run_monitor(self.db, monitor)
incident = self.db.scalar(select(Incident))
assert incident is not None
assert incident.status == "open"
assert len(scheduler.posts) == 1
assert scheduler.posts[0]["url"] == "https://hooks.example.test/orbitalward"
assert scheduler.posts[0]["username"] == "OrbitalWard"
assert incident.details["notification_history"][0]["event"] == "opened"
await scheduler._send_incident_notifications(self.db, incident, monitor, "opened", datetime.now(UTC))
assert len(scheduler.posts) == 1
await scheduler._run_monitor(self.db, monitor)
assert incident.status == "resolved"
assert incident.resolved_at is not None
assert incident.details["recovery_message"] == "Website check passed"
assert len(scheduler.posts) == 2
assert incident.details["notification_history"][1]["event"] == "resolved"
async def test_alert_cooldown_suppresses_new_incident_after_recent_resolution(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=1, cooldown_seconds=300)
now = datetime.now(UTC)
self.db.add(
Incident(
monitor_id=monitor.id,
alert_rule_id=rule.id,
title="Example Site is failing",
severity="critical",
status="resolved",
opened_at=now - timedelta(seconds=60),
resolved_at=now - timedelta(seconds=30),
details={},
)
)
self.db.add(CheckResult(monitor_id=monitor.id, status="down", response_time_ms=100, message="HTTP 500", observed_at=now))
monitor.status = "down"
self.db.flush()
scheduler = RecordingScheduler()
await scheduler._evaluate_rule(self.db, monitor, rule, now, "HTTP 500")
open_incidents = self.db.scalars(select(Incident).where(Incident.status == "open")).all()
assert open_incidents == []