Add worker alert and notification tests
This commit is contained in:
+4
-3
@@ -50,11 +50,12 @@ Implemented alerting management slice:
|
||||
- Existing simple alert conditions are shown in friendly language instead of raw condition data.
|
||||
- Worker honors alert rule cooldown before opening a new incident for a recently-triggered rule.
|
||||
|
||||
Implemented backend test coverage:
|
||||
Implemented monitor and notification test coverage:
|
||||
|
||||
- Test fixtures isolate API tests with an in-memory database and authenticated owner override.
|
||||
- Website monitor tests cover asset creation, default alert rule creation, TLS config persistence, and disabled default alerts.
|
||||
- Notification channel tests verify saved webhook URLs are encrypted and are not returned by create, list, or update responses.
|
||||
- Worker scheduler tests cover alert threshold incident opening, recovery resolution, notification history deduplication, and alert cooldown behavior.
|
||||
|
||||
## Known Gaps
|
||||
|
||||
@@ -69,7 +70,7 @@ Implemented backend test coverage:
|
||||
- Email/SMTP notifications are not implemented yet.
|
||||
- Graphing exists only as placeholders; metric visualization is not implemented.
|
||||
- Worker scheduling is simple polling, not a Redis queue yet.
|
||||
- Tests still need worker notification delivery, alert evaluation, and frontend coverage.
|
||||
- Tests still need frontend coverage and broader edge-case coverage across monitor types.
|
||||
- Production deployment hardening is not done.
|
||||
|
||||
## Recommended Next Work
|
||||
@@ -84,7 +85,7 @@ Implemented backend test coverage:
|
||||
8. Add user administration UI.
|
||||
9. Add graphs for website response time and monitor status history.
|
||||
10. Add richer alert condition editing.
|
||||
11. Add worker tests for alert evaluation and notification delivery.
|
||||
11. Add frontend coverage for monitor, alert, and notification workflows.
|
||||
|
||||
## Operational Notes
|
||||
|
||||
|
||||
+10
-1
@@ -154,7 +154,10 @@ class Scheduler:
|
||||
.order_by(Incident.opened_at.desc())
|
||||
.limit(1)
|
||||
)
|
||||
if latest_incident is not None and latest_incident.opened_at + timedelta(seconds=rule.cooldown_seconds) > now:
|
||||
if (
|
||||
latest_incident is not None
|
||||
and self._as_utc(latest_incident.opened_at) + timedelta(seconds=rule.cooldown_seconds) > now
|
||||
):
|
||||
return
|
||||
|
||||
incident = Incident(
|
||||
@@ -221,6 +224,12 @@ class Scheduler:
|
||||
response = await client.post(url, json={"username": username, "text": message})
|
||||
response.raise_for_status()
|
||||
|
||||
@staticmethod
|
||||
def _as_utc(value: datetime) -> datetime:
|
||||
if value.tzinfo is None:
|
||||
return value.replace(tzinfo=UTC)
|
||||
return value.astimezone(UTC)
|
||||
|
||||
def _format_incident_message(self, incident: Incident, monitor: Monitor, event_type: str) -> str:
|
||||
if event_type == "resolved":
|
||||
title = f"RESOLVED: {monitor.name} recovered"
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import unittest
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
from cryptography.fernet import Fernet
|
||||
from sqlalchemy import create_engine, select
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
from sqlalchemy.pool import StaticPool
|
||||
|
||||
from app.collectors.website import WebsiteCheckResult
|
||||
from app.config import settings
|
||||
from app.models import AlertRule, Base, CheckResult, Incident, Monitor, NotificationChannel
|
||||
from app.scheduler import Scheduler
|
||||
|
||||
|
||||
def encrypt_secret(value: str) -> str:
|
||||
digest = hashlib.sha256(settings.orbitalward_secret_key.encode("utf-8")).digest()
|
||||
return Fernet(base64.urlsafe_b64encode(digest)).encrypt(value.encode("utf-8")).decode("utf-8")
|
||||
|
||||
|
||||
class RecordingScheduler(Scheduler):
|
||||
def __init__(self, results: list[WebsiteCheckResult] | None = None) -> None:
|
||||
super().__init__()
|
||||
self.results = list(results or [])
|
||||
self.posts: list[dict[str, str]] = []
|
||||
|
||||
async def _collect_monitor_result(self, monitor: Monitor) -> WebsiteCheckResult:
|
||||
return self.results.pop(0)
|
||||
|
||||
async def _post_webhook(self, url: str, message: str, username: str) -> None:
|
||||
self.posts.append({"url": url, "message": message, "username": username})
|
||||
|
||||
|
||||
class SchedulerTestCase(unittest.IsolatedAsyncioTestCase):
|
||||
def setUp(self) -> None:
|
||||
self.engine = create_engine(
|
||||
"sqlite://",
|
||||
connect_args={"check_same_thread": False},
|
||||
poolclass=StaticPool,
|
||||
)
|
||||
Base.metadata.create_all(bind=self.engine)
|
||||
self.session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
|
||||
self.db: Session = self.session_factory()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
self.db.close()
|
||||
Base.metadata.drop_all(bind=self.engine)
|
||||
self.engine.dispose()
|
||||
|
||||
def create_monitor_with_rule(self, *, failure_threshold: int = 2, cooldown_seconds: int = 0) -> tuple[Monitor, AlertRule]:
|
||||
monitor = Monitor(
|
||||
name="Example Site",
|
||||
monitor_type="http",
|
||||
target="https://example.com",
|
||||
config={"expected_status": 200, "timeout_seconds": 5},
|
||||
interval_seconds=60,
|
||||
status="unknown",
|
||||
)
|
||||
self.db.add(monitor)
|
||||
self.db.flush()
|
||||
|
||||
rule = AlertRule(
|
||||
monitor_id=monitor.id,
|
||||
name="Example Site failure",
|
||||
severity="critical",
|
||||
condition={"type": "status_not_up"},
|
||||
failure_threshold=failure_threshold,
|
||||
cooldown_seconds=cooldown_seconds,
|
||||
is_enabled=True,
|
||||
)
|
||||
self.db.add(rule)
|
||||
self.db.flush()
|
||||
return monitor, rule
|
||||
|
||||
async def test_alert_evaluation_opens_incident_after_failure_threshold(self) -> None:
|
||||
monitor, rule = self.create_monitor_with_rule(failure_threshold=2)
|
||||
scheduler = RecordingScheduler(
|
||||
[
|
||||
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
|
||||
WebsiteCheckResult(status="down", response_time_ms=110, message="HTTP 500 again"),
|
||||
]
|
||||
)
|
||||
|
||||
await scheduler._run_monitor(self.db, monitor)
|
||||
assert self.db.scalars(select(Incident)).all() == []
|
||||
|
||||
await scheduler._run_monitor(self.db, monitor)
|
||||
|
||||
incident = self.db.scalar(select(Incident))
|
||||
assert incident is not None
|
||||
assert incident.monitor_id == monitor.id
|
||||
assert incident.alert_rule_id == rule.id
|
||||
assert incident.status == "open"
|
||||
assert incident.severity == "critical"
|
||||
assert incident.details["last_message"] == "HTTP 500 again"
|
||||
assert incident.details["failure_threshold"] == 2
|
||||
|
||||
async def test_recovery_resolves_open_incident_and_sends_notifications_once(self) -> None:
|
||||
monitor, rule = self.create_monitor_with_rule(failure_threshold=1)
|
||||
channel = NotificationChannel(
|
||||
name="Ops Webhook",
|
||||
channel_type="generic_webhook",
|
||||
settings={"username": "OrbitalWard"},
|
||||
encrypted_secret=encrypt_secret("https://hooks.example.test/orbitalward"),
|
||||
is_enabled=True,
|
||||
)
|
||||
self.db.add(channel)
|
||||
self.db.flush()
|
||||
scheduler = RecordingScheduler(
|
||||
[
|
||||
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
|
||||
WebsiteCheckResult(status="up", response_time_ms=80, message="Website check passed"),
|
||||
]
|
||||
)
|
||||
|
||||
await scheduler._run_monitor(self.db, monitor)
|
||||
|
||||
incident = self.db.scalar(select(Incident))
|
||||
assert incident is not None
|
||||
assert incident.status == "open"
|
||||
assert len(scheduler.posts) == 1
|
||||
assert scheduler.posts[0]["url"] == "https://hooks.example.test/orbitalward"
|
||||
assert scheduler.posts[0]["username"] == "OrbitalWard"
|
||||
assert incident.details["notification_history"][0]["event"] == "opened"
|
||||
|
||||
await scheduler._send_incident_notifications(self.db, incident, monitor, "opened", datetime.now(UTC))
|
||||
assert len(scheduler.posts) == 1
|
||||
|
||||
await scheduler._run_monitor(self.db, monitor)
|
||||
|
||||
assert incident.status == "resolved"
|
||||
assert incident.resolved_at is not None
|
||||
assert incident.details["recovery_message"] == "Website check passed"
|
||||
assert len(scheduler.posts) == 2
|
||||
assert incident.details["notification_history"][1]["event"] == "resolved"
|
||||
|
||||
async def test_alert_cooldown_suppresses_new_incident_after_recent_resolution(self) -> None:
|
||||
monitor, rule = self.create_monitor_with_rule(failure_threshold=1, cooldown_seconds=300)
|
||||
now = datetime.now(UTC)
|
||||
self.db.add(
|
||||
Incident(
|
||||
monitor_id=monitor.id,
|
||||
alert_rule_id=rule.id,
|
||||
title="Example Site is failing",
|
||||
severity="critical",
|
||||
status="resolved",
|
||||
opened_at=now - timedelta(seconds=60),
|
||||
resolved_at=now - timedelta(seconds=30),
|
||||
details={},
|
||||
)
|
||||
)
|
||||
self.db.add(CheckResult(monitor_id=monitor.id, status="down", response_time_ms=100, message="HTTP 500", observed_at=now))
|
||||
monitor.status = "down"
|
||||
self.db.flush()
|
||||
scheduler = RecordingScheduler()
|
||||
|
||||
await scheduler._evaluate_rule(self.db, monitor, rule, now, "HTTP 500")
|
||||
|
||||
open_incidents = self.db.scalars(select(Incident).where(Incident.status == "open")).all()
|
||||
assert open_incidents == []
|
||||
Reference in New Issue
Block a user