Files
OrbitWard/worker/tests/test_scheduler.py
T
2026-05-23 19:50:13 -06:00

162 lines
6.3 KiB
Python

import base64
import hashlib
import unittest
from datetime import UTC, datetime, timedelta
from cryptography.fernet import Fernet
from sqlalchemy import create_engine, select
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.pool import StaticPool
from app.collectors.website import WebsiteCheckResult
from app.config import settings
from app.models import AlertRule, Base, CheckResult, Incident, Monitor, NotificationChannel
from app.scheduler import Scheduler
def encrypt_secret(value: str) -> str:
digest = hashlib.sha256(settings.orbitalward_secret_key.encode("utf-8")).digest()
return Fernet(base64.urlsafe_b64encode(digest)).encrypt(value.encode("utf-8")).decode("utf-8")
class RecordingScheduler(Scheduler):
def __init__(self, results: list[WebsiteCheckResult] | None = None) -> None:
super().__init__()
self.results = list(results or [])
self.posts: list[dict[str, str]] = []
async def _collect_monitor_result(self, monitor: Monitor) -> WebsiteCheckResult:
return self.results.pop(0)
async def _post_webhook(self, url: str, message: str, username: str) -> None:
self.posts.append({"url": url, "message": message, "username": username})
class SchedulerTestCase(unittest.IsolatedAsyncioTestCase):
def setUp(self) -> None:
self.engine = create_engine(
"sqlite://",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(bind=self.engine)
self.session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
self.db: Session = self.session_factory()
def tearDown(self) -> None:
self.db.close()
Base.metadata.drop_all(bind=self.engine)
self.engine.dispose()
def create_monitor_with_rule(self, *, failure_threshold: int = 2, cooldown_seconds: int = 0) -> tuple[Monitor, AlertRule]:
monitor = Monitor(
name="Example Site",
monitor_type="http",
target="https://example.com",
config={"expected_status": 200, "timeout_seconds": 5},
interval_seconds=60,
status="unknown",
)
self.db.add(monitor)
self.db.flush()
rule = AlertRule(
monitor_id=monitor.id,
name="Example Site failure",
severity="critical",
condition={"type": "status_not_up"},
failure_threshold=failure_threshold,
cooldown_seconds=cooldown_seconds,
is_enabled=True,
)
self.db.add(rule)
self.db.flush()
return monitor, rule
async def test_alert_evaluation_opens_incident_after_failure_threshold(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=2)
scheduler = RecordingScheduler(
[
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
WebsiteCheckResult(status="down", response_time_ms=110, message="HTTP 500 again"),
]
)
await scheduler._run_monitor(self.db, monitor)
assert self.db.scalars(select(Incident)).all() == []
await scheduler._run_monitor(self.db, monitor)
incident = self.db.scalar(select(Incident))
assert incident is not None
assert incident.monitor_id == monitor.id
assert incident.alert_rule_id == rule.id
assert incident.status == "open"
assert incident.severity == "critical"
assert incident.details["last_message"] == "HTTP 500 again"
assert incident.details["failure_threshold"] == 2
async def test_recovery_resolves_open_incident_and_sends_notifications_once(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=1)
channel = NotificationChannel(
name="Ops Webhook",
channel_type="generic_webhook",
settings={"username": "OrbitalWard"},
encrypted_secret=encrypt_secret("https://hooks.example.test/orbitalward"),
is_enabled=True,
)
self.db.add(channel)
self.db.flush()
scheduler = RecordingScheduler(
[
WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"),
WebsiteCheckResult(status="up", response_time_ms=80, message="Website check passed"),
]
)
await scheduler._run_monitor(self.db, monitor)
incident = self.db.scalar(select(Incident))
assert incident is not None
assert incident.status == "open"
assert len(scheduler.posts) == 1
assert scheduler.posts[0]["url"] == "https://hooks.example.test/orbitalward"
assert scheduler.posts[0]["username"] == "OrbitalWard"
assert incident.details["notification_history"][0]["event"] == "opened"
await scheduler._send_incident_notifications(self.db, incident, monitor, "opened", datetime.now(UTC))
assert len(scheduler.posts) == 1
await scheduler._run_monitor(self.db, monitor)
assert incident.status == "resolved"
assert incident.resolved_at is not None
assert incident.details["recovery_message"] == "Website check passed"
assert len(scheduler.posts) == 2
assert incident.details["notification_history"][1]["event"] == "resolved"
async def test_alert_cooldown_suppresses_new_incident_after_recent_resolution(self) -> None:
monitor, rule = self.create_monitor_with_rule(failure_threshold=1, cooldown_seconds=300)
now = datetime.now(UTC)
self.db.add(
Incident(
monitor_id=monitor.id,
alert_rule_id=rule.id,
title="Example Site is failing",
severity="critical",
status="resolved",
opened_at=now - timedelta(seconds=60),
resolved_at=now - timedelta(seconds=30),
details={},
)
)
self.db.add(CheckResult(monitor_id=monitor.id, status="down", response_time_ms=100, message="HTTP 500", observed_at=now))
monitor.status = "down"
self.db.flush()
scheduler = RecordingScheduler()
await scheduler._evaluate_rule(self.db, monitor, rule, now, "HTTP 500")
open_incidents = self.db.scalars(select(Incident).where(Incident.status == "open")).all()
assert open_incidents == []