import base64 import hashlib import unittest from datetime import UTC, datetime, timedelta from cryptography.fernet import Fernet from sqlalchemy import create_engine, select from sqlalchemy.orm import Session, sessionmaker from sqlalchemy.pool import StaticPool from app.collectors.website import WebsiteCheckResult from app.config import settings from app.models import AlertRule, Base, CheckResult, Incident, Monitor, NotificationChannel from app.scheduler import Scheduler def encrypt_secret(value: str) -> str: digest = hashlib.sha256(settings.orbitalward_secret_key.encode("utf-8")).digest() return Fernet(base64.urlsafe_b64encode(digest)).encrypt(value.encode("utf-8")).decode("utf-8") class RecordingScheduler(Scheduler): def __init__(self, results: list[WebsiteCheckResult] | None = None) -> None: super().__init__() self.results = list(results or []) self.posts: list[dict[str, str]] = [] async def _collect_monitor_result(self, monitor: Monitor) -> WebsiteCheckResult: return self.results.pop(0) async def _post_webhook(self, url: str, message: str, username: str) -> None: self.posts.append({"url": url, "message": message, "username": username}) class SchedulerTestCase(unittest.IsolatedAsyncioTestCase): def setUp(self) -> None: self.engine = create_engine( "sqlite://", connect_args={"check_same_thread": False}, poolclass=StaticPool, ) Base.metadata.create_all(bind=self.engine) self.session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False) self.db: Session = self.session_factory() def tearDown(self) -> None: self.db.close() Base.metadata.drop_all(bind=self.engine) self.engine.dispose() def create_monitor_with_rule(self, *, failure_threshold: int = 2, cooldown_seconds: int = 0) -> tuple[Monitor, AlertRule]: monitor = Monitor( name="Example Site", monitor_type="http", target="https://example.com", config={"expected_status": 200, "timeout_seconds": 5}, interval_seconds=60, status="unknown", ) self.db.add(monitor) self.db.flush() rule = AlertRule( monitor_id=monitor.id, name="Example Site failure", severity="critical", condition={"type": "status_not_up"}, failure_threshold=failure_threshold, cooldown_seconds=cooldown_seconds, is_enabled=True, ) self.db.add(rule) self.db.flush() return monitor, rule async def test_alert_evaluation_opens_incident_after_failure_threshold(self) -> None: monitor, rule = self.create_monitor_with_rule(failure_threshold=2) scheduler = RecordingScheduler( [ WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"), WebsiteCheckResult(status="down", response_time_ms=110, message="HTTP 500 again"), ] ) await scheduler._run_monitor(self.db, monitor) assert self.db.scalars(select(Incident)).all() == [] await scheduler._run_monitor(self.db, monitor) incident = self.db.scalar(select(Incident)) assert incident is not None assert incident.monitor_id == monitor.id assert incident.alert_rule_id == rule.id assert incident.status == "open" assert incident.severity == "critical" assert incident.details["last_message"] == "HTTP 500 again" assert incident.details["failure_threshold"] == 2 async def test_recovery_resolves_open_incident_and_sends_notifications_once(self) -> None: monitor, rule = self.create_monitor_with_rule(failure_threshold=1) channel = NotificationChannel( name="Ops Webhook", channel_type="generic_webhook", settings={"username": "OrbitalWard"}, encrypted_secret=encrypt_secret("https://hooks.example.test/orbitalward"), is_enabled=True, ) self.db.add(channel) self.db.flush() scheduler = RecordingScheduler( [ WebsiteCheckResult(status="down", response_time_ms=100, message="HTTP 500"), WebsiteCheckResult(status="up", response_time_ms=80, message="Website check passed"), ] ) await scheduler._run_monitor(self.db, monitor) incident = self.db.scalar(select(Incident)) assert incident is not None assert incident.status == "open" assert len(scheduler.posts) == 1 assert scheduler.posts[0]["url"] == "https://hooks.example.test/orbitalward" assert scheduler.posts[0]["username"] == "OrbitalWard" assert incident.details["notification_history"][0]["event"] == "opened" await scheduler._send_incident_notifications(self.db, incident, monitor, "opened", datetime.now(UTC)) assert len(scheduler.posts) == 1 await scheduler._run_monitor(self.db, monitor) assert incident.status == "resolved" assert incident.resolved_at is not None assert incident.details["recovery_message"] == "Website check passed" assert len(scheduler.posts) == 2 assert incident.details["notification_history"][1]["event"] == "resolved" async def test_alert_cooldown_suppresses_new_incident_after_recent_resolution(self) -> None: monitor, rule = self.create_monitor_with_rule(failure_threshold=1, cooldown_seconds=300) now = datetime.now(UTC) self.db.add( Incident( monitor_id=monitor.id, alert_rule_id=rule.id, title="Example Site is failing", severity="critical", status="resolved", opened_at=now - timedelta(seconds=60), resolved_at=now - timedelta(seconds=30), details={}, ) ) self.db.add(CheckResult(monitor_id=monitor.id, status="down", response_time_ms=100, message="HTTP 500", observed_at=now)) monitor.status = "down" self.db.flush() scheduler = RecordingScheduler() await scheduler._evaluate_rule(self.db, monitor, rule, now, "HTTP 500") open_incidents = self.db.scalars(select(Incident).where(Incident.status == "open")).all() assert open_incidents == []