Initial InfraPulse scaffold
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
__pycache__/
|
||||
.venv/
|
||||
@@ -0,0 +1,13 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
COPY pyproject.toml ./
|
||||
RUN pip install --no-cache-dir -e .
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD ["python", "-m", "app.main"]
|
||||
@@ -0,0 +1 @@
|
||||
"""InfraPulse worker package."""
|
||||
@@ -0,0 +1 @@
|
||||
"""Collector implementations."""
|
||||
@@ -0,0 +1,50 @@
|
||||
from dataclasses import dataclass
|
||||
from time import perf_counter
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WebsiteCheckConfig:
|
||||
url: str
|
||||
expected_status: int = 200
|
||||
expected_text: str | None = None
|
||||
unexpected_text: str | None = None
|
||||
timeout_seconds: float = 10.0
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WebsiteCheckResult:
|
||||
status: str
|
||||
response_time_ms: int | None
|
||||
message: str
|
||||
|
||||
|
||||
async def run_website_check(config: WebsiteCheckConfig) -> WebsiteCheckResult:
|
||||
started = perf_counter()
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=config.timeout_seconds) as client:
|
||||
response = await client.get(config.url)
|
||||
except httpx.HTTPError as exc:
|
||||
return WebsiteCheckResult(status="down", response_time_ms=None, message=str(exc))
|
||||
|
||||
response_time_ms = int((perf_counter() - started) * 1000)
|
||||
if response.status_code != config.expected_status:
|
||||
return WebsiteCheckResult(
|
||||
status="down",
|
||||
response_time_ms=response_time_ms,
|
||||
message=f"Expected HTTP {config.expected_status}, got {response.status_code}",
|
||||
)
|
||||
if config.expected_text and config.expected_text not in response.text:
|
||||
return WebsiteCheckResult(
|
||||
status="down",
|
||||
response_time_ms=response_time_ms,
|
||||
message="Expected text was not present",
|
||||
)
|
||||
if config.unexpected_text and config.unexpected_text in response.text:
|
||||
return WebsiteCheckResult(
|
||||
status="down",
|
||||
response_time_ms=response_time_ms,
|
||||
message="Unexpected text was present",
|
||||
)
|
||||
return WebsiteCheckResult(status="up", response_time_ms=response_time_ms, message="Website check passed")
|
||||
@@ -0,0 +1,22 @@
|
||||
from functools import lru_cache
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
|
||||
|
||||
infrapulse_env: str = "development"
|
||||
infrapulse_secret_key: str = "change-me"
|
||||
database_url: str = "postgresql+psycopg://infrapulse:infrapulse@postgres:5432/infrapulse"
|
||||
redis_url: str = "redis://redis:6379/0"
|
||||
frontend_url: str = "http://localhost:5173"
|
||||
backend_url: str = "http://localhost:8000"
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_settings() -> Settings:
|
||||
return Settings()
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
@@ -0,0 +1,23 @@
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
from app.config import settings
|
||||
|
||||
engine = create_engine(settings.database_url, pool_pre_ping=True)
|
||||
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def session_scope() -> Generator[Session, None, None]:
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
db.commit()
|
||||
except Exception:
|
||||
db.rollback()
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
@@ -0,0 +1 @@
|
||||
"""Background jobs."""
|
||||
@@ -0,0 +1,16 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AlertEvaluation:
|
||||
should_open_incident: bool
|
||||
should_resolve_incident: bool
|
||||
message: str
|
||||
|
||||
|
||||
def evaluate_status_rule(current_status: str, failure_count: int, threshold: int) -> AlertEvaluation:
|
||||
if current_status == "up":
|
||||
return AlertEvaluation(False, True, "Monitor recovered")
|
||||
if failure_count >= threshold:
|
||||
return AlertEvaluation(True, False, f"Monitor failed {failure_count} times")
|
||||
return AlertEvaluation(False, False, "Failure threshold not reached")
|
||||
@@ -0,0 +1,14 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WebhookNotification:
|
||||
url: str
|
||||
text: str
|
||||
|
||||
|
||||
async def send_generic_webhook(notification: WebhookNotification) -> None:
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
await client.post(notification.url, json={"text": notification.text})
|
||||
@@ -0,0 +1,20 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import signal
|
||||
|
||||
from app.scheduler import Scheduler
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s [%(name)s] %(message)s")
|
||||
scheduler = Scheduler()
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
loop.add_signal_handler(sig, scheduler.stop)
|
||||
|
||||
await scheduler.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,84 @@
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, JSON, String, Text, func
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
class Asset(Base):
|
||||
__tablename__ = "assets"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
name: Mapped[str] = mapped_column(String(160))
|
||||
asset_type: Mapped[str] = mapped_column(String(64))
|
||||
address: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(32), default="unknown")
|
||||
|
||||
|
||||
class Monitor(Base):
|
||||
__tablename__ = "monitors"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
asset_id: Mapped[int | None] = mapped_column(ForeignKey("assets.id", ondelete="CASCADE"), nullable=True)
|
||||
name: Mapped[str] = mapped_column(String(160))
|
||||
monitor_type: Mapped[str] = mapped_column(String(64))
|
||||
target: Mapped[str] = mapped_column(String(512))
|
||||
config: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
interval_seconds: Mapped[int] = mapped_column(Integer, default=60)
|
||||
status: Mapped[str] = mapped_column(String(32), default="unknown")
|
||||
last_checked_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
|
||||
|
||||
class CheckResult(Base):
|
||||
__tablename__ = "check_results"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
monitor_id: Mapped[int] = mapped_column(ForeignKey("monitors.id", ondelete="CASCADE"))
|
||||
status: Mapped[str] = mapped_column(String(32))
|
||||
response_time_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
observed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
|
||||
|
||||
|
||||
class AlertRule(Base):
|
||||
__tablename__ = "alert_rules"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
monitor_id: Mapped[int] = mapped_column(ForeignKey("monitors.id", ondelete="CASCADE"))
|
||||
name: Mapped[str] = mapped_column(String(160))
|
||||
severity: Mapped[str] = mapped_column(String(32), default="warning")
|
||||
condition: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
failure_threshold: Mapped[int] = mapped_column(Integer, default=3)
|
||||
cooldown_seconds: Mapped[int] = mapped_column(Integer, default=300)
|
||||
is_enabled: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
|
||||
|
||||
class NotificationChannel(Base):
|
||||
__tablename__ = "notification_channels"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
name: Mapped[str] = mapped_column(String(160))
|
||||
channel_type: Mapped[str] = mapped_column(String(64))
|
||||
settings: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
encrypted_secret: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
is_enabled: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
|
||||
|
||||
class Incident(Base):
|
||||
__tablename__ = "incidents"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
asset_id: Mapped[int | None] = mapped_column(ForeignKey("assets.id", ondelete="SET NULL"), nullable=True)
|
||||
monitor_id: Mapped[int | None] = mapped_column(ForeignKey("monitors.id", ondelete="SET NULL"), nullable=True)
|
||||
alert_rule_id: Mapped[int | None] = mapped_column(ForeignKey("alert_rules.id", ondelete="SET NULL"), nullable=True)
|
||||
title: Mapped[str] = mapped_column(String(240))
|
||||
severity: Mapped[str] = mapped_column(String(32))
|
||||
status: Mapped[str] = mapped_column(String(32), default="open")
|
||||
opened_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
|
||||
resolved_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
acknowledged_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
silenced_until: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
details: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
@@ -0,0 +1,209 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.orm import Session
|
||||
import httpx
|
||||
|
||||
from app.collectors.website import WebsiteCheckConfig, run_website_check
|
||||
from app.config import settings
|
||||
from app.db import session_scope
|
||||
from app.models import AlertRule, Asset, CheckResult, Incident, Monitor, NotificationChannel
|
||||
from app.secrets import decrypt_secret
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Scheduler:
|
||||
def __init__(self, poll_interval_seconds: int = 10) -> None:
|
||||
self.poll_interval_seconds = poll_interval_seconds
|
||||
self._stopped = asyncio.Event()
|
||||
|
||||
async def run(self) -> None:
|
||||
logger.info("InfraPulse worker started for %s", settings.infrapulse_env)
|
||||
while not self._stopped.is_set():
|
||||
await self.tick()
|
||||
try:
|
||||
await asyncio.wait_for(self._stopped.wait(), timeout=self.poll_interval_seconds)
|
||||
except TimeoutError:
|
||||
continue
|
||||
|
||||
async def tick(self) -> None:
|
||||
try:
|
||||
with session_scope() as db:
|
||||
due_monitors = self._load_due_website_monitors(db)
|
||||
for monitor in due_monitors:
|
||||
await self._run_monitor(db, monitor)
|
||||
db.commit()
|
||||
except SQLAlchemyError:
|
||||
logger.exception("Worker tick failed while talking to the database")
|
||||
|
||||
def stop(self) -> None:
|
||||
self._stopped.set()
|
||||
|
||||
def _load_due_website_monitors(self, db: Session) -> list[Monitor]:
|
||||
now = datetime.now(UTC)
|
||||
monitors = db.scalars(select(Monitor).where(Monitor.monitor_type == "http").order_by(Monitor.id).limit(50)).all()
|
||||
due: list[Monitor] = []
|
||||
for monitor in monitors:
|
||||
if monitor.last_checked_at is None:
|
||||
due.append(monitor)
|
||||
continue
|
||||
next_due_at = monitor.last_checked_at + timedelta(seconds=monitor.interval_seconds)
|
||||
if next_due_at <= now:
|
||||
due.append(monitor)
|
||||
return due
|
||||
|
||||
async def _run_monitor(self, db: Session, monitor: Monitor) -> None:
|
||||
config = WebsiteCheckConfig(
|
||||
url=monitor.target,
|
||||
expected_status=int(monitor.config.get("expected_status", 200)),
|
||||
expected_text=monitor.config.get("expected_text") or None,
|
||||
unexpected_text=monitor.config.get("unexpected_text") or None,
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
|
||||
)
|
||||
result = await run_website_check(config)
|
||||
now = datetime.now(UTC)
|
||||
|
||||
monitor.status = result.status
|
||||
monitor.last_checked_at = now
|
||||
db.add(
|
||||
CheckResult(
|
||||
monitor_id=monitor.id,
|
||||
status=result.status,
|
||||
response_time_ms=result.response_time_ms,
|
||||
message=result.message,
|
||||
observed_at=now,
|
||||
)
|
||||
)
|
||||
db.flush()
|
||||
|
||||
if monitor.asset_id is not None:
|
||||
asset = db.get(Asset, monitor.asset_id)
|
||||
if asset is not None:
|
||||
asset.status = result.status
|
||||
|
||||
rules = db.scalars(select(AlertRule).where(AlertRule.monitor_id == monitor.id, AlertRule.is_enabled.is_(True))).all()
|
||||
for rule in rules:
|
||||
await self._evaluate_rule(db, monitor, rule, now, result.message)
|
||||
|
||||
logger.info("Checked %s: %s (%s ms)", monitor.name, result.status, result.response_time_ms)
|
||||
|
||||
async def _evaluate_rule(self, db: Session, monitor: Monitor, rule: AlertRule, now: datetime, message: str) -> None:
|
||||
open_incident = db.scalar(
|
||||
select(Incident).where(
|
||||
Incident.monitor_id == monitor.id,
|
||||
Incident.alert_rule_id == rule.id,
|
||||
Incident.status == "open",
|
||||
)
|
||||
)
|
||||
|
||||
if monitor.status == "up":
|
||||
if open_incident is not None:
|
||||
open_incident.status = "resolved"
|
||||
open_incident.resolved_at = now
|
||||
open_incident.details = {**(open_incident.details or {}), "recovery_message": message}
|
||||
await self._send_incident_notifications(db, open_incident, monitor, "resolved", now)
|
||||
return
|
||||
|
||||
recent_statuses = list(
|
||||
db.scalars(
|
||||
select(CheckResult.status)
|
||||
.where(CheckResult.monitor_id == monitor.id)
|
||||
.order_by(CheckResult.observed_at.desc())
|
||||
.limit(rule.failure_threshold)
|
||||
)
|
||||
)
|
||||
threshold_met = len(recent_statuses) >= rule.failure_threshold and all(status != "up" for status in recent_statuses)
|
||||
if threshold_met and open_incident is None:
|
||||
incident = Incident(
|
||||
asset_id=monitor.asset_id,
|
||||
monitor_id=monitor.id,
|
||||
alert_rule_id=rule.id,
|
||||
title=f"{monitor.name} is failing",
|
||||
severity=rule.severity,
|
||||
status="open",
|
||||
opened_at=now,
|
||||
details={"last_message": message, "failure_threshold": rule.failure_threshold},
|
||||
)
|
||||
db.add(incident)
|
||||
db.flush()
|
||||
await self._send_incident_notifications(db, incident, monitor, "opened", now)
|
||||
|
||||
async def _send_incident_notifications(
|
||||
self,
|
||||
db: Session,
|
||||
incident: Incident,
|
||||
monitor: Monitor,
|
||||
event_type: str,
|
||||
now: datetime,
|
||||
) -> None:
|
||||
state_key = "opened_sent_at" if event_type == "opened" else "resolved_sent_at"
|
||||
notification_state = dict((incident.details or {}).get("notification_state") or {})
|
||||
if notification_state.get(state_key):
|
||||
return
|
||||
|
||||
channels = db.scalars(
|
||||
select(NotificationChannel).where(
|
||||
NotificationChannel.is_enabled.is_(True),
|
||||
NotificationChannel.channel_type.in_(["generic_webhook", "webhook", "mattermost", "zoom", "zoom_team_chat"]),
|
||||
)
|
||||
).all()
|
||||
if not channels:
|
||||
return
|
||||
|
||||
sent_channels: list[str] = []
|
||||
for channel in channels:
|
||||
url = decrypt_secret(channel.encrypted_secret)
|
||||
if not url:
|
||||
logger.warning("Skipping notification channel %s because its secret cannot be decrypted", channel.id)
|
||||
continue
|
||||
try:
|
||||
await self._post_webhook(
|
||||
url,
|
||||
self._format_incident_message(incident, monitor, event_type),
|
||||
str((channel.settings or {}).get("username") or "InfraPulse"),
|
||||
)
|
||||
except httpx.HTTPError:
|
||||
logger.exception("Notification delivery failed for channel %s", channel.id)
|
||||
continue
|
||||
sent_channels.append(channel.name)
|
||||
|
||||
if sent_channels:
|
||||
notification_state[state_key] = now.isoformat()
|
||||
history = list((incident.details or {}).get("notification_history") or [])
|
||||
history.append({"event": event_type, "sent_at": now.isoformat(), "channels": sent_channels})
|
||||
incident.details = {**(incident.details or {}), "notification_state": notification_state, "notification_history": history}
|
||||
|
||||
async def _post_webhook(self, url: str, message: str, username: str) -> None:
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
response = await client.post(url, json={"username": username, "text": message})
|
||||
response.raise_for_status()
|
||||
|
||||
def _format_incident_message(self, incident: Incident, monitor: Monitor, event_type: str) -> str:
|
||||
if event_type == "resolved":
|
||||
title = f"RESOLVED: {monitor.name} recovered"
|
||||
body = [
|
||||
title,
|
||||
"",
|
||||
f"Monitor: {monitor.name}",
|
||||
f"Target: {monitor.target}",
|
||||
f"Resolved: {incident.resolved_at or datetime.now(UTC)}",
|
||||
]
|
||||
else:
|
||||
title = f"{incident.severity.upper()}: {incident.title}"
|
||||
body = [
|
||||
title,
|
||||
"",
|
||||
f"Monitor: {monitor.name}",
|
||||
f"Target: {monitor.target}",
|
||||
f"Status: {monitor.status}",
|
||||
f"Started: {incident.opened_at}",
|
||||
]
|
||||
last_message = (incident.details or {}).get("last_message")
|
||||
if last_message:
|
||||
body.append(f"Last response: {last_message}")
|
||||
body.extend(["", f"View in InfraPulse: {settings.frontend_url}/incidents/{incident.id}"])
|
||||
return "\n".join(str(line) for line in body)
|
||||
@@ -0,0 +1,20 @@
|
||||
import base64
|
||||
import hashlib
|
||||
|
||||
from cryptography.fernet import Fernet, InvalidToken
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
def _fernet() -> Fernet:
|
||||
digest = hashlib.sha256(settings.infrapulse_secret_key.encode("utf-8")).digest()
|
||||
return Fernet(base64.urlsafe_b64encode(digest))
|
||||
|
||||
|
||||
def decrypt_secret(value: str | None) -> str | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return _fernet().decrypt(value.encode("utf-8")).decode("utf-8")
|
||||
except InvalidToken:
|
||||
return None
|
||||
@@ -0,0 +1,17 @@
|
||||
[project]
|
||||
name = "infrapulse-worker"
|
||||
version = "0.1.0"
|
||||
description = "InfraPulse background worker"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"cryptography>=48.0.0",
|
||||
"httpx>=0.27.2",
|
||||
"pydantic-settings>=2.5.2",
|
||||
"redis>=5.0.8",
|
||||
"sqlalchemy>=2.0.35",
|
||||
"psycopg[binary]>=3.2.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=75.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
Reference in New Issue
Block a user