Add ping and TCP monitor types

Adds ping and TCP monitor creation APIs, worker collectors, network checks UI, dashboard monitor status support, and progress documentation.
This commit is contained in:
Keith Smith
2026-05-23 15:01:57 -06:00
parent 597ff18c2a
commit 16932957b2
13 changed files with 577 additions and 35 deletions
+108
View File
@@ -0,0 +1,108 @@
import asyncio
import os
import socket
import struct
from dataclasses import dataclass
from time import perf_counter
@dataclass(frozen=True)
class NetworkCheckResult:
status: str
response_time_ms: int | None
message: str
@dataclass(frozen=True)
class PingCheckConfig:
host: str
timeout_seconds: float = 5.0
@dataclass(frozen=True)
class TcpCheckConfig:
host: str
port: int
timeout_seconds: float = 5.0
async def run_ping_check(config: PingCheckConfig) -> NetworkCheckResult:
try:
response_time_ms = await asyncio.to_thread(_run_ping_check_sync, config.host, config.timeout_seconds)
except PermissionError:
return NetworkCheckResult(status="down", response_time_ms=None, message="ICMP ping requires raw socket permission")
except TimeoutError:
return NetworkCheckResult(status="down", response_time_ms=None, message="Ping timed out")
except OSError as exc:
return NetworkCheckResult(status="down", response_time_ms=None, message=f"Ping failed: {exc}")
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="Ping check passed")
async def run_tcp_check(config: TcpCheckConfig) -> NetworkCheckResult:
started = perf_counter()
try:
connection = asyncio.open_connection(config.host, config.port)
reader, writer = await asyncio.wait_for(connection, timeout=config.timeout_seconds)
writer.close()
await writer.wait_closed()
reader.feed_eof()
except (TimeoutError, OSError) as exc:
return NetworkCheckResult(status="down", response_time_ms=None, message=f"TCP connection failed: {exc}")
response_time_ms = int((perf_counter() - started) * 1000)
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="TCP connection succeeded")
def _run_ping_check_sync(host: str, timeout_seconds: float) -> int:
address = _resolve_ipv4(host)
identifier = os.getpid() & 0xFFFF
sequence = 1
packet = _build_icmp_echo_request(identifier, sequence)
with socket.socket(socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP) as sock:
sock.settimeout(timeout_seconds)
started = perf_counter()
sock.sendto(packet, (address, 0))
while True:
response, _ = sock.recvfrom(1024)
if _matches_icmp_echo_reply(response, identifier, sequence):
return int((perf_counter() - started) * 1000)
def _resolve_ipv4(host: str) -> str:
results = socket.getaddrinfo(host, None, socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP)
if not results:
raise OSError("Could not resolve an IPv4 address")
return str(results[0][4][0])
def _build_icmp_echo_request(identifier: int, sequence: int) -> bytes:
payload = b"OrbitalWard ping"
header = struct.pack("!BBHHH", 8, 0, 0, identifier, sequence)
checksum = _icmp_checksum(header + payload)
header = struct.pack("!BBHHH", 8, 0, checksum, identifier, sequence)
return header + payload
def _matches_icmp_echo_reply(response: bytes, identifier: int, sequence: int) -> bool:
if len(response) < 28:
return False
ip_header_length = (response[0] & 0x0F) * 4
icmp_header = response[ip_header_length : ip_header_length + 8]
if len(icmp_header) < 8:
return False
icmp_type, _, _, reply_identifier, reply_sequence = struct.unpack("!BBHHH", icmp_header)
return icmp_type == 0 and reply_identifier == identifier and reply_sequence == sequence
def _icmp_checksum(data: bytes) -> int:
if len(data) % 2:
data += b"\x00"
checksum = 0
for index in range(0, len(data), 2):
checksum += (data[index] << 8) + data[index + 1]
checksum = (checksum & 0xFFFF) + (checksum >> 16)
return ~checksum & 0xFFFF
+37 -13
View File
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
import httpx
from app.collectors.website import WebsiteCheckConfig, run_website_check
from app.collectors.network import PingCheckConfig, TcpCheckConfig, run_ping_check, run_tcp_check
from app.config import settings
from app.db import session_scope
from app.models import AlertRule, Asset, CheckResult, Incident, Monitor, NotificationChannel
@@ -33,7 +34,7 @@ class Scheduler:
async def tick(self) -> None:
try:
with session_scope() as db:
due_monitors = self._load_due_website_monitors(db)
due_monitors = self._load_due_monitors(db)
for monitor in due_monitors:
await self._run_monitor(db, monitor)
db.commit()
@@ -43,9 +44,11 @@ class Scheduler:
def stop(self) -> None:
self._stopped.set()
def _load_due_website_monitors(self, db: Session) -> list[Monitor]:
def _load_due_monitors(self, db: Session) -> list[Monitor]:
now = datetime.now(UTC)
monitors = db.scalars(select(Monitor).where(Monitor.monitor_type == "http").order_by(Monitor.id).limit(50)).all()
monitors = db.scalars(
select(Monitor).where(Monitor.monitor_type.in_(["http", "ping", "tcp"])).order_by(Monitor.id).limit(50)
).all()
due: list[Monitor] = []
for monitor in monitors:
if monitor.last_checked_at is None:
@@ -57,16 +60,7 @@ class Scheduler:
return due
async def _run_monitor(self, db: Session, monitor: Monitor) -> None:
config = WebsiteCheckConfig(
url=monitor.target,
expected_status=int(monitor.config.get("expected_status", 200)),
expected_text=monitor.config.get("expected_text") or None,
unexpected_text=monitor.config.get("unexpected_text") or None,
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
)
result = await run_website_check(config)
result = await self._collect_monitor_result(monitor)
now = datetime.now(UTC)
monitor.status = result.status
@@ -93,6 +87,36 @@ class Scheduler:
logger.info("Checked %s: %s (%s ms)", monitor.name, result.status, result.response_time_ms)
async def _collect_monitor_result(self, monitor: Monitor):
if monitor.monitor_type == "http":
config = WebsiteCheckConfig(
url=monitor.target,
expected_status=int(monitor.config.get("expected_status", 200)),
expected_text=monitor.config.get("expected_text") or None,
unexpected_text=monitor.config.get("unexpected_text") or None,
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
)
return await run_website_check(config)
if monitor.monitor_type == "ping":
config = PingCheckConfig(
host=monitor.target,
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
)
return await run_ping_check(config)
if monitor.monitor_type == "tcp":
config = TcpCheckConfig(
host=str(monitor.config.get("host") or monitor.target),
port=int(monitor.config.get("port")),
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
)
return await run_tcp_check(config)
raise ValueError(f"Unsupported monitor type: {monitor.monitor_type}")
async def _evaluate_rule(self, db: Session, monitor: Monitor, rule: AlertRule, now: datetime, message: str) -> None:
open_incident = db.scalar(
select(Incident).where(