Add ping and TCP monitor types
Adds ping and TCP monitor creation APIs, worker collectors, network checks UI, dashboard monitor status support, and progress documentation.
This commit is contained in:
@@ -0,0 +1,108 @@
|
||||
import asyncio
|
||||
import os
|
||||
import socket
|
||||
import struct
|
||||
from dataclasses import dataclass
|
||||
from time import perf_counter
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NetworkCheckResult:
|
||||
status: str
|
||||
response_time_ms: int | None
|
||||
message: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PingCheckConfig:
|
||||
host: str
|
||||
timeout_seconds: float = 5.0
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TcpCheckConfig:
|
||||
host: str
|
||||
port: int
|
||||
timeout_seconds: float = 5.0
|
||||
|
||||
|
||||
async def run_ping_check(config: PingCheckConfig) -> NetworkCheckResult:
|
||||
try:
|
||||
response_time_ms = await asyncio.to_thread(_run_ping_check_sync, config.host, config.timeout_seconds)
|
||||
except PermissionError:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message="ICMP ping requires raw socket permission")
|
||||
except TimeoutError:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message="Ping timed out")
|
||||
except OSError as exc:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message=f"Ping failed: {exc}")
|
||||
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="Ping check passed")
|
||||
|
||||
|
||||
async def run_tcp_check(config: TcpCheckConfig) -> NetworkCheckResult:
|
||||
started = perf_counter()
|
||||
try:
|
||||
connection = asyncio.open_connection(config.host, config.port)
|
||||
reader, writer = await asyncio.wait_for(connection, timeout=config.timeout_seconds)
|
||||
writer.close()
|
||||
await writer.wait_closed()
|
||||
reader.feed_eof()
|
||||
except (TimeoutError, OSError) as exc:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message=f"TCP connection failed: {exc}")
|
||||
|
||||
response_time_ms = int((perf_counter() - started) * 1000)
|
||||
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="TCP connection succeeded")
|
||||
|
||||
|
||||
def _run_ping_check_sync(host: str, timeout_seconds: float) -> int:
|
||||
address = _resolve_ipv4(host)
|
||||
identifier = os.getpid() & 0xFFFF
|
||||
sequence = 1
|
||||
packet = _build_icmp_echo_request(identifier, sequence)
|
||||
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP) as sock:
|
||||
sock.settimeout(timeout_seconds)
|
||||
started = perf_counter()
|
||||
sock.sendto(packet, (address, 0))
|
||||
|
||||
while True:
|
||||
response, _ = sock.recvfrom(1024)
|
||||
if _matches_icmp_echo_reply(response, identifier, sequence):
|
||||
return int((perf_counter() - started) * 1000)
|
||||
|
||||
|
||||
def _resolve_ipv4(host: str) -> str:
|
||||
results = socket.getaddrinfo(host, None, socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP)
|
||||
if not results:
|
||||
raise OSError("Could not resolve an IPv4 address")
|
||||
return str(results[0][4][0])
|
||||
|
||||
|
||||
def _build_icmp_echo_request(identifier: int, sequence: int) -> bytes:
|
||||
payload = b"OrbitalWard ping"
|
||||
header = struct.pack("!BBHHH", 8, 0, 0, identifier, sequence)
|
||||
checksum = _icmp_checksum(header + payload)
|
||||
header = struct.pack("!BBHHH", 8, 0, checksum, identifier, sequence)
|
||||
return header + payload
|
||||
|
||||
|
||||
def _matches_icmp_echo_reply(response: bytes, identifier: int, sequence: int) -> bool:
|
||||
if len(response) < 28:
|
||||
return False
|
||||
ip_header_length = (response[0] & 0x0F) * 4
|
||||
icmp_header = response[ip_header_length : ip_header_length + 8]
|
||||
if len(icmp_header) < 8:
|
||||
return False
|
||||
icmp_type, _, _, reply_identifier, reply_sequence = struct.unpack("!BBHHH", icmp_header)
|
||||
return icmp_type == 0 and reply_identifier == identifier and reply_sequence == sequence
|
||||
|
||||
|
||||
def _icmp_checksum(data: bytes) -> int:
|
||||
if len(data) % 2:
|
||||
data += b"\x00"
|
||||
|
||||
checksum = 0
|
||||
for index in range(0, len(data), 2):
|
||||
checksum += (data[index] << 8) + data[index + 1]
|
||||
checksum = (checksum & 0xFFFF) + (checksum >> 16)
|
||||
|
||||
return ~checksum & 0xFFFF
|
||||
+37
-13
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
|
||||
import httpx
|
||||
|
||||
from app.collectors.website import WebsiteCheckConfig, run_website_check
|
||||
from app.collectors.network import PingCheckConfig, TcpCheckConfig, run_ping_check, run_tcp_check
|
||||
from app.config import settings
|
||||
from app.db import session_scope
|
||||
from app.models import AlertRule, Asset, CheckResult, Incident, Monitor, NotificationChannel
|
||||
@@ -33,7 +34,7 @@ class Scheduler:
|
||||
async def tick(self) -> None:
|
||||
try:
|
||||
with session_scope() as db:
|
||||
due_monitors = self._load_due_website_monitors(db)
|
||||
due_monitors = self._load_due_monitors(db)
|
||||
for monitor in due_monitors:
|
||||
await self._run_monitor(db, monitor)
|
||||
db.commit()
|
||||
@@ -43,9 +44,11 @@ class Scheduler:
|
||||
def stop(self) -> None:
|
||||
self._stopped.set()
|
||||
|
||||
def _load_due_website_monitors(self, db: Session) -> list[Monitor]:
|
||||
def _load_due_monitors(self, db: Session) -> list[Monitor]:
|
||||
now = datetime.now(UTC)
|
||||
monitors = db.scalars(select(Monitor).where(Monitor.monitor_type == "http").order_by(Monitor.id).limit(50)).all()
|
||||
monitors = db.scalars(
|
||||
select(Monitor).where(Monitor.monitor_type.in_(["http", "ping", "tcp"])).order_by(Monitor.id).limit(50)
|
||||
).all()
|
||||
due: list[Monitor] = []
|
||||
for monitor in monitors:
|
||||
if monitor.last_checked_at is None:
|
||||
@@ -57,16 +60,7 @@ class Scheduler:
|
||||
return due
|
||||
|
||||
async def _run_monitor(self, db: Session, monitor: Monitor) -> None:
|
||||
config = WebsiteCheckConfig(
|
||||
url=monitor.target,
|
||||
expected_status=int(monitor.config.get("expected_status", 200)),
|
||||
expected_text=monitor.config.get("expected_text") or None,
|
||||
unexpected_text=monitor.config.get("unexpected_text") or None,
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
|
||||
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
|
||||
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
|
||||
)
|
||||
result = await run_website_check(config)
|
||||
result = await self._collect_monitor_result(monitor)
|
||||
now = datetime.now(UTC)
|
||||
|
||||
monitor.status = result.status
|
||||
@@ -93,6 +87,36 @@ class Scheduler:
|
||||
|
||||
logger.info("Checked %s: %s (%s ms)", monitor.name, result.status, result.response_time_ms)
|
||||
|
||||
async def _collect_monitor_result(self, monitor: Monitor):
|
||||
if monitor.monitor_type == "http":
|
||||
config = WebsiteCheckConfig(
|
||||
url=monitor.target,
|
||||
expected_status=int(monitor.config.get("expected_status", 200)),
|
||||
expected_text=monitor.config.get("expected_text") or None,
|
||||
unexpected_text=monitor.config.get("unexpected_text") or None,
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
|
||||
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
|
||||
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
|
||||
)
|
||||
return await run_website_check(config)
|
||||
|
||||
if monitor.monitor_type == "ping":
|
||||
config = PingCheckConfig(
|
||||
host=monitor.target,
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
|
||||
)
|
||||
return await run_ping_check(config)
|
||||
|
||||
if monitor.monitor_type == "tcp":
|
||||
config = TcpCheckConfig(
|
||||
host=str(monitor.config.get("host") or monitor.target),
|
||||
port=int(monitor.config.get("port")),
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
|
||||
)
|
||||
return await run_tcp_check(config)
|
||||
|
||||
raise ValueError(f"Unsupported monitor type: {monitor.monitor_type}")
|
||||
|
||||
async def _evaluate_rule(self, db: Session, monitor: Monitor, rule: AlertRule, now: datetime, message: str) -> None:
|
||||
open_incident = db.scalar(
|
||||
select(Incident).where(
|
||||
|
||||
Reference in New Issue
Block a user