Add ping and TCP monitor types

Adds ping and TCP monitor creation APIs, worker collectors, network checks UI, dashboard monitor status support, and progress documentation.
This commit is contained in:
Keith Smith
2026-05-23 15:01:57 -06:00
parent 597ff18c2a
commit 16932957b2
13 changed files with 577 additions and 35 deletions
+91 -4
View File
@@ -7,7 +7,7 @@ from sqlalchemy.orm import Session
from app.auth.dependencies import get_current_user, require_role
from app.db.session import get_db
from app.models import AlertRule, Asset, CheckResult, Incident, Monitor, User
from app.schemas.core import CheckResultRead, MonitorCreate, MonitorRead, MonitorUpdate, WebsiteMonitorCreate
from app.schemas.core import CheckResultRead, MonitorCreate, MonitorRead, MonitorUpdate, PingMonitorCreate, TcpMonitorCreate, WebsiteMonitorCreate
router = APIRouter(prefix="/monitors", tags=["monitors"])
@@ -80,6 +80,93 @@ def create_website_monitor(
return monitor
@router.post("/ping", response_model=MonitorRead)
def create_ping_monitor(
payload: PingMonitorCreate,
_: User = Depends(require_role("admin")),
db: Session = Depends(get_db),
) -> Monitor:
asset_id: int | None = None
if payload.create_asset:
asset = Asset(name=payload.name, asset_type="host", address=payload.host, status="unknown", extra={})
db.add(asset)
db.flush()
asset_id = asset.id
monitor = Monitor(
asset_id=asset_id,
name=payload.name,
monitor_type="ping",
target=payload.host,
config={"timeout_seconds": payload.timeout_seconds},
interval_seconds=payload.interval_seconds,
status="unknown",
)
db.add(monitor)
db.flush()
if payload.alert_enabled:
db.add(
AlertRule(
monitor_id=monitor.id,
name=f"{payload.name} ping failure",
severity=payload.alert_severity,
condition={"type": "status_not_up"},
failure_threshold=payload.failure_threshold,
cooldown_seconds=300,
is_enabled=True,
)
)
db.commit()
db.refresh(monitor)
return monitor
@router.post("/tcp", response_model=MonitorRead)
def create_tcp_monitor(
payload: TcpMonitorCreate,
_: User = Depends(require_role("admin")),
db: Session = Depends(get_db),
) -> Monitor:
asset_id: int | None = None
target = f"{payload.host}:{payload.port}"
if payload.create_asset:
asset = Asset(name=payload.name, asset_type="tcp_service", address=target, status="unknown", extra={})
db.add(asset)
db.flush()
asset_id = asset.id
monitor = Monitor(
asset_id=asset_id,
name=payload.name,
monitor_type="tcp",
target=target,
config={"host": payload.host, "port": payload.port, "timeout_seconds": payload.timeout_seconds},
interval_seconds=payload.interval_seconds,
status="unknown",
)
db.add(monitor)
db.flush()
if payload.alert_enabled:
db.add(
AlertRule(
monitor_id=monitor.id,
name=f"{payload.name} TCP connection failure",
severity=payload.alert_severity,
condition={"type": "status_not_up"},
failure_threshold=payload.failure_threshold,
cooldown_seconds=300,
is_enabled=True,
)
)
db.commit()
db.refresh(monitor)
return monitor
@router.get("/{monitor_id}", response_model=MonitorRead)
def get_monitor(monitor_id: int, _: User = Depends(get_current_user), db: Session = Depends(get_db)) -> Monitor:
monitor = db.get(Monitor, monitor_id)
@@ -110,7 +197,7 @@ def update_monitor(
@router.delete("/{monitor_id}", status_code=204)
def delete_monitor(
monitor_id: int,
cleanup_orphan_website_asset: bool = True,
cleanup_orphan_asset: bool = True,
_: User = Depends(require_role("admin")),
db: Session = Depends(get_db),
) -> None:
@@ -129,10 +216,10 @@ def delete_monitor(
db.delete(monitor)
db.flush()
if cleanup_orphan_website_asset and asset_id is not None:
if cleanup_orphan_asset and asset_id is not None:
remaining = db.scalar(select(func.count(Monitor.id)).where(Monitor.asset_id == asset_id))
asset = db.get(Asset, asset_id)
if remaining == 0 and asset is not None and asset.asset_type == "website":
if remaining == 0 and asset is not None and asset.asset_type in {"website", "host", "tcp_service"}:
db.delete(asset)
db.commit()
+23
View File
@@ -73,6 +73,29 @@ class WebsiteMonitorCreate(BaseModel):
failure_threshold: int = Field(default=3, ge=1, le=20)
class PingMonitorCreate(BaseModel):
name: str = Field(min_length=1, max_length=160)
host: str = Field(min_length=1, max_length=255)
timeout_seconds: int = Field(default=5, ge=1, le=60)
interval_seconds: int = Field(default=60, ge=10)
create_asset: bool = True
alert_enabled: bool = True
alert_severity: str = "warning"
failure_threshold: int = Field(default=3, ge=1, le=20)
class TcpMonitorCreate(BaseModel):
name: str = Field(min_length=1, max_length=160)
host: str = Field(min_length=1, max_length=255)
port: int = Field(ge=1, le=65535)
timeout_seconds: int = Field(default=5, ge=1, le=60)
interval_seconds: int = Field(default=60, ge=10)
create_asset: bool = True
alert_enabled: bool = True
alert_severity: str = "warning"
failure_threshold: int = Field(default=3, ge=1, le=20)
class CheckResultRead(BaseModel):
id: int
monitor_id: int
+4 -1
View File
@@ -30,6 +30,7 @@ OrbitalWard is a secure monitoring appliance focused on the v0.1 vertical slice:
- Website monitor create/edit/delete flow.
- HTTP status and expected-text checks.
- Optional TLS certificate expiry checks for HTTPS monitors.
- Ping and TCP port monitor create/edit/delete flow.
- Alert rules, incident opening/resolution, acknowledge, silence, and webhook notifications.
- Generic webhook, Mattermost, and Zoom Team Chat notification channels.
- Saved webhook URLs encrypted at rest and not returned to the UI.
@@ -43,6 +44,8 @@ After the rename and TLS expiry work, these checks passed in Docker:
- `docker compose -f docker-compose.dev.yml exec -T frontend npm run typecheck`
- `docker compose -f docker-compose.dev.yml exec -T worker python -m compileall app`
- Backend health returned `{"status":"ok","service":"orbitalward-backend"}`.
- Direct worker probes for TCP and ICMP ping checks passed inside the Docker network.
- API probe created and deleted one ping monitor and one TCP monitor successfully.
The final Compose project uses `orbitalward-*` containers, images, network, and volumes.
@@ -72,7 +75,7 @@ Issue source docs:
- `docs/progress.md`
- `docs/roadmap.md`
Current completed items include TLS expiry monitor support, HTTP/website checks, basic alert evaluation, incident actions, and webhook notification channels. Next recommended work starts with ping and TCP port monitors.
Current completed items include TLS expiry monitor support, HTTP/website checks, ping and TCP port checks, basic alert evaluation, incident actions, and webhook notification channels. The next recommended implementation issue is alert rule editing UI.
## Guardrails
+1
View File
@@ -57,6 +57,7 @@ Completed in the initial scaffold:
- React frontend skeleton with authenticated layout.
- Worker skeleton with working HTTP website monitor polling.
- Website monitor create/edit/delete UI.
- Ping and TCP port monitor collectors and UI.
- Basic alert evaluation, incidents, acknowledge, and silence actions.
- Generic webhook, Mattermost, and Zoom Team Chat channel foundations.
- Encrypted webhook URL storage.
+15 -10
View File
@@ -24,6 +24,13 @@ Implemented website-monitor slice:
- Incidents can be acknowledged and silenced from the UI.
- Deleting a monitor resolves any open incidents tied to that monitor.
Implemented network-monitor slice:
- Create, edit, delete ping and TCP port monitors from the UI.
- Worker performs ICMP ping checks and TCP connection checks.
- Ping and TCP monitors use the same alert rule, incident, recovery, and notification flow as website monitors.
- Dashboard monitor status includes website, ping, and TCP monitors.
Implemented notification slice:
- Create, edit, test, and delete notification channels from the UI.
@@ -43,7 +50,6 @@ Implemented notification slice:
- Alert rule editing UI is not implemented.
- Notification routing/policies are not implemented; all enabled webhook channels receive incident notifications.
- Email/SMTP notifications are not implemented yet.
- Ping and TCP checks are not implemented yet.
- Graphing exists only as placeholders; metric visualization is not implemented.
- Worker scheduling is simple polling, not a Redis queue yet.
- Tests are still minimal and need meaningful backend/worker/frontend coverage.
@@ -51,15 +57,14 @@ Implemented notification slice:
## Recommended Next Work
1. Add ping and TCP port monitors.
2. Add alert rule editing UI and richer alert conditions.
3. Add notification policy/routing controls.
4. Add email/SMTP notification channel.
5. Add audit event writes for auth, monitor, credential, notification, and incident actions.
6. Build credential vault UI with masked secret handling.
7. Add user administration UI.
8. Add graphs for website response time and monitor status history.
9. Add backend and worker tests for the website-monitor and notification flows.
1. Add alert rule editing UI and richer alert conditions.
2. Add notification policy/routing controls.
3. Add email/SMTP notification channel.
4. Add audit event writes for auth, monitor, credential, notification, and incident actions.
5. Build credential vault UI with masked secret handling.
6. Add user administration UI.
7. Add graphs for website response time and monitor status history.
8. Add backend and worker tests for the website-monitor and notification flows.
## Operational Notes
+12
View File
@@ -6,6 +6,8 @@ import type {
NotificationChannel,
NotificationChannelCreate,
NotificationChannelUpdate,
PingMonitorCreate,
TcpMonitorCreate,
User,
WebsiteMonitorCreate,
} from "../types/api";
@@ -61,6 +63,16 @@ export const api = {
method: "POST",
body: JSON.stringify(payload),
}),
createPingMonitor: (token: string, payload: PingMonitorCreate) =>
request<Monitor>("/monitors/ping", token, {
method: "POST",
body: JSON.stringify(payload),
}),
createTcpMonitor: (token: string, payload: TcpMonitorCreate) =>
request<Monitor>("/monitors/tcp", token, {
method: "POST",
body: JSON.stringify(payload),
}),
updateMonitor: (token: string, monitorId: number, payload: MonitorUpdate) =>
request<Monitor>(`/monitors/${monitorId}`, token, {
method: "PATCH",
+4
View File
@@ -7,6 +7,7 @@ import { AlertsPage } from "../pages/AlertsPage";
import { DashboardPage } from "../pages/DashboardPage";
import { ListPage } from "../pages/ListPage";
import { LoginPage } from "../pages/LoginPage";
import { NetworkChecksPage } from "../pages/NetworkChecksPage";
import { NotificationsPage } from "../pages/NotificationsPage";
import { WebsitesPage } from "../pages/WebsitesPage";
import type { Asset, Incident, Monitor } from "../types/api";
@@ -79,6 +80,9 @@ export function App() {
{page === "websites" ? (
<WebsitesPage token={auth.token} monitors={monitors} onCreated={refreshData} />
) : null}
{page === "network-checks" ? (
<NetworkChecksPage token={auth.token} monitors={monitors} onChanged={refreshData} />
) : null}
{page === "alerts" ? (
<AlertsPage token={auth.token} incidents={incidents} selectedIncidentId={selectedIncidentId} onChanged={refreshData} />
) : null}
+2
View File
@@ -7,6 +7,7 @@ import {
KeyRound,
LogOut,
Network,
PlugZap,
Radar,
Settings,
Shield,
@@ -20,6 +21,7 @@ const navigation = [
{ id: "dashboard", label: "Dashboard", icon: Gauge },
{ id: "assets", label: "Assets", icon: Network },
{ id: "websites", label: "Websites", icon: Globe },
{ id: "network-checks", label: "Network Checks", icon: PlugZap },
{ id: "alerts", label: "Alerts", icon: Bell },
{ id: "discovery", label: "Discovery", icon: Radar },
{ id: "graphs", label: "Graphs", icon: Activity },
+7 -7
View File
@@ -11,7 +11,6 @@ interface DashboardPageProps {
export function DashboardPage({ assets, monitors, incidents }: DashboardPageProps) {
const attentionMonitors = monitors.filter((monitor) => monitor.status !== "up" && monitor.status !== "unknown").length;
const activeIncidents = incidents.filter((incident) => incident.status === "open").length;
const websites = monitors.filter((monitor) => monitor.monitor_type === "http");
return (
<div className="space-y-6">
@@ -26,29 +25,30 @@ export function DashboardPage({ assets, monitors, incidents }: DashboardPageProp
<StatusTile icon={CheckCircle2} label="Overall Status" value={activeIncidents ? "Attention" : "Healthy"} tone={activeIncidents ? "warn" : "ok"} />
<StatusTile icon={AlertTriangle} label="Active Incidents" value={String(activeIncidents)} tone={activeIncidents ? "warn" : "ok"} />
<StatusTile icon={Server} label="Assets" value={String(assets.length)} />
<StatusTile icon={Globe2} label="Websites" value={String(websites.length)} />
<StatusTile icon={Globe2} label="Monitors" value={String(monitors.length)} />
</section>
<section className="grid gap-4 xl:grid-cols-[minmax(0,1fr)_360px]">
<div className="rounded-md border border-line bg-[#0d131c]">
<div className="flex items-center justify-between border-b border-line p-4">
<h2 className="text-base font-semibold">Website Monitors</h2>
<h2 className="text-base font-semibold">Monitor Status</h2>
<span className="text-sm text-slate-400">{attentionMonitors} need attention</span>
</div>
<div className="divide-y divide-line">
{websites.length ? (
websites.map((monitor) => (
<div key={monitor.id} className="grid gap-2 p-4 md:grid-cols-[1fr_120px_110px] md:items-center">
{monitors.length ? (
monitors.map((monitor) => (
<div key={monitor.id} className="grid gap-2 p-4 md:grid-cols-[1fr_90px_120px_110px] md:items-center">
<div>
<div className="font-medium">{monitor.name}</div>
<div className="truncate text-sm text-slate-400">{monitor.target}</div>
</div>
<div className="text-sm uppercase text-slate-400">{monitor.monitor_type}</div>
<div className="text-sm text-slate-400">{monitor.interval_seconds}s interval</div>
<StatusBadge status={monitor.status} />
</div>
))
) : (
<div className="p-6 text-sm text-slate-400">No website monitors yet.</div>
<div className="p-6 text-sm text-slate-400">No monitors yet.</div>
)}
</div>
</div>
+250
View File
@@ -0,0 +1,250 @@
import { FormEvent, useState } from "react";
import { Activity, Edit3, PlugZap, Plus, RefreshCw, Trash2, X } from "lucide-react";
import { api } from "../api/client";
import { Button } from "../components/Button";
import type { Monitor } from "../types/api";
interface NetworkChecksPageProps {
token: string;
monitors: Monitor[];
onChanged: () => Promise<void>;
}
type NetworkCheckType = "ping" | "tcp";
export function NetworkChecksPage({ token, monitors, onChanged }: NetworkChecksPageProps) {
const networkChecks = monitors.filter((monitor) => monitor.monitor_type === "ping" || monitor.monitor_type === "tcp");
const [checkType, setCheckType] = useState<NetworkCheckType>("ping");
const [name, setName] = useState("");
const [host, setHost] = useState("");
const [port, setPort] = useState(443);
const [timeoutSeconds, setTimeoutSeconds] = useState(5);
const [intervalSeconds, setIntervalSeconds] = useState(60);
const [failureThreshold, setFailureThreshold] = useState(3);
const [alertEnabled, setAlertEnabled] = useState(true);
const [editingMonitorId, setEditingMonitorId] = useState<number | null>(null);
const [submitting, setSubmitting] = useState(false);
const [deletingId, setDeletingId] = useState<number | null>(null);
const [error, setError] = useState<string | null>(null);
async function handleSubmit(event: FormEvent) {
event.preventDefault();
setSubmitting(true);
setError(null);
try {
if (editingMonitorId) {
await api.updateMonitor(token, editingMonitorId, {
name,
target: checkType === "tcp" ? `${host}:${port}` : host,
interval_seconds: intervalSeconds,
config: checkType === "tcp" ? { host, port, timeout_seconds: timeoutSeconds } : { timeout_seconds: timeoutSeconds },
});
} else if (checkType === "tcp") {
await api.createTcpMonitor(token, {
name,
host,
port,
timeout_seconds: timeoutSeconds,
interval_seconds: intervalSeconds,
create_asset: true,
alert_enabled: alertEnabled,
alert_severity: "warning",
failure_threshold: failureThreshold,
});
} else {
await api.createPingMonitor(token, {
name,
host,
timeout_seconds: timeoutSeconds,
interval_seconds: intervalSeconds,
create_asset: true,
alert_enabled: alertEnabled,
alert_severity: "warning",
failure_threshold: failureThreshold,
});
}
resetForm();
await onChanged();
} catch (err) {
setError(err instanceof Error ? err.message : "Could not save network check");
} finally {
setSubmitting(false);
}
}
function startEdit(monitor: Monitor) {
const nextType = monitor.monitor_type === "tcp" ? "tcp" : "ping";
setEditingMonitorId(monitor.id);
setCheckType(nextType);
setName(monitor.name);
setHost(nextType === "tcp" ? String(monitor.config?.host ?? monitor.target.split(":")[0] ?? "") : monitor.target);
setPort(Number(monitor.config?.port ?? 443));
setTimeoutSeconds(Number(monitor.config?.timeout_seconds ?? 5));
setIntervalSeconds(monitor.interval_seconds);
setAlertEnabled(true);
setFailureThreshold(3);
setError(null);
}
function resetForm() {
setEditingMonitorId(null);
setCheckType("ping");
setName("");
setHost("");
setPort(443);
setTimeoutSeconds(5);
setIntervalSeconds(60);
setFailureThreshold(3);
setAlertEnabled(true);
}
async function deleteMonitor(monitorId: number) {
setDeletingId(monitorId);
setError(null);
try {
await api.deleteMonitor(token, monitorId);
await onChanged();
} catch (err) {
setError(err instanceof Error ? err.message : "Could not delete network check");
} finally {
setDeletingId(null);
}
}
return (
<div className="space-y-6">
<div className="flex flex-col justify-between gap-4 md:flex-row md:items-end">
<div>
<h1 className="text-3xl font-semibold">Network Checks</h1>
<p className="mt-2 text-sm text-slate-400">ICMP ping checks and TCP port availability checks.</p>
</div>
<Button variant="ghost" onClick={onChanged}>
<RefreshCw size={16} />
Refresh
</Button>
</div>
<section className="grid gap-5 xl:grid-cols-[420px_minmax(0,1fr)]">
<form className="space-y-4 rounded-md border border-line bg-[#0d131c] p-5" onSubmit={handleSubmit}>
<div className="flex items-center gap-2">
{checkType === "tcp" ? <PlugZap size={18} className="text-pulse" /> : <Activity size={18} className="text-pulse" />}
<h2 className="text-base font-semibold">{editingMonitorId ? "Edit Network Check" : "Add Network Check"}</h2>
</div>
<div className="grid grid-cols-2 rounded-md border border-line bg-slate-950 p-1">
<button className={modeClass(checkType === "ping")} disabled={Boolean(editingMonitorId)} onClick={() => setCheckType("ping")} type="button">
Ping
</button>
<button className={modeClass(checkType === "tcp")} disabled={Boolean(editingMonitorId)} onClick={() => setCheckType("tcp")} type="button">
TCP
</button>
</div>
<label className="block space-y-2">
<span className="text-sm text-slate-300">Name</span>
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={name} onChange={(event) => setName(event.target.value)} required />
</label>
<label className="block space-y-2">
<span className="text-sm text-slate-300">Host</span>
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={host} onChange={(event) => setHost(event.target.value)} placeholder="router.local or 192.168.1.1" required />
</label>
<div className="grid gap-3 sm:grid-cols-3">
{checkType === "tcp" ? (
<label className="block space-y-2">
<span className="text-sm text-slate-300">Port</span>
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={port} onChange={(event) => setPort(Number(event.target.value))} min={1} max={65535} type="number" />
</label>
) : null}
<label className="block space-y-2">
<span className="text-sm text-slate-300">Timeout</span>
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={timeoutSeconds} onChange={(event) => setTimeoutSeconds(Number(event.target.value))} min={1} max={60} type="number" />
</label>
<label className="block space-y-2">
<span className="text-sm text-slate-300">Interval</span>
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={intervalSeconds} onChange={(event) => setIntervalSeconds(Number(event.target.value))} min={10} type="number" />
</label>
</div>
{!editingMonitorId ? (
<div className="flex items-center justify-between rounded-md border border-line bg-slate-950 px-3 py-2">
<span className="text-sm text-slate-300">Alert on repeated failures</span>
<input className="h-5 w-5 accent-teal-400" checked={alertEnabled} onChange={(event) => setAlertEnabled(event.target.checked)} type="checkbox" />
</div>
) : null}
{!editingMonitorId ? (
<label className="block space-y-2">
<span className="text-sm text-slate-300">Failure Threshold</span>
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={failureThreshold} onChange={(event) => setFailureThreshold(Number(event.target.value))} min={1} max={20} type="number" />
</label>
) : null}
{error ? <div className="rounded-md border border-red-500/40 bg-red-950/40 p-3 text-sm text-red-200">{error}</div> : null}
<div className="flex gap-2">
{editingMonitorId ? (
<Button className="flex-1" onClick={resetForm} type="button" variant="ghost">
<X size={16} />
Cancel
</Button>
) : null}
<Button className="flex-1" disabled={submitting} type="submit">
<Plus size={16} />
{submitting ? "Saving..." : editingMonitorId ? "Save Check" : "Create Check"}
</Button>
</div>
</form>
<div className="rounded-md border border-line bg-[#0d131c]">
<div className="border-b border-line p-4">
<h2 className="text-base font-semibold">Configured Network Checks</h2>
</div>
<div className="divide-y divide-line">
{networkChecks.length ? (
networkChecks.map((monitor) => (
<div key={monitor.id} className="grid gap-2 p-4 md:grid-cols-[1fr_90px_110px_170px] md:items-center">
<div>
<div className="font-medium">{monitor.name}</div>
<div className="truncate text-sm text-slate-400">{monitor.target}</div>
</div>
<span className="text-sm uppercase text-slate-400">{monitor.monitor_type}</span>
<Status status={monitor.status} />
<div className="flex items-center justify-between gap-3">
<div className="text-sm text-slate-400">{monitor.last_checked_at ? new Date(monitor.last_checked_at).toLocaleTimeString() : "Not checked"}</div>
<Button aria-label={`Edit ${monitor.name}`} className="h-8 w-8 px-0" onClick={() => startEdit(monitor)} title="Edit check" type="button" variant="ghost">
<Edit3 size={15} />
</Button>
<Button aria-label={`Delete ${monitor.name}`} className="h-8 w-8 px-0" disabled={deletingId === monitor.id} onClick={() => deleteMonitor(monitor.id)} title="Delete check" type="button" variant="ghost">
<Trash2 size={15} />
</Button>
</div>
</div>
))
) : (
<div className="p-6 text-sm text-slate-400">No network checks yet.</div>
)}
</div>
</div>
</section>
</div>
);
}
function modeClass(active: boolean) {
return `h-8 rounded-md text-sm transition disabled:opacity-70 ${active ? "bg-slate-800 text-white" : "text-slate-400 hover:bg-slate-900 hover:text-white"}`;
}
function Status({ status }: { status: string }) {
const classes =
status === "up"
? "border-teal-500/40 bg-teal-950/40 text-teal-200"
: status === "down"
? "border-red-500/40 bg-red-950/40 text-red-200"
: status === "warning"
? "border-amber-500/40 bg-amber-950/40 text-amber-200"
: "border-slate-600 bg-slate-900 text-slate-300";
return <span className={`inline-flex h-7 w-24 items-center justify-center rounded-md border text-xs font-medium ${classes}`}>{status}</span>;
}
+23
View File
@@ -94,3 +94,26 @@ export interface WebsiteMonitorCreate {
alert_severity: string;
failure_threshold: number;
}
export interface PingMonitorCreate {
name: string;
host: string;
timeout_seconds: number;
interval_seconds: number;
create_asset: boolean;
alert_enabled: boolean;
alert_severity: string;
failure_threshold: number;
}
export interface TcpMonitorCreate {
name: string;
host: string;
port: number;
timeout_seconds: number;
interval_seconds: number;
create_asset: boolean;
alert_enabled: boolean;
alert_severity: string;
failure_threshold: number;
}
+108
View File
@@ -0,0 +1,108 @@
import asyncio
import os
import socket
import struct
from dataclasses import dataclass
from time import perf_counter
@dataclass(frozen=True)
class NetworkCheckResult:
status: str
response_time_ms: int | None
message: str
@dataclass(frozen=True)
class PingCheckConfig:
host: str
timeout_seconds: float = 5.0
@dataclass(frozen=True)
class TcpCheckConfig:
host: str
port: int
timeout_seconds: float = 5.0
async def run_ping_check(config: PingCheckConfig) -> NetworkCheckResult:
try:
response_time_ms = await asyncio.to_thread(_run_ping_check_sync, config.host, config.timeout_seconds)
except PermissionError:
return NetworkCheckResult(status="down", response_time_ms=None, message="ICMP ping requires raw socket permission")
except TimeoutError:
return NetworkCheckResult(status="down", response_time_ms=None, message="Ping timed out")
except OSError as exc:
return NetworkCheckResult(status="down", response_time_ms=None, message=f"Ping failed: {exc}")
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="Ping check passed")
async def run_tcp_check(config: TcpCheckConfig) -> NetworkCheckResult:
started = perf_counter()
try:
connection = asyncio.open_connection(config.host, config.port)
reader, writer = await asyncio.wait_for(connection, timeout=config.timeout_seconds)
writer.close()
await writer.wait_closed()
reader.feed_eof()
except (TimeoutError, OSError) as exc:
return NetworkCheckResult(status="down", response_time_ms=None, message=f"TCP connection failed: {exc}")
response_time_ms = int((perf_counter() - started) * 1000)
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="TCP connection succeeded")
def _run_ping_check_sync(host: str, timeout_seconds: float) -> int:
address = _resolve_ipv4(host)
identifier = os.getpid() & 0xFFFF
sequence = 1
packet = _build_icmp_echo_request(identifier, sequence)
with socket.socket(socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP) as sock:
sock.settimeout(timeout_seconds)
started = perf_counter()
sock.sendto(packet, (address, 0))
while True:
response, _ = sock.recvfrom(1024)
if _matches_icmp_echo_reply(response, identifier, sequence):
return int((perf_counter() - started) * 1000)
def _resolve_ipv4(host: str) -> str:
results = socket.getaddrinfo(host, None, socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP)
if not results:
raise OSError("Could not resolve an IPv4 address")
return str(results[0][4][0])
def _build_icmp_echo_request(identifier: int, sequence: int) -> bytes:
payload = b"OrbitalWard ping"
header = struct.pack("!BBHHH", 8, 0, 0, identifier, sequence)
checksum = _icmp_checksum(header + payload)
header = struct.pack("!BBHHH", 8, 0, checksum, identifier, sequence)
return header + payload
def _matches_icmp_echo_reply(response: bytes, identifier: int, sequence: int) -> bool:
if len(response) < 28:
return False
ip_header_length = (response[0] & 0x0F) * 4
icmp_header = response[ip_header_length : ip_header_length + 8]
if len(icmp_header) < 8:
return False
icmp_type, _, _, reply_identifier, reply_sequence = struct.unpack("!BBHHH", icmp_header)
return icmp_type == 0 and reply_identifier == identifier and reply_sequence == sequence
def _icmp_checksum(data: bytes) -> int:
if len(data) % 2:
data += b"\x00"
checksum = 0
for index in range(0, len(data), 2):
checksum += (data[index] << 8) + data[index + 1]
checksum = (checksum & 0xFFFF) + (checksum >> 16)
return ~checksum & 0xFFFF
+37 -13
View File
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
import httpx
from app.collectors.website import WebsiteCheckConfig, run_website_check
from app.collectors.network import PingCheckConfig, TcpCheckConfig, run_ping_check, run_tcp_check
from app.config import settings
from app.db import session_scope
from app.models import AlertRule, Asset, CheckResult, Incident, Monitor, NotificationChannel
@@ -33,7 +34,7 @@ class Scheduler:
async def tick(self) -> None:
try:
with session_scope() as db:
due_monitors = self._load_due_website_monitors(db)
due_monitors = self._load_due_monitors(db)
for monitor in due_monitors:
await self._run_monitor(db, monitor)
db.commit()
@@ -43,9 +44,11 @@ class Scheduler:
def stop(self) -> None:
self._stopped.set()
def _load_due_website_monitors(self, db: Session) -> list[Monitor]:
def _load_due_monitors(self, db: Session) -> list[Monitor]:
now = datetime.now(UTC)
monitors = db.scalars(select(Monitor).where(Monitor.monitor_type == "http").order_by(Monitor.id).limit(50)).all()
monitors = db.scalars(
select(Monitor).where(Monitor.monitor_type.in_(["http", "ping", "tcp"])).order_by(Monitor.id).limit(50)
).all()
due: list[Monitor] = []
for monitor in monitors:
if monitor.last_checked_at is None:
@@ -57,16 +60,7 @@ class Scheduler:
return due
async def _run_monitor(self, db: Session, monitor: Monitor) -> None:
config = WebsiteCheckConfig(
url=monitor.target,
expected_status=int(monitor.config.get("expected_status", 200)),
expected_text=monitor.config.get("expected_text") or None,
unexpected_text=monitor.config.get("unexpected_text") or None,
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
)
result = await run_website_check(config)
result = await self._collect_monitor_result(monitor)
now = datetime.now(UTC)
monitor.status = result.status
@@ -93,6 +87,36 @@ class Scheduler:
logger.info("Checked %s: %s (%s ms)", monitor.name, result.status, result.response_time_ms)
async def _collect_monitor_result(self, monitor: Monitor):
if monitor.monitor_type == "http":
config = WebsiteCheckConfig(
url=monitor.target,
expected_status=int(monitor.config.get("expected_status", 200)),
expected_text=monitor.config.get("expected_text") or None,
unexpected_text=monitor.config.get("unexpected_text") or None,
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
)
return await run_website_check(config)
if monitor.monitor_type == "ping":
config = PingCheckConfig(
host=monitor.target,
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
)
return await run_ping_check(config)
if monitor.monitor_type == "tcp":
config = TcpCheckConfig(
host=str(monitor.config.get("host") or monitor.target),
port=int(monitor.config.get("port")),
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
)
return await run_tcp_check(config)
raise ValueError(f"Unsupported monitor type: {monitor.monitor_type}")
async def _evaluate_rule(self, db: Session, monitor: Monitor, rule: AlertRule, now: datetime, message: str) -> None:
open_incident = db.scalar(
select(Incident).where(