Add ping and TCP monitor types
Adds ping and TCP monitor creation APIs, worker collectors, network checks UI, dashboard monitor status support, and progress documentation.
This commit is contained in:
@@ -7,7 +7,7 @@ from sqlalchemy.orm import Session
|
||||
from app.auth.dependencies import get_current_user, require_role
|
||||
from app.db.session import get_db
|
||||
from app.models import AlertRule, Asset, CheckResult, Incident, Monitor, User
|
||||
from app.schemas.core import CheckResultRead, MonitorCreate, MonitorRead, MonitorUpdate, WebsiteMonitorCreate
|
||||
from app.schemas.core import CheckResultRead, MonitorCreate, MonitorRead, MonitorUpdate, PingMonitorCreate, TcpMonitorCreate, WebsiteMonitorCreate
|
||||
|
||||
router = APIRouter(prefix="/monitors", tags=["monitors"])
|
||||
|
||||
@@ -80,6 +80,93 @@ def create_website_monitor(
|
||||
return monitor
|
||||
|
||||
|
||||
@router.post("/ping", response_model=MonitorRead)
|
||||
def create_ping_monitor(
|
||||
payload: PingMonitorCreate,
|
||||
_: User = Depends(require_role("admin")),
|
||||
db: Session = Depends(get_db),
|
||||
) -> Monitor:
|
||||
asset_id: int | None = None
|
||||
if payload.create_asset:
|
||||
asset = Asset(name=payload.name, asset_type="host", address=payload.host, status="unknown", extra={})
|
||||
db.add(asset)
|
||||
db.flush()
|
||||
asset_id = asset.id
|
||||
|
||||
monitor = Monitor(
|
||||
asset_id=asset_id,
|
||||
name=payload.name,
|
||||
monitor_type="ping",
|
||||
target=payload.host,
|
||||
config={"timeout_seconds": payload.timeout_seconds},
|
||||
interval_seconds=payload.interval_seconds,
|
||||
status="unknown",
|
||||
)
|
||||
db.add(monitor)
|
||||
db.flush()
|
||||
|
||||
if payload.alert_enabled:
|
||||
db.add(
|
||||
AlertRule(
|
||||
monitor_id=monitor.id,
|
||||
name=f"{payload.name} ping failure",
|
||||
severity=payload.alert_severity,
|
||||
condition={"type": "status_not_up"},
|
||||
failure_threshold=payload.failure_threshold,
|
||||
cooldown_seconds=300,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
db.refresh(monitor)
|
||||
return monitor
|
||||
|
||||
|
||||
@router.post("/tcp", response_model=MonitorRead)
|
||||
def create_tcp_monitor(
|
||||
payload: TcpMonitorCreate,
|
||||
_: User = Depends(require_role("admin")),
|
||||
db: Session = Depends(get_db),
|
||||
) -> Monitor:
|
||||
asset_id: int | None = None
|
||||
target = f"{payload.host}:{payload.port}"
|
||||
if payload.create_asset:
|
||||
asset = Asset(name=payload.name, asset_type="tcp_service", address=target, status="unknown", extra={})
|
||||
db.add(asset)
|
||||
db.flush()
|
||||
asset_id = asset.id
|
||||
|
||||
monitor = Monitor(
|
||||
asset_id=asset_id,
|
||||
name=payload.name,
|
||||
monitor_type="tcp",
|
||||
target=target,
|
||||
config={"host": payload.host, "port": payload.port, "timeout_seconds": payload.timeout_seconds},
|
||||
interval_seconds=payload.interval_seconds,
|
||||
status="unknown",
|
||||
)
|
||||
db.add(monitor)
|
||||
db.flush()
|
||||
|
||||
if payload.alert_enabled:
|
||||
db.add(
|
||||
AlertRule(
|
||||
monitor_id=monitor.id,
|
||||
name=f"{payload.name} TCP connection failure",
|
||||
severity=payload.alert_severity,
|
||||
condition={"type": "status_not_up"},
|
||||
failure_threshold=payload.failure_threshold,
|
||||
cooldown_seconds=300,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
db.refresh(monitor)
|
||||
return monitor
|
||||
|
||||
|
||||
@router.get("/{monitor_id}", response_model=MonitorRead)
|
||||
def get_monitor(monitor_id: int, _: User = Depends(get_current_user), db: Session = Depends(get_db)) -> Monitor:
|
||||
monitor = db.get(Monitor, monitor_id)
|
||||
@@ -110,7 +197,7 @@ def update_monitor(
|
||||
@router.delete("/{monitor_id}", status_code=204)
|
||||
def delete_monitor(
|
||||
monitor_id: int,
|
||||
cleanup_orphan_website_asset: bool = True,
|
||||
cleanup_orphan_asset: bool = True,
|
||||
_: User = Depends(require_role("admin")),
|
||||
db: Session = Depends(get_db),
|
||||
) -> None:
|
||||
@@ -129,10 +216,10 @@ def delete_monitor(
|
||||
db.delete(monitor)
|
||||
db.flush()
|
||||
|
||||
if cleanup_orphan_website_asset and asset_id is not None:
|
||||
if cleanup_orphan_asset and asset_id is not None:
|
||||
remaining = db.scalar(select(func.count(Monitor.id)).where(Monitor.asset_id == asset_id))
|
||||
asset = db.get(Asset, asset_id)
|
||||
if remaining == 0 and asset is not None and asset.asset_type == "website":
|
||||
if remaining == 0 and asset is not None and asset.asset_type in {"website", "host", "tcp_service"}:
|
||||
db.delete(asset)
|
||||
|
||||
db.commit()
|
||||
|
||||
@@ -73,6 +73,29 @@ class WebsiteMonitorCreate(BaseModel):
|
||||
failure_threshold: int = Field(default=3, ge=1, le=20)
|
||||
|
||||
|
||||
class PingMonitorCreate(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=160)
|
||||
host: str = Field(min_length=1, max_length=255)
|
||||
timeout_seconds: int = Field(default=5, ge=1, le=60)
|
||||
interval_seconds: int = Field(default=60, ge=10)
|
||||
create_asset: bool = True
|
||||
alert_enabled: bool = True
|
||||
alert_severity: str = "warning"
|
||||
failure_threshold: int = Field(default=3, ge=1, le=20)
|
||||
|
||||
|
||||
class TcpMonitorCreate(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=160)
|
||||
host: str = Field(min_length=1, max_length=255)
|
||||
port: int = Field(ge=1, le=65535)
|
||||
timeout_seconds: int = Field(default=5, ge=1, le=60)
|
||||
interval_seconds: int = Field(default=60, ge=10)
|
||||
create_asset: bool = True
|
||||
alert_enabled: bool = True
|
||||
alert_severity: str = "warning"
|
||||
failure_threshold: int = Field(default=3, ge=1, le=20)
|
||||
|
||||
|
||||
class CheckResultRead(BaseModel):
|
||||
id: int
|
||||
monitor_id: int
|
||||
|
||||
@@ -30,6 +30,7 @@ OrbitalWard is a secure monitoring appliance focused on the v0.1 vertical slice:
|
||||
- Website monitor create/edit/delete flow.
|
||||
- HTTP status and expected-text checks.
|
||||
- Optional TLS certificate expiry checks for HTTPS monitors.
|
||||
- Ping and TCP port monitor create/edit/delete flow.
|
||||
- Alert rules, incident opening/resolution, acknowledge, silence, and webhook notifications.
|
||||
- Generic webhook, Mattermost, and Zoom Team Chat notification channels.
|
||||
- Saved webhook URLs encrypted at rest and not returned to the UI.
|
||||
@@ -43,6 +44,8 @@ After the rename and TLS expiry work, these checks passed in Docker:
|
||||
- `docker compose -f docker-compose.dev.yml exec -T frontend npm run typecheck`
|
||||
- `docker compose -f docker-compose.dev.yml exec -T worker python -m compileall app`
|
||||
- Backend health returned `{"status":"ok","service":"orbitalward-backend"}`.
|
||||
- Direct worker probes for TCP and ICMP ping checks passed inside the Docker network.
|
||||
- API probe created and deleted one ping monitor and one TCP monitor successfully.
|
||||
|
||||
The final Compose project uses `orbitalward-*` containers, images, network, and volumes.
|
||||
|
||||
@@ -72,7 +75,7 @@ Issue source docs:
|
||||
- `docs/progress.md`
|
||||
- `docs/roadmap.md`
|
||||
|
||||
Current completed items include TLS expiry monitor support, HTTP/website checks, basic alert evaluation, incident actions, and webhook notification channels. Next recommended work starts with ping and TCP port monitors.
|
||||
Current completed items include TLS expiry monitor support, HTTP/website checks, ping and TCP port checks, basic alert evaluation, incident actions, and webhook notification channels. The next recommended implementation issue is alert rule editing UI.
|
||||
|
||||
## Guardrails
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ Completed in the initial scaffold:
|
||||
- React frontend skeleton with authenticated layout.
|
||||
- Worker skeleton with working HTTP website monitor polling.
|
||||
- Website monitor create/edit/delete UI.
|
||||
- Ping and TCP port monitor collectors and UI.
|
||||
- Basic alert evaluation, incidents, acknowledge, and silence actions.
|
||||
- Generic webhook, Mattermost, and Zoom Team Chat channel foundations.
|
||||
- Encrypted webhook URL storage.
|
||||
|
||||
+15
-10
@@ -24,6 +24,13 @@ Implemented website-monitor slice:
|
||||
- Incidents can be acknowledged and silenced from the UI.
|
||||
- Deleting a monitor resolves any open incidents tied to that monitor.
|
||||
|
||||
Implemented network-monitor slice:
|
||||
|
||||
- Create, edit, delete ping and TCP port monitors from the UI.
|
||||
- Worker performs ICMP ping checks and TCP connection checks.
|
||||
- Ping and TCP monitors use the same alert rule, incident, recovery, and notification flow as website monitors.
|
||||
- Dashboard monitor status includes website, ping, and TCP monitors.
|
||||
|
||||
Implemented notification slice:
|
||||
|
||||
- Create, edit, test, and delete notification channels from the UI.
|
||||
@@ -43,7 +50,6 @@ Implemented notification slice:
|
||||
- Alert rule editing UI is not implemented.
|
||||
- Notification routing/policies are not implemented; all enabled webhook channels receive incident notifications.
|
||||
- Email/SMTP notifications are not implemented yet.
|
||||
- Ping and TCP checks are not implemented yet.
|
||||
- Graphing exists only as placeholders; metric visualization is not implemented.
|
||||
- Worker scheduling is simple polling, not a Redis queue yet.
|
||||
- Tests are still minimal and need meaningful backend/worker/frontend coverage.
|
||||
@@ -51,15 +57,14 @@ Implemented notification slice:
|
||||
|
||||
## Recommended Next Work
|
||||
|
||||
1. Add ping and TCP port monitors.
|
||||
2. Add alert rule editing UI and richer alert conditions.
|
||||
3. Add notification policy/routing controls.
|
||||
4. Add email/SMTP notification channel.
|
||||
5. Add audit event writes for auth, monitor, credential, notification, and incident actions.
|
||||
6. Build credential vault UI with masked secret handling.
|
||||
7. Add user administration UI.
|
||||
8. Add graphs for website response time and monitor status history.
|
||||
9. Add backend and worker tests for the website-monitor and notification flows.
|
||||
1. Add alert rule editing UI and richer alert conditions.
|
||||
2. Add notification policy/routing controls.
|
||||
3. Add email/SMTP notification channel.
|
||||
4. Add audit event writes for auth, monitor, credential, notification, and incident actions.
|
||||
5. Build credential vault UI with masked secret handling.
|
||||
6. Add user administration UI.
|
||||
7. Add graphs for website response time and monitor status history.
|
||||
8. Add backend and worker tests for the website-monitor and notification flows.
|
||||
|
||||
## Operational Notes
|
||||
|
||||
|
||||
@@ -6,6 +6,8 @@ import type {
|
||||
NotificationChannel,
|
||||
NotificationChannelCreate,
|
||||
NotificationChannelUpdate,
|
||||
PingMonitorCreate,
|
||||
TcpMonitorCreate,
|
||||
User,
|
||||
WebsiteMonitorCreate,
|
||||
} from "../types/api";
|
||||
@@ -61,6 +63,16 @@ export const api = {
|
||||
method: "POST",
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
createPingMonitor: (token: string, payload: PingMonitorCreate) =>
|
||||
request<Monitor>("/monitors/ping", token, {
|
||||
method: "POST",
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
createTcpMonitor: (token: string, payload: TcpMonitorCreate) =>
|
||||
request<Monitor>("/monitors/tcp", token, {
|
||||
method: "POST",
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
updateMonitor: (token: string, monitorId: number, payload: MonitorUpdate) =>
|
||||
request<Monitor>(`/monitors/${monitorId}`, token, {
|
||||
method: "PATCH",
|
||||
|
||||
@@ -7,6 +7,7 @@ import { AlertsPage } from "../pages/AlertsPage";
|
||||
import { DashboardPage } from "../pages/DashboardPage";
|
||||
import { ListPage } from "../pages/ListPage";
|
||||
import { LoginPage } from "../pages/LoginPage";
|
||||
import { NetworkChecksPage } from "../pages/NetworkChecksPage";
|
||||
import { NotificationsPage } from "../pages/NotificationsPage";
|
||||
import { WebsitesPage } from "../pages/WebsitesPage";
|
||||
import type { Asset, Incident, Monitor } from "../types/api";
|
||||
@@ -79,6 +80,9 @@ export function App() {
|
||||
{page === "websites" ? (
|
||||
<WebsitesPage token={auth.token} monitors={monitors} onCreated={refreshData} />
|
||||
) : null}
|
||||
{page === "network-checks" ? (
|
||||
<NetworkChecksPage token={auth.token} monitors={monitors} onChanged={refreshData} />
|
||||
) : null}
|
||||
{page === "alerts" ? (
|
||||
<AlertsPage token={auth.token} incidents={incidents} selectedIncidentId={selectedIncidentId} onChanged={refreshData} />
|
||||
) : null}
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
KeyRound,
|
||||
LogOut,
|
||||
Network,
|
||||
PlugZap,
|
||||
Radar,
|
||||
Settings,
|
||||
Shield,
|
||||
@@ -20,6 +21,7 @@ const navigation = [
|
||||
{ id: "dashboard", label: "Dashboard", icon: Gauge },
|
||||
{ id: "assets", label: "Assets", icon: Network },
|
||||
{ id: "websites", label: "Websites", icon: Globe },
|
||||
{ id: "network-checks", label: "Network Checks", icon: PlugZap },
|
||||
{ id: "alerts", label: "Alerts", icon: Bell },
|
||||
{ id: "discovery", label: "Discovery", icon: Radar },
|
||||
{ id: "graphs", label: "Graphs", icon: Activity },
|
||||
|
||||
@@ -11,7 +11,6 @@ interface DashboardPageProps {
|
||||
export function DashboardPage({ assets, monitors, incidents }: DashboardPageProps) {
|
||||
const attentionMonitors = monitors.filter((monitor) => monitor.status !== "up" && monitor.status !== "unknown").length;
|
||||
const activeIncidents = incidents.filter((incident) => incident.status === "open").length;
|
||||
const websites = monitors.filter((monitor) => monitor.monitor_type === "http");
|
||||
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
@@ -26,29 +25,30 @@ export function DashboardPage({ assets, monitors, incidents }: DashboardPageProp
|
||||
<StatusTile icon={CheckCircle2} label="Overall Status" value={activeIncidents ? "Attention" : "Healthy"} tone={activeIncidents ? "warn" : "ok"} />
|
||||
<StatusTile icon={AlertTriangle} label="Active Incidents" value={String(activeIncidents)} tone={activeIncidents ? "warn" : "ok"} />
|
||||
<StatusTile icon={Server} label="Assets" value={String(assets.length)} />
|
||||
<StatusTile icon={Globe2} label="Websites" value={String(websites.length)} />
|
||||
<StatusTile icon={Globe2} label="Monitors" value={String(monitors.length)} />
|
||||
</section>
|
||||
|
||||
<section className="grid gap-4 xl:grid-cols-[minmax(0,1fr)_360px]">
|
||||
<div className="rounded-md border border-line bg-[#0d131c]">
|
||||
<div className="flex items-center justify-between border-b border-line p-4">
|
||||
<h2 className="text-base font-semibold">Website Monitors</h2>
|
||||
<h2 className="text-base font-semibold">Monitor Status</h2>
|
||||
<span className="text-sm text-slate-400">{attentionMonitors} need attention</span>
|
||||
</div>
|
||||
<div className="divide-y divide-line">
|
||||
{websites.length ? (
|
||||
websites.map((monitor) => (
|
||||
<div key={monitor.id} className="grid gap-2 p-4 md:grid-cols-[1fr_120px_110px] md:items-center">
|
||||
{monitors.length ? (
|
||||
monitors.map((monitor) => (
|
||||
<div key={monitor.id} className="grid gap-2 p-4 md:grid-cols-[1fr_90px_120px_110px] md:items-center">
|
||||
<div>
|
||||
<div className="font-medium">{monitor.name}</div>
|
||||
<div className="truncate text-sm text-slate-400">{monitor.target}</div>
|
||||
</div>
|
||||
<div className="text-sm uppercase text-slate-400">{monitor.monitor_type}</div>
|
||||
<div className="text-sm text-slate-400">{monitor.interval_seconds}s interval</div>
|
||||
<StatusBadge status={monitor.status} />
|
||||
</div>
|
||||
))
|
||||
) : (
|
||||
<div className="p-6 text-sm text-slate-400">No website monitors yet.</div>
|
||||
<div className="p-6 text-sm text-slate-400">No monitors yet.</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -0,0 +1,250 @@
|
||||
import { FormEvent, useState } from "react";
|
||||
import { Activity, Edit3, PlugZap, Plus, RefreshCw, Trash2, X } from "lucide-react";
|
||||
|
||||
import { api } from "../api/client";
|
||||
import { Button } from "../components/Button";
|
||||
import type { Monitor } from "../types/api";
|
||||
|
||||
interface NetworkChecksPageProps {
|
||||
token: string;
|
||||
monitors: Monitor[];
|
||||
onChanged: () => Promise<void>;
|
||||
}
|
||||
|
||||
type NetworkCheckType = "ping" | "tcp";
|
||||
|
||||
export function NetworkChecksPage({ token, monitors, onChanged }: NetworkChecksPageProps) {
|
||||
const networkChecks = monitors.filter((monitor) => monitor.monitor_type === "ping" || monitor.monitor_type === "tcp");
|
||||
const [checkType, setCheckType] = useState<NetworkCheckType>("ping");
|
||||
const [name, setName] = useState("");
|
||||
const [host, setHost] = useState("");
|
||||
const [port, setPort] = useState(443);
|
||||
const [timeoutSeconds, setTimeoutSeconds] = useState(5);
|
||||
const [intervalSeconds, setIntervalSeconds] = useState(60);
|
||||
const [failureThreshold, setFailureThreshold] = useState(3);
|
||||
const [alertEnabled, setAlertEnabled] = useState(true);
|
||||
const [editingMonitorId, setEditingMonitorId] = useState<number | null>(null);
|
||||
const [submitting, setSubmitting] = useState(false);
|
||||
const [deletingId, setDeletingId] = useState<number | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
async function handleSubmit(event: FormEvent) {
|
||||
event.preventDefault();
|
||||
setSubmitting(true);
|
||||
setError(null);
|
||||
try {
|
||||
if (editingMonitorId) {
|
||||
await api.updateMonitor(token, editingMonitorId, {
|
||||
name,
|
||||
target: checkType === "tcp" ? `${host}:${port}` : host,
|
||||
interval_seconds: intervalSeconds,
|
||||
config: checkType === "tcp" ? { host, port, timeout_seconds: timeoutSeconds } : { timeout_seconds: timeoutSeconds },
|
||||
});
|
||||
} else if (checkType === "tcp") {
|
||||
await api.createTcpMonitor(token, {
|
||||
name,
|
||||
host,
|
||||
port,
|
||||
timeout_seconds: timeoutSeconds,
|
||||
interval_seconds: intervalSeconds,
|
||||
create_asset: true,
|
||||
alert_enabled: alertEnabled,
|
||||
alert_severity: "warning",
|
||||
failure_threshold: failureThreshold,
|
||||
});
|
||||
} else {
|
||||
await api.createPingMonitor(token, {
|
||||
name,
|
||||
host,
|
||||
timeout_seconds: timeoutSeconds,
|
||||
interval_seconds: intervalSeconds,
|
||||
create_asset: true,
|
||||
alert_enabled: alertEnabled,
|
||||
alert_severity: "warning",
|
||||
failure_threshold: failureThreshold,
|
||||
});
|
||||
}
|
||||
resetForm();
|
||||
await onChanged();
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : "Could not save network check");
|
||||
} finally {
|
||||
setSubmitting(false);
|
||||
}
|
||||
}
|
||||
|
||||
function startEdit(monitor: Monitor) {
|
||||
const nextType = monitor.monitor_type === "tcp" ? "tcp" : "ping";
|
||||
setEditingMonitorId(monitor.id);
|
||||
setCheckType(nextType);
|
||||
setName(monitor.name);
|
||||
setHost(nextType === "tcp" ? String(monitor.config?.host ?? monitor.target.split(":")[0] ?? "") : monitor.target);
|
||||
setPort(Number(monitor.config?.port ?? 443));
|
||||
setTimeoutSeconds(Number(monitor.config?.timeout_seconds ?? 5));
|
||||
setIntervalSeconds(monitor.interval_seconds);
|
||||
setAlertEnabled(true);
|
||||
setFailureThreshold(3);
|
||||
setError(null);
|
||||
}
|
||||
|
||||
function resetForm() {
|
||||
setEditingMonitorId(null);
|
||||
setCheckType("ping");
|
||||
setName("");
|
||||
setHost("");
|
||||
setPort(443);
|
||||
setTimeoutSeconds(5);
|
||||
setIntervalSeconds(60);
|
||||
setFailureThreshold(3);
|
||||
setAlertEnabled(true);
|
||||
}
|
||||
|
||||
async function deleteMonitor(monitorId: number) {
|
||||
setDeletingId(monitorId);
|
||||
setError(null);
|
||||
try {
|
||||
await api.deleteMonitor(token, monitorId);
|
||||
await onChanged();
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : "Could not delete network check");
|
||||
} finally {
|
||||
setDeletingId(null);
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
<div className="flex flex-col justify-between gap-4 md:flex-row md:items-end">
|
||||
<div>
|
||||
<h1 className="text-3xl font-semibold">Network Checks</h1>
|
||||
<p className="mt-2 text-sm text-slate-400">ICMP ping checks and TCP port availability checks.</p>
|
||||
</div>
|
||||
<Button variant="ghost" onClick={onChanged}>
|
||||
<RefreshCw size={16} />
|
||||
Refresh
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
<section className="grid gap-5 xl:grid-cols-[420px_minmax(0,1fr)]">
|
||||
<form className="space-y-4 rounded-md border border-line bg-[#0d131c] p-5" onSubmit={handleSubmit}>
|
||||
<div className="flex items-center gap-2">
|
||||
{checkType === "tcp" ? <PlugZap size={18} className="text-pulse" /> : <Activity size={18} className="text-pulse" />}
|
||||
<h2 className="text-base font-semibold">{editingMonitorId ? "Edit Network Check" : "Add Network Check"}</h2>
|
||||
</div>
|
||||
|
||||
<div className="grid grid-cols-2 rounded-md border border-line bg-slate-950 p-1">
|
||||
<button className={modeClass(checkType === "ping")} disabled={Boolean(editingMonitorId)} onClick={() => setCheckType("ping")} type="button">
|
||||
Ping
|
||||
</button>
|
||||
<button className={modeClass(checkType === "tcp")} disabled={Boolean(editingMonitorId)} onClick={() => setCheckType("tcp")} type="button">
|
||||
TCP
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<label className="block space-y-2">
|
||||
<span className="text-sm text-slate-300">Name</span>
|
||||
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={name} onChange={(event) => setName(event.target.value)} required />
|
||||
</label>
|
||||
|
||||
<label className="block space-y-2">
|
||||
<span className="text-sm text-slate-300">Host</span>
|
||||
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={host} onChange={(event) => setHost(event.target.value)} placeholder="router.local or 192.168.1.1" required />
|
||||
</label>
|
||||
|
||||
<div className="grid gap-3 sm:grid-cols-3">
|
||||
{checkType === "tcp" ? (
|
||||
<label className="block space-y-2">
|
||||
<span className="text-sm text-slate-300">Port</span>
|
||||
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={port} onChange={(event) => setPort(Number(event.target.value))} min={1} max={65535} type="number" />
|
||||
</label>
|
||||
) : null}
|
||||
<label className="block space-y-2">
|
||||
<span className="text-sm text-slate-300">Timeout</span>
|
||||
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={timeoutSeconds} onChange={(event) => setTimeoutSeconds(Number(event.target.value))} min={1} max={60} type="number" />
|
||||
</label>
|
||||
<label className="block space-y-2">
|
||||
<span className="text-sm text-slate-300">Interval</span>
|
||||
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={intervalSeconds} onChange={(event) => setIntervalSeconds(Number(event.target.value))} min={10} type="number" />
|
||||
</label>
|
||||
</div>
|
||||
|
||||
{!editingMonitorId ? (
|
||||
<div className="flex items-center justify-between rounded-md border border-line bg-slate-950 px-3 py-2">
|
||||
<span className="text-sm text-slate-300">Alert on repeated failures</span>
|
||||
<input className="h-5 w-5 accent-teal-400" checked={alertEnabled} onChange={(event) => setAlertEnabled(event.target.checked)} type="checkbox" />
|
||||
</div>
|
||||
) : null}
|
||||
|
||||
{!editingMonitorId ? (
|
||||
<label className="block space-y-2">
|
||||
<span className="text-sm text-slate-300">Failure Threshold</span>
|
||||
<input className="h-10 w-full rounded-md border border-line bg-slate-950 px-3 text-sm outline-none ring-pulse/40 focus:ring-2" value={failureThreshold} onChange={(event) => setFailureThreshold(Number(event.target.value))} min={1} max={20} type="number" />
|
||||
</label>
|
||||
) : null}
|
||||
|
||||
{error ? <div className="rounded-md border border-red-500/40 bg-red-950/40 p-3 text-sm text-red-200">{error}</div> : null}
|
||||
|
||||
<div className="flex gap-2">
|
||||
{editingMonitorId ? (
|
||||
<Button className="flex-1" onClick={resetForm} type="button" variant="ghost">
|
||||
<X size={16} />
|
||||
Cancel
|
||||
</Button>
|
||||
) : null}
|
||||
<Button className="flex-1" disabled={submitting} type="submit">
|
||||
<Plus size={16} />
|
||||
{submitting ? "Saving..." : editingMonitorId ? "Save Check" : "Create Check"}
|
||||
</Button>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<div className="rounded-md border border-line bg-[#0d131c]">
|
||||
<div className="border-b border-line p-4">
|
||||
<h2 className="text-base font-semibold">Configured Network Checks</h2>
|
||||
</div>
|
||||
<div className="divide-y divide-line">
|
||||
{networkChecks.length ? (
|
||||
networkChecks.map((monitor) => (
|
||||
<div key={monitor.id} className="grid gap-2 p-4 md:grid-cols-[1fr_90px_110px_170px] md:items-center">
|
||||
<div>
|
||||
<div className="font-medium">{monitor.name}</div>
|
||||
<div className="truncate text-sm text-slate-400">{monitor.target}</div>
|
||||
</div>
|
||||
<span className="text-sm uppercase text-slate-400">{monitor.monitor_type}</span>
|
||||
<Status status={monitor.status} />
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<div className="text-sm text-slate-400">{monitor.last_checked_at ? new Date(monitor.last_checked_at).toLocaleTimeString() : "Not checked"}</div>
|
||||
<Button aria-label={`Edit ${monitor.name}`} className="h-8 w-8 px-0" onClick={() => startEdit(monitor)} title="Edit check" type="button" variant="ghost">
|
||||
<Edit3 size={15} />
|
||||
</Button>
|
||||
<Button aria-label={`Delete ${monitor.name}`} className="h-8 w-8 px-0" disabled={deletingId === monitor.id} onClick={() => deleteMonitor(monitor.id)} title="Delete check" type="button" variant="ghost">
|
||||
<Trash2 size={15} />
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
))
|
||||
) : (
|
||||
<div className="p-6 text-sm text-slate-400">No network checks yet.</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function modeClass(active: boolean) {
|
||||
return `h-8 rounded-md text-sm transition disabled:opacity-70 ${active ? "bg-slate-800 text-white" : "text-slate-400 hover:bg-slate-900 hover:text-white"}`;
|
||||
}
|
||||
|
||||
function Status({ status }: { status: string }) {
|
||||
const classes =
|
||||
status === "up"
|
||||
? "border-teal-500/40 bg-teal-950/40 text-teal-200"
|
||||
: status === "down"
|
||||
? "border-red-500/40 bg-red-950/40 text-red-200"
|
||||
: status === "warning"
|
||||
? "border-amber-500/40 bg-amber-950/40 text-amber-200"
|
||||
: "border-slate-600 bg-slate-900 text-slate-300";
|
||||
return <span className={`inline-flex h-7 w-24 items-center justify-center rounded-md border text-xs font-medium ${classes}`}>{status}</span>;
|
||||
}
|
||||
@@ -94,3 +94,26 @@ export interface WebsiteMonitorCreate {
|
||||
alert_severity: string;
|
||||
failure_threshold: number;
|
||||
}
|
||||
|
||||
export interface PingMonitorCreate {
|
||||
name: string;
|
||||
host: string;
|
||||
timeout_seconds: number;
|
||||
interval_seconds: number;
|
||||
create_asset: boolean;
|
||||
alert_enabled: boolean;
|
||||
alert_severity: string;
|
||||
failure_threshold: number;
|
||||
}
|
||||
|
||||
export interface TcpMonitorCreate {
|
||||
name: string;
|
||||
host: string;
|
||||
port: number;
|
||||
timeout_seconds: number;
|
||||
interval_seconds: number;
|
||||
create_asset: boolean;
|
||||
alert_enabled: boolean;
|
||||
alert_severity: string;
|
||||
failure_threshold: number;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,108 @@
|
||||
import asyncio
|
||||
import os
|
||||
import socket
|
||||
import struct
|
||||
from dataclasses import dataclass
|
||||
from time import perf_counter
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NetworkCheckResult:
|
||||
status: str
|
||||
response_time_ms: int | None
|
||||
message: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PingCheckConfig:
|
||||
host: str
|
||||
timeout_seconds: float = 5.0
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TcpCheckConfig:
|
||||
host: str
|
||||
port: int
|
||||
timeout_seconds: float = 5.0
|
||||
|
||||
|
||||
async def run_ping_check(config: PingCheckConfig) -> NetworkCheckResult:
|
||||
try:
|
||||
response_time_ms = await asyncio.to_thread(_run_ping_check_sync, config.host, config.timeout_seconds)
|
||||
except PermissionError:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message="ICMP ping requires raw socket permission")
|
||||
except TimeoutError:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message="Ping timed out")
|
||||
except OSError as exc:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message=f"Ping failed: {exc}")
|
||||
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="Ping check passed")
|
||||
|
||||
|
||||
async def run_tcp_check(config: TcpCheckConfig) -> NetworkCheckResult:
|
||||
started = perf_counter()
|
||||
try:
|
||||
connection = asyncio.open_connection(config.host, config.port)
|
||||
reader, writer = await asyncio.wait_for(connection, timeout=config.timeout_seconds)
|
||||
writer.close()
|
||||
await writer.wait_closed()
|
||||
reader.feed_eof()
|
||||
except (TimeoutError, OSError) as exc:
|
||||
return NetworkCheckResult(status="down", response_time_ms=None, message=f"TCP connection failed: {exc}")
|
||||
|
||||
response_time_ms = int((perf_counter() - started) * 1000)
|
||||
return NetworkCheckResult(status="up", response_time_ms=response_time_ms, message="TCP connection succeeded")
|
||||
|
||||
|
||||
def _run_ping_check_sync(host: str, timeout_seconds: float) -> int:
|
||||
address = _resolve_ipv4(host)
|
||||
identifier = os.getpid() & 0xFFFF
|
||||
sequence = 1
|
||||
packet = _build_icmp_echo_request(identifier, sequence)
|
||||
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP) as sock:
|
||||
sock.settimeout(timeout_seconds)
|
||||
started = perf_counter()
|
||||
sock.sendto(packet, (address, 0))
|
||||
|
||||
while True:
|
||||
response, _ = sock.recvfrom(1024)
|
||||
if _matches_icmp_echo_reply(response, identifier, sequence):
|
||||
return int((perf_counter() - started) * 1000)
|
||||
|
||||
|
||||
def _resolve_ipv4(host: str) -> str:
|
||||
results = socket.getaddrinfo(host, None, socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP)
|
||||
if not results:
|
||||
raise OSError("Could not resolve an IPv4 address")
|
||||
return str(results[0][4][0])
|
||||
|
||||
|
||||
def _build_icmp_echo_request(identifier: int, sequence: int) -> bytes:
|
||||
payload = b"OrbitalWard ping"
|
||||
header = struct.pack("!BBHHH", 8, 0, 0, identifier, sequence)
|
||||
checksum = _icmp_checksum(header + payload)
|
||||
header = struct.pack("!BBHHH", 8, 0, checksum, identifier, sequence)
|
||||
return header + payload
|
||||
|
||||
|
||||
def _matches_icmp_echo_reply(response: bytes, identifier: int, sequence: int) -> bool:
|
||||
if len(response) < 28:
|
||||
return False
|
||||
ip_header_length = (response[0] & 0x0F) * 4
|
||||
icmp_header = response[ip_header_length : ip_header_length + 8]
|
||||
if len(icmp_header) < 8:
|
||||
return False
|
||||
icmp_type, _, _, reply_identifier, reply_sequence = struct.unpack("!BBHHH", icmp_header)
|
||||
return icmp_type == 0 and reply_identifier == identifier and reply_sequence == sequence
|
||||
|
||||
|
||||
def _icmp_checksum(data: bytes) -> int:
|
||||
if len(data) % 2:
|
||||
data += b"\x00"
|
||||
|
||||
checksum = 0
|
||||
for index in range(0, len(data), 2):
|
||||
checksum += (data[index] << 8) + data[index + 1]
|
||||
checksum = (checksum & 0xFFFF) + (checksum >> 16)
|
||||
|
||||
return ~checksum & 0xFFFF
|
||||
+37
-13
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
|
||||
import httpx
|
||||
|
||||
from app.collectors.website import WebsiteCheckConfig, run_website_check
|
||||
from app.collectors.network import PingCheckConfig, TcpCheckConfig, run_ping_check, run_tcp_check
|
||||
from app.config import settings
|
||||
from app.db import session_scope
|
||||
from app.models import AlertRule, Asset, CheckResult, Incident, Monitor, NotificationChannel
|
||||
@@ -33,7 +34,7 @@ class Scheduler:
|
||||
async def tick(self) -> None:
|
||||
try:
|
||||
with session_scope() as db:
|
||||
due_monitors = self._load_due_website_monitors(db)
|
||||
due_monitors = self._load_due_monitors(db)
|
||||
for monitor in due_monitors:
|
||||
await self._run_monitor(db, monitor)
|
||||
db.commit()
|
||||
@@ -43,9 +44,11 @@ class Scheduler:
|
||||
def stop(self) -> None:
|
||||
self._stopped.set()
|
||||
|
||||
def _load_due_website_monitors(self, db: Session) -> list[Monitor]:
|
||||
def _load_due_monitors(self, db: Session) -> list[Monitor]:
|
||||
now = datetime.now(UTC)
|
||||
monitors = db.scalars(select(Monitor).where(Monitor.monitor_type == "http").order_by(Monitor.id).limit(50)).all()
|
||||
monitors = db.scalars(
|
||||
select(Monitor).where(Monitor.monitor_type.in_(["http", "ping", "tcp"])).order_by(Monitor.id).limit(50)
|
||||
).all()
|
||||
due: list[Monitor] = []
|
||||
for monitor in monitors:
|
||||
if monitor.last_checked_at is None:
|
||||
@@ -57,16 +60,7 @@ class Scheduler:
|
||||
return due
|
||||
|
||||
async def _run_monitor(self, db: Session, monitor: Monitor) -> None:
|
||||
config = WebsiteCheckConfig(
|
||||
url=monitor.target,
|
||||
expected_status=int(monitor.config.get("expected_status", 200)),
|
||||
expected_text=monitor.config.get("expected_text") or None,
|
||||
unexpected_text=monitor.config.get("unexpected_text") or None,
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
|
||||
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
|
||||
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
|
||||
)
|
||||
result = await run_website_check(config)
|
||||
result = await self._collect_monitor_result(monitor)
|
||||
now = datetime.now(UTC)
|
||||
|
||||
monitor.status = result.status
|
||||
@@ -93,6 +87,36 @@ class Scheduler:
|
||||
|
||||
logger.info("Checked %s: %s (%s ms)", monitor.name, result.status, result.response_time_ms)
|
||||
|
||||
async def _collect_monitor_result(self, monitor: Monitor):
|
||||
if monitor.monitor_type == "http":
|
||||
config = WebsiteCheckConfig(
|
||||
url=monitor.target,
|
||||
expected_status=int(monitor.config.get("expected_status", 200)),
|
||||
expected_text=monitor.config.get("expected_text") or None,
|
||||
unexpected_text=monitor.config.get("unexpected_text") or None,
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 10)),
|
||||
check_tls_expiry=bool(monitor.config.get("check_tls_expiry", False)),
|
||||
tls_warning_days=int(monitor.config.get("tls_warning_days", 30)),
|
||||
)
|
||||
return await run_website_check(config)
|
||||
|
||||
if monitor.monitor_type == "ping":
|
||||
config = PingCheckConfig(
|
||||
host=monitor.target,
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
|
||||
)
|
||||
return await run_ping_check(config)
|
||||
|
||||
if monitor.monitor_type == "tcp":
|
||||
config = TcpCheckConfig(
|
||||
host=str(monitor.config.get("host") or monitor.target),
|
||||
port=int(monitor.config.get("port")),
|
||||
timeout_seconds=float(monitor.config.get("timeout_seconds", 5)),
|
||||
)
|
||||
return await run_tcp_check(config)
|
||||
|
||||
raise ValueError(f"Unsupported monitor type: {monitor.monitor_type}")
|
||||
|
||||
async def _evaluate_rule(self, db: Session, monitor: Monitor, rule: AlertRule, now: datetime, message: str) -> None:
|
||||
open_incident = db.scalar(
|
||||
select(Incident).where(
|
||||
|
||||
Reference in New Issue
Block a user