From 2d9448a45adeb9033392167e76580c908a1e18c5 Mon Sep 17 00:00:00 2001 From: Kures <14836932+Kures@users.noreply.github.com> Date: Fri, 8 May 2026 00:32:06 +0300 Subject: [PATCH 1/3] feat(observability): add Grafana dashboards, Prometheus scrape config + AlertManager rules Partially addresses P1-03 from reference/todo.md ("structured observability: request IDs, metrics endpoint, OTEL spans"). The metrics endpoint already ships in src/metrics.zig and src/api.zig:70; this contributes the operator side as a coherent stack. Contents: - dashboards/grafana/nullboiler-overview.json - high-level ops view (8 panels) - dashboards/grafana/nullboiler-workers.json - per-fleet worker view (7 panels) - dashboards/prometheus/prometheus.yml - minimal scrape config - dashboards/alerts/nullboiler.rules.yml - 8 AlertManager rules - dashboards/README.md - quick-start, panel index, alert table Targets Grafana 10.x and 11.x (schemaVersion 39). PromQL is plain rate() over the 11 counters exposed by /metrics, with clamp_min to avoid divide-by-zero on idle clusters. Both dashboards prompt for the Prometheus datasource via DS_PROMETHEUS template variable so they import cleanly into existing setups. Alert rules pair 1:1 with the dashboards: thresholds match the colour bands on the panels so dashboard and pager tell the same story. Validates clean with `promtool check rules`. Side benefit: the Workers dashboard's "Failure ratios over time" panel visualises Gap 3 from nullclaw/docs/integration-analysis.md (HIGH PRIORITY) when a stock nullclaw worker is wired up via /webhook. See "Diagnosing integration gaps" in dashboards/README.md. Future work (not part of this PR): - histogram metrics for HTTP and dispatch latency - per-worker labels on dispatch counters - recording rules + Grafana alerting integration --- dashboards/README.md | 214 ++++++ dashboards/alerts/nullboiler.rules.yml | 204 ++++++ dashboards/grafana/nullboiler-overview.json | 703 ++++++++++++++++++++ dashboards/grafana/nullboiler-workers.json | 593 +++++++++++++++++ dashboards/prometheus/prometheus.yml | 30 + 5 files changed, 1744 insertions(+) create mode 100644 dashboards/README.md create mode 100644 dashboards/alerts/nullboiler.rules.yml create mode 100644 dashboards/grafana/nullboiler-overview.json create mode 100644 dashboards/grafana/nullboiler-workers.json create mode 100644 dashboards/prometheus/prometheus.yml diff --git a/dashboards/README.md b/dashboards/README.md new file mode 100644 index 0000000..a374989 --- /dev/null +++ b/dashboards/README.md @@ -0,0 +1,214 @@ +# NullBoiler observability stack (P1-03) + +Ready-to-import Grafana dashboards plus a minimal Prometheus scrape +config for the existing `/metrics` endpoint. + +This contributes the **operator side** of `reference/todo.md` P1-03 +("Structured observability: request IDs, metrics endpoint, OTEL +spans"). The endpoint and counters already ship in `src/metrics.zig`; +this directory makes them visible. + +> **Side benefit:** the panels also visualise the integration gaps +> documented in +> [`nullclaw/docs/integration-analysis.md`](https://github.com/nullclaw/nullclaw/blob/main/docs/integration-analysis.md). +> When a NullClaw worker is wired up via `/webhook` (Gap 3 — HIGH +> PRIORITY in that document), the *Worker dispatch failure ratio* +> panel goes red while the *Health-check failure ratio* stays green, +> isolating the contract mismatch (sync `{status:"ok",response:"..."}` +> expected, async `{status:"received"}` returned). See +> [Diagnosing integration gaps](#diagnosing-integration-gaps) below for +> the exact panel pattern. + +## Contents + +``` +dashboards/ +├── README.md this file +├── grafana/ +│ ├── nullboiler-overview.json high-level operations view +│ └── nullboiler-workers.json per-fleet worker health view +├── prometheus/ +│ └── prometheus.yml minimal scrape config +└── alerts/ + └── nullboiler.rules.yml 8 AlertManager rules paired 1:1 with the dashboards +``` + +## What each dashboard answers + +### `nullboiler-overview.json` + +Open this first when investigating "is something wrong?". + +| Panel | Question it answers | +|---|---| +| HTTP requests/sec | Is anyone talking to us right now? | +| Runs created/sec | Is work flowing into the orchestrator? | +| Worker dispatch failure ratio (5m) | What share of dispatches are blowing up? | +| Callback failures/sec | Are run-lifecycle webhooks reaching consumers? | +| Run & step throughput | Mix of created / replayed / claimed / retried over time | +| Worker dispatch (success vs failure) | Stacked-area dispatch outcomes | +| Callbacks (sent vs failed) | Webhook delivery reliability | +| Reliability ratios | Idempotent replay ratio + step retry ratio with thresholds | + +### `nullboiler-workers.json` + +Open this when the Overview shows elevated dispatch failure ratio and +you need to localize the bad worker. + +| Panel | Question it answers | +|---|---| +| Health checks/sec | Are health probes running? | +| Health-check failure ratio (5m) | Are workers responding to probes? | +| Dispatch success/sec, failure/sec | Per-second outcomes | +| Health-check rate (probe vs failure) | Probes timeline | +| Dispatch outcomes (stacked bars) | Discrete dispatch outcomes | +| Failure ratios over time | The signal the circuit breaker reacts to | + +## Metrics exposed by NullBoiler + +From `src/metrics.zig`, all counters (no histograms or labels yet): + +| Counter | Meaning | +|---|---| +| `nullboiler_http_requests_total` | All HTTP requests handled by the API | +| `nullboiler_runs_created_total` | Runs successfully accepted by `POST /runs` | +| `nullboiler_runs_idempotent_replays_total` | Idempotent replays of an existing run | +| `nullboiler_steps_claimed_total` | Steps dispatched to workers | +| `nullboiler_steps_retry_scheduled_total` | Steps scheduled for retry | +| `nullboiler_worker_dispatch_success_total` | Worker dispatches that succeeded | +| `nullboiler_worker_dispatch_failure_total` | Worker dispatches that failed | +| `nullboiler_worker_health_checks_total` | Health probes performed | +| `nullboiler_worker_health_failures_total` | Health probes that failed | +| `nullboiler_callback_sent_total` | Run-lifecycle webhook callbacks sent | +| `nullboiler_callback_failed_total` | Run-lifecycle webhook callbacks failed | + +## Quick start (docker-compose) + +```bash +docker run -d --name prom \ + -p 9090:9090 \ + -v "$(pwd)/dashboards/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \ + prom/prometheus + +docker run -d --name grafana \ + -p 3030:3000 \ + -e GF_AUTH_ANONYMOUS_ENABLED=true \ + -e GF_AUTH_ANONYMOUS_ORG_ROLE=Admin \ + grafana/grafana + +# Add Prometheus datasource pointing at http://host.docker.internal:9090 +# Then: Dashboards -> Import -> upload nullboiler-overview.json and nullboiler-workers.json +``` + +Open Grafana at http://localhost:3030, point both dashboards at the +Prometheus datasource, and they will populate as soon as NullBoiler +starts handling traffic. + +## Quick start (existing Prometheus + Grafana) + +1. Add the scrape stanza from `prometheus/prometheus.yml` to your + existing `prometheus.yml`. Reload Prometheus. +2. Import each `dashboards/grafana/*.json` via *Dashboards → Import → Upload JSON*. +3. When prompted, select your Prometheus datasource for the + `${DS_PROMETHEUS}` template variable. +4. (optional) Wire up alerts: + ```yaml + # in prometheus.yml + rule_files: + - /etc/prometheus/alerts/nullboiler.rules.yml + ``` + and copy `dashboards/alerts/nullboiler.rules.yml` into that path. + +## Alert rules + +`alerts/nullboiler.rules.yml` ships 8 rules grouped under +`nullboiler.health` and `nullboiler.flow`: + +| Alert | Severity | Fires when | +|---|---|---| +| `NullBoilerInstanceDown` | critical | `up == 0` for 2m | +| `NullBoilerDispatchFailureRatioHigh` | warning | dispatch failure ratio > 30% for 5m | +| `NullBoilerDispatchFailureRatioCritical` | critical | dispatch failure ratio > 80% for 2m | +| `NullBoilerWorkerHealthDegraded` | warning | health-check failure ratio > 20% for 5m | +| `NullBoilerCallbackDeliveryDegraded` | warning | callback failure ratio > 10% for 10m | +| `NullBoilerStepRetryRateElevated` | info | retry/claim ratio > 20% for 10m | +| `NullBoilerNoTrafficForExtendedPeriod` | info | no HTTP traffic for 30m | +| `NullBoilerIdempotentReplayRatioVeryHigh` | info | replay ratio > 95% for 15m | + +Thresholds match the colour bands on the corresponding Grafana panels +1:1 — if you tune one, mirror the other so the dashboard and the +pager tell the same story. The Critical-severity alerts are intended +for paging; everything else is ticket-bait. + +> One deliberate exception: `NullBoilerWorkerHealthDegraded` fires at +> 20% (alert) while the dashboard's health-ratio stat shows yellow at +> 1% and red at 10%. The alert sits above the dashboard's red band on +> purpose — the dashboard is meant to surface single-probe blips +> visually, while the pager should only fire on a sustained pattern. + +Validate locally: + +```bash +docker run --rm --entrypoint=promtool \ + -v "$(pwd)/dashboards/alerts:/rules:ro" prom/prometheus \ + check rules /rules/nullboiler.rules.yml +# SUCCESS: 8 rules found +``` + +## Verification + +The dashboards target Grafana 10.x and 11.x (`schemaVersion: 39`). The +PromQL is plain `rate()` over counters with `clamp_min` to avoid +divide-by-zero on idle clusters. + +To smoke-test the metrics endpoint without Grafana: + +```bash +curl -s http://localhost:8080/metrics | head -30 +``` + +You should see eleven `# TYPE ... counter` blocks and their numeric +values. Empty values are valid — counters start at zero. + +## Diagnosing integration gaps + +A non-obvious value of these dashboards: they make ecosystem-level +integration gaps **visually obvious** without reading logs. + +The cleanest example today is **Gap 3** in +[`nullclaw/docs/integration-analysis.md`](https://github.com/nullclaw/nullclaw/blob/main/docs/integration-analysis.md) +("Worker Endpoint for nullboiler Dispatch — HIGH PRIORITY"). When a +plain `nullclaw gateway` is registered as a NullBoiler worker: + +- `/health` succeeds → `nullboiler_worker_health_failures_total` stays low +- `/webhook` returns `{"status":"received"}` instead of the documented + `{"status":"ok","response":"..."}` → + `nullboiler_worker_dispatch_failure_total` increments on every step + +In the **Workers** dashboard's *Failure ratios over time* panel this +shows up as **dispatch failure ratio at ~100% (red)** sitting on top +of **health-check failure ratio near 0% (green)** — a one-glance +diagnosis that the worker is reachable but its response contract is +broken. + +This is exactly the visual signal NullBoiler maintainers would want +when triaging field reports about worker dispatches; it surfaces the +gap that `integration-analysis.md` predicts but does not yet +mitigate at runtime. + +## Future work + +- Histograms for HTTP latency and worker dispatch duration (would + enable percentile panels). +- Per-worker labels on the dispatch counters (would enable + per-worker breakdown panels — currently the workers dashboard shows + fleet-wide aggregates). On a fleet of N workers with the metrics + unlabeled, a single bad worker pulling 1/N of dispatches produces a + ~1/N failure ratio — below the 30% warning threshold for N ≥ 4. + Per-worker labels resolve this. +- Recording rules + Grafana-native alerting (the AlertManager rules + in `alerts/` are the floor — a recording-rule layer would precompute + the ratios and avoid PromQL duplication between dashboards and + alerts). + +These are not required for P1-03 but are natural follow-ups. diff --git a/dashboards/alerts/nullboiler.rules.yml b/dashboards/alerts/nullboiler.rules.yml new file mode 100644 index 0000000..bfc72f9 --- /dev/null +++ b/dashboards/alerts/nullboiler.rules.yml @@ -0,0 +1,204 @@ +# Prometheus AlertManager rules for NullBoiler. +# +# Drop into your Prometheus config alongside the scrape stanza from +# `dashboards/prometheus/prometheus.yml`: +# +# rule_files: +# - /etc/prometheus/alerts/nullboiler.rules.yml +# +# Then point AlertManager at the rules and route the `nullboiler` group +# to your on-call channel. +# +# These rules pair 1:1 with the panels in `dashboards/grafana/`. If you +# tune a threshold here, mirror it in the corresponding dashboard panel +# threshold band so the dashboard and the alert tell the same story. + +groups: + - name: nullboiler.health + interval: 30s + rules: + + # ── Liveness ────────────────────────────────────────────────── + + - alert: NullBoilerInstanceDown + expr: up{job="nullboiler"} == 0 + for: 2m + labels: + severity: critical + service: nullboiler + annotations: + summary: "NullBoiler instance is down" + description: | + Prometheus has not scraped {{ $labels.instance }} for 2 + minutes. Either the orchestrator process died or the + `/metrics` endpoint stopped responding. + runbook: "https://github.com/nullclaw/nullboiler/blob/main/dashboards/README.md" + + # ── Worker dispatch (Gap 3 territory) ───────────────────────── + + - alert: NullBoilerDispatchFailureRatioHigh + expr: | + ( + rate(nullboiler_worker_dispatch_failure_total[5m]) + / + clamp_min( + rate(nullboiler_worker_dispatch_success_total[5m]) + + rate(nullboiler_worker_dispatch_failure_total[5m]), + 0.001 + ) + ) > 0.30 + for: 5m + labels: + severity: warning + service: nullboiler + annotations: + summary: "More than 30% of worker dispatches are failing (5m)" + description: | + Sustained dispatch failure ratio above 30% indicates a + broken contract between NullBoiler and at least one + worker. Cross-check the *Workers* dashboard's + "Failure ratios over time" panel: if `health-check + failure ratio` is also high, the worker is unreachable; + if only dispatch is failing, the worker is reachable but + its `/webhook` is returning the wrong shape (see + integration-analysis.md Gap 3). + dashboard: "http://grafana/d/nullboiler-workers" + + - alert: NullBoilerDispatchFailureRatioCritical + expr: | + ( + rate(nullboiler_worker_dispatch_failure_total[5m]) + / + clamp_min( + rate(nullboiler_worker_dispatch_success_total[5m]) + + rate(nullboiler_worker_dispatch_failure_total[5m]), + 0.001 + ) + ) > 0.80 + for: 2m + labels: + severity: critical + service: nullboiler + annotations: + summary: "Worker dispatches almost entirely failing (>80% for 2m)" + description: | + Effectively no work is reaching workers. Page on-call. + + # ── Worker health probes ────────────────────────────────────── + + - alert: NullBoilerWorkerHealthDegraded + expr: | + ( + rate(nullboiler_worker_health_failures_total[5m]) + / + clamp_min( + rate(nullboiler_worker_health_checks_total[5m]), + 0.001 + ) + ) > 0.20 + for: 5m + labels: + severity: warning + service: nullboiler + annotations: + summary: "Worker fleet health probes failing >20% (5m)" + description: | + At least one worker is failing health probes. Look at + `GET /workers` for `consecutive_failures > 0` and + `last_error_text` to localise. + + # ── Run lifecycle webhooks ──────────────────────────────────── + + - alert: NullBoilerCallbackDeliveryDegraded + expr: | + ( + rate(nullboiler_callback_failed_total[5m]) + / + clamp_min( + rate(nullboiler_callback_sent_total[5m]) + + rate(nullboiler_callback_failed_total[5m]), + 0.001 + ) + ) > 0.10 + for: 10m + labels: + severity: warning + service: nullboiler + annotations: + summary: "Run-lifecycle callbacks failing >10% (10m)" + description: | + Consumers depending on `step.completed` / `run.completed` + webhooks may be missing notifications. Check + `last_error_text` on the failing callback configs. + + # ── Step retries (signal of upstream flake) ─────────────────── + + - alert: NullBoilerStepRetryRateElevated + expr: | + ( + rate(nullboiler_steps_retry_scheduled_total[5m]) + / + clamp_min( + rate(nullboiler_steps_claimed_total[5m]), + 0.001 + ) + ) > 0.20 + for: 10m + labels: + severity: info + service: nullboiler + annotations: + summary: "More than 20% of steps are being retried (10m)" + description: | + High retry rate usually means transient errors at the + worker layer. Once retry policy gains exponential backoff + + jitter (reference/todo.md P1-01), this alert can be + tightened. + + - name: nullboiler.flow + interval: 30s + rules: + + # ── Throughput sanity ───────────────────────────────────────── + + - alert: NullBoilerNoTrafficForExtendedPeriod + expr: | + (rate(nullboiler_http_requests_total[15m]) == 0) + or absent(nullboiler_http_requests_total) + for: 30m + labels: + severity: info + service: nullboiler + annotations: + summary: "NullBoiler has received no HTTP traffic for 30m" + description: | + The orchestrator is up but idle. Often benign (between + workloads); fires on a clearly-broken upstream pipeline. + Liveness itself is covered by NullBoilerInstanceDown — the + `absent()` clause here catches the edge case of a target + that never produced a sample (e.g. immediately after a + restart that bypassed the first scrape). + + # ── Idempotency replays ─────────────────────────────────────── + + - alert: NullBoilerIdempotentReplayRatioVeryHigh + expr: | + ( + rate(nullboiler_runs_idempotent_replays_total[15m]) + / + clamp_min( + rate(nullboiler_runs_created_total[15m]) + + rate(nullboiler_runs_idempotent_replays_total[15m]), + 0.001 + ) + ) > 0.95 + for: 15m + labels: + severity: info + service: nullboiler + annotations: + summary: "95%+ of POST /runs are idempotent replays (15m)" + description: | + A producer is hammering POST /runs with the same + idempotency key. This is harmless (replays are cheap) but + usually points to a misconfigured retry loop upstream. diff --git a/dashboards/grafana/nullboiler-overview.json b/dashboards/grafana/nullboiler-overview.json new file mode 100644 index 0000000..582f37e --- /dev/null +++ b/dashboards/grafana/nullboiler-overview.json @@ -0,0 +1,703 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "High-level operational view of NullBoiler — request volume, run throughput, dispatch reliability, and idempotency replay ratio. Pair with the per-worker dashboard for deeper drill-downs.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_http_requests_total[5m])", + "legendFormat": "req/s", + "range": true, + "refId": "A" + } + ], + "title": "HTTP requests/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_created_total[5m])", + "legendFormat": "runs/s", + "range": true, + "refId": "A" + } + ], + "title": "Runs created/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "red", + "value": 0.2 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m]) / clamp_min(rate(nullboiler_worker_dispatch_success_total[5m]) + rate(nullboiler_worker_dispatch_failure_total[5m]), 0.001)", + "legendFormat": "failure ratio", + "range": true, + "refId": "A" + } + ], + "title": "Worker dispatch failure ratio (5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.001 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_callback_failed_total[5m])", + "legendFormat": "failed/s", + "range": true, + "refId": "A" + } + ], + "title": "Callback failures/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_created_total[5m])", + "legendFormat": "runs created", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_idempotent_replays_total[5m])", + "legendFormat": "idempotent replays", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_steps_claimed_total[5m])", + "legendFormat": "steps claimed", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_steps_retry_scheduled_total[5m])", + "legendFormat": "steps retried", + "range": true, + "refId": "D" + } + ], + "title": "Run & step throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_success_total[5m])", + "legendFormat": "success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])", + "legendFormat": "failure", + "range": true, + "refId": "B" + } + ], + "title": "Worker dispatch (success vs failure, stacked)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_callback_sent_total[5m])", + "legendFormat": "sent", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_callback_failed_total[5m])", + "legendFormat": "failed", + "range": true, + "refId": "B" + } + ], + "title": "Callbacks (run lifecycle webhook delivery)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "thresholdsStyle": { + "mode": "line+area" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.3 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_idempotent_replays_total[5m]) / clamp_min(rate(nullboiler_runs_created_total[5m]) + rate(nullboiler_runs_idempotent_replays_total[5m]), 0.001)", + "legendFormat": "idempotent replay ratio", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_steps_retry_scheduled_total[5m]) / clamp_min(rate(nullboiler_steps_claimed_total[5m]), 0.001)", + "legendFormat": "step retry ratio", + "range": true, + "refId": "B" + } + ], + "title": "Reliability ratios (replay & retry)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["nullboiler", "orchestration", "p1-03"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "NullBoiler — Overview", + "uid": "nullboiler-overview", + "version": 1, + "weekStart": "" +} diff --git a/dashboards/grafana/nullboiler-workers.json b/dashboards/grafana/nullboiler-workers.json new file mode 100644 index 0000000..74b8afd --- /dev/null +++ b/dashboards/grafana/nullboiler-workers.json @@ -0,0 +1,593 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Worker-side health for NullBoiler — health-check pass/fail rates and dispatch reliability. Use this when the Overview dashboard shows elevated dispatch failure ratio and you need to localize the bad worker.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_checks_total[5m])", + "legendFormat": "checks/s", + "range": true, + "refId": "A" + } + ], + "title": "Health checks/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_failures_total[5m]) / clamp_min(rate(nullboiler_worker_health_checks_total[5m]), 0.001)", + "legendFormat": "failure ratio", + "range": true, + "refId": "A" + } + ], + "title": "Health-check failure ratio (5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_success_total[5m])", + "legendFormat": "success/s", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch success/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.001 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])", + "legendFormat": "failure/s", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch failure/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_checks_total[5m])", + "legendFormat": "checks", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_failures_total[5m])", + "legendFormat": "failures", + "range": true, + "refId": "B" + } + ], + "title": "Health-check rate (probe vs failure)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 0, + "showPoints": "never", + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_success_total[5m])", + "legendFormat": "success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])", + "legendFormat": "failure", + "range": true, + "refId": "B" + } + ], + "title": "Dispatch outcomes (stacked bars)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "thresholdsStyle": { + "mode": "line+area" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "red", + "value": 0.2 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m]) / clamp_min((rate(nullboiler_worker_dispatch_success_total[5m]) + rate(nullboiler_worker_dispatch_failure_total[5m])), 0.001)", + "legendFormat": "dispatch failure ratio", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_failures_total[5m]) / clamp_min(rate(nullboiler_worker_health_checks_total[5m]), 0.001)", + "legendFormat": "health-check failure ratio", + "range": true, + "refId": "B" + } + ], + "title": "Failure ratios over time (drives circuit breaker)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["nullboiler", "workers", "p1-03"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "NullBoiler — Workers", + "uid": "nullboiler-workers", + "version": 1, + "weekStart": "" +} diff --git a/dashboards/prometheus/prometheus.yml b/dashboards/prometheus/prometheus.yml new file mode 100644 index 0000000..f64a3c1 --- /dev/null +++ b/dashboards/prometheus/prometheus.yml @@ -0,0 +1,30 @@ +# Minimal Prometheus scrape config for a local NullBoiler instance. +# Drop into your existing Prometheus alongside other scrape jobs. +# +# Pair with the Grafana dashboards in ../grafana/. + +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + external_labels: + monitor: nullboiler-local + +scrape_configs: + - job_name: nullboiler + metrics_path: /metrics + scheme: http + static_configs: + - targets: ["nullboiler:8080"] # docker-compose service name + labels: + service: nullboiler + env: local + + # Uncomment when running outside docker-compose: + # - job_name: nullboiler-host + # metrics_path: /metrics + # static_configs: + # - targets: ["127.0.0.1:8080"] + # labels: + # service: nullboiler + # env: local From 72b7cadae5e2e043ee7403b8726a0376c0909223 Mon Sep 17 00:00:00 2001 From: Kures <14836932+Kures@users.noreply.github.com> Date: Fri, 8 May 2026 18:57:23 +0300 Subject: [PATCH 2/3] feat(observability): add 4 gauge panels for live-state metrics Companion to the upcoming feat/observability-gauges PR which adds the runs_in_flight, steps_in_flight, workers_healthy, and drain_mode gauges to /metrics. These panels light up once that PR lands; until then they show 'No data' against any image that does not yet expose the gauges. Layout: 4 stat panels in a new bottom row at gridPos y=20. Workers healthy uses a red-on-0 threshold so a dead worker pool is visible at a glance. Note: this commit also touches a few previously-inline arrays (e.g. "calcs": ["lastNotNull"]) that the JSON formatter expanded to multi-line format. No semantic change; reviewers can collapse those hunks visually. --- dashboards/grafana/nullboiler-overview.json | 298 +++++++++++++++++++- 1 file changed, 289 insertions(+), 9 deletions(-) diff --git a/dashboards/grafana/nullboiler-overview.json b/dashboards/grafana/nullboiler-overview.json index 582f37e..8b6b769 100644 --- a/dashboards/grafana/nullboiler-overview.json +++ b/dashboards/grafana/nullboiler-overview.json @@ -59,7 +59,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -118,7 +120,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -184,7 +188,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -246,7 +252,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -331,7 +339,11 @@ "id": 5, "options": { "legend": { - "calcs": ["lastNotNull", "mean", "max"], + "calcs": [ + "lastNotNull", + "mean", + "max" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -464,7 +476,11 @@ "id": 6, "options": { "legend": { - "calcs": ["lastNotNull", "mean", "max"], + "calcs": [ + "lastNotNull", + "mean", + "max" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -539,7 +555,11 @@ "id": 7, "options": { "legend": { - "calcs": ["lastNotNull", "mean", "max"], + "calcs": [ + "lastNotNull", + "mean", + "max" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -627,7 +647,11 @@ "id": 8, "options": { "legend": { - "calcs": ["lastNotNull", "mean", "max"], + "calcs": [ + "lastNotNull", + "mean", + "max" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -663,11 +687,267 @@ ], "title": "Reliability ratios (replay & retry)", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 20 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_runs_in_flight", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Runs in-flight", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 20 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_steps_in_flight", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Steps in-flight", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 20 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_workers_healthy", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Workers healthy", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 20 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_drain_mode", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Drain mode", + "type": "stat" } ], "refresh": "30s", "schemaVersion": 39, - "tags": ["nullboiler", "orchestration", "p1-03"], + "tags": [ + "nullboiler", + "orchestration", + "p1-03" + ], "templating": { "list": [ { From 13ab1f585f9b728ed647b48381da5a4a7eb8f137 Mon Sep 17 00:00:00 2001 From: Igor Somov Date: Wed, 13 May 2026 10:10:49 -0300 Subject: [PATCH 3/3] fix(observability): wire dashboard gauges --- dashboards/README.md | 58 +++++++++++++-------- dashboards/alerts/nullboiler.rules.yml | 6 +-- dashboards/prometheus/prometheus.yml | 9 ++-- docs/openapi.yaml | 8 +++ src/api.zig | 18 ++++++- src/metrics.zig | 70 ++++++++++++++++++++++++++ src/store.zig | 42 ++++++++++++++++ 7 files changed, 181 insertions(+), 30 deletions(-) diff --git a/dashboards/README.md b/dashboards/README.md index a374989..491dbac 100644 --- a/dashboards/README.md +++ b/dashboards/README.md @@ -30,7 +30,7 @@ dashboards/ ├── prometheus/ │ └── prometheus.yml minimal scrape config └── alerts/ - └── nullboiler.rules.yml 8 AlertManager rules paired 1:1 with the dashboards + └── nullboiler.rules.yml 8 Prometheus alerting rules paired 1:1 with the dashboards ``` ## What each dashboard answers @@ -49,6 +49,10 @@ Open this first when investigating "is something wrong?". | Worker dispatch (success vs failure) | Stacked-area dispatch outcomes | | Callbacks (sent vs failed) | Webhook delivery reliability | | Reliability ratios | Idempotent replay ratio + step retry ratio with thresholds | +| Runs in-flight | Current number of running workflow runs | +| Steps in-flight | Current number of running steps | +| Workers healthy | Workers currently marked `active` | +| Drain mode | Whether the API is rejecting new runs for drain | ### `nullboiler-workers.json` @@ -66,27 +70,33 @@ you need to localize the bad worker. ## Metrics exposed by NullBoiler -From `src/metrics.zig`, all counters (no histograms or labels yet): +From `src/metrics.zig` and the `GET /metrics` handler, all exported +metrics (no histograms or labels yet): -| Counter | Meaning | -|---|---| -| `nullboiler_http_requests_total` | All HTTP requests handled by the API | -| `nullboiler_runs_created_total` | Runs successfully accepted by `POST /runs` | -| `nullboiler_runs_idempotent_replays_total` | Idempotent replays of an existing run | -| `nullboiler_steps_claimed_total` | Steps dispatched to workers | -| `nullboiler_steps_retry_scheduled_total` | Steps scheduled for retry | -| `nullboiler_worker_dispatch_success_total` | Worker dispatches that succeeded | -| `nullboiler_worker_dispatch_failure_total` | Worker dispatches that failed | -| `nullboiler_worker_health_checks_total` | Health probes performed | -| `nullboiler_worker_health_failures_total` | Health probes that failed | -| `nullboiler_callback_sent_total` | Run-lifecycle webhook callbacks sent | -| `nullboiler_callback_failed_total` | Run-lifecycle webhook callbacks failed | - -## Quick start (docker-compose) +| Metric | Type | Meaning | +|---|---|---| +| `nullboiler_http_requests_total` | counter | All HTTP requests handled by the API | +| `nullboiler_runs_created_total` | counter | Runs successfully accepted by `POST /runs` | +| `nullboiler_runs_idempotent_replays_total` | counter | Idempotent replays of an existing run | +| `nullboiler_steps_claimed_total` | counter | Steps dispatched to workers | +| `nullboiler_steps_retry_scheduled_total` | counter | Steps scheduled for retry | +| `nullboiler_worker_dispatch_success_total` | counter | Worker dispatches that succeeded | +| `nullboiler_worker_dispatch_failure_total` | counter | Worker dispatches that failed | +| `nullboiler_worker_health_checks_total` | counter | Health probes performed | +| `nullboiler_worker_health_failures_total` | counter | Health probes that failed | +| `nullboiler_callback_sent_total` | counter | Run-lifecycle webhook callbacks sent | +| `nullboiler_callback_failed_total` | counter | Run-lifecycle webhook callbacks failed | +| `nullboiler_runs_in_flight` | gauge | Workflow runs currently in `running` status | +| `nullboiler_steps_in_flight` | gauge | Steps currently in `running` status | +| `nullboiler_workers_healthy` | gauge | Workers currently in `active` status | +| `nullboiler_drain_mode` | gauge | `1` when drain mode is enabled, otherwise `0` | + +## Quick start (Docker sidecars) ```bash docker run -d --name prom \ -p 9090:9090 \ + --add-host=host.docker.internal:host-gateway \ -v "$(pwd)/dashboards/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \ prom/prometheus @@ -101,8 +111,11 @@ docker run -d --name grafana \ ``` Open Grafana at http://localhost:3030, point both dashboards at the -Prometheus datasource, and they will populate as soon as NullBoiler -starts handling traffic. +Prometheus datasource, and they will populate as soon as a host +NullBoiler process on port 8080 starts handling traffic. If Prometheus +and NullBoiler run in the same docker-compose network, switch the +target in `prometheus/prometheus.yml` to the commented +`nullboiler:8080` example. ## Quick start (existing Prometheus + Grafana) @@ -167,8 +180,9 @@ To smoke-test the metrics endpoint without Grafana: curl -s http://localhost:8080/metrics | head -30 ``` -You should see eleven `# TYPE ... counter` blocks and their numeric -values. Empty values are valid — counters start at zero. +You should see fifteen `# TYPE ...` blocks and their numeric values. +Empty values are valid — counters start at zero and gauges reflect the +current store state. ## Diagnosing integration gaps @@ -206,7 +220,7 @@ mitigate at runtime. unlabeled, a single bad worker pulling 1/N of dispatches produces a ~1/N failure ratio — below the 30% warning threshold for N ≥ 4. Per-worker labels resolve this. -- Recording rules + Grafana-native alerting (the AlertManager rules +- Recording rules + Grafana-native alerting (the Prometheus alerting rules in `alerts/` are the floor — a recording-rule layer would precompute the ratios and avoid PromQL duplication between dashboards and alerts). diff --git a/dashboards/alerts/nullboiler.rules.yml b/dashboards/alerts/nullboiler.rules.yml index bfc72f9..ed7e0fe 100644 --- a/dashboards/alerts/nullboiler.rules.yml +++ b/dashboards/alerts/nullboiler.rules.yml @@ -1,4 +1,4 @@ -# Prometheus AlertManager rules for NullBoiler. +# Prometheus alerting rules for NullBoiler. # # Drop into your Prometheus config alongside the scrape stanza from # `dashboards/prometheus/prometheus.yml`: @@ -6,8 +6,8 @@ # rule_files: # - /etc/prometheus/alerts/nullboiler.rules.yml # -# Then point AlertManager at the rules and route the `nullboiler` group -# to your on-call channel. +# Then configure Alertmanager routes for the `nullboiler` group and send +# them to your on-call channel. # # These rules pair 1:1 with the panels in `dashboards/grafana/`. If you # tune a threshold here, mirror it in the corresponding dashboard panel diff --git a/dashboards/prometheus/prometheus.yml b/dashboards/prometheus/prometheus.yml index f64a3c1..22a1b99 100644 --- a/dashboards/prometheus/prometheus.yml +++ b/dashboards/prometheus/prometheus.yml @@ -15,16 +15,17 @@ scrape_configs: metrics_path: /metrics scheme: http static_configs: - - targets: ["nullboiler:8080"] # docker-compose service name + - targets: ["host.docker.internal:8080"] # host NullBoiler from Docker sidecar labels: service: nullboiler env: local - # Uncomment when running outside docker-compose: - # - job_name: nullboiler-host + # Use this target instead when Prometheus and NullBoiler share a + # docker-compose network and the orchestrator service is named "nullboiler": + # - job_name: nullboiler-compose # metrics_path: /metrics # static_configs: - # - targets: ["127.0.0.1:8080"] + # - targets: ["nullboiler:8080"] # labels: # service: nullboiler # env: local diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 75a9b3e..e1bd415 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -138,6 +138,14 @@ paths: # TYPE nullboiler_runs_created_total counter nullboiler_runs_created_total 42 # ... 9 more counters + # TYPE nullboiler_runs_in_flight gauge + nullboiler_runs_in_flight 2 + # TYPE nullboiler_steps_in_flight gauge + nullboiler_steps_in_flight 5 + # TYPE nullboiler_workers_healthy gauge + nullboiler_workers_healthy 3 + # TYPE nullboiler_drain_mode gauge + nullboiler_drain_mode 0 /admin/drain: post: diff --git a/src/api.zig b/src/api.zig index b5ee09e..22e8e47 100644 --- a/src/api.zig +++ b/src/api.zig @@ -282,7 +282,13 @@ fn handleHealth(ctx: *Context) HttpResponse { fn handleMetrics(ctx: *Context) HttpResponse { const m = ctx.metrics orelse return plainResponse(200, "nullboiler_metrics_disabled 1\n"); - const body = m.renderPrometheus(ctx.allocator) catch return plainResponse(500, "nullboiler_metrics_render_error 1\n"); + const gauges = metrics_mod.Metrics.GaugeSnapshot{ + .runs_in_flight = ctx.store.countRunsByStatus("running") catch 0, + .steps_in_flight = ctx.store.countAllStepsByStatus("running") catch 0, + .workers_healthy = ctx.store.countWorkersByStatus("active") catch 0, + .drain_mode = if (ctx.drain_mode) |drain| @intFromBool(drain.load(.acquire)) else 0, + }; + const body = m.renderPrometheusWithGauges(ctx.allocator, gauges) catch return plainResponse(500, "nullboiler_metrics_render_error 1\n"); return plainResponse(200, body); } @@ -2632,16 +2638,26 @@ test "API: metrics endpoint returns text format" { defer arena.deinit(); var metrics = metrics_mod.Metrics{}; + var drain_mode = std.atomic.Value(bool).init(true); + try store.insertRun("run-active", null, "running", "{\"steps\":[]}", "{}", "[]"); + try store.insertStep("step-active", "run-active", "node-a", "task", "running", "{}", 1, null, null, null); + try store.insertWorker("worker-active", "http://localhost:3000/webhook", "", "webhook", null, "[]", 1, "registered"); var ctx = Context{ .store = &store, .allocator = arena.allocator(), .metrics = &metrics, + .drain_mode = &drain_mode, }; const resp = handleRequest(&ctx, "GET", "/metrics", ""); try std.testing.expectEqual(@as(u16, 200), resp.status_code); try std.testing.expect(std.mem.startsWith(u8, resp.content_type, "text/plain")); try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_http_requests_total") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "# TYPE nullboiler_runs_in_flight gauge") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_runs_in_flight 1") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_steps_in_flight 1") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_workers_healthy 1") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_drain_mode 1") != null); } test "API: list runs supports workflow_id filter" { diff --git a/src/metrics.zig b/src/metrics.zig index 25aa02b..5225be9 100644 --- a/src/metrics.zig +++ b/src/metrics.zig @@ -18,6 +18,76 @@ pub const Metrics = struct { } pub fn renderPrometheus(self: *const Metrics, allocator: std.mem.Allocator) ![]const u8 { + return self.renderPrometheusWithGauges(allocator, null); + } + + pub const GaugeSnapshot = struct { + runs_in_flight: i64, + steps_in_flight: i64, + workers_healthy: i64, + drain_mode: i64, + }; + + pub fn renderPrometheusWithGauges( + self: *const Metrics, + allocator: std.mem.Allocator, + gauges: ?GaugeSnapshot, + ) ![]const u8 { + if (gauges) |g| { + return std.fmt.allocPrint( + allocator, + \\# TYPE nullboiler_http_requests_total counter + \\nullboiler_http_requests_total {d} + \\# TYPE nullboiler_runs_created_total counter + \\nullboiler_runs_created_total {d} + \\# TYPE nullboiler_runs_idempotent_replays_total counter + \\nullboiler_runs_idempotent_replays_total {d} + \\# TYPE nullboiler_steps_claimed_total counter + \\nullboiler_steps_claimed_total {d} + \\# TYPE nullboiler_steps_retry_scheduled_total counter + \\nullboiler_steps_retry_scheduled_total {d} + \\# TYPE nullboiler_worker_dispatch_success_total counter + \\nullboiler_worker_dispatch_success_total {d} + \\# TYPE nullboiler_worker_dispatch_failure_total counter + \\nullboiler_worker_dispatch_failure_total {d} + \\# TYPE nullboiler_worker_health_checks_total counter + \\nullboiler_worker_health_checks_total {d} + \\# TYPE nullboiler_worker_health_failures_total counter + \\nullboiler_worker_health_failures_total {d} + \\# TYPE nullboiler_callback_sent_total counter + \\nullboiler_callback_sent_total {d} + \\# TYPE nullboiler_callback_failed_total counter + \\nullboiler_callback_failed_total {d} + \\# TYPE nullboiler_runs_in_flight gauge + \\nullboiler_runs_in_flight {d} + \\# TYPE nullboiler_steps_in_flight gauge + \\nullboiler_steps_in_flight {d} + \\# TYPE nullboiler_workers_healthy gauge + \\nullboiler_workers_healthy {d} + \\# TYPE nullboiler_drain_mode gauge + \\nullboiler_drain_mode {d} + \\ + , + .{ + self.http_requests_total.load(.monotonic), + self.runs_created_total.load(.monotonic), + self.runs_idempotent_replays_total.load(.monotonic), + self.steps_claimed_total.load(.monotonic), + self.steps_retry_scheduled_total.load(.monotonic), + self.worker_dispatch_success_total.load(.monotonic), + self.worker_dispatch_failure_total.load(.monotonic), + self.worker_health_checks_total.load(.monotonic), + self.worker_health_failures_total.load(.monotonic), + self.callback_sent_total.load(.monotonic), + self.callback_failed_total.load(.monotonic), + g.runs_in_flight, + g.steps_in_flight, + g.workers_healthy, + g.drain_mode, + }, + ); + } + return std.fmt.allocPrint( allocator, \\# TYPE nullboiler_http_requests_total counter diff --git a/src/store.zig b/src/store.zig index d9236e0..9944c04 100644 --- a/src/store.zig +++ b/src/store.zig @@ -342,6 +342,20 @@ pub const Store = struct { return list.toOwnedSlice(allocator); } + pub fn countWorkersByStatus(self: *Self, status: []const u8) !i64 { + const sql = "SELECT COUNT(*) FROM workers WHERE status = ?"; + var stmt: ?*c.sqlite3_stmt = null; + if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) { + return error.SqlitePrepareFailed; + } + defer _ = c.sqlite3_finalize(stmt); + + _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC); + + if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return 0; + return colInt(stmt, 0); + } + pub fn getWorker(self: *Self, allocator: std.mem.Allocator, id: []const u8) !?types.WorkerRow { const sql = "SELECT id, url, token, protocol, model, tags_json, max_concurrent, source, status, consecutive_failures, circuit_open_until_ms, last_error_text, last_health_ms, created_at_ms FROM workers WHERE id = ?"; var stmt: ?*c.sqlite3_stmt = null; @@ -661,6 +675,20 @@ pub const Store = struct { return list.toOwnedSlice(allocator); } + pub fn countRunsByStatus(self: *Self, status: []const u8) !i64 { + const sql = "SELECT COUNT(*) FROM runs WHERE status = ?"; + var stmt: ?*c.sqlite3_stmt = null; + if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) { + return error.SqlitePrepareFailed; + } + defer _ = c.sqlite3_finalize(stmt); + + _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC); + + if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return 0; + return colInt(stmt, 0); + } + // ── Step CRUD ───────────────────────────────────────────────────── pub fn insertStep(self: *Self, id: []const u8, run_id: []const u8, def_step_id: []const u8, step_type: []const u8, status: []const u8, input_json: []const u8, max_attempts: i64, timeout_ms: ?i64, parent_step_id: ?[]const u8, item_index: ?i64) !void { @@ -859,6 +887,20 @@ pub const Store = struct { return colInt(stmt, 0); } + pub fn countAllStepsByStatus(self: *Self, status: []const u8) !i64 { + const sql = "SELECT COUNT(*) FROM steps WHERE status = ?"; + var stmt: ?*c.sqlite3_stmt = null; + if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) { + return error.SqlitePrepareFailed; + } + defer _ = c.sqlite3_finalize(stmt); + + _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC); + + if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return 0; + return colInt(stmt, 0); + } + pub fn getChildSteps(self: *Self, allocator: std.mem.Allocator, parent_step_id: []const u8) ![]types.StepRow { const sql = "SELECT id, run_id, def_step_id, type, status, worker_id, input_json, output_json, error_text, attempt, max_attempts, timeout_ms, next_attempt_at_ms, parent_step_id, item_index, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, child_run_id, iteration_index FROM steps WHERE parent_step_id = ? ORDER BY item_index ASC"; var stmt: ?*c.sqlite3_stmt = null;