From 2d9448a45adeb9033392167e76580c908a1e18c5 Mon Sep 17 00:00:00 2001
From: Kures <14836932+Kures@users.noreply.github.com>
Date: Fri, 8 May 2026 00:32:06 +0300
Subject: [PATCH 1/3] feat(observability): add Grafana dashboards, Prometheus
 scrape config + AlertManager rules

Partially addresses P1-03 from reference/todo.md ("structured
observability: request IDs, metrics endpoint, OTEL spans"). The
metrics endpoint already ships in src/metrics.zig and src/api.zig:70;
this contributes the operator side as a coherent stack.

Contents:
- dashboards/grafana/nullboiler-overview.json   - high-level ops view (8 panels)
- dashboards/grafana/nullboiler-workers.json    - per-fleet worker view (7 panels)
- dashboards/prometheus/prometheus.yml          - minimal scrape config
- dashboards/alerts/nullboiler.rules.yml        - 8 AlertManager rules
- dashboards/README.md                          - quick-start, panel index, alert table

Targets Grafana 10.x and 11.x (schemaVersion 39). PromQL is plain rate()
over the 11 counters exposed by /metrics, with clamp_min to avoid
divide-by-zero on idle clusters.

Both dashboards prompt for the Prometheus datasource via DS_PROMETHEUS
template variable so they import cleanly into existing setups.

Alert rules pair 1:1 with the dashboards: thresholds match the colour
bands on the panels so dashboard and pager tell the same story.
Validates clean with `promtool check rules`.

Side benefit: the Workers dashboard's "Failure ratios over time" panel
visualises Gap 3 from nullclaw/docs/integration-analysis.md
(HIGH PRIORITY) when a stock nullclaw worker is wired up via /webhook.
See "Diagnosing integration gaps" in dashboards/README.md.

Future work (not part of this PR):
- histogram metrics for HTTP and dispatch latency
- per-worker labels on dispatch counters
- recording rules + Grafana alerting integration
---
 dashboards/README.md                        | 214 ++++++
 dashboards/alerts/nullboiler.rules.yml      | 204 ++++++
 dashboards/grafana/nullboiler-overview.json | 703 ++++++++++++++++++++
 dashboards/grafana/nullboiler-workers.json  | 593 +++++++++++++++++
 dashboards/prometheus/prometheus.yml        |  30 +
 5 files changed, 1744 insertions(+)
 create mode 100644 dashboards/README.md
 create mode 100644 dashboards/alerts/nullboiler.rules.yml
 create mode 100644 dashboards/grafana/nullboiler-overview.json
 create mode 100644 dashboards/grafana/nullboiler-workers.json
 create mode 100644 dashboards/prometheus/prometheus.yml

diff --git a/dashboards/README.md b/dashboards/README.md
new file mode 100644
index 0000000..a374989
--- /dev/null
+++ b/dashboards/README.md
@@ -0,0 +1,214 @@
+# NullBoiler observability stack (P1-03)
+
+Ready-to-import Grafana dashboards plus a minimal Prometheus scrape
+config for the existing `/metrics` endpoint.
+
+This contributes the **operator side** of `reference/todo.md` P1-03
+("Structured observability: request IDs, metrics endpoint, OTEL
+spans"). The endpoint and counters already ship in `src/metrics.zig`;
+this directory makes them visible.
+
+> **Side benefit:** the panels also visualise the integration gaps
+> documented in
+> [`nullclaw/docs/integration-analysis.md`](https://github.com/nullclaw/nullclaw/blob/main/docs/integration-analysis.md).
+> When a NullClaw worker is wired up via `/webhook` (Gap 3 — HIGH
+> PRIORITY in that document), the *Worker dispatch failure ratio*
+> panel goes red while the *Health-check failure ratio* stays green,
+> isolating the contract mismatch (sync `{status:"ok",response:"..."}`
+> expected, async `{status:"received"}` returned). See
+> [Diagnosing integration gaps](#diagnosing-integration-gaps) below for
+> the exact panel pattern.
+
+## Contents
+
+```
+dashboards/
+├── README.md                            this file
+├── grafana/
+│   ├── nullboiler-overview.json         high-level operations view
+│   └── nullboiler-workers.json          per-fleet worker health view
+├── prometheus/
+│   └── prometheus.yml                   minimal scrape config
+└── alerts/
+    └── nullboiler.rules.yml             8 AlertManager rules paired 1:1 with the dashboards
+```
+
+## What each dashboard answers
+
+### `nullboiler-overview.json`
+
+Open this first when investigating "is something wrong?".
+
+| Panel | Question it answers |
+|---|---|
+| HTTP requests/sec | Is anyone talking to us right now? |
+| Runs created/sec | Is work flowing into the orchestrator? |
+| Worker dispatch failure ratio (5m) | What share of dispatches are blowing up? |
+| Callback failures/sec | Are run-lifecycle webhooks reaching consumers? |
+| Run & step throughput | Mix of created / replayed / claimed / retried over time |
+| Worker dispatch (success vs failure) | Stacked-area dispatch outcomes |
+| Callbacks (sent vs failed) | Webhook delivery reliability |
+| Reliability ratios | Idempotent replay ratio + step retry ratio with thresholds |
+
+### `nullboiler-workers.json`
+
+Open this when the Overview shows elevated dispatch failure ratio and
+you need to localize the bad worker.
+
+| Panel | Question it answers |
+|---|---|
+| Health checks/sec | Are health probes running? |
+| Health-check failure ratio (5m) | Are workers responding to probes? |
+| Dispatch success/sec, failure/sec | Per-second outcomes |
+| Health-check rate (probe vs failure) | Probes timeline |
+| Dispatch outcomes (stacked bars) | Discrete dispatch outcomes |
+| Failure ratios over time | The signal the circuit breaker reacts to |
+
+## Metrics exposed by NullBoiler
+
+From `src/metrics.zig`, all counters (no histograms or labels yet):
+
+| Counter | Meaning |
+|---|---|
+| `nullboiler_http_requests_total` | All HTTP requests handled by the API |
+| `nullboiler_runs_created_total` | Runs successfully accepted by `POST /runs` |
+| `nullboiler_runs_idempotent_replays_total` | Idempotent replays of an existing run |
+| `nullboiler_steps_claimed_total` | Steps dispatched to workers |
+| `nullboiler_steps_retry_scheduled_total` | Steps scheduled for retry |
+| `nullboiler_worker_dispatch_success_total` | Worker dispatches that succeeded |
+| `nullboiler_worker_dispatch_failure_total` | Worker dispatches that failed |
+| `nullboiler_worker_health_checks_total` | Health probes performed |
+| `nullboiler_worker_health_failures_total` | Health probes that failed |
+| `nullboiler_callback_sent_total` | Run-lifecycle webhook callbacks sent |
+| `nullboiler_callback_failed_total` | Run-lifecycle webhook callbacks failed |
+
+## Quick start (docker-compose)
+
+```bash
+docker run -d --name prom \
+  -p 9090:9090 \
+  -v "$(pwd)/dashboards/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \
+  prom/prometheus
+
+docker run -d --name grafana \
+  -p 3030:3000 \
+  -e GF_AUTH_ANONYMOUS_ENABLED=true \
+  -e GF_AUTH_ANONYMOUS_ORG_ROLE=Admin \
+  grafana/grafana
+
+# Add Prometheus datasource pointing at http://host.docker.internal:9090
+# Then: Dashboards -> Import -> upload nullboiler-overview.json and nullboiler-workers.json
+```
+
+Open Grafana at http://localhost:3030, point both dashboards at the
+Prometheus datasource, and they will populate as soon as NullBoiler
+starts handling traffic.
+
+## Quick start (existing Prometheus + Grafana)
+
+1. Add the scrape stanza from `prometheus/prometheus.yml` to your
+   existing `prometheus.yml`. Reload Prometheus.
+2. Import each `dashboards/grafana/*.json` via *Dashboards → Import → Upload JSON*.
+3. When prompted, select your Prometheus datasource for the
+   `${DS_PROMETHEUS}` template variable.
+4. (optional) Wire up alerts:
+   ```yaml
+   # in prometheus.yml
+   rule_files:
+     - /etc/prometheus/alerts/nullboiler.rules.yml
+   ```
+   and copy `dashboards/alerts/nullboiler.rules.yml` into that path.
+
+## Alert rules
+
+`alerts/nullboiler.rules.yml` ships 8 rules grouped under
+`nullboiler.health` and `nullboiler.flow`:
+
+| Alert | Severity | Fires when |
+|---|---|---|
+| `NullBoilerInstanceDown` | critical | `up == 0` for 2m |
+| `NullBoilerDispatchFailureRatioHigh` | warning | dispatch failure ratio > 30% for 5m |
+| `NullBoilerDispatchFailureRatioCritical` | critical | dispatch failure ratio > 80% for 2m |
+| `NullBoilerWorkerHealthDegraded` | warning | health-check failure ratio > 20% for 5m |
+| `NullBoilerCallbackDeliveryDegraded` | warning | callback failure ratio > 10% for 10m |
+| `NullBoilerStepRetryRateElevated` | info | retry/claim ratio > 20% for 10m |
+| `NullBoilerNoTrafficForExtendedPeriod` | info | no HTTP traffic for 30m |
+| `NullBoilerIdempotentReplayRatioVeryHigh` | info | replay ratio > 95% for 15m |
+
+Thresholds match the colour bands on the corresponding Grafana panels
+1:1 — if you tune one, mirror the other so the dashboard and the
+pager tell the same story. The Critical-severity alerts are intended
+for paging; everything else is ticket-bait.
+
+> One deliberate exception: `NullBoilerWorkerHealthDegraded` fires at
+> 20% (alert) while the dashboard's health-ratio stat shows yellow at
+> 1% and red at 10%. The alert sits above the dashboard's red band on
+> purpose — the dashboard is meant to surface single-probe blips
+> visually, while the pager should only fire on a sustained pattern.
+
+Validate locally:
+
+```bash
+docker run --rm --entrypoint=promtool \
+  -v "$(pwd)/dashboards/alerts:/rules:ro" prom/prometheus \
+  check rules /rules/nullboiler.rules.yml
+# SUCCESS: 8 rules found
+```
+
+## Verification
+
+The dashboards target Grafana 10.x and 11.x (`schemaVersion: 39`). The
+PromQL is plain `rate()` over counters with `clamp_min` to avoid
+divide-by-zero on idle clusters.
+
+To smoke-test the metrics endpoint without Grafana:
+
+```bash
+curl -s http://localhost:8080/metrics | head -30
+```
+
+You should see eleven `# TYPE ... counter` blocks and their numeric
+values. Empty values are valid — counters start at zero.
+
+## Diagnosing integration gaps
+
+A non-obvious value of these dashboards: they make ecosystem-level
+integration gaps **visually obvious** without reading logs.
+
+The cleanest example today is **Gap 3** in
+[`nullclaw/docs/integration-analysis.md`](https://github.com/nullclaw/nullclaw/blob/main/docs/integration-analysis.md)
+("Worker Endpoint for nullboiler Dispatch — HIGH PRIORITY"). When a
+plain `nullclaw gateway` is registered as a NullBoiler worker:
+
+- `/health` succeeds → `nullboiler_worker_health_failures_total` stays low
+- `/webhook` returns `{"status":"received"}` instead of the documented
+  `{"status":"ok","response":"..."}` →
+  `nullboiler_worker_dispatch_failure_total` increments on every step
+
+In the **Workers** dashboard's *Failure ratios over time* panel this
+shows up as **dispatch failure ratio at ~100% (red)** sitting on top
+of **health-check failure ratio near 0% (green)** — a one-glance
+diagnosis that the worker is reachable but its response contract is
+broken.
+
+This is exactly the visual signal NullBoiler maintainers would want
+when triaging field reports about worker dispatches; it surfaces the
+gap that `integration-analysis.md` predicts but does not yet
+mitigate at runtime.
+
+## Future work
+
+- Histograms for HTTP latency and worker dispatch duration (would
+  enable percentile panels).
+- Per-worker labels on the dispatch counters (would enable
+  per-worker breakdown panels — currently the workers dashboard shows
+  fleet-wide aggregates). On a fleet of N workers with the metrics
+  unlabeled, a single bad worker pulling 1/N of dispatches produces a
+  ~1/N failure ratio — below the 30% warning threshold for N ≥ 4.
+  Per-worker labels resolve this.
+- Recording rules + Grafana-native alerting (the AlertManager rules
+  in `alerts/` are the floor — a recording-rule layer would precompute
+  the ratios and avoid PromQL duplication between dashboards and
+  alerts).
+
+These are not required for P1-03 but are natural follow-ups.
diff --git a/dashboards/alerts/nullboiler.rules.yml b/dashboards/alerts/nullboiler.rules.yml
new file mode 100644
index 0000000..bfc72f9
--- /dev/null
+++ b/dashboards/alerts/nullboiler.rules.yml
@@ -0,0 +1,204 @@
+# Prometheus AlertManager rules for NullBoiler.
+#
+# Drop into your Prometheus config alongside the scrape stanza from
+# `dashboards/prometheus/prometheus.yml`:
+#
+#   rule_files:
+#     - /etc/prometheus/alerts/nullboiler.rules.yml
+#
+# Then point AlertManager at the rules and route the `nullboiler` group
+# to your on-call channel.
+#
+# These rules pair 1:1 with the panels in `dashboards/grafana/`. If you
+# tune a threshold here, mirror it in the corresponding dashboard panel
+# threshold band so the dashboard and the alert tell the same story.
+
+groups:
+  - name: nullboiler.health
+    interval: 30s
+    rules:
+
+      # ── Liveness ──────────────────────────────────────────────────
+
+      - alert: NullBoilerInstanceDown
+        expr: up{job="nullboiler"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          service: nullboiler
+        annotations:
+          summary: "NullBoiler instance is down"
+          description: |
+            Prometheus has not scraped {{ $labels.instance }} for 2
+            minutes. Either the orchestrator process died or the
+            `/metrics` endpoint stopped responding.
+          runbook: "https://github.com/nullclaw/nullboiler/blob/main/dashboards/README.md"
+
+      # ── Worker dispatch (Gap 3 territory) ─────────────────────────
+
+      - alert: NullBoilerDispatchFailureRatioHigh
+        expr: |
+          (
+              rate(nullboiler_worker_dispatch_failure_total[5m])
+            /
+              clamp_min(
+                  rate(nullboiler_worker_dispatch_success_total[5m])
+                + rate(nullboiler_worker_dispatch_failure_total[5m]),
+                  0.001
+              )
+          ) > 0.30
+        for: 5m
+        labels:
+          severity: warning
+          service: nullboiler
+        annotations:
+          summary: "More than 30% of worker dispatches are failing (5m)"
+          description: |
+            Sustained dispatch failure ratio above 30% indicates a
+            broken contract between NullBoiler and at least one
+            worker. Cross-check the *Workers* dashboard's
+            "Failure ratios over time" panel: if `health-check
+            failure ratio` is also high, the worker is unreachable;
+            if only dispatch is failing, the worker is reachable but
+            its `/webhook` is returning the wrong shape (see
+            integration-analysis.md Gap 3).
+          dashboard: "http://grafana/d/nullboiler-workers"
+
+      - alert: NullBoilerDispatchFailureRatioCritical
+        expr: |
+          (
+              rate(nullboiler_worker_dispatch_failure_total[5m])
+            /
+              clamp_min(
+                  rate(nullboiler_worker_dispatch_success_total[5m])
+                + rate(nullboiler_worker_dispatch_failure_total[5m]),
+                  0.001
+              )
+          ) > 0.80
+        for: 2m
+        labels:
+          severity: critical
+          service: nullboiler
+        annotations:
+          summary: "Worker dispatches almost entirely failing (>80% for 2m)"
+          description: |
+            Effectively no work is reaching workers. Page on-call.
+
+      # ── Worker health probes ──────────────────────────────────────
+
+      - alert: NullBoilerWorkerHealthDegraded
+        expr: |
+          (
+              rate(nullboiler_worker_health_failures_total[5m])
+            /
+              clamp_min(
+                  rate(nullboiler_worker_health_checks_total[5m]),
+                  0.001
+              )
+          ) > 0.20
+        for: 5m
+        labels:
+          severity: warning
+          service: nullboiler
+        annotations:
+          summary: "Worker fleet health probes failing >20% (5m)"
+          description: |
+            At least one worker is failing health probes. Look at
+            `GET /workers` for `consecutive_failures > 0` and
+            `last_error_text` to localise.
+
+      # ── Run lifecycle webhooks ────────────────────────────────────
+
+      - alert: NullBoilerCallbackDeliveryDegraded
+        expr: |
+          (
+              rate(nullboiler_callback_failed_total[5m])
+            /
+              clamp_min(
+                  rate(nullboiler_callback_sent_total[5m])
+                + rate(nullboiler_callback_failed_total[5m]),
+                  0.001
+              )
+          ) > 0.10
+        for: 10m
+        labels:
+          severity: warning
+          service: nullboiler
+        annotations:
+          summary: "Run-lifecycle callbacks failing >10% (10m)"
+          description: |
+            Consumers depending on `step.completed` / `run.completed`
+            webhooks may be missing notifications. Check
+            `last_error_text` on the failing callback configs.
+
+      # ── Step retries (signal of upstream flake) ───────────────────
+
+      - alert: NullBoilerStepRetryRateElevated
+        expr: |
+          (
+              rate(nullboiler_steps_retry_scheduled_total[5m])
+            /
+              clamp_min(
+                  rate(nullboiler_steps_claimed_total[5m]),
+                  0.001
+              )
+          ) > 0.20
+        for: 10m
+        labels:
+          severity: info
+          service: nullboiler
+        annotations:
+          summary: "More than 20% of steps are being retried (10m)"
+          description: |
+            High retry rate usually means transient errors at the
+            worker layer. Once retry policy gains exponential backoff
+            + jitter (reference/todo.md P1-01), this alert can be
+            tightened.
+
+  - name: nullboiler.flow
+    interval: 30s
+    rules:
+
+      # ── Throughput sanity ─────────────────────────────────────────
+
+      - alert: NullBoilerNoTrafficForExtendedPeriod
+        expr: |
+          (rate(nullboiler_http_requests_total[15m]) == 0)
+            or absent(nullboiler_http_requests_total)
+        for: 30m
+        labels:
+          severity: info
+          service: nullboiler
+        annotations:
+          summary: "NullBoiler has received no HTTP traffic for 30m"
+          description: |
+            The orchestrator is up but idle. Often benign (between
+            workloads); fires on a clearly-broken upstream pipeline.
+            Liveness itself is covered by NullBoilerInstanceDown — the
+            `absent()` clause here catches the edge case of a target
+            that never produced a sample (e.g. immediately after a
+            restart that bypassed the first scrape).
+
+      # ── Idempotency replays ───────────────────────────────────────
+
+      - alert: NullBoilerIdempotentReplayRatioVeryHigh
+        expr: |
+          (
+              rate(nullboiler_runs_idempotent_replays_total[15m])
+            /
+              clamp_min(
+                  rate(nullboiler_runs_created_total[15m])
+                + rate(nullboiler_runs_idempotent_replays_total[15m]),
+                  0.001
+              )
+          ) > 0.95
+        for: 15m
+        labels:
+          severity: info
+          service: nullboiler
+        annotations:
+          summary: "95%+ of POST /runs are idempotent replays (15m)"
+          description: |
+            A producer is hammering POST /runs with the same
+            idempotency key. This is harmless (replays are cheap) but
+            usually points to a misconfigured retry loop upstream.
diff --git a/dashboards/grafana/nullboiler-overview.json b/dashboards/grafana/nullboiler-overview.json
new file mode 100644
index 0000000..582f37e
--- /dev/null
+++ b/dashboards/grafana/nullboiler-overview.json
@@ -0,0 +1,703 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "High-level operational view of NullBoiler — request volume, run throughput, dispatch reliability, and idempotency replay ratio. Pair with the per-worker dashboard for deeper drill-downs.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "reqps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_http_requests_total[5m])",
+          "legendFormat": "req/s",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "HTTP requests/sec",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_runs_created_total[5m])",
+          "legendFormat": "runs/s",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Runs created/sec",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 0.05
+              },
+              {
+                "color": "red",
+                "value": 0.2
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 12,
+        "y": 0
+      },
+      "id": 3,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_failure_total[5m]) / clamp_min(rate(nullboiler_worker_dispatch_success_total[5m]) + rate(nullboiler_worker_dispatch_failure_total[5m]), 0.001)",
+          "legendFormat": "failure ratio",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Worker dispatch failure ratio (5m)",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 0.001
+              }
+            ]
+          },
+          "unit": "ops"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 18,
+        "y": 0
+      },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_callback_failed_total[5m])",
+          "legendFormat": "failed/s",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Callback failures/sec",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "gradientMode": "opacity",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 4
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": ["lastNotNull", "mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_runs_created_total[5m])",
+          "legendFormat": "runs created",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_runs_idempotent_replays_total[5m])",
+          "legendFormat": "idempotent replays",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_steps_claimed_total[5m])",
+          "legendFormat": "steps claimed",
+          "range": true,
+          "refId": "C"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_steps_retry_scheduled_total[5m])",
+          "legendFormat": "steps retried",
+          "range": true,
+          "refId": "D"
+        }
+      ],
+      "title": "Run & step throughput",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "showPoints": "never",
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            }
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "failure"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "success"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 4
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": ["lastNotNull", "mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_success_total[5m])",
+          "legendFormat": "success",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])",
+          "legendFormat": "failure",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Worker dispatch (success vs failure, stacked)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "showPoints": "never"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 12
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": ["lastNotNull", "mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_callback_sent_total[5m])",
+          "legendFormat": "sent",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_callback_failed_total[5m])",
+          "legendFormat": "failed",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Callbacks (run lifecycle webhook delivery)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "showPoints": "never",
+            "thresholdsStyle": {
+              "mode": "line+area"
+            }
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 0.1
+              },
+              {
+                "color": "red",
+                "value": 0.3
+              }
+            ]
+          },
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 12
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": ["lastNotNull", "mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_runs_idempotent_replays_total[5m]) / clamp_min(rate(nullboiler_runs_created_total[5m]) + rate(nullboiler_runs_idempotent_replays_total[5m]), 0.001)",
+          "legendFormat": "idempotent replay ratio",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_steps_retry_scheduled_total[5m]) / clamp_min(rate(nullboiler_steps_claimed_total[5m]), 0.001)",
+          "legendFormat": "step retry ratio",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Reliability ratios (replay & retry)",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "tags": ["nullboiler", "orchestration", "p1-03"],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "Prometheus",
+          "value": "prometheus"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "NullBoiler — Overview",
+  "uid": "nullboiler-overview",
+  "version": 1,
+  "weekStart": ""
+}
diff --git a/dashboards/grafana/nullboiler-workers.json b/dashboards/grafana/nullboiler-workers.json
new file mode 100644
index 0000000..74b8afd
--- /dev/null
+++ b/dashboards/grafana/nullboiler-workers.json
@@ -0,0 +1,593 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Worker-side health for NullBoiler — health-check pass/fail rates and dispatch reliability. Use this when the Overview dashboard shows elevated dispatch failure ratio and you need to localize the bad worker.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_health_checks_total[5m])",
+          "legendFormat": "checks/s",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Health checks/sec",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 0.01
+              },
+              {
+                "color": "red",
+                "value": 0.1
+              }
+            ]
+          },
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_health_failures_total[5m]) / clamp_min(rate(nullboiler_worker_health_checks_total[5m]), 0.001)",
+          "legendFormat": "failure ratio",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Health-check failure ratio (5m)",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 12,
+        "y": 0
+      },
+      "id": 3,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_success_total[5m])",
+          "legendFormat": "success/s",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Dispatch success/sec",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 0.001
+              }
+            ]
+          },
+          "unit": "ops"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 18,
+        "y": 0
+      },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])",
+          "legendFormat": "failure/s",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Dispatch failure/sec",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "showPoints": "never"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "failures"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 4
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": ["lastNotNull", "mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_health_checks_total[5m])",
+          "legendFormat": "checks",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_health_failures_total[5m])",
+          "legendFormat": "failures",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Health-check rate (probe vs failure)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "drawStyle": "bars",
+            "fillOpacity": 80,
+            "lineWidth": 0,
+            "showPoints": "never",
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            }
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "ops"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "failure"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "success"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 4
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": ["lastNotNull", "mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_success_total[5m])",
+          "legendFormat": "success",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])",
+          "legendFormat": "failure",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Dispatch outcomes (stacked bars)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "showPoints": "never",
+            "thresholdsStyle": {
+              "mode": "line+area"
+            }
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 0.05
+              },
+              {
+                "color": "red",
+                "value": 0.2
+              }
+            ]
+          },
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 12
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": ["lastNotNull", "mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_dispatch_failure_total[5m]) / clamp_min((rate(nullboiler_worker_dispatch_success_total[5m]) + rate(nullboiler_worker_dispatch_failure_total[5m])), 0.001)",
+          "legendFormat": "dispatch failure ratio",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(nullboiler_worker_health_failures_total[5m]) / clamp_min(rate(nullboiler_worker_health_checks_total[5m]), 0.001)",
+          "legendFormat": "health-check failure ratio",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Failure ratios over time (drives circuit breaker)",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "tags": ["nullboiler", "workers", "p1-03"],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "Prometheus",
+          "value": "prometheus"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "NullBoiler — Workers",
+  "uid": "nullboiler-workers",
+  "version": 1,
+  "weekStart": ""
+}
diff --git a/dashboards/prometheus/prometheus.yml b/dashboards/prometheus/prometheus.yml
new file mode 100644
index 0000000..f64a3c1
--- /dev/null
+++ b/dashboards/prometheus/prometheus.yml
@@ -0,0 +1,30 @@
+# Minimal Prometheus scrape config for a local NullBoiler instance.
+# Drop into your existing Prometheus alongside other scrape jobs.
+#
+# Pair with the Grafana dashboards in ../grafana/.
+
+global:
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  evaluation_interval: 15s
+  external_labels:
+    monitor: nullboiler-local
+
+scrape_configs:
+  - job_name: nullboiler
+    metrics_path: /metrics
+    scheme: http
+    static_configs:
+      - targets: ["nullboiler:8080"]   # docker-compose service name
+        labels:
+          service: nullboiler
+          env: local
+
+  # Uncomment when running outside docker-compose:
+  # - job_name: nullboiler-host
+  #   metrics_path: /metrics
+  #   static_configs:
+  #     - targets: ["127.0.0.1:8080"]
+  #       labels:
+  #         service: nullboiler
+  #         env: local

From 72b7cadae5e2e043ee7403b8726a0376c0909223 Mon Sep 17 00:00:00 2001
From: Kures <14836932+Kures@users.noreply.github.com>
Date: Fri, 8 May 2026 18:57:23 +0300
Subject: [PATCH 2/3] feat(observability): add 4 gauge panels for live-state
 metrics

Companion to the upcoming feat/observability-gauges PR which adds the
runs_in_flight, steps_in_flight, workers_healthy, and drain_mode
gauges to /metrics. These panels light up once that PR lands; until
then they show 'No data' against any image that does not yet expose
the gauges.

Layout: 4 stat panels in a new bottom row at gridPos y=20. Workers
healthy uses a red-on-0 threshold so a dead worker pool is visible at
a glance.

Note: this commit also touches a few previously-inline arrays
(e.g. "calcs": ["lastNotNull"]) that the JSON formatter expanded to
multi-line format. No semantic change; reviewers can collapse those
hunks visually.
---
 dashboards/grafana/nullboiler-overview.json | 298 +++++++++++++++++++-
 1 file changed, 289 insertions(+), 9 deletions(-)

diff --git a/dashboards/grafana/nullboiler-overview.json b/dashboards/grafana/nullboiler-overview.json
index 582f37e..8b6b769 100644
--- a/dashboards/grafana/nullboiler-overview.json
+++ b/dashboards/grafana/nullboiler-overview.json
@@ -59,7 +59,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -118,7 +120,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -184,7 +188,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -246,7 +252,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -331,7 +339,11 @@
       "id": 5,
       "options": {
         "legend": {
-          "calcs": ["lastNotNull", "mean", "max"],
+          "calcs": [
+            "lastNotNull",
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -464,7 +476,11 @@
       "id": 6,
       "options": {
         "legend": {
-          "calcs": ["lastNotNull", "mean", "max"],
+          "calcs": [
+            "lastNotNull",
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -539,7 +555,11 @@
       "id": 7,
       "options": {
         "legend": {
-          "calcs": ["lastNotNull", "mean", "max"],
+          "calcs": [
+            "lastNotNull",
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -627,7 +647,11 @@
       "id": 8,
       "options": {
         "legend": {
-          "calcs": ["lastNotNull", "mean", "max"],
+          "calcs": [
+            "lastNotNull",
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -663,11 +687,267 @@
       ],
       "title": "Reliability ratios (replay & retry)",
       "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 20
+      },
+      "id": 9,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "nullboiler_runs_in_flight",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Runs in-flight",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 20
+      },
+      "id": 10,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "nullboiler_steps_in_flight",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Steps in-flight",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 12,
+        "y": 20
+      },
+      "id": 11,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "nullboiler_workers_healthy",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Workers healthy",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 18,
+        "y": 20
+      },
+      "id": 12,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "nullboiler_drain_mode",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Drain mode",
+      "type": "stat"
     }
   ],
   "refresh": "30s",
   "schemaVersion": 39,
-  "tags": ["nullboiler", "orchestration", "p1-03"],
+  "tags": [
+    "nullboiler",
+    "orchestration",
+    "p1-03"
+  ],
   "templating": {
     "list": [
       {

From 13ab1f585f9b728ed647b48381da5a4a7eb8f137 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Wed, 13 May 2026 10:10:49 -0300
Subject: [PATCH 3/3] fix(observability): wire dashboard gauges

---
 dashboards/README.md                   | 58 +++++++++++++--------
 dashboards/alerts/nullboiler.rules.yml |  6 +--
 dashboards/prometheus/prometheus.yml   |  9 ++--
 docs/openapi.yaml                      |  8 +++
 src/api.zig                            | 18 ++++++-
 src/metrics.zig                        | 70 ++++++++++++++++++++++++++
 src/store.zig                          | 42 ++++++++++++++++
 7 files changed, 181 insertions(+), 30 deletions(-)

diff --git a/dashboards/README.md b/dashboards/README.md
index a374989..491dbac 100644
--- a/dashboards/README.md
+++ b/dashboards/README.md
@@ -30,7 +30,7 @@ dashboards/
 ├── prometheus/
 │   └── prometheus.yml                   minimal scrape config
 └── alerts/
-    └── nullboiler.rules.yml             8 AlertManager rules paired 1:1 with the dashboards
+    └── nullboiler.rules.yml             8 Prometheus alerting rules paired 1:1 with the dashboards
 ```
 
 ## What each dashboard answers
@@ -49,6 +49,10 @@ Open this first when investigating "is something wrong?".
 | Worker dispatch (success vs failure) | Stacked-area dispatch outcomes |
 | Callbacks (sent vs failed) | Webhook delivery reliability |
 | Reliability ratios | Idempotent replay ratio + step retry ratio with thresholds |
+| Runs in-flight | Current number of running workflow runs |
+| Steps in-flight | Current number of running steps |
+| Workers healthy | Workers currently marked `active` |
+| Drain mode | Whether the API is rejecting new runs for drain |
 
 ### `nullboiler-workers.json`
 
@@ -66,27 +70,33 @@ you need to localize the bad worker.
 
 ## Metrics exposed by NullBoiler
 
-From `src/metrics.zig`, all counters (no histograms or labels yet):
+From `src/metrics.zig` and the `GET /metrics` handler, all exported
+metrics (no histograms or labels yet):
 
-| Counter | Meaning |
-|---|---|
-| `nullboiler_http_requests_total` | All HTTP requests handled by the API |
-| `nullboiler_runs_created_total` | Runs successfully accepted by `POST /runs` |
-| `nullboiler_runs_idempotent_replays_total` | Idempotent replays of an existing run |
-| `nullboiler_steps_claimed_total` | Steps dispatched to workers |
-| `nullboiler_steps_retry_scheduled_total` | Steps scheduled for retry |
-| `nullboiler_worker_dispatch_success_total` | Worker dispatches that succeeded |
-| `nullboiler_worker_dispatch_failure_total` | Worker dispatches that failed |
-| `nullboiler_worker_health_checks_total` | Health probes performed |
-| `nullboiler_worker_health_failures_total` | Health probes that failed |
-| `nullboiler_callback_sent_total` | Run-lifecycle webhook callbacks sent |
-| `nullboiler_callback_failed_total` | Run-lifecycle webhook callbacks failed |
-
-## Quick start (docker-compose)
+| Metric | Type | Meaning |
+|---|---|---|
+| `nullboiler_http_requests_total` | counter | All HTTP requests handled by the API |
+| `nullboiler_runs_created_total` | counter | Runs successfully accepted by `POST /runs` |
+| `nullboiler_runs_idempotent_replays_total` | counter | Idempotent replays of an existing run |
+| `nullboiler_steps_claimed_total` | counter | Steps dispatched to workers |
+| `nullboiler_steps_retry_scheduled_total` | counter | Steps scheduled for retry |
+| `nullboiler_worker_dispatch_success_total` | counter | Worker dispatches that succeeded |
+| `nullboiler_worker_dispatch_failure_total` | counter | Worker dispatches that failed |
+| `nullboiler_worker_health_checks_total` | counter | Health probes performed |
+| `nullboiler_worker_health_failures_total` | counter | Health probes that failed |
+| `nullboiler_callback_sent_total` | counter | Run-lifecycle webhook callbacks sent |
+| `nullboiler_callback_failed_total` | counter | Run-lifecycle webhook callbacks failed |
+| `nullboiler_runs_in_flight` | gauge | Workflow runs currently in `running` status |
+| `nullboiler_steps_in_flight` | gauge | Steps currently in `running` status |
+| `nullboiler_workers_healthy` | gauge | Workers currently in `active` status |
+| `nullboiler_drain_mode` | gauge | `1` when drain mode is enabled, otherwise `0` |
+
+## Quick start (Docker sidecars)
 
 ```bash
 docker run -d --name prom \
   -p 9090:9090 \
+  --add-host=host.docker.internal:host-gateway \
   -v "$(pwd)/dashboards/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \
   prom/prometheus
 
@@ -101,8 +111,11 @@ docker run -d --name grafana \
 ```
 
 Open Grafana at http://localhost:3030, point both dashboards at the
-Prometheus datasource, and they will populate as soon as NullBoiler
-starts handling traffic.
+Prometheus datasource, and they will populate as soon as a host
+NullBoiler process on port 8080 starts handling traffic. If Prometheus
+and NullBoiler run in the same docker-compose network, switch the
+target in `prometheus/prometheus.yml` to the commented
+`nullboiler:8080` example.
 
 ## Quick start (existing Prometheus + Grafana)
 
@@ -167,8 +180,9 @@ To smoke-test the metrics endpoint without Grafana:
 curl -s http://localhost:8080/metrics | head -30
 ```
 
-You should see eleven `# TYPE ... counter` blocks and their numeric
-values. Empty values are valid — counters start at zero.
+You should see fifteen `# TYPE ...` blocks and their numeric values.
+Empty values are valid — counters start at zero and gauges reflect the
+current store state.
 
 ## Diagnosing integration gaps
 
@@ -206,7 +220,7 @@ mitigate at runtime.
   unlabeled, a single bad worker pulling 1/N of dispatches produces a
   ~1/N failure ratio — below the 30% warning threshold for N ≥ 4.
   Per-worker labels resolve this.
-- Recording rules + Grafana-native alerting (the AlertManager rules
+- Recording rules + Grafana-native alerting (the Prometheus alerting rules
   in `alerts/` are the floor — a recording-rule layer would precompute
   the ratios and avoid PromQL duplication between dashboards and
   alerts).
diff --git a/dashboards/alerts/nullboiler.rules.yml b/dashboards/alerts/nullboiler.rules.yml
index bfc72f9..ed7e0fe 100644
--- a/dashboards/alerts/nullboiler.rules.yml
+++ b/dashboards/alerts/nullboiler.rules.yml
@@ -1,4 +1,4 @@
-# Prometheus AlertManager rules for NullBoiler.
+# Prometheus alerting rules for NullBoiler.
 #
 # Drop into your Prometheus config alongside the scrape stanza from
 # `dashboards/prometheus/prometheus.yml`:
@@ -6,8 +6,8 @@
 #   rule_files:
 #     - /etc/prometheus/alerts/nullboiler.rules.yml
 #
-# Then point AlertManager at the rules and route the `nullboiler` group
-# to your on-call channel.
+# Then configure Alertmanager routes for the `nullboiler` group and send
+# them to your on-call channel.
 #
 # These rules pair 1:1 with the panels in `dashboards/grafana/`. If you
 # tune a threshold here, mirror it in the corresponding dashboard panel
diff --git a/dashboards/prometheus/prometheus.yml b/dashboards/prometheus/prometheus.yml
index f64a3c1..22a1b99 100644
--- a/dashboards/prometheus/prometheus.yml
+++ b/dashboards/prometheus/prometheus.yml
@@ -15,16 +15,17 @@ scrape_configs:
     metrics_path: /metrics
     scheme: http
     static_configs:
-      - targets: ["nullboiler:8080"]   # docker-compose service name
+      - targets: ["host.docker.internal:8080"]   # host NullBoiler from Docker sidecar
         labels:
           service: nullboiler
           env: local
 
-  # Uncomment when running outside docker-compose:
-  # - job_name: nullboiler-host
+  # Use this target instead when Prometheus and NullBoiler share a
+  # docker-compose network and the orchestrator service is named "nullboiler":
+  # - job_name: nullboiler-compose
   #   metrics_path: /metrics
   #   static_configs:
-  #     - targets: ["127.0.0.1:8080"]
+  #     - targets: ["nullboiler:8080"]
   #       labels:
   #         service: nullboiler
   #         env: local
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 75a9b3e..e1bd415 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -138,6 +138,14 @@ paths:
                 # TYPE nullboiler_runs_created_total counter
                 nullboiler_runs_created_total 42
                 # ... 9 more counters
+                # TYPE nullboiler_runs_in_flight gauge
+                nullboiler_runs_in_flight 2
+                # TYPE nullboiler_steps_in_flight gauge
+                nullboiler_steps_in_flight 5
+                # TYPE nullboiler_workers_healthy gauge
+                nullboiler_workers_healthy 3
+                # TYPE nullboiler_drain_mode gauge
+                nullboiler_drain_mode 0
 
   /admin/drain:
     post:
diff --git a/src/api.zig b/src/api.zig
index b5ee09e..22e8e47 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -282,7 +282,13 @@ fn handleHealth(ctx: *Context) HttpResponse {
 
 fn handleMetrics(ctx: *Context) HttpResponse {
     const m = ctx.metrics orelse return plainResponse(200, "nullboiler_metrics_disabled 1\n");
-    const body = m.renderPrometheus(ctx.allocator) catch return plainResponse(500, "nullboiler_metrics_render_error 1\n");
+    const gauges = metrics_mod.Metrics.GaugeSnapshot{
+        .runs_in_flight = ctx.store.countRunsByStatus("running") catch 0,
+        .steps_in_flight = ctx.store.countAllStepsByStatus("running") catch 0,
+        .workers_healthy = ctx.store.countWorkersByStatus("active") catch 0,
+        .drain_mode = if (ctx.drain_mode) |drain| @intFromBool(drain.load(.acquire)) else 0,
+    };
+    const body = m.renderPrometheusWithGauges(ctx.allocator, gauges) catch return plainResponse(500, "nullboiler_metrics_render_error 1\n");
     return plainResponse(200, body);
 }
 
@@ -2632,16 +2638,26 @@ test "API: metrics endpoint returns text format" {
     defer arena.deinit();
 
     var metrics = metrics_mod.Metrics{};
+    var drain_mode = std.atomic.Value(bool).init(true);
+    try store.insertRun("run-active", null, "running", "{\"steps\":[]}", "{}", "[]");
+    try store.insertStep("step-active", "run-active", "node-a", "task", "running", "{}", 1, null, null, null);
+    try store.insertWorker("worker-active", "http://localhost:3000/webhook", "", "webhook", null, "[]", 1, "registered");
     var ctx = Context{
         .store = &store,
         .allocator = arena.allocator(),
         .metrics = &metrics,
+        .drain_mode = &drain_mode,
     };
 
     const resp = handleRequest(&ctx, "GET", "/metrics", "");
     try std.testing.expectEqual(@as(u16, 200), resp.status_code);
     try std.testing.expect(std.mem.startsWith(u8, resp.content_type, "text/plain"));
     try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_http_requests_total") != null);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "# TYPE nullboiler_runs_in_flight gauge") != null);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_runs_in_flight 1") != null);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_steps_in_flight 1") != null);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_workers_healthy 1") != null);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_drain_mode 1") != null);
 }
 
 test "API: list runs supports workflow_id filter" {
diff --git a/src/metrics.zig b/src/metrics.zig
index 25aa02b..5225be9 100644
--- a/src/metrics.zig
+++ b/src/metrics.zig
@@ -18,6 +18,76 @@ pub const Metrics = struct {
     }
 
     pub fn renderPrometheus(self: *const Metrics, allocator: std.mem.Allocator) ![]const u8 {
+        return self.renderPrometheusWithGauges(allocator, null);
+    }
+
+    pub const GaugeSnapshot = struct {
+        runs_in_flight: i64,
+        steps_in_flight: i64,
+        workers_healthy: i64,
+        drain_mode: i64,
+    };
+
+    pub fn renderPrometheusWithGauges(
+        self: *const Metrics,
+        allocator: std.mem.Allocator,
+        gauges: ?GaugeSnapshot,
+    ) ![]const u8 {
+        if (gauges) |g| {
+            return std.fmt.allocPrint(
+                allocator,
+                \\# TYPE nullboiler_http_requests_total counter
+                \\nullboiler_http_requests_total {d}
+                \\# TYPE nullboiler_runs_created_total counter
+                \\nullboiler_runs_created_total {d}
+                \\# TYPE nullboiler_runs_idempotent_replays_total counter
+                \\nullboiler_runs_idempotent_replays_total {d}
+                \\# TYPE nullboiler_steps_claimed_total counter
+                \\nullboiler_steps_claimed_total {d}
+                \\# TYPE nullboiler_steps_retry_scheduled_total counter
+                \\nullboiler_steps_retry_scheduled_total {d}
+                \\# TYPE nullboiler_worker_dispatch_success_total counter
+                \\nullboiler_worker_dispatch_success_total {d}
+                \\# TYPE nullboiler_worker_dispatch_failure_total counter
+                \\nullboiler_worker_dispatch_failure_total {d}
+                \\# TYPE nullboiler_worker_health_checks_total counter
+                \\nullboiler_worker_health_checks_total {d}
+                \\# TYPE nullboiler_worker_health_failures_total counter
+                \\nullboiler_worker_health_failures_total {d}
+                \\# TYPE nullboiler_callback_sent_total counter
+                \\nullboiler_callback_sent_total {d}
+                \\# TYPE nullboiler_callback_failed_total counter
+                \\nullboiler_callback_failed_total {d}
+                \\# TYPE nullboiler_runs_in_flight gauge
+                \\nullboiler_runs_in_flight {d}
+                \\# TYPE nullboiler_steps_in_flight gauge
+                \\nullboiler_steps_in_flight {d}
+                \\# TYPE nullboiler_workers_healthy gauge
+                \\nullboiler_workers_healthy {d}
+                \\# TYPE nullboiler_drain_mode gauge
+                \\nullboiler_drain_mode {d}
+                \\
+            ,
+                .{
+                    self.http_requests_total.load(.monotonic),
+                    self.runs_created_total.load(.monotonic),
+                    self.runs_idempotent_replays_total.load(.monotonic),
+                    self.steps_claimed_total.load(.monotonic),
+                    self.steps_retry_scheduled_total.load(.monotonic),
+                    self.worker_dispatch_success_total.load(.monotonic),
+                    self.worker_dispatch_failure_total.load(.monotonic),
+                    self.worker_health_checks_total.load(.monotonic),
+                    self.worker_health_failures_total.load(.monotonic),
+                    self.callback_sent_total.load(.monotonic),
+                    self.callback_failed_total.load(.monotonic),
+                    g.runs_in_flight,
+                    g.steps_in_flight,
+                    g.workers_healthy,
+                    g.drain_mode,
+                },
+            );
+        }
+
         return std.fmt.allocPrint(
             allocator,
             \\# TYPE nullboiler_http_requests_total counter
diff --git a/src/store.zig b/src/store.zig
index d9236e0..9944c04 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -342,6 +342,20 @@ pub const Store = struct {
         return list.toOwnedSlice(allocator);
     }
 
+    pub fn countWorkersByStatus(self: *Self, status: []const u8) !i64 {
+        const sql = "SELECT COUNT(*) FROM workers WHERE status = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return 0;
+        return colInt(stmt, 0);
+    }
+
     pub fn getWorker(self: *Self, allocator: std.mem.Allocator, id: []const u8) !?types.WorkerRow {
         const sql = "SELECT id, url, token, protocol, model, tags_json, max_concurrent, source, status, consecutive_failures, circuit_open_until_ms, last_error_text, last_health_ms, created_at_ms FROM workers WHERE id = ?";
         var stmt: ?*c.sqlite3_stmt = null;
@@ -661,6 +675,20 @@ pub const Store = struct {
         return list.toOwnedSlice(allocator);
     }
 
+    pub fn countRunsByStatus(self: *Self, status: []const u8) !i64 {
+        const sql = "SELECT COUNT(*) FROM runs WHERE status = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return 0;
+        return colInt(stmt, 0);
+    }
+
     // ── Step CRUD ─────────────────────────────────────────────────────
 
     pub fn insertStep(self: *Self, id: []const u8, run_id: []const u8, def_step_id: []const u8, step_type: []const u8, status: []const u8, input_json: []const u8, max_attempts: i64, timeout_ms: ?i64, parent_step_id: ?[]const u8, item_index: ?i64) !void {
@@ -859,6 +887,20 @@ pub const Store = struct {
         return colInt(stmt, 0);
     }
 
+    pub fn countAllStepsByStatus(self: *Self, status: []const u8) !i64 {
+        const sql = "SELECT COUNT(*) FROM steps WHERE status = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return 0;
+        return colInt(stmt, 0);
+    }
+
     pub fn getChildSteps(self: *Self, allocator: std.mem.Allocator, parent_step_id: []const u8) ![]types.StepRow {
         const sql = "SELECT id, run_id, def_step_id, type, status, worker_id, input_json, output_json, error_text, attempt, max_attempts, timeout_ms, next_attempt_at_ms, parent_step_id, item_index, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, child_run_id, iteration_index FROM steps WHERE parent_step_id = ? ORDER BY item_index ASC";
         var stmt: ?*c.sqlite3_stmt = null;