diff --git a/dashboards/README.md b/dashboards/README.md new file mode 100644 index 0000000..a374989 --- /dev/null +++ b/dashboards/README.md @@ -0,0 +1,214 @@ +# NullBoiler observability stack (P1-03) + +Ready-to-import Grafana dashboards plus a minimal Prometheus scrape +config for the existing `/metrics` endpoint. + +This contributes the **operator side** of `reference/todo.md` P1-03 +("Structured observability: request IDs, metrics endpoint, OTEL +spans"). The endpoint and counters already ship in `src/metrics.zig`; +this directory makes them visible. + +> **Side benefit:** the panels also visualise the integration gaps +> documented in +> [`nullclaw/docs/integration-analysis.md`](https://github.com/nullclaw/nullclaw/blob/main/docs/integration-analysis.md). +> When a NullClaw worker is wired up via `/webhook` (Gap 3 — HIGH +> PRIORITY in that document), the *Worker dispatch failure ratio* +> panel goes red while the *Health-check failure ratio* stays green, +> isolating the contract mismatch (sync `{status:"ok",response:"..."}` +> expected, async `{status:"received"}` returned). See +> [Diagnosing integration gaps](#diagnosing-integration-gaps) below for +> the exact panel pattern. + +## Contents + +``` +dashboards/ +├── README.md this file +├── grafana/ +│ ├── nullboiler-overview.json high-level operations view +│ └── nullboiler-workers.json per-fleet worker health view +├── prometheus/ +│ └── prometheus.yml minimal scrape config +└── alerts/ + └── nullboiler.rules.yml 8 AlertManager rules paired 1:1 with the dashboards +``` + +## What each dashboard answers + +### `nullboiler-overview.json` + +Open this first when investigating "is something wrong?". + +| Panel | Question it answers | +|---|---| +| HTTP requests/sec | Is anyone talking to us right now? | +| Runs created/sec | Is work flowing into the orchestrator? | +| Worker dispatch failure ratio (5m) | What share of dispatches are blowing up? | +| Callback failures/sec | Are run-lifecycle webhooks reaching consumers? | +| Run & step throughput | Mix of created / replayed / claimed / retried over time | +| Worker dispatch (success vs failure) | Stacked-area dispatch outcomes | +| Callbacks (sent vs failed) | Webhook delivery reliability | +| Reliability ratios | Idempotent replay ratio + step retry ratio with thresholds | + +### `nullboiler-workers.json` + +Open this when the Overview shows elevated dispatch failure ratio and +you need to localize the bad worker. + +| Panel | Question it answers | +|---|---| +| Health checks/sec | Are health probes running? | +| Health-check failure ratio (5m) | Are workers responding to probes? | +| Dispatch success/sec, failure/sec | Per-second outcomes | +| Health-check rate (probe vs failure) | Probes timeline | +| Dispatch outcomes (stacked bars) | Discrete dispatch outcomes | +| Failure ratios over time | The signal the circuit breaker reacts to | + +## Metrics exposed by NullBoiler + +From `src/metrics.zig`, all counters (no histograms or labels yet): + +| Counter | Meaning | +|---|---| +| `nullboiler_http_requests_total` | All HTTP requests handled by the API | +| `nullboiler_runs_created_total` | Runs successfully accepted by `POST /runs` | +| `nullboiler_runs_idempotent_replays_total` | Idempotent replays of an existing run | +| `nullboiler_steps_claimed_total` | Steps dispatched to workers | +| `nullboiler_steps_retry_scheduled_total` | Steps scheduled for retry | +| `nullboiler_worker_dispatch_success_total` | Worker dispatches that succeeded | +| `nullboiler_worker_dispatch_failure_total` | Worker dispatches that failed | +| `nullboiler_worker_health_checks_total` | Health probes performed | +| `nullboiler_worker_health_failures_total` | Health probes that failed | +| `nullboiler_callback_sent_total` | Run-lifecycle webhook callbacks sent | +| `nullboiler_callback_failed_total` | Run-lifecycle webhook callbacks failed | + +## Quick start (docker-compose) + +```bash +docker run -d --name prom \ + -p 9090:9090 \ + -v "$(pwd)/dashboards/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \ + prom/prometheus + +docker run -d --name grafana \ + -p 3030:3000 \ + -e GF_AUTH_ANONYMOUS_ENABLED=true \ + -e GF_AUTH_ANONYMOUS_ORG_ROLE=Admin \ + grafana/grafana + +# Add Prometheus datasource pointing at http://host.docker.internal:9090 +# Then: Dashboards -> Import -> upload nullboiler-overview.json and nullboiler-workers.json +``` + +Open Grafana at http://localhost:3030, point both dashboards at the +Prometheus datasource, and they will populate as soon as NullBoiler +starts handling traffic. + +## Quick start (existing Prometheus + Grafana) + +1. Add the scrape stanza from `prometheus/prometheus.yml` to your + existing `prometheus.yml`. Reload Prometheus. +2. Import each `dashboards/grafana/*.json` via *Dashboards → Import → Upload JSON*. +3. When prompted, select your Prometheus datasource for the + `${DS_PROMETHEUS}` template variable. +4. (optional) Wire up alerts: + ```yaml + # in prometheus.yml + rule_files: + - /etc/prometheus/alerts/nullboiler.rules.yml + ``` + and copy `dashboards/alerts/nullboiler.rules.yml` into that path. + +## Alert rules + +`alerts/nullboiler.rules.yml` ships 8 rules grouped under +`nullboiler.health` and `nullboiler.flow`: + +| Alert | Severity | Fires when | +|---|---|---| +| `NullBoilerInstanceDown` | critical | `up == 0` for 2m | +| `NullBoilerDispatchFailureRatioHigh` | warning | dispatch failure ratio > 30% for 5m | +| `NullBoilerDispatchFailureRatioCritical` | critical | dispatch failure ratio > 80% for 2m | +| `NullBoilerWorkerHealthDegraded` | warning | health-check failure ratio > 20% for 5m | +| `NullBoilerCallbackDeliveryDegraded` | warning | callback failure ratio > 10% for 10m | +| `NullBoilerStepRetryRateElevated` | info | retry/claim ratio > 20% for 10m | +| `NullBoilerNoTrafficForExtendedPeriod` | info | no HTTP traffic for 30m | +| `NullBoilerIdempotentReplayRatioVeryHigh` | info | replay ratio > 95% for 15m | + +Thresholds match the colour bands on the corresponding Grafana panels +1:1 — if you tune one, mirror the other so the dashboard and the +pager tell the same story. The Critical-severity alerts are intended +for paging; everything else is ticket-bait. + +> One deliberate exception: `NullBoilerWorkerHealthDegraded` fires at +> 20% (alert) while the dashboard's health-ratio stat shows yellow at +> 1% and red at 10%. The alert sits above the dashboard's red band on +> purpose — the dashboard is meant to surface single-probe blips +> visually, while the pager should only fire on a sustained pattern. + +Validate locally: + +```bash +docker run --rm --entrypoint=promtool \ + -v "$(pwd)/dashboards/alerts:/rules:ro" prom/prometheus \ + check rules /rules/nullboiler.rules.yml +# SUCCESS: 8 rules found +``` + +## Verification + +The dashboards target Grafana 10.x and 11.x (`schemaVersion: 39`). The +PromQL is plain `rate()` over counters with `clamp_min` to avoid +divide-by-zero on idle clusters. + +To smoke-test the metrics endpoint without Grafana: + +```bash +curl -s http://localhost:8080/metrics | head -30 +``` + +You should see eleven `# TYPE ... counter` blocks and their numeric +values. Empty values are valid — counters start at zero. + +## Diagnosing integration gaps + +A non-obvious value of these dashboards: they make ecosystem-level +integration gaps **visually obvious** without reading logs. + +The cleanest example today is **Gap 3** in +[`nullclaw/docs/integration-analysis.md`](https://github.com/nullclaw/nullclaw/blob/main/docs/integration-analysis.md) +("Worker Endpoint for nullboiler Dispatch — HIGH PRIORITY"). When a +plain `nullclaw gateway` is registered as a NullBoiler worker: + +- `/health` succeeds → `nullboiler_worker_health_failures_total` stays low +- `/webhook` returns `{"status":"received"}` instead of the documented + `{"status":"ok","response":"..."}` → + `nullboiler_worker_dispatch_failure_total` increments on every step + +In the **Workers** dashboard's *Failure ratios over time* panel this +shows up as **dispatch failure ratio at ~100% (red)** sitting on top +of **health-check failure ratio near 0% (green)** — a one-glance +diagnosis that the worker is reachable but its response contract is +broken. + +This is exactly the visual signal NullBoiler maintainers would want +when triaging field reports about worker dispatches; it surfaces the +gap that `integration-analysis.md` predicts but does not yet +mitigate at runtime. + +## Future work + +- Histograms for HTTP latency and worker dispatch duration (would + enable percentile panels). +- Per-worker labels on the dispatch counters (would enable + per-worker breakdown panels — currently the workers dashboard shows + fleet-wide aggregates). On a fleet of N workers with the metrics + unlabeled, a single bad worker pulling 1/N of dispatches produces a + ~1/N failure ratio — below the 30% warning threshold for N ≥ 4. + Per-worker labels resolve this. +- Recording rules + Grafana-native alerting (the AlertManager rules + in `alerts/` are the floor — a recording-rule layer would precompute + the ratios and avoid PromQL duplication between dashboards and + alerts). + +These are not required for P1-03 but are natural follow-ups. diff --git a/dashboards/alerts/nullboiler.rules.yml b/dashboards/alerts/nullboiler.rules.yml new file mode 100644 index 0000000..bfc72f9 --- /dev/null +++ b/dashboards/alerts/nullboiler.rules.yml @@ -0,0 +1,204 @@ +# Prometheus AlertManager rules for NullBoiler. +# +# Drop into your Prometheus config alongside the scrape stanza from +# `dashboards/prometheus/prometheus.yml`: +# +# rule_files: +# - /etc/prometheus/alerts/nullboiler.rules.yml +# +# Then point AlertManager at the rules and route the `nullboiler` group +# to your on-call channel. +# +# These rules pair 1:1 with the panels in `dashboards/grafana/`. If you +# tune a threshold here, mirror it in the corresponding dashboard panel +# threshold band so the dashboard and the alert tell the same story. + +groups: + - name: nullboiler.health + interval: 30s + rules: + + # ── Liveness ────────────────────────────────────────────────── + + - alert: NullBoilerInstanceDown + expr: up{job="nullboiler"} == 0 + for: 2m + labels: + severity: critical + service: nullboiler + annotations: + summary: "NullBoiler instance is down" + description: | + Prometheus has not scraped {{ $labels.instance }} for 2 + minutes. Either the orchestrator process died or the + `/metrics` endpoint stopped responding. + runbook: "https://github.com/nullclaw/nullboiler/blob/main/dashboards/README.md" + + # ── Worker dispatch (Gap 3 territory) ───────────────────────── + + - alert: NullBoilerDispatchFailureRatioHigh + expr: | + ( + rate(nullboiler_worker_dispatch_failure_total[5m]) + / + clamp_min( + rate(nullboiler_worker_dispatch_success_total[5m]) + + rate(nullboiler_worker_dispatch_failure_total[5m]), + 0.001 + ) + ) > 0.30 + for: 5m + labels: + severity: warning + service: nullboiler + annotations: + summary: "More than 30% of worker dispatches are failing (5m)" + description: | + Sustained dispatch failure ratio above 30% indicates a + broken contract between NullBoiler and at least one + worker. Cross-check the *Workers* dashboard's + "Failure ratios over time" panel: if `health-check + failure ratio` is also high, the worker is unreachable; + if only dispatch is failing, the worker is reachable but + its `/webhook` is returning the wrong shape (see + integration-analysis.md Gap 3). + dashboard: "http://grafana/d/nullboiler-workers" + + - alert: NullBoilerDispatchFailureRatioCritical + expr: | + ( + rate(nullboiler_worker_dispatch_failure_total[5m]) + / + clamp_min( + rate(nullboiler_worker_dispatch_success_total[5m]) + + rate(nullboiler_worker_dispatch_failure_total[5m]), + 0.001 + ) + ) > 0.80 + for: 2m + labels: + severity: critical + service: nullboiler + annotations: + summary: "Worker dispatches almost entirely failing (>80% for 2m)" + description: | + Effectively no work is reaching workers. Page on-call. + + # ── Worker health probes ────────────────────────────────────── + + - alert: NullBoilerWorkerHealthDegraded + expr: | + ( + rate(nullboiler_worker_health_failures_total[5m]) + / + clamp_min( + rate(nullboiler_worker_health_checks_total[5m]), + 0.001 + ) + ) > 0.20 + for: 5m + labels: + severity: warning + service: nullboiler + annotations: + summary: "Worker fleet health probes failing >20% (5m)" + description: | + At least one worker is failing health probes. Look at + `GET /workers` for `consecutive_failures > 0` and + `last_error_text` to localise. + + # ── Run lifecycle webhooks ──────────────────────────────────── + + - alert: NullBoilerCallbackDeliveryDegraded + expr: | + ( + rate(nullboiler_callback_failed_total[5m]) + / + clamp_min( + rate(nullboiler_callback_sent_total[5m]) + + rate(nullboiler_callback_failed_total[5m]), + 0.001 + ) + ) > 0.10 + for: 10m + labels: + severity: warning + service: nullboiler + annotations: + summary: "Run-lifecycle callbacks failing >10% (10m)" + description: | + Consumers depending on `step.completed` / `run.completed` + webhooks may be missing notifications. Check + `last_error_text` on the failing callback configs. + + # ── Step retries (signal of upstream flake) ─────────────────── + + - alert: NullBoilerStepRetryRateElevated + expr: | + ( + rate(nullboiler_steps_retry_scheduled_total[5m]) + / + clamp_min( + rate(nullboiler_steps_claimed_total[5m]), + 0.001 + ) + ) > 0.20 + for: 10m + labels: + severity: info + service: nullboiler + annotations: + summary: "More than 20% of steps are being retried (10m)" + description: | + High retry rate usually means transient errors at the + worker layer. Once retry policy gains exponential backoff + + jitter (reference/todo.md P1-01), this alert can be + tightened. + + - name: nullboiler.flow + interval: 30s + rules: + + # ── Throughput sanity ───────────────────────────────────────── + + - alert: NullBoilerNoTrafficForExtendedPeriod + expr: | + (rate(nullboiler_http_requests_total[15m]) == 0) + or absent(nullboiler_http_requests_total) + for: 30m + labels: + severity: info + service: nullboiler + annotations: + summary: "NullBoiler has received no HTTP traffic for 30m" + description: | + The orchestrator is up but idle. Often benign (between + workloads); fires on a clearly-broken upstream pipeline. + Liveness itself is covered by NullBoilerInstanceDown — the + `absent()` clause here catches the edge case of a target + that never produced a sample (e.g. immediately after a + restart that bypassed the first scrape). + + # ── Idempotency replays ─────────────────────────────────────── + + - alert: NullBoilerIdempotentReplayRatioVeryHigh + expr: | + ( + rate(nullboiler_runs_idempotent_replays_total[15m]) + / + clamp_min( + rate(nullboiler_runs_created_total[15m]) + + rate(nullboiler_runs_idempotent_replays_total[15m]), + 0.001 + ) + ) > 0.95 + for: 15m + labels: + severity: info + service: nullboiler + annotations: + summary: "95%+ of POST /runs are idempotent replays (15m)" + description: | + A producer is hammering POST /runs with the same + idempotency key. This is harmless (replays are cheap) but + usually points to a misconfigured retry loop upstream. diff --git a/dashboards/grafana/nullboiler-overview.json b/dashboards/grafana/nullboiler-overview.json new file mode 100644 index 0000000..8b6b769 --- /dev/null +++ b/dashboards/grafana/nullboiler-overview.json @@ -0,0 +1,983 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "High-level operational view of NullBoiler — request volume, run throughput, dispatch reliability, and idempotency replay ratio. Pair with the per-worker dashboard for deeper drill-downs.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_http_requests_total[5m])", + "legendFormat": "req/s", + "range": true, + "refId": "A" + } + ], + "title": "HTTP requests/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_created_total[5m])", + "legendFormat": "runs/s", + "range": true, + "refId": "A" + } + ], + "title": "Runs created/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "red", + "value": 0.2 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m]) / clamp_min(rate(nullboiler_worker_dispatch_success_total[5m]) + rate(nullboiler_worker_dispatch_failure_total[5m]), 0.001)", + "legendFormat": "failure ratio", + "range": true, + "refId": "A" + } + ], + "title": "Worker dispatch failure ratio (5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.001 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_callback_failed_total[5m])", + "legendFormat": "failed/s", + "range": true, + "refId": "A" + } + ], + "title": "Callback failures/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_created_total[5m])", + "legendFormat": "runs created", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_idempotent_replays_total[5m])", + "legendFormat": "idempotent replays", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_steps_claimed_total[5m])", + "legendFormat": "steps claimed", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_steps_retry_scheduled_total[5m])", + "legendFormat": "steps retried", + "range": true, + "refId": "D" + } + ], + "title": "Run & step throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_success_total[5m])", + "legendFormat": "success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])", + "legendFormat": "failure", + "range": true, + "refId": "B" + } + ], + "title": "Worker dispatch (success vs failure, stacked)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_callback_sent_total[5m])", + "legendFormat": "sent", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_callback_failed_total[5m])", + "legendFormat": "failed", + "range": true, + "refId": "B" + } + ], + "title": "Callbacks (run lifecycle webhook delivery)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "thresholdsStyle": { + "mode": "line+area" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.3 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_runs_idempotent_replays_total[5m]) / clamp_min(rate(nullboiler_runs_created_total[5m]) + rate(nullboiler_runs_idempotent_replays_total[5m]), 0.001)", + "legendFormat": "idempotent replay ratio", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_steps_retry_scheduled_total[5m]) / clamp_min(rate(nullboiler_steps_claimed_total[5m]), 0.001)", + "legendFormat": "step retry ratio", + "range": true, + "refId": "B" + } + ], + "title": "Reliability ratios (replay & retry)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 20 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_runs_in_flight", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Runs in-flight", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 20 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_steps_in_flight", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Steps in-flight", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 20 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_workers_healthy", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Workers healthy", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 20 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nullboiler_drain_mode", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Drain mode", + "type": "stat" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "nullboiler", + "orchestration", + "p1-03" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "NullBoiler — Overview", + "uid": "nullboiler-overview", + "version": 1, + "weekStart": "" +} diff --git a/dashboards/grafana/nullboiler-workers.json b/dashboards/grafana/nullboiler-workers.json new file mode 100644 index 0000000..74b8afd --- /dev/null +++ b/dashboards/grafana/nullboiler-workers.json @@ -0,0 +1,593 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Worker-side health for NullBoiler — health-check pass/fail rates and dispatch reliability. Use this when the Overview dashboard shows elevated dispatch failure ratio and you need to localize the bad worker.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_checks_total[5m])", + "legendFormat": "checks/s", + "range": true, + "refId": "A" + } + ], + "title": "Health checks/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_failures_total[5m]) / clamp_min(rate(nullboiler_worker_health_checks_total[5m]), 0.001)", + "legendFormat": "failure ratio", + "range": true, + "refId": "A" + } + ], + "title": "Health-check failure ratio (5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_success_total[5m])", + "legendFormat": "success/s", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch success/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.001 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])", + "legendFormat": "failure/s", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch failure/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_checks_total[5m])", + "legendFormat": "checks", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_failures_total[5m])", + "legendFormat": "failures", + "range": true, + "refId": "B" + } + ], + "title": "Health-check rate (probe vs failure)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 0, + "showPoints": "never", + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_success_total[5m])", + "legendFormat": "success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m])", + "legendFormat": "failure", + "range": true, + "refId": "B" + } + ], + "title": "Dispatch outcomes (stacked bars)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "thresholdsStyle": { + "mode": "line+area" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "red", + "value": 0.2 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["lastNotNull", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_dispatch_failure_total[5m]) / clamp_min((rate(nullboiler_worker_dispatch_success_total[5m]) + rate(nullboiler_worker_dispatch_failure_total[5m])), 0.001)", + "legendFormat": "dispatch failure ratio", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(nullboiler_worker_health_failures_total[5m]) / clamp_min(rate(nullboiler_worker_health_checks_total[5m]), 0.001)", + "legendFormat": "health-check failure ratio", + "range": true, + "refId": "B" + } + ], + "title": "Failure ratios over time (drives circuit breaker)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["nullboiler", "workers", "p1-03"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "NullBoiler — Workers", + "uid": "nullboiler-workers", + "version": 1, + "weekStart": "" +} diff --git a/dashboards/prometheus/prometheus.yml b/dashboards/prometheus/prometheus.yml new file mode 100644 index 0000000..f64a3c1 --- /dev/null +++ b/dashboards/prometheus/prometheus.yml @@ -0,0 +1,30 @@ +# Minimal Prometheus scrape config for a local NullBoiler instance. +# Drop into your existing Prometheus alongside other scrape jobs. +# +# Pair with the Grafana dashboards in ../grafana/. + +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + external_labels: + monitor: nullboiler-local + +scrape_configs: + - job_name: nullboiler + metrics_path: /metrics + scheme: http + static_configs: + - targets: ["nullboiler:8080"] # docker-compose service name + labels: + service: nullboiler + env: local + + # Uncomment when running outside docker-compose: + # - job_name: nullboiler-host + # metrics_path: /metrics + # static_configs: + # - targets: ["127.0.0.1:8080"] + # labels: + # service: nullboiler + # env: local