From c43c99c548a6f4c10f76fab6db9b76da9ca5f40c Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 23 Mar 2026 23:38:35 -0400 Subject: [PATCH] fix: remove dead metrics RBAC, document NetworkPolicy prereq, add scrape-down alert Follow-up from expert review non-blocking observations: - Remove metrics_auth_role, metrics_auth_role_binding, and metrics_reader_role -- these protected the old HTTPS metrics endpoint which was replaced by plain HTTP in the observability PR - Document the `metrics=enabled` namespace label prerequisite on the NetworkPolicy so operators know why Prometheus scrapes are blocked - Add ControllerMetricsDown alert that fires when Prometheus cannot scrape the controller for 5 minutes, ensuring other alerts aren't silently blind Made-with: Cursor --- config/monitoring/prometheus-rule.yaml | 10 ++++++++++ config/monitoring/service-monitor.yaml | 1 + .../network-policy/allow-metrics-traffic.yaml | 4 ++++ config/rbac/kustomization.yaml | 3 --- config/rbac/metrics_auth_role.yaml | 17 ----------------- config/rbac/metrics_auth_role_binding.yaml | 12 ------------ config/rbac/metrics_reader_role.yaml | 9 --------- 7 files changed, 15 insertions(+), 41 deletions(-) delete mode 100644 config/rbac/metrics_auth_role.yaml delete mode 100644 config/rbac/metrics_auth_role_binding.yaml delete mode 100644 config/rbac/metrics_reader_role.yaml diff --git a/config/monitoring/prometheus-rule.yaml b/config/monitoring/prometheus-rule.yaml index 4638287..ec8bc5a 100644 --- a/config/monitoring/prometheus-rule.yaml +++ b/config/monitoring/prometheus-rule.yaml @@ -80,3 +80,13 @@ spec: annotations: summary: "SeiNodeGroup reconcile substep {{ $labels.substep }} is slow" description: "p99 latency above 10s for 10 minutes." + + - alert: ControllerMetricsDown + expr: up{job=~".*sei-k8s-controller.*"} == 0 + for: 5m + labels: + severity: critical + team: platform + annotations: + summary: "sei-k8s-controller metrics endpoint is down" + description: "Prometheus has been unable to scrape the controller for 5 minutes. All other controller alerts are blind." diff --git a/config/monitoring/service-monitor.yaml b/config/monitoring/service-monitor.yaml index 30a4373..e582a4b 100644 --- a/config/monitoring/service-monitor.yaml +++ b/config/monitoring/service-monitor.yaml @@ -8,6 +8,7 @@ metadata: app.kubernetes.io/name: sei-k8s-controller app.kubernetes.io/managed-by: kustomize spec: + jobLabel: app.kubernetes.io/name selector: matchLabels: control-plane: controller-manager diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml index 2ec5842..10325a4 100644 --- a/config/network-policy/allow-metrics-traffic.yaml +++ b/config/network-policy/allow-metrics-traffic.yaml @@ -1,3 +1,7 @@ +# Allows Prometheus to scrape the controller metrics endpoint. +# Prerequisite: the Prometheus namespace must carry the label +# kubectl label namespace metrics=enabled +# Without this label, scrapes will be blocked by the namespaceSelector. apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 4df47d3..005c988 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -6,6 +6,3 @@ resources: - role_binding.yaml - leader_election_role.yaml - leader_election_role_binding.yaml - - metrics_auth_role.yaml - - metrics_auth_role_binding.yaml - - metrics_reader_role.yaml diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml deleted file mode 100644 index 32d2e4e..0000000 --- a/config/rbac/metrics_auth_role.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-auth-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml deleted file mode 100644 index e775d67..0000000 --- a/config/rbac/metrics_auth_role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: metrics-auth-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: metrics-auth-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml deleted file mode 100644 index 51a75db..0000000 --- a/config/rbac/metrics_reader_role.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get