From 7b849d7c14e6060c34a58ee85c1c4e41fdb58d6a Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Mon, 9 Mar 2026 15:31:10 -0700 Subject: [PATCH 1/5] updates to pod and app alerts --- .../src/ptd/grafana_alerts/applications.yaml | 2 +- .../src/ptd/grafana_alerts/pods.yaml | 311 +----------------- 2 files changed, 2 insertions(+), 311 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml index 7f36bb7..db1a4bf 100644 --- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml @@ -63,7 +63,7 @@ groups: execErrState: Error for: 5m annotations: - summary: "🔴 CRITICAL: Loki WAL Disk Full" + summary: "🟡 WARNING: Loki WAL Disk Full" description: | Loki ingester experiencing WAL disk full failures diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 5ffda98..1da8db8 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -4,8 +4,6 @@ # deleteRules: # - orgId: 1 # uid: crash_loop_backoff -# - orgId: 1 -# uid: pod_error # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ # @@ -116,7 +114,7 @@ groups: execErrState: Error for: 5m annotations: - summary: "🔴 CRITICAL: Container Crash-Looping" + summary: "🟡 WARNING: Container Crash-Looping" description: | Container keeps crashing and restarting @@ -131,313 +129,6 @@ groups: Status: CrashLoopBackOff Duration: 5 minutes - labels: - opsgenie: "1" - isPaused: false - - uid: pod_error - title: Pod Error - condition: C - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - datasource: - type: prometheus - uid: mimir - disableTextWrap: false - editorMode: code - expr: count by(cluster, namespace, pod, reason) (kube_pod_container_status_terminated_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: count - refId: B - settings: - mode: dropNN - type: reduce - - refId: C - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - noDataState: OK - execErrState: Error - for: 5m - annotations: - summary: "🟡 WARNING: Pod Error" - description: | - Pod container terminated with an error - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Namespace: {{ $labels.namespace }} - Pod: {{ $labels.pod }} - - ─── DETAILS ───────────────────────── - Reason: {{ $labels.reason }} - Duration: 5 minutes - - labels: - opsgenie: "1" - isPaused: false - - uid: PodNotHealthy - title: Pod Not Healthy - condition: B - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - datasource: - type: prometheus - uid: mimir - disableTextWrap: false - editorMode: code - expr: sum by (cluster, namespace, pod, phase) (kube_pod_status_phase{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 60000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: A - hide: false - reducer: last - refId: B - type: reduce - - refId: C - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: B - hide: false - refId: C - type: threshold - noDataState: OK - execErrState: Error - for: 15m - annotations: - summary: "🟡 WARNING: Pod Not Healthy" - description: | - Pod has been in a non-running state - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Namespace: {{ $labels.namespace }} - Pod: {{ $labels.pod }} - - ─── DETAILS ───────────────────────── - Phase: {{ $labels.phase }} - Duration: 15 minutes - - labels: - opsgenie: "1" - isPaused: false - - uid: PodRestarts - title: Pod Restarts - condition: C - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - datasource: - type: prometheus - uid: mimir - disableTextWrap: false - editorMode: code - expr: avg_over_time(increase(kube_pod_container_status_restarts_total{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"}[15m])[15m:1m]) > 5 - fullMetaSearch: false - includeNullMetadata: true - instant: false - intervalMs: 60000 - legendFormat: __auto - maxDataPoints: 43200 - range: true - refId: A - useBackend: false - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: A - hide: false - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - - refId: C - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: B - hide: false - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - noDataState: OK - execErrState: Error - for: 15m - annotations: - summary: "🟡 WARNING: Pod Restarts" - description: | - Pod has restarted excessively - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Namespace: {{ $labels.namespace }} - Pod: {{ $labels.pod }} - - ─── DETAILS ───────────────────────── - Issue: > 5 restarts in 15 minutes - labels: opsgenie: "1" isPaused: false From 489251e019265c508ca035e4a670d84e610093d8 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Tue, 10 Mar 2026 15:14:06 -0700 Subject: [PATCH 2/5] updates to cloudwatch and rds alerts --- .../src/ptd/grafana_alerts/cloudwatch.yaml | 114 ++--------- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 181 +++++++++++++++--- 2 files changed, 171 insertions(+), 124 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index 076c603..d29c724 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -3,18 +3,23 @@ # apiVersion: 1 # deleteRules: # - orgId: 1 -# uid: fsx_capacity +# uid: fsx_capacity_warning +# - orgId: 1 +# uid: fsx_capacity_critical # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ apiVersion: 1 +deleteRules: + - orgId: 1 + uid: fsx_capacity groups: - orgId: 1 name: Cloudwatch folder: Posit Alerts interval: 5m rules: - - uid: fsx_capacity - title: FSx Capacity + - uid: fsx_capacity_warning + title: FSx Capacity Warning condition: C data: - refId: A @@ -82,84 +87,9 @@ groups: labels: opsgenie: "1" isPaused: false - - uid: ec2_network_out_high - title: EC2 Network Out High - condition: B - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - editorMode: code - # Network out threshold: 314572800 bytes/s (~300 MiB/s) - # Based on analysis of Loki->S3 traffic patterns from issue #2347 - # Instance-aware thresholds: Using the same threshold for all instances - # To set different thresholds by instance type, use: - # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="t3.xlarge"}[5m]) > 157286400 or - # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="m5.2xlarge"}[5m]) > 314572800 - expr: avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch"}[5m]) - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 3.145728e+08 # ~ 300 MiB/s - type: gt - operator: - type: and - query: - params: - - A - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - refId: B - type: threshold - noDataState: NoData - execErrState: Error - for: 5m - annotations: - summary: "🟡 WARNING: EC2 Network Out High" - description: | - EC2 instance has unusually high outbound network traffic - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Resource: {{ $labels.dimension_InstanceId }} - Region: {{ $labels.region }} - - ─── DETAILS ───────────────────────── - Metric: Network Out - Current: > 300 MiB/s - Threshold: 300 MiB/s - Duration: 5 minutes - - labels: - opsgenie: "1" - isPaused: false - - uid: ec2_network_packets_out_high - title: EC2 Network Packets Out High - condition: B + - uid: fsx_capacity_critical + title: FSx Capacity Critical + condition: C data: - refId: A relativeTimeRange: @@ -168,16 +98,14 @@ groups: datasourceUid: mimir model: editorMode: code - # Network packets out threshold: 400000 packets/s - # High packet rate can indicate network bottlenecks or unusual traffic patterns - expr: avg_over_time(aws_ec2_network_packets_out_average{job="integrations/cloudwatch"}[5m]) + expr: aws_fsx_used_storage_capacity_average{job="integrations/cloudwatch", dimension_DataType!="Snapshot", dimension_VolumeId!=""} / aws_fsx_storage_capacity_average{job="integrations/cloudwatch", dimension_VolumeId!=""} instant: true intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: false refId: A - - refId: B + - refId: C relativeTimeRange: from: 600 to: 0 @@ -186,7 +114,7 @@ groups: conditions: - evaluator: params: - - 400000 + - 0.9 type: gt operator: type: and @@ -203,26 +131,26 @@ groups: expression: A intervalMs: 1000 maxDataPoints: 43200 - refId: B + refId: C type: threshold noDataState: NoData execErrState: Error for: 5m annotations: - summary: "🟡 WARNING: EC2 Network Packets Out High" + summary: "🔴 CRITICAL: FSx Storage Capacity Critical" description: | - EC2 instance has unusually high packet transmission rate + FSx file system storage capacity is critically low ─── WHERE ─────────────────────────── Tenant: {{ $labels.tenant_name }} Cluster: {{ $labels.cluster }} - Resource: {{ $labels.dimension_InstanceId }} + Resource: {{ $labels.dimension_FileSystemId }} Region: {{ $labels.region }} ─── DETAILS ───────────────────────── - Metric: Network Packets Out - Current: > 400,000 packets/s - Threshold: 400,000 packets/s + Metric: Storage Capacity + Current: > 90% used + Threshold: 90% Duration: 5 minutes labels: diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index c09e58a..63c4c49 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -6,7 +6,9 @@ # - orgId: 1 # uid: rds_cpu_utilization_high # - orgId: 1 -# uid: rds_free_storage_low +# uid: rds_free_storage_low_warning +# - orgId: 1 +# uid: rds_free_storage_low_critical # - orgId: 1 # uid: rds_freeable_memory_low # - orgId: 1 @@ -22,6 +24,8 @@ apiVersion: 1 deleteRules: - orgId: 1 uid: rds_read_latency_high + - orgId: 1 + uid: rds_free_storage_low groups: - orgId: 1 name: RDS @@ -74,18 +78,101 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + noDataState: NoData execErrState: Error for: 10m annotations: - description: RDS instance CPU utilization is above 80% for more than 10 minutes. - summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: RDS CPU Utilization High" + description: | + RDS instance CPU utilization is elevated + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: CPU Utilization + Current: > 80% + Threshold: 80% + Duration: 10 minutes + + labels: + opsgenie: "1" + isPaused: false + - uid: rds_free_storage_low_warning + title: RDS Free Storage Low (Warning) + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_free_storage_space_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + # 10 GiB in bytes + - 10737418240 + type: lt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: Alerting + execErrState: Error + for: 5m + annotations: + summary: "🟡 WARNING: RDS Free Storage Low" + description: | + RDS instance storage capacity is running low + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Free Storage Space + Current: < 10 GiB free + Threshold: 10 GiB + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning isPaused: false - - uid: rds_free_storage_low - title: RDS Free Storage Low + - uid: rds_free_storage_low_critical + title: RDS Free Storage Low (Critical) condition: B data: - refId: A @@ -111,7 +198,8 @@ groups: conditions: - evaluator: params: - - 5368709120 # 5 GiB in bytes; calibrated for mid-size instances (100–500 GiB). Adjust for larger (e.g. 1 TiB) or smaller instances. + # 5 GiB in bytes + - 5368709120 type: lt operator: type: and @@ -130,19 +218,28 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full disk + noDataState: Alerting execErrState: Error for: 5m annotations: - description: RDS instance has less than 5 GiB of free storage space remaining. Note: on new cluster deployments where CloudWatch scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:5m window; this is expected during provisioning. - summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🔴 CRITICAL: RDS Free Storage Critical" + description: | + RDS instance storage capacity is critically low + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Free Storage Space + Current: < 5 GiB free + Threshold: 5 GiB + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning - # Note: threshold is absolute (5 GiB) rather than percentage-based. CloudWatch does not - # expose AllocatedStorage as a time-series metric for RDS (it is an instance attribute), - # so computing a usage percentage is not feasible without a separate exporter or recording - # rule. The 5 GiB threshold is calibrated to PTD's default 100 GiB allocation. isPaused: false - uid: rds_freeable_memory_low title: RDS Freeable Memory Low @@ -171,7 +268,8 @@ groups: conditions: - evaluator: params: - - 536870912 # 512 MiB in bytes; calibrated for db.r5.large (~16 GiB RAM). Adjust for other instance classes. + # 200 MiB in bytes + - 209715200 type: lt operator: type: and @@ -190,21 +288,29 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting # Memory exhaustion is latent; alert even when scraping stops so we don't silently miss an OOM condition + noDataState: Alerting execErrState: Error for: 10m annotations: - description: RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes. Note: this threshold is calibrated for db.r5.large (~16 GiB RAM); it will fire continuously for small instances (e.g. db.t3.micro, db.t3.small). Adjust per instance class. - summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: RDS Freeable Memory Low" + description: | + RDS instance freeable memory is low + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Freeable Memory + Current: < 200 MiB free + Threshold: 200 MiB + Duration: 10 minutes + labels: opsgenie: "1" - severity: warning - instance_size_dependent: "true" # Silence this label for known-small instance classes - # Note: threshold is absolute (512 MiB) rather than percentage-based. CloudWatch does not - # expose total instance RAM as a metric for RDS — it varies by instance type. PTD's default - # instance (db.t3.small, 2 GiB) would fire constantly at a 90%-used threshold under normal - # Postgres buffer cache load, making percentage-based alerting impractical here. - isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label + isPaused: false - uid: rds_database_connections_high title: RDS Database Connections High condition: B @@ -251,13 +357,26 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + noDataState: NoData execErrState: Error for: 5m annotations: - description: RDS instance has more than 80 active database connections for more than 5 minutes. - summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: RDS Database Connections High" + description: | + RDS instance has high number of database connections + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Database Connections + Current: > 80 connections + Threshold: 80 + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning isPaused: false From 4fed4d321bcbd17bd33c58cca7a22e8df47f95ef Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 11 Mar 2026 11:31:05 -0700 Subject: [PATCH 3/5] add delete rules for old alerts --- python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml | 4 ++++ python-pulumi/src/ptd/grafana_alerts/pods.yaml | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index d29c724..f183dbf 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -12,6 +12,10 @@ apiVersion: 1 deleteRules: - orgId: 1 uid: fsx_capacity + - orgId: 1 + uid: ec2_network_out_high + - orgId: 1 + uid: ec2_network_packets_out_high groups: - orgId: 1 name: Cloudwatch diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 1da8db8..98e5c70 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -27,6 +27,13 @@ # To update the namespace filter, use find/replace on the regex pattern above. # ============================================================================= apiVersion: 1 +deleteRules: + - orgId: 1 + uid: pod_error + - orgId: 1 + uid: PodNotHealthy + - orgId: 1 + uid: PodRestarts groups: - orgId: 1 name: Pods From b475aa958cef5312cab1c29c2649cc05862e4690 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 11 Mar 2026 12:12:20 -0700 Subject: [PATCH 4/5] update lb, node, and rds alerts --- .../src/ptd/grafana_alerts/loadbalancer.yaml | 76 ++++++++++++++++--- .../src/ptd/grafana_alerts/nodes.yaml | 14 +++- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 8 +- 3 files changed, 81 insertions(+), 17 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml index 8d6c216..30f2a91 100644 --- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -75,11 +75,24 @@ groups: execErrState: Error for: 5m annotations: - description: Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures. - summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: ALB Target 5XX Errors High" + description: | + Application Load Balancer has elevated 5XX errors + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_LoadBalancer }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Target 5XX Count + Current: > 10 errors + Threshold: 10 + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning isPaused: false - uid: alb_unhealthy_targets title: ALB Unhealthy Targets @@ -131,11 +144,24 @@ groups: execErrState: Error for: 5m annotations: - description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. - summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: ALB Unhealthy Targets" + description: | + Application Load Balancer has unhealthy targets + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_LoadBalancer }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Unhealthy Host Count + Current: > 0 + Threshold: 0 + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning isPaused: false - uid: nlb_unhealthy_targets title: NLB Unhealthy Targets @@ -187,11 +213,24 @@ groups: execErrState: Error for: 5m annotations: - description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. - summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: NLB Unhealthy Targets" + description: | + Network Load Balancer has unhealthy targets + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_LoadBalancer }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Unhealthy Host Count + Current: > 0 + Threshold: 0 + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning isPaused: false - uid: alb_response_latency_high title: ALB Response Latency High @@ -243,9 +282,22 @@ groups: execErrState: Error for: 10m annotations: - description: Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation. - summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: ALB Response Latency High" + description: | + Application Load Balancer response time is elevated + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_LoadBalancer }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Target Response Time + Current: > 2 seconds + Threshold: 2 seconds + Duration: 10 minutes + labels: opsgenie: "1" - severity: warning isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml index bb6f91d..9953b88 100644 --- a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml @@ -1,3 +1,15 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts: +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: node_not_ready +# - orgId: 1 +# uid: node_memory_pressure +# - orgId: 1 +# uid: node_disk_pressure +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ apiVersion: 1 groups: - orgId: 1 @@ -55,7 +67,7 @@ groups: execErrState: Error for: 15m annotations: - summary: "🔴 CRITICAL: Node Not Ready" + summary: "🟡 WARNING: Node Not Ready" description: | Kubernetes node is not accepting workloads diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 63c4c49..7f501cb 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -268,8 +268,8 @@ groups: conditions: - evaluator: params: - # 200 MiB in bytes - - 209715200 + # 256 MiB in bytes + - 268435456 type: lt operator: type: and @@ -304,8 +304,8 @@ groups: ─── DETAILS ───────────────────────── Metric: Freeable Memory - Current: < 200 MiB free - Threshold: 200 MiB + Current: < 256 MiB free + Threshold: 256 MiB Duration: 10 minutes labels: From 8ce17cd091035cf4cefbe13dd7a4668bdf2b98da Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 11 Mar 2026 12:16:30 -0700 Subject: [PATCH 5/5] documentation updates --- docs/guides/monitoring.md | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md index 91233de..2ce34d7 100644 --- a/docs/guides/monitoring.md +++ b/docs/guides/monitoring.md @@ -578,9 +578,27 @@ Component: [affected component] | Alert | Threshold | Duration | Description | |-------|-----------|----------|-------------| -| **FSx Capacity** | > 80% used | 5m | FSx storage instance has less than 20% capacity remaining | -| **EC2 Network Out High** | > 300 MiB/s | 5m | EC2 instance has sustained high network outbound traffic | -| **EC2 Network Packets Out High** | > 400,000 packets/s | 5m | EC2 instance has unusually high packet transmission rate | +| **FSx Capacity Warning** | > 80% used | 5m | FSx storage instance has less than 20% capacity remaining | +| **FSx Capacity Critical** | > 90% used | 5m | FSx storage instance has less than 10% capacity remaining | + +### RDS Alerts (AWS) + +| Alert | Threshold | Duration | Description | +|-------|-----------|----------|-------------| +| **RDS CPU Utilization High** | > 80% | 10m | RDS instance CPU utilization is elevated | +| **RDS Free Storage Low (Warning)** | < 10 GiB | 5m | RDS instance storage capacity is running low | +| **RDS Free Storage Low (Critical)** | < 5 GiB | 5m | RDS instance storage capacity is critically low | +| **RDS Freeable Memory Low** | < 256 MiB | 10m | RDS instance freeable memory is low | +| **RDS Database Connections High** | > 80 connections | 5m | RDS instance has high number of database connections | + +### Load Balancer Alerts (AWS) + +| Alert | Threshold | Duration | Description | +|-------|-----------|----------|-------------| +| **ALB Target 5XX Errors High** | > 10 errors | 5m | Application Load Balancer has elevated 5XX errors from targets | +| **ALB Unhealthy Targets** | > 0 unhealthy | 5m | Application Load Balancer has unhealthy targets | +| **NLB Unhealthy Targets** | > 0 unhealthy | 5m | Network Load Balancer has unhealthy targets | +| **ALB Response Latency High** | > 2 seconds | 10m | Application Load Balancer target response time is elevated | ### Health Check Alerts @@ -607,9 +625,6 @@ Component: [affected component] | Alert | Threshold | Duration | Description | |-------|-----------|----------|-------------| | **CrashLoopBackOff** | Any container in CrashLoopBackOff | 5m | Container is repeatedly crashing and restarting | -| **Pod Error** | Container terminated (reason != Completed) | 5m | Pod container terminated with an error (excludes user session pods) | -| **Pod Not Healthy** | Phase = Pending/Unknown/Failed | 15m | Pod has been in non-running state (excludes user session pods) | -| **Pod Restarts** | > 5 restarts in 15m | 15m | Pod has restarted excessively | | **Deployment Replicas Mismatch** | Desired != Available | 15m | Deployment does not have the expected number of available replicas | | **StatefulSet Replicas Mismatch** | Ready != Desired | 15m | StatefulSet does not have the expected number of ready replicas | @@ -631,7 +646,7 @@ Pod-related alerts are filtered to only monitor PTD-managed namespaces to preven ``` **Example Failure Cascade**: -- Calico CNI pod crashes → Network connectivity breaks for application pods → Application pods become unhealthy → `PodNotHealthy` alert fires in `posit-team` namespace +- Calico CNI pod crashes → Network connectivity breaks for application pods → Application pods become unhealthy → `CrashLoopBackOff` or `DeploymentReplicaMismatch` alert fires in `posit-team` namespace - Traefik ingress pod crashes → Ingress routing breaks → HTTP health checks fail → `Healthchecks` alert fires - Alloy pod crashes → Metrics/logs stop flowing → No alerts fire (blind) → **Must alert on Alloy pod failures directly** @@ -640,11 +655,13 @@ Pod-related alerts are filtered to only monitor PTD-managed namespaces to preven To add or modify alerts, edit the YAML files in `python-pulumi/src/ptd/grafana_alerts/`. Each file contains alerts grouped by category: - `applications.yaml` - Application-specific alerts (Loki, etc.) -- `cloudwatch.yaml` - AWS CloudWatch metric alerts +- `cloudwatch.yaml` - AWS CloudWatch metric alerts (FSx) - `healthchecks.yaml` - HTTP health check alerts +- `loadbalancer.yaml` - AWS load balancer alerts (ALB, NLB) - `mimir.yaml` - Metrics pipeline alerts - `nodes.yaml` - Kubernetes node alerts - `pods.yaml` - Kubernetes pod and workload alerts +- `rds.yaml` - AWS RDS database alerts To delete an alert, follow the instructions in the file header comments regarding the `deleteRules` syntax.