posit-dev · amdove · Mar 11, 2026 · Mar 9, 2026 · Mar 10, 2026 · Mar 11, 2026
@@ -578,9 +578,27 @@ Component:   [affected component]
 
 | Alert | Threshold | Duration | Description |
 |-------|-----------|----------|-------------|
-| **FSx Capacity** | > 80% used | 5m | FSx storage instance has less than 20% capacity remaining |
-| **EC2 Network Out High** | > 300 MiB/s | 5m | EC2 instance has sustained high network outbound traffic |
-| **EC2 Network Packets Out High** | > 400,000 packets/s | 5m | EC2 instance has unusually high packet transmission rate |
+| **FSx Capacity Warning** | > 80% used | 5m | FSx storage instance has less than 20% capacity remaining |
+| **FSx Capacity Critical** | > 90% used | 5m | FSx storage instance has less than 10% capacity remaining |
+
+### RDS Alerts (AWS)
+
+| Alert | Threshold | Duration | Description |
+|-------|-----------|----------|-------------|
+| **RDS CPU Utilization High** | > 80% | 10m | RDS instance CPU utilization is elevated |
+| **RDS Free Storage Low (Warning)** | < 10 GiB | 5m | RDS instance storage capacity is running low |
+| **RDS Free Storage Low (Critical)** | < 5 GiB | 5m | RDS instance storage capacity is critically low |
+| **RDS Freeable Memory Low** | < 256 MiB | 10m | RDS instance freeable memory is low |
+| **RDS Database Connections High** | > 80 connections | 5m | RDS instance has high number of database connections |
+
+### Load Balancer Alerts (AWS)
+
+| Alert | Threshold | Duration | Description |
+|-------|-----------|----------|-------------|
+| **ALB Target 5XX Errors High** | > 10 errors | 5m | Application Load Balancer has elevated 5XX errors from targets |
+| **ALB Unhealthy Targets** | > 0 unhealthy | 5m | Application Load Balancer has unhealthy targets |
+| **NLB Unhealthy Targets** | > 0 unhealthy | 5m | Network Load Balancer has unhealthy targets |
+| **ALB Response Latency High** | > 2 seconds | 10m | Application Load Balancer target response time is elevated |
 
 ### Health Check Alerts
 
@@ -607,9 +625,6 @@ Component:   [affected component]
 | Alert | Threshold | Duration | Description |
 |-------|-----------|----------|-------------|
 | **CrashLoopBackOff** | Any container in CrashLoopBackOff | 5m | Container is repeatedly crashing and restarting |
-| **Pod Error** | Container terminated (reason != Completed) | 5m | Pod container terminated with an error (excludes user session pods) |
-| **Pod Not Healthy** | Phase = Pending/Unknown/Failed | 15m | Pod has been in non-running state (excludes user session pods) |
-| **Pod Restarts** | > 5 restarts in 15m | 15m | Pod has restarted excessively |
 | **Deployment Replicas Mismatch** | Desired != Available | 15m | Deployment does not have the expected number of available replicas |
 | **StatefulSet Replicas Mismatch** | Ready != Desired | 15m | StatefulSet does not have the expected number of ready replicas |
 
@@ -631,7 +646,7 @@ Pod-related alerts are filtered to only monitor PTD-managed namespaces to preven
 ```
 
 **Example Failure Cascade**:
-- Calico CNI pod crashes → Network connectivity breaks for application pods → Application pods become unhealthy → `PodNotHealthy` alert fires in `posit-team` namespace
+- Calico CNI pod crashes → Network connectivity breaks for application pods → Application pods become unhealthy → `CrashLoopBackOff` or `DeploymentReplicaMismatch` alert fires in `posit-team` namespace
 - Traefik ingress pod crashes → Ingress routing breaks → HTTP health checks fail → `Healthchecks` alert fires
 - Alloy pod crashes → Metrics/logs stop flowing → No alerts fire (blind) → **Must alert on Alloy pod failures directly**
 
@@ -640,11 +655,13 @@ Pod-related alerts are filtered to only monitor PTD-managed namespaces to preven
 To add or modify alerts, edit the YAML files in `python-pulumi/src/ptd/grafana_alerts/`. Each file contains alerts grouped by category:
 
 - `applications.yaml` - Application-specific alerts (Loki, etc.)
-- `cloudwatch.yaml` - AWS CloudWatch metric alerts
+- `cloudwatch.yaml` - AWS CloudWatch metric alerts (FSx)
 - `healthchecks.yaml` - HTTP health check alerts
+- `loadbalancer.yaml` - AWS load balancer alerts (ALB, NLB)
 - `mimir.yaml` - Metrics pipeline alerts
 - `nodes.yaml` - Kubernetes node alerts
 - `pods.yaml` - Kubernetes pod and workload alerts
+- `rds.yaml` - AWS RDS database alerts
 
 To delete an alert, follow the instructions in the file header comments regarding the `deleteRules` syntax.
 

@@ -63,7 +63,7 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            summary: "🔴 CRITICAL: Loki WAL Disk Full"
+            summary: "🟡 WARNING: Loki WAL Disk Full"
             description: |
               Loki ingester experiencing WAL disk full failures
 

@@ -3,18 +3,27 @@
 # apiVersion: 1
 # deleteRules:
 #   - orgId: 1
-#     uid: fsx_capacity
+#     uid: fsx_capacity_warning
+#   - orgId: 1
+#     uid: fsx_capacity_critical
 #
 # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
 apiVersion: 1
+deleteRules:
+    - orgId: 1
+      uid: fsx_capacity
+    - orgId: 1
+      uid: ec2_network_out_high
+    - orgId: 1
+      uid: ec2_network_packets_out_high
 groups:
     - orgId: 1
       name: Cloudwatch
       folder: Posit Alerts
       interval: 5m
       rules:
-        - uid: fsx_capacity
-          title: FSx Capacity
+        - uid: fsx_capacity_warning
+          title: FSx Capacity Warning
           condition: C
           data:
             - refId: A
@@ -82,84 +91,9 @@ groups:
           labels:
             opsgenie: "1"
           isPaused: false
-        - uid: ec2_network_out_high
-          title: EC2 Network Out High
-          condition: B
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                editorMode: code
-                # Network out threshold: 314572800 bytes/s (~300 MiB/s)
-                # Based on analysis of Loki->S3 traffic patterns from issue #2347
-                # Instance-aware thresholds: Using the same threshold for all instances
-                # To set different thresholds by instance type, use:
-                # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="t3.xlarge"}[5m]) > 157286400 or
-                # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="m5.2xlarge"}[5m]) > 314572800
-                expr: avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch"}[5m])
-                instant: true
-                intervalMs: 1000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: false
-                refId: A
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 3.145728e+08 # ~ 300 MiB/s
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params:
-                            - A
-                      reducer:
-                        params: []
-                        type: last
-                      type: query
-                datasource:
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                intervalMs: 1000
-                maxDataPoints: 43200
-                refId: B
-                type: threshold
-          noDataState: NoData
-          execErrState: Error
-          for: 5m
-          annotations:
-            summary: "🟡 WARNING: EC2 Network Out High"
-            description: |
-              EC2 instance has unusually high outbound network traffic
-
-              ─── WHERE ───────────────────────────
-              Tenant:      {{ $labels.tenant_name }}
-              Cluster:     {{ $labels.cluster }}
-              Resource:    {{ $labels.dimension_InstanceId }}
-              Region:      {{ $labels.region }}
-
-              ─── DETAILS ─────────────────────────
-              Metric:      Network Out
-              Current:     > 300 MiB/s
-              Threshold:   300 MiB/s
-              Duration:    5 minutes
-
-          labels:
-            opsgenie: "1"
-          isPaused: false
-        - uid: ec2_network_packets_out_high
-          title: EC2 Network Packets Out High
-          condition: B
+        - uid: fsx_capacity_critical
+          title: FSx Capacity Critical
+          condition: C
           data:
             - refId: A
               relativeTimeRange:
@@ -168,16 +102,14 @@ groups:
               datasourceUid: mimir
               model:
                 editorMode: code
-                # Network packets out threshold: 400000 packets/s
-                # High packet rate can indicate network bottlenecks or unusual traffic patterns
-                expr: avg_over_time(aws_ec2_network_packets_out_average{job="integrations/cloudwatch"}[5m])
+                expr: aws_fsx_used_storage_capacity_average{job="integrations/cloudwatch", dimension_DataType!="Snapshot", dimension_VolumeId!=""} / aws_fsx_storage_capacity_average{job="integrations/cloudwatch", dimension_VolumeId!=""}
                 instant: true
                 intervalMs: 1000
                 legendFormat: __auto
                 maxDataPoints: 43200
                 range: false
                 refId: A
-            - refId: B
+            - refId: C
               relativeTimeRange:
                 from: 600
                 to: 0
@@ -186,7 +118,7 @@ groups:
                 conditions:
                     - evaluator:
                         params:
-                            - 400000
+                            - 0.9
                         type: gt
                       operator:
                         type: and
@@ -203,26 +135,26 @@ groups:
                 expression: A
                 intervalMs: 1000
                 maxDataPoints: 43200
-                refId: B
+                refId: C
                 type: threshold
           noDataState: NoData
           execErrState: Error
           for: 5m
           annotations:
-            summary: "🟡 WARNING: EC2 Network Packets Out High"
+            summary: "🔴 CRITICAL: FSx Storage Capacity Critical"
             description: |
-              EC2 instance has unusually high packet transmission rate
+              FSx file system storage capacity is critically low
 
               ─── WHERE ───────────────────────────
               Tenant:      {{ $labels.tenant_name }}
               Cluster:     {{ $labels.cluster }}
-              Resource:    {{ $labels.dimension_InstanceId }}
+              Resource:    {{ $labels.dimension_FileSystemId }}
               Region:      {{ $labels.region }}
 
               ─── DETAILS ─────────────────────────
-              Metric:      Network Packets Out
-              Current:     > 400,000 packets/s
-              Threshold:   400,000 packets/s
+              Metric:      Storage Capacity
+              Current:     > 90% used
+              Threshold:   90%
               Duration:    5 minutes
 
           labels:

@@ -75,11 +75,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures.
-            summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: ALB Target 5XX Errors High"
+            description: |
+              Application Load Balancer has elevated 5XX errors
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Target 5XX Count
+              Current:     > 10 errors
+              Threshold:   10
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
         - uid: alb_unhealthy_targets
           title: ALB Unhealthy Targets
@@ -131,11 +144,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
-            summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: ALB Unhealthy Targets"
+            description: |
+              Application Load Balancer has unhealthy targets
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Unhealthy Host Count
+              Current:     > 0
+              Threshold:   0
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
         - uid: nlb_unhealthy_targets
           title: NLB Unhealthy Targets
@@ -187,11 +213,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
-            summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: NLB Unhealthy Targets"
+            description: |
+              Network Load Balancer has unhealthy targets
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Unhealthy Host Count
+              Current:     > 0
+              Threshold:   0
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
         - uid: alb_response_latency_high
           title: ALB Response Latency High
@@ -243,9 +282,22 @@ groups:
           execErrState: Error
           for: 10m
           annotations:
-            description: Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation.
-            summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: ALB Response Latency High"
+            description: |
+              Application Load Balancer response time is elevated
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Target Response Time
+              Current:     > 2 seconds
+              Threshold:   2 seconds
+              Duration:    10 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
@@ -1,3 +1,15 @@
+# To delete these alerts, simply removing the configMap that uses this method will not work.
+# Replace file contents with the following and apply in order to delete the alerts:
+# apiVersion: 1
+# deleteRules:
+#   - orgId: 1
+#     uid: node_not_ready
+#   - orgId: 1
+#     uid: node_memory_pressure
+#   - orgId: 1
+#     uid: node_disk_pressure
+#
+# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
 apiVersion: 1
 groups:
     - orgId: 1
@@ -55,7 +67,7 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: "🔴 CRITICAL: Node Not Ready"
+            summary: "🟡 WARNING: Node Not Ready"
             description: |
               Kubernetes node is not accepting workloads