From 7b849d7c14e6060c34a58ee85c1c4e41fdb58d6a Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Mon, 9 Mar 2026 15:31:10 -0700
Subject: [PATCH 1/5] updates to pod and app alerts

---
 .../src/ptd/grafana_alerts/applications.yaml  |   2 +-
 .../src/ptd/grafana_alerts/pods.yaml          | 311 +-----------------
 2 files changed, 2 insertions(+), 311 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml
index 7f36bb7..db1a4bf 100644
--- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml
@@ -63,7 +63,7 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            summary: "🔴 CRITICAL: Loki WAL Disk Full"
+            summary: "🟡 WARNING: Loki WAL Disk Full"
             description: |
               Loki ingester experiencing WAL disk full failures
 
diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
index 5ffda98..1da8db8 100644
--- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
@@ -4,8 +4,6 @@
 # deleteRules:
 #   - orgId: 1
 #     uid: crash_loop_backoff
-#   - orgId: 1
-#     uid: pod_error
 #
 # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
 #
@@ -116,7 +114,7 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            summary: "🔴 CRITICAL: Container Crash-Looping"
+            summary: "🟡 WARNING: Container Crash-Looping"
             description: |
               Container keeps crashing and restarting
 
@@ -131,313 +129,6 @@ groups:
               Status:      CrashLoopBackOff
               Duration:    5 minutes
 
-          labels:
-            opsgenie: "1"
-          isPaused: false
-        - uid: pod_error
-          title: Pod Error
-          condition: C
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                datasource:
-                    type: prometheus
-                    uid: mimir
-                disableTextWrap: false
-                editorMode: code
-                expr: count by(cluster, namespace, pod, reason) (kube_pod_container_status_terminated_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""})
-                fullMetaSearch: false
-                includeNullMetadata: true
-                instant: true
-                intervalMs: 1000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: false
-                refId: A
-                useBackend: false
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params: []
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params:
-                            - B
-                      reducer:
-                        params: []
-                        type: last
-                      type: query
-                datasource:
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                intervalMs: 1000
-                maxDataPoints: 43200
-                reducer: count
-                refId: B
-                settings:
-                    mode: dropNN
-                type: reduce
-            - refId: C
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 0
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params:
-                            - C
-                      reducer:
-                        params: []
-                        type: last
-                      type: query
-                datasource:
-                    type: __expr__
-                    uid: __expr__
-                expression: B
-                intervalMs: 1000
-                maxDataPoints: 43200
-                refId: C
-                type: threshold
-          noDataState: OK
-          execErrState: Error
-          for: 5m
-          annotations:
-            summary: "🟡 WARNING: Pod Error"
-            description: |
-              Pod container terminated with an error
-
-              ─── WHERE ───────────────────────────
-              Tenant:      {{ $labels.tenant_name }}
-              Cluster:     {{ $labels.cluster }}
-              Namespace:   {{ $labels.namespace }}
-              Pod:         {{ $labels.pod }}
-
-              ─── DETAILS ─────────────────────────
-              Reason:      {{ $labels.reason }}
-              Duration:    5 minutes
-
-          labels:
-            opsgenie: "1"
-          isPaused: false
-        - uid: PodNotHealthy
-          title: Pod Not Healthy
-          condition: B
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                datasource:
-                    type: prometheus
-                    uid: mimir
-                disableTextWrap: false
-                editorMode: code
-                expr: sum by (cluster, namespace, pod, phase) (kube_pod_status_phase{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0
-                fullMetaSearch: false
-                includeNullMetadata: true
-                instant: true
-                intervalMs: 60000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: false
-                refId: A
-                useBackend: false
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 0
-                            - 0
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params: []
-                      reducer:
-                        params: []
-                        type: avg
-                      type: query
-                datasource:
-                    name: Expression
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                hide: false
-                reducer: last
-                refId: B
-                type: reduce
-            - refId: C
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 0
-                            - 0
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params: []
-                      reducer:
-                        params: []
-                        type: avg
-                      type: query
-                datasource:
-                    name: Expression
-                    type: __expr__
-                    uid: __expr__
-                expression: B
-                hide: false
-                refId: C
-                type: threshold
-          noDataState: OK
-          execErrState: Error
-          for: 15m
-          annotations:
-            summary: "🟡 WARNING: Pod Not Healthy"
-            description: |
-              Pod has been in a non-running state
-
-              ─── WHERE ───────────────────────────
-              Tenant:      {{ $labels.tenant_name }}
-              Cluster:     {{ $labels.cluster }}
-              Namespace:   {{ $labels.namespace }}
-              Pod:         {{ $labels.pod }}
-
-              ─── DETAILS ─────────────────────────
-              Phase:       {{ $labels.phase }}
-              Duration:    15 minutes
-
-          labels:
-            opsgenie: "1"
-          isPaused: false
-        - uid: PodRestarts
-          title: Pod Restarts
-          condition: C
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                datasource:
-                    type: prometheus
-                    uid: mimir
-                disableTextWrap: false
-                editorMode: code
-                expr: avg_over_time(increase(kube_pod_container_status_restarts_total{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"}[15m])[15m:1m]) > 5
-                fullMetaSearch: false
-                includeNullMetadata: true
-                instant: false
-                intervalMs: 60000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: true
-                refId: A
-                useBackend: false
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 0
-                            - 0
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params: []
-                      reducer:
-                        params: []
-                        type: avg
-                      type: query
-                datasource:
-                    name: Expression
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                hide: false
-                intervalMs: 1000
-                maxDataPoints: 43200
-                reducer: last
-                refId: B
-                type: reduce
-            - refId: C
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 0
-                            - 0
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params: []
-                      reducer:
-                        params: []
-                        type: avg
-                      type: query
-                datasource:
-                    name: Expression
-                    type: __expr__
-                    uid: __expr__
-                expression: B
-                hide: false
-                intervalMs: 1000
-                maxDataPoints: 43200
-                refId: C
-                type: threshold
-          noDataState: OK
-          execErrState: Error
-          for: 15m
-          annotations:
-            summary: "🟡 WARNING: Pod Restarts"
-            description: |
-              Pod has restarted excessively
-
-              ─── WHERE ───────────────────────────
-              Tenant:      {{ $labels.tenant_name }}
-              Cluster:     {{ $labels.cluster }}
-              Namespace:   {{ $labels.namespace }}
-              Pod:         {{ $labels.pod }}
-
-              ─── DETAILS ─────────────────────────
-              Issue:       > 5 restarts in 15 minutes
-
           labels:
             opsgenie: "1"
           isPaused: false

From 489251e019265c508ca035e4a670d84e610093d8 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Tue, 10 Mar 2026 15:14:06 -0700
Subject: [PATCH 2/5] updates to cloudwatch and rds alerts

---
 .../src/ptd/grafana_alerts/cloudwatch.yaml    | 114 ++---------
 python-pulumi/src/ptd/grafana_alerts/rds.yaml | 181 +++++++++++++++---
 2 files changed, 171 insertions(+), 124 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
index 076c603..d29c724 100644
--- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
@@ -3,18 +3,23 @@
 # apiVersion: 1
 # deleteRules:
 #   - orgId: 1
-#     uid: fsx_capacity
+#     uid: fsx_capacity_warning
+#   - orgId: 1
+#     uid: fsx_capacity_critical
 #
 # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
 apiVersion: 1
+deleteRules:
+    - orgId: 1
+      uid: fsx_capacity
 groups:
     - orgId: 1
       name: Cloudwatch
       folder: Posit Alerts
       interval: 5m
       rules:
-        - uid: fsx_capacity
-          title: FSx Capacity
+        - uid: fsx_capacity_warning
+          title: FSx Capacity Warning
           condition: C
           data:
             - refId: A
@@ -82,84 +87,9 @@ groups:
           labels:
             opsgenie: "1"
           isPaused: false
-        - uid: ec2_network_out_high
-          title: EC2 Network Out High
-          condition: B
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                editorMode: code
-                # Network out threshold: 314572800 bytes/s (~300 MiB/s)
-                # Based on analysis of Loki->S3 traffic patterns from issue #2347
-                # Instance-aware thresholds: Using the same threshold for all instances
-                # To set different thresholds by instance type, use:
-                # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="t3.xlarge"}[5m]) > 157286400 or
-                # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="m5.2xlarge"}[5m]) > 314572800
-                expr: avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch"}[5m])
-                instant: true
-                intervalMs: 1000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: false
-                refId: A
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 3.145728e+08 # ~ 300 MiB/s
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params:
-                            - A
-                      reducer:
-                        params: []
-                        type: last
-                      type: query
-                datasource:
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                intervalMs: 1000
-                maxDataPoints: 43200
-                refId: B
-                type: threshold
-          noDataState: NoData
-          execErrState: Error
-          for: 5m
-          annotations:
-            summary: "🟡 WARNING: EC2 Network Out High"
-            description: |
-              EC2 instance has unusually high outbound network traffic
-
-              ─── WHERE ───────────────────────────
-              Tenant:      {{ $labels.tenant_name }}
-              Cluster:     {{ $labels.cluster }}
-              Resource:    {{ $labels.dimension_InstanceId }}
-              Region:      {{ $labels.region }}
-
-              ─── DETAILS ─────────────────────────
-              Metric:      Network Out
-              Current:     > 300 MiB/s
-              Threshold:   300 MiB/s
-              Duration:    5 minutes
-
-          labels:
-            opsgenie: "1"
-          isPaused: false
-        - uid: ec2_network_packets_out_high
-          title: EC2 Network Packets Out High
-          condition: B
+        - uid: fsx_capacity_critical
+          title: FSx Capacity Critical
+          condition: C
           data:
             - refId: A
               relativeTimeRange:
@@ -168,16 +98,14 @@ groups:
               datasourceUid: mimir
               model:
                 editorMode: code
-                # Network packets out threshold: 400000 packets/s
-                # High packet rate can indicate network bottlenecks or unusual traffic patterns
-                expr: avg_over_time(aws_ec2_network_packets_out_average{job="integrations/cloudwatch"}[5m])
+                expr: aws_fsx_used_storage_capacity_average{job="integrations/cloudwatch", dimension_DataType!="Snapshot", dimension_VolumeId!=""} / aws_fsx_storage_capacity_average{job="integrations/cloudwatch", dimension_VolumeId!=""}
                 instant: true
                 intervalMs: 1000
                 legendFormat: __auto
                 maxDataPoints: 43200
                 range: false
                 refId: A
-            - refId: B
+            - refId: C
               relativeTimeRange:
                 from: 600
                 to: 0
@@ -186,7 +114,7 @@ groups:
                 conditions:
                     - evaluator:
                         params:
-                            - 400000
+                            - 0.9
                         type: gt
                       operator:
                         type: and
@@ -203,26 +131,26 @@ groups:
                 expression: A
                 intervalMs: 1000
                 maxDataPoints: 43200
-                refId: B
+                refId: C
                 type: threshold
           noDataState: NoData
           execErrState: Error
           for: 5m
           annotations:
-            summary: "🟡 WARNING: EC2 Network Packets Out High"
+            summary: "🔴 CRITICAL: FSx Storage Capacity Critical"
             description: |
-              EC2 instance has unusually high packet transmission rate
+              FSx file system storage capacity is critically low
 
               ─── WHERE ───────────────────────────
               Tenant:      {{ $labels.tenant_name }}
               Cluster:     {{ $labels.cluster }}
-              Resource:    {{ $labels.dimension_InstanceId }}
+              Resource:    {{ $labels.dimension_FileSystemId }}
               Region:      {{ $labels.region }}
 
               ─── DETAILS ─────────────────────────
-              Metric:      Network Packets Out
-              Current:     > 400,000 packets/s
-              Threshold:   400,000 packets/s
+              Metric:      Storage Capacity
+              Current:     > 90% used
+              Threshold:   90%
               Duration:    5 minutes
 
           labels:
diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml
index c09e58a..63c4c49 100644
--- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml
@@ -6,7 +6,9 @@
 #   - orgId: 1
 #     uid: rds_cpu_utilization_high
 #   - orgId: 1
-#     uid: rds_free_storage_low
+#     uid: rds_free_storage_low_warning
+#   - orgId: 1
+#     uid: rds_free_storage_low_critical
 #   - orgId: 1
 #     uid: rds_freeable_memory_low
 #   - orgId: 1
@@ -22,6 +24,8 @@ apiVersion: 1
 deleteRules:
     - orgId: 1
       uid: rds_read_latency_high
+    - orgId: 1
+      uid: rds_free_storage_low
 groups:
     - orgId: 1
       name: RDS
@@ -74,18 +78,101 @@ groups:
                 maxDataPoints: 43200
                 refId: B
                 type: threshold
-          noDataState: NoData  # Performance metric; silent suppression on scrape outage is acceptable
+          noDataState: NoData
           execErrState: Error
           for: 10m
           annotations:
-            description: RDS instance CPU utilization is above 80% for more than 10 minutes.
-            summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: RDS CPU Utilization High"
+            description: |
+              RDS instance CPU utilization is elevated
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_DBInstanceIdentifier }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      CPU Utilization
+              Current:     > 80%
+              Threshold:   80%
+              Duration:    10 minutes
+
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: rds_free_storage_low_warning
+          title: RDS Free Storage Low (Warning)
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: aws_rds_free_storage_space_average{job="integrations/cloudwatch"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            # 10 GiB in bytes
+                            - 10737418240
+                        type: lt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: Alerting
+          execErrState: Error
+          for: 5m
+          annotations:
+            summary: "🟡 WARNING: RDS Free Storage Low"
+            description: |
+              RDS instance storage capacity is running low
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_DBInstanceIdentifier }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Free Storage Space
+              Current:     < 10 GiB free
+              Threshold:   10 GiB
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
-        - uid: rds_free_storage_low
-          title: RDS Free Storage Low
+        - uid: rds_free_storage_low_critical
+          title: RDS Free Storage Low (Critical)
           condition: B
           data:
             - refId: A
@@ -111,7 +198,8 @@ groups:
                 conditions:
                     - evaluator:
                         params:
-                            - 5368709120  # 5 GiB in bytes; calibrated for mid-size instances (100–500 GiB). Adjust for larger (e.g. 1 TiB) or smaller instances.
+                            # 5 GiB in bytes
+                            - 5368709120
                         type: lt
                       operator:
                         type: and
@@ -130,19 +218,28 @@ groups:
                 maxDataPoints: 43200
                 refId: B
                 type: threshold
-          noDataState: Alerting  # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full disk
+          noDataState: Alerting
           execErrState: Error
           for: 5m
           annotations:
-            description: RDS instance has less than 5 GiB of free storage space remaining. Note: on new cluster deployments where CloudWatch scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:5m window; this is expected during provisioning.
-            summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
+            summary: "🔴 CRITICAL: RDS Free Storage Critical"
+            description: |
+              RDS instance storage capacity is critically low
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_DBInstanceIdentifier }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Free Storage Space
+              Current:     < 5 GiB free
+              Threshold:   5 GiB
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
-            # Note: threshold is absolute (5 GiB) rather than percentage-based. CloudWatch does not
-            # expose AllocatedStorage as a time-series metric for RDS (it is an instance attribute),
-            # so computing a usage percentage is not feasible without a separate exporter or recording
-            # rule. The 5 GiB threshold is calibrated to PTD's default 100 GiB allocation.
           isPaused: false
         - uid: rds_freeable_memory_low
           title: RDS Freeable Memory Low
@@ -171,7 +268,8 @@ groups:
                 conditions:
                     - evaluator:
                         params:
-                            - 536870912  # 512 MiB in bytes; calibrated for db.r5.large (~16 GiB RAM). Adjust for other instance classes.
+                            # 200 MiB in bytes
+                            - 209715200
                         type: lt
                       operator:
                         type: and
@@ -190,21 +288,29 @@ groups:
                 maxDataPoints: 43200
                 refId: B
                 type: threshold
-          noDataState: Alerting  # Memory exhaustion is latent; alert even when scraping stops so we don't silently miss an OOM condition
+          noDataState: Alerting
           execErrState: Error
           for: 10m
           annotations:
-            description: RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes. Note: this threshold is calibrated for db.r5.large (~16 GiB RAM); it will fire continuously for small instances (e.g. db.t3.micro, db.t3.small). Adjust per instance class.
-            summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: RDS Freeable Memory Low"
+            description: |
+              RDS instance freeable memory is low
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_DBInstanceIdentifier }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Freeable Memory
+              Current:     < 200 MiB free
+              Threshold:   200 MiB
+              Duration:    10 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
-            instance_size_dependent: "true"  # Silence this label for known-small instance classes
-            # Note: threshold is absolute (512 MiB) rather than percentage-based. CloudWatch does not
-            # expose total instance RAM as a metric for RDS — it varies by instance type. PTD's default
-            # instance (db.t3.small, 2 GiB) would fire constantly at a 90%-used threshold under normal
-            # Postgres buffer cache load, making percentage-based alerting impractical here.
-          isPaused: true  # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label
+          isPaused: false
         - uid: rds_database_connections_high
           title: RDS Database Connections High
           condition: B
@@ -251,13 +357,26 @@ groups:
                 maxDataPoints: 43200
                 refId: B
                 type: threshold
-          noDataState: NoData  # Performance metric; silent suppression on scrape outage is acceptable
+          noDataState: NoData
           execErrState: Error
           for: 5m
           annotations:
-            description: RDS instance has more than 80 active database connections for more than 5 minutes.
-            summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: RDS Database Connections High"
+            description: |
+              RDS instance has high number of database connections
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_DBInstanceIdentifier }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Database Connections
+              Current:     > 80 connections
+              Threshold:   80
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false

From 4fed4d321bcbd17bd33c58cca7a22e8df47f95ef Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 11 Mar 2026 11:31:05 -0700
Subject: [PATCH 3/5] add delete rules for old alerts

---
 python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml | 4 ++++
 python-pulumi/src/ptd/grafana_alerts/pods.yaml       | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
index d29c724..f183dbf 100644
--- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
@@ -12,6 +12,10 @@ apiVersion: 1
 deleteRules:
     - orgId: 1
       uid: fsx_capacity
+    - orgId: 1
+      uid: ec2_network_out_high
+    - orgId: 1
+      uid: ec2_network_packets_out_high
 groups:
     - orgId: 1
       name: Cloudwatch
diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
index 1da8db8..98e5c70 100644
--- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
@@ -27,6 +27,13 @@
 # To update the namespace filter, use find/replace on the regex pattern above.
 # =============================================================================
 apiVersion: 1
+deleteRules:
+    - orgId: 1
+      uid: pod_error
+    - orgId: 1
+      uid: PodNotHealthy
+    - orgId: 1
+      uid: PodRestarts
 groups:
     - orgId: 1
       name: Pods

From b475aa958cef5312cab1c29c2649cc05862e4690 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 11 Mar 2026 12:12:20 -0700
Subject: [PATCH 4/5] update lb, node, and rds alerts

---
 .../src/ptd/grafana_alerts/loadbalancer.yaml  | 76 ++++++++++++++++---
 .../src/ptd/grafana_alerts/nodes.yaml         | 14 +++-
 python-pulumi/src/ptd/grafana_alerts/rds.yaml |  8 +-
 3 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
index 8d6c216..30f2a91 100644
--- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
@@ -75,11 +75,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures.
-            summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: ALB Target 5XX Errors High"
+            description: |
+              Application Load Balancer has elevated 5XX errors
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Target 5XX Count
+              Current:     > 10 errors
+              Threshold:   10
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
         - uid: alb_unhealthy_targets
           title: ALB Unhealthy Targets
@@ -131,11 +144,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
-            summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: ALB Unhealthy Targets"
+            description: |
+              Application Load Balancer has unhealthy targets
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Unhealthy Host Count
+              Current:     > 0
+              Threshold:   0
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
         - uid: nlb_unhealthy_targets
           title: NLB Unhealthy Targets
@@ -187,11 +213,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
-            summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: NLB Unhealthy Targets"
+            description: |
+              Network Load Balancer has unhealthy targets
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Unhealthy Host Count
+              Current:     > 0
+              Threshold:   0
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
         - uid: alb_response_latency_high
           title: ALB Response Latency High
@@ -243,9 +282,22 @@ groups:
           execErrState: Error
           for: 10m
           annotations:
-            description: Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation.
-            summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: ALB Response Latency High"
+            description: |
+              Application Load Balancer response time is elevated
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_LoadBalancer }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Target Response Time
+              Current:     > 2 seconds
+              Threshold:   2 seconds
+              Duration:    10 minutes
+
           labels:
             opsgenie: "1"
-            severity: warning
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
index bb6f91d..9953b88 100644
--- a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
@@ -1,3 +1,15 @@
+# To delete these alerts, simply removing the configMap that uses this method will not work.
+# Replace file contents with the following and apply in order to delete the alerts:
+# apiVersion: 1
+# deleteRules:
+#   - orgId: 1
+#     uid: node_not_ready
+#   - orgId: 1
+#     uid: node_memory_pressure
+#   - orgId: 1
+#     uid: node_disk_pressure
+#
+# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
 apiVersion: 1
 groups:
     - orgId: 1
@@ -55,7 +67,7 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: "🔴 CRITICAL: Node Not Ready"
+            summary: "🟡 WARNING: Node Not Ready"
             description: |
               Kubernetes node is not accepting workloads
 
diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml
index 63c4c49..7f501cb 100644
--- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml
@@ -268,8 +268,8 @@ groups:
                 conditions:
                     - evaluator:
                         params:
-                            # 200 MiB in bytes
-                            - 209715200
+                            # 256 MiB in bytes
+                            - 268435456
                         type: lt
                       operator:
                         type: and
@@ -304,8 +304,8 @@ groups:
 
               ─── DETAILS ─────────────────────────
               Metric:      Freeable Memory
-              Current:     < 200 MiB free
-              Threshold:   200 MiB
+              Current:     < 256 MiB free
+              Threshold:   256 MiB
               Duration:    10 minutes
 
           labels:

From 8ce17cd091035cf4cefbe13dd7a4668bdf2b98da Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 11 Mar 2026 12:16:30 -0700
Subject: [PATCH 5/5] documentation updates

---
 docs/guides/monitoring.md | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md
index 91233de..2ce34d7 100644
--- a/docs/guides/monitoring.md
+++ b/docs/guides/monitoring.md
@@ -578,9 +578,27 @@ Component:   [affected component]
 
 | Alert | Threshold | Duration | Description |
 |-------|-----------|----------|-------------|
-| **FSx Capacity** | > 80% used | 5m | FSx storage instance has less than 20% capacity remaining |
-| **EC2 Network Out High** | > 300 MiB/s | 5m | EC2 instance has sustained high network outbound traffic |
-| **EC2 Network Packets Out High** | > 400,000 packets/s | 5m | EC2 instance has unusually high packet transmission rate |
+| **FSx Capacity Warning** | > 80% used | 5m | FSx storage instance has less than 20% capacity remaining |
+| **FSx Capacity Critical** | > 90% used | 5m | FSx storage instance has less than 10% capacity remaining |
+
+### RDS Alerts (AWS)
+
+| Alert | Threshold | Duration | Description |
+|-------|-----------|----------|-------------|
+| **RDS CPU Utilization High** | > 80% | 10m | RDS instance CPU utilization is elevated |
+| **RDS Free Storage Low (Warning)** | < 10 GiB | 5m | RDS instance storage capacity is running low |
+| **RDS Free Storage Low (Critical)** | < 5 GiB | 5m | RDS instance storage capacity is critically low |
+| **RDS Freeable Memory Low** | < 256 MiB | 10m | RDS instance freeable memory is low |
+| **RDS Database Connections High** | > 80 connections | 5m | RDS instance has high number of database connections |
+
+### Load Balancer Alerts (AWS)
+
+| Alert | Threshold | Duration | Description |
+|-------|-----------|----------|-------------|
+| **ALB Target 5XX Errors High** | > 10 errors | 5m | Application Load Balancer has elevated 5XX errors from targets |
+| **ALB Unhealthy Targets** | > 0 unhealthy | 5m | Application Load Balancer has unhealthy targets |
+| **NLB Unhealthy Targets** | > 0 unhealthy | 5m | Network Load Balancer has unhealthy targets |
+| **ALB Response Latency High** | > 2 seconds | 10m | Application Load Balancer target response time is elevated |
 
 ### Health Check Alerts
 
@@ -607,9 +625,6 @@ Component:   [affected component]
 | Alert | Threshold | Duration | Description |
 |-------|-----------|----------|-------------|
 | **CrashLoopBackOff** | Any container in CrashLoopBackOff | 5m | Container is repeatedly crashing and restarting |
-| **Pod Error** | Container terminated (reason != Completed) | 5m | Pod container terminated with an error (excludes user session pods) |
-| **Pod Not Healthy** | Phase = Pending/Unknown/Failed | 15m | Pod has been in non-running state (excludes user session pods) |
-| **Pod Restarts** | > 5 restarts in 15m | 15m | Pod has restarted excessively |
 | **Deployment Replicas Mismatch** | Desired != Available | 15m | Deployment does not have the expected number of available replicas |
 | **StatefulSet Replicas Mismatch** | Ready != Desired | 15m | StatefulSet does not have the expected number of ready replicas |
 
@@ -631,7 +646,7 @@ Pod-related alerts are filtered to only monitor PTD-managed namespaces to preven
 ```
 
 **Example Failure Cascade**:
-- Calico CNI pod crashes → Network connectivity breaks for application pods → Application pods become unhealthy → `PodNotHealthy` alert fires in `posit-team` namespace
+- Calico CNI pod crashes → Network connectivity breaks for application pods → Application pods become unhealthy → `CrashLoopBackOff` or `DeploymentReplicaMismatch` alert fires in `posit-team` namespace
 - Traefik ingress pod crashes → Ingress routing breaks → HTTP health checks fail → `Healthchecks` alert fires
 - Alloy pod crashes → Metrics/logs stop flowing → No alerts fire (blind) → **Must alert on Alloy pod failures directly**
 
@@ -640,11 +655,13 @@ Pod-related alerts are filtered to only monitor PTD-managed namespaces to preven
 To add or modify alerts, edit the YAML files in `python-pulumi/src/ptd/grafana_alerts/`. Each file contains alerts grouped by category:
 
 - `applications.yaml` - Application-specific alerts (Loki, etc.)
-- `cloudwatch.yaml` - AWS CloudWatch metric alerts
+- `cloudwatch.yaml` - AWS CloudWatch metric alerts (FSx)
 - `healthchecks.yaml` - HTTP health check alerts
+- `loadbalancer.yaml` - AWS load balancer alerts (ALB, NLB)
 - `mimir.yaml` - Metrics pipeline alerts
 - `nodes.yaml` - Kubernetes node alerts
 - `pods.yaml` - Kubernetes pod and workload alerts
+- `rds.yaml` - AWS RDS database alerts
 
 To delete an alert, follow the instructions in the file header comments regarding the `deleteRules` syntax.