posit-dev
diff --git a/‎python-pulumi/src/ptd/azure_roles.py‎
Lines changed: 1 addition & 0 deletions b/‎python-pulumi/src/ptd/azure_roles.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml‎
Lines changed: 193 additions & 0 deletions b/‎python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml‎
Lines changed: 193 additions & 0 deletions
diff --git a/‎python-pulumi/src/ptd/grafana_alerts/azure_netapp.yaml‎
Lines changed: 190 additions & 0 deletions b/‎python-pulumi/src/ptd/grafana_alerts/azure_netapp.yaml‎
Lines changed: 190 additions & 0 deletions
@@ -2,6 +2,7 @@
 ACR_PULL_ROLE_DEFINITION_ID = "7f951dda-4ed3-4680-a7ca-43fe172d538d"
 CONTRIBUTOR_ROLE_DEFINITION_ID = "b24988ac-6180-42a0-ab88-20f7382dd24c"
 DNS_ZONE_CONTRIBUTOR_ROLE_DEFINITION_ID = "befefa01-2a29-4197-83a8-272ff33ce314"
+MONITORING_READER_ROLE_DEFINITION_ID = "43d0d8ad-25c7-4714-9337-8ba259a9fe05"
 NETWORK_CONTRIBUTOR_ROLE_DEFINITION_ID = "4d97b98b-1d4f-4787-a291-c67834d212e7"
 READER_ROLE_DEFINITION_ID = "acdd72a7-3385-48ef-bd42-f606fba81ae7"
 STORAGE_BLOB_DATA_CONTRIBUTOR_ROLE_DEFINITION_ID = "ba92f5b4-2d11-453d-a403-e96b0029c9fe"
 
@@ -0,0 +1,193 @@
+# To delete these alerts, simply removing the configMap that uses this method will not work.
+# Replace file contents with the following and apply in order to delete the alerts
+# (repeat the deleteRules entry for each uid listed below):
+# apiVersion: 1
+# deleteRules:
+#   - orgId: 1
+#     uid: azure_lb_health_probe_down
+#   - orgId: 1
+#     uid: azure_lb_data_path_down
+#   - orgId: 1
+#     uid: azure_lb_snat_port_exhaustion
+#
+# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
+#
+# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics,
+# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
+# If Alloy is not running or relabeling is misconfigured, the label will be absent and
+# the annotation will render as "in cluster " (blank).
+apiVersion: 1
+groups:
+    - orgId: 1
+      name: Azure Load Balancer
+      folder: Posit Alerts
+      interval: 5m
+      rules:
+        - uid: azure_lb_health_probe_down
+          title: Azure Load Balancer Health Probe Down
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: azure_microsoft_network_loadbalancers_dipavailability{job="integrations/azure"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 100
+                        type: lt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 5m
+          annotations:
+            description: Azure Load Balancer backend health probe availability is below 100% for over 5 minutes, indicating unhealthy backend instances. This is a critical issue that requires immediate attention.
+            summary: Health probe down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: azure_lb_data_path_down
+          title: Azure Load Balancer Data Path Down
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: azure_microsoft_network_loadbalancers_vipavailability{job="integrations/azure"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 100
+                        type: lt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 5m
+          annotations:
+            description: Azure Load Balancer data path availability is below 100% for over 5 minutes, indicating the load balancer frontend is not responding to health probes. This is a critical issue that requires immediate attention.
+            summary: Data path down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: azure_lb_snat_port_exhaustion
+          title: Azure Load Balancer SNAT Port Exhaustion
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: |
+                  (azure_microsoft_network_loadbalancers_usedsnatports{job="integrations/azure"}
+                  /
+                  azure_microsoft_network_loadbalancers_allocatedsnatports{job="integrations/azure"}) * 100
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 80
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 5m
+          annotations:
+            description: Azure Load Balancer is using more than 80% of allocated SNAT ports for over 5 minutes. SNAT port exhaustion can cause outbound connection failures and may require increasing the number of backend instances or using a NAT Gateway.
+            summary: SNAT port exhaustion on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
@@ -0,0 +1,190 @@
+# To delete these alerts, simply removing the configMap that uses this method will not work.
+# Replace file contents with the following and apply in order to delete the alerts
+# (repeat the deleteRules entry for each uid listed below):
+# apiVersion: 1
+# deleteRules:
+#   - orgId: 1
+#     uid: azure_netapp_capacity_high
+#   - orgId: 1
+#     uid: azure_netapp_read_latency_high
+#   - orgId: 1
+#     uid: azure_netapp_write_latency_high
+#
+# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
+#
+# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics,
+# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
+# If Alloy is not running or relabeling is misconfigured, the label will be absent and
+# the annotation will render as "in cluster " (blank).
+apiVersion: 1
+groups:
+    - orgId: 1
+      name: Azure NetApp Files
+      folder: Posit Alerts
+      interval: 5m
+      rules:
+        - uid: azure_netapp_capacity_high
+          title: Azure NetApp Files Capacity High
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_volumeconsumedsizepercentage{job="integrations/azure"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 80
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: Alerting  # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full volume
+          execErrState: Error
+          for: 10m
+          annotations:
+            description: Azure NetApp Files volume has more than 80% capacity utilization for more than 10 minutes. Note: on new cluster deployments where Azure Monitor scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:10m window; this is expected during provisioning.
+            summary: High capacity utilization on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: azure_netapp_read_latency_high
+          title: Azure NetApp Files Read Latency High
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagereadlatency{job="integrations/azure"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 10  # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData  # Performance metric; silent suppression on scrape outage is acceptable
+          execErrState: Error
+          for: 10m
+          annotations:
+            description: Azure NetApp Files volume read latency is above 10ms for more than 10 minutes, indicating potential performance degradation.
+            summary: High read latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: azure_netapp_write_latency_high
+          title: Azure NetApp Files Write Latency High
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagewritelatency{job="integrations/azure"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 10  # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData  # Performance metric; silent suppression on scrape outage is acceptable
+          execErrState: Error
+          for: 10m
+          annotations:
+            description: Azure NetApp Files volume write latency is above 10ms for more than 10 minutes, indicating potential performance degradation.
+            summary: High write latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false