Skip to content

Commit 258f6bb

Browse files
authored
Merge pull request #143 from posit-dev/azure-monitor-alerting
Add Azure Monitor alerting for cloud-level resources
2 parents 6a0bc3c + ebf2319 commit 258f6bb

11 files changed

Lines changed: 1385 additions & 54 deletions

python-pulumi/src/ptd/azure_roles.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
ACR_PULL_ROLE_DEFINITION_ID = "7f951dda-4ed3-4680-a7ca-43fe172d538d"
33
CONTRIBUTOR_ROLE_DEFINITION_ID = "b24988ac-6180-42a0-ab88-20f7382dd24c"
44
DNS_ZONE_CONTRIBUTOR_ROLE_DEFINITION_ID = "befefa01-2a29-4197-83a8-272ff33ce314"
5+
MONITORING_READER_ROLE_DEFINITION_ID = "43d0d8ad-25c7-4714-9337-8ba259a9fe05"
56
NETWORK_CONTRIBUTOR_ROLE_DEFINITION_ID = "4d97b98b-1d4f-4787-a291-c67834d212e7"
67
READER_ROLE_DEFINITION_ID = "acdd72a7-3385-48ef-bd42-f606fba81ae7"
78
STORAGE_BLOB_DATA_CONTRIBUTOR_ROLE_DEFINITION_ID = "ba92f5b4-2d11-453d-a403-e96b0029c9fe"
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# To delete these alerts, simply removing the configMap that uses this method will not work.
2+
# Replace file contents with the following and apply in order to delete the alerts
3+
# (repeat the deleteRules entry for each uid listed below):
4+
# apiVersion: 1
5+
# deleteRules:
6+
# - orgId: 1
7+
# uid: azure_lb_health_probe_down
8+
# - orgId: 1
9+
# uid: azure_lb_data_path_down
10+
# - orgId: 1
11+
# uid: azure_lb_snat_port_exhaustion
12+
#
13+
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
14+
#
15+
# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics,
16+
# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
17+
# If Alloy is not running or relabeling is misconfigured, the label will be absent and
18+
# the annotation will render as "in cluster " (blank).
19+
apiVersion: 1
20+
groups:
21+
- orgId: 1
22+
name: Azure Load Balancer
23+
folder: Posit Alerts
24+
interval: 5m
25+
rules:
26+
- uid: azure_lb_health_probe_down
27+
title: Azure Load Balancer Health Probe Down
28+
condition: B
29+
data:
30+
- refId: A
31+
relativeTimeRange:
32+
from: 600
33+
to: 0
34+
datasourceUid: mimir
35+
model:
36+
editorMode: code
37+
expr: azure_microsoft_network_loadbalancers_dipavailability{job="integrations/azure"}
38+
instant: true
39+
intervalMs: 1000
40+
legendFormat: __auto
41+
maxDataPoints: 43200
42+
range: false
43+
refId: A
44+
- refId: B
45+
relativeTimeRange:
46+
from: 600
47+
to: 0
48+
datasourceUid: __expr__
49+
model:
50+
conditions:
51+
- evaluator:
52+
params:
53+
- 100
54+
type: lt
55+
operator:
56+
type: and
57+
query:
58+
params:
59+
- A
60+
reducer:
61+
params: []
62+
type: last
63+
type: query
64+
datasource:
65+
type: __expr__
66+
uid: __expr__
67+
expression: A
68+
intervalMs: 1000
69+
maxDataPoints: 43200
70+
refId: B
71+
type: threshold
72+
noDataState: NoData
73+
execErrState: Error
74+
for: 5m
75+
annotations:
76+
description: Azure Load Balancer backend health probe availability is below 100% for over 5 minutes, indicating unhealthy backend instances. This is a critical issue that requires immediate attention.
77+
summary: Health probe down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
78+
labels:
79+
opsgenie: "1"
80+
isPaused: false
81+
- uid: azure_lb_data_path_down
82+
title: Azure Load Balancer Data Path Down
83+
condition: B
84+
data:
85+
- refId: A
86+
relativeTimeRange:
87+
from: 600
88+
to: 0
89+
datasourceUid: mimir
90+
model:
91+
editorMode: code
92+
expr: azure_microsoft_network_loadbalancers_vipavailability{job="integrations/azure"}
93+
instant: true
94+
intervalMs: 1000
95+
legendFormat: __auto
96+
maxDataPoints: 43200
97+
range: false
98+
refId: A
99+
- refId: B
100+
relativeTimeRange:
101+
from: 600
102+
to: 0
103+
datasourceUid: __expr__
104+
model:
105+
conditions:
106+
- evaluator:
107+
params:
108+
- 100
109+
type: lt
110+
operator:
111+
type: and
112+
query:
113+
params:
114+
- A
115+
reducer:
116+
params: []
117+
type: last
118+
type: query
119+
datasource:
120+
type: __expr__
121+
uid: __expr__
122+
expression: A
123+
intervalMs: 1000
124+
maxDataPoints: 43200
125+
refId: B
126+
type: threshold
127+
noDataState: NoData
128+
execErrState: Error
129+
for: 5m
130+
annotations:
131+
description: Azure Load Balancer data path availability is below 100% for over 5 minutes, indicating the load balancer frontend is not responding to health probes. This is a critical issue that requires immediate attention.
132+
summary: Data path down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
133+
labels:
134+
opsgenie: "1"
135+
isPaused: false
136+
- uid: azure_lb_snat_port_exhaustion
137+
title: Azure Load Balancer SNAT Port Exhaustion
138+
condition: B
139+
data:
140+
- refId: A
141+
relativeTimeRange:
142+
from: 600
143+
to: 0
144+
datasourceUid: mimir
145+
model:
146+
editorMode: code
147+
expr: |
148+
(azure_microsoft_network_loadbalancers_usedsnatports{job="integrations/azure"}
149+
/
150+
azure_microsoft_network_loadbalancers_allocatedsnatports{job="integrations/azure"}) * 100
151+
instant: true
152+
intervalMs: 1000
153+
legendFormat: __auto
154+
maxDataPoints: 43200
155+
range: false
156+
refId: A
157+
- refId: B
158+
relativeTimeRange:
159+
from: 600
160+
to: 0
161+
datasourceUid: __expr__
162+
model:
163+
conditions:
164+
- evaluator:
165+
params:
166+
- 80
167+
type: gt
168+
operator:
169+
type: and
170+
query:
171+
params:
172+
- A
173+
reducer:
174+
params: []
175+
type: last
176+
type: query
177+
datasource:
178+
type: __expr__
179+
uid: __expr__
180+
expression: A
181+
intervalMs: 1000
182+
maxDataPoints: 43200
183+
refId: B
184+
type: threshold
185+
noDataState: NoData
186+
execErrState: Error
187+
for: 5m
188+
annotations:
189+
description: Azure Load Balancer is using more than 80% of allocated SNAT ports for over 5 minutes. SNAT port exhaustion can cause outbound connection failures and may require increasing the number of backend instances or using a NAT Gateway.
190+
summary: SNAT port exhaustion on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
191+
labels:
192+
opsgenie: "1"
193+
isPaused: false
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# To delete these alerts, simply removing the configMap that uses this method will not work.
2+
# Replace file contents with the following and apply in order to delete the alerts
3+
# (repeat the deleteRules entry for each uid listed below):
4+
# apiVersion: 1
5+
# deleteRules:
6+
# - orgId: 1
7+
# uid: azure_netapp_capacity_high
8+
# - orgId: 1
9+
# uid: azure_netapp_read_latency_high
10+
# - orgId: 1
11+
# uid: azure_netapp_write_latency_high
12+
#
13+
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
14+
#
15+
# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics,
16+
# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
17+
# If Alloy is not running or relabeling is misconfigured, the label will be absent and
18+
# the annotation will render as "in cluster " (blank).
19+
apiVersion: 1
20+
groups:
21+
- orgId: 1
22+
name: Azure NetApp Files
23+
folder: Posit Alerts
24+
interval: 5m
25+
rules:
26+
- uid: azure_netapp_capacity_high
27+
title: Azure NetApp Files Capacity High
28+
condition: B
29+
data:
30+
- refId: A
31+
relativeTimeRange:
32+
from: 600
33+
to: 0
34+
datasourceUid: mimir
35+
model:
36+
editorMode: code
37+
expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_volumeconsumedsizepercentage{job="integrations/azure"}
38+
instant: true
39+
intervalMs: 1000
40+
legendFormat: __auto
41+
maxDataPoints: 43200
42+
range: false
43+
refId: A
44+
- refId: B
45+
relativeTimeRange:
46+
from: 600
47+
to: 0
48+
datasourceUid: __expr__
49+
model:
50+
conditions:
51+
- evaluator:
52+
params:
53+
- 80
54+
type: gt
55+
operator:
56+
type: and
57+
query:
58+
params:
59+
- A
60+
reducer:
61+
params: []
62+
type: last
63+
type: query
64+
datasource:
65+
type: __expr__
66+
uid: __expr__
67+
expression: A
68+
intervalMs: 1000
69+
maxDataPoints: 43200
70+
refId: B
71+
type: threshold
72+
noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full volume
73+
execErrState: Error
74+
for: 10m
75+
annotations:
76+
description: Azure NetApp Files volume has more than 80% capacity utilization for more than 10 minutes. Note: on new cluster deployments where Azure Monitor scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:10m window; this is expected during provisioning.
77+
summary: High capacity utilization on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
78+
labels:
79+
opsgenie: "1"
80+
isPaused: false
81+
- uid: azure_netapp_read_latency_high
82+
title: Azure NetApp Files Read Latency High
83+
condition: B
84+
data:
85+
- refId: A
86+
relativeTimeRange:
87+
from: 600
88+
to: 0
89+
datasourceUid: mimir
90+
model:
91+
editorMode: code
92+
expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagereadlatency{job="integrations/azure"}
93+
instant: true
94+
intervalMs: 1000
95+
legendFormat: __auto
96+
maxDataPoints: 43200
97+
range: false
98+
refId: A
99+
- refId: B
100+
relativeTimeRange:
101+
from: 600
102+
to: 0
103+
datasourceUid: __expr__
104+
model:
105+
conditions:
106+
- evaluator:
107+
params:
108+
- 10 # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency
109+
type: gt
110+
operator:
111+
type: and
112+
query:
113+
params:
114+
- A
115+
reducer:
116+
params: []
117+
type: last
118+
type: query
119+
datasource:
120+
type: __expr__
121+
uid: __expr__
122+
expression: A
123+
intervalMs: 1000
124+
maxDataPoints: 43200
125+
refId: B
126+
type: threshold
127+
noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable
128+
execErrState: Error
129+
for: 10m
130+
annotations:
131+
description: Azure NetApp Files volume read latency is above 10ms for more than 10 minutes, indicating potential performance degradation.
132+
summary: High read latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
133+
labels:
134+
opsgenie: "1"
135+
isPaused: false
136+
- uid: azure_netapp_write_latency_high
137+
title: Azure NetApp Files Write Latency High
138+
condition: B
139+
data:
140+
- refId: A
141+
relativeTimeRange:
142+
from: 600
143+
to: 0
144+
datasourceUid: mimir
145+
model:
146+
editorMode: code
147+
expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagewritelatency{job="integrations/azure"}
148+
instant: true
149+
intervalMs: 1000
150+
legendFormat: __auto
151+
maxDataPoints: 43200
152+
range: false
153+
refId: A
154+
- refId: B
155+
relativeTimeRange:
156+
from: 600
157+
to: 0
158+
datasourceUid: __expr__
159+
model:
160+
conditions:
161+
- evaluator:
162+
params:
163+
- 10 # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency
164+
type: gt
165+
operator:
166+
type: and
167+
query:
168+
params:
169+
- A
170+
reducer:
171+
params: []
172+
type: last
173+
type: query
174+
datasource:
175+
type: __expr__
176+
uid: __expr__
177+
expression: A
178+
intervalMs: 1000
179+
maxDataPoints: 43200
180+
refId: B
181+
type: threshold
182+
noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable
183+
execErrState: Error
184+
for: 10m
185+
annotations:
186+
description: Azure NetApp Files volume write latency is above 10ms for more than 10 minutes, indicating potential performance degradation.
187+
summary: High write latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
188+
labels:
189+
opsgenie: "1"
190+
isPaused: false

0 commit comments

Comments
 (0)