Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions infrastructure/modules/function-app/alerts.tf
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,42 @@ resource "azurerm_monitor_metric_alert" "function_5xx" {
]
}
}

# Health Check Failure Alert — fires when failed health check probes exceed threshold
resource "azurerm_monitor_scheduled_query_rules_alert_v2" "health_check_failures" {
count = var.enable_alerting && var.health_check_alert_enabled && var.health_check_path != null && var.health_check_path != "" ? 1 : 0

name = "${azurerm_linux_function_app.function_app.name}-health-check-failures"
resource_group_name = var.resource_group_name_monitoring != null ? var.resource_group_name_monitoring : var.resource_group_name
location = var.location

evaluation_frequency = var.health_check_alert_evaluation_frequency
window_duration = var.health_check_alert_window_duration
scopes = [var.log_analytics_workspace_id]
severity = 1 # Error

criteria {
query = <<-KQL
FunctionAppLogs
| where _ResourceId has "${lower(azurerm_linux_function_app.function_app.name)}"
| where Category == "Function.health"
| where Level == "Error" or Message has "Failed"
| summarize FailureCount = count()
KQL

time_aggregation_method = "Total"
threshold = var.health_check_alert_threshold
operator = "GreaterThan"
metric_measure_column = "FailureCount"
}

description = "Alert when health check failures for ${azurerm_linux_function_app.function_app.name} exceed ${var.health_check_alert_threshold} in ${var.health_check_alert_window_duration}."

action {
action_groups = [var.action_group_id]
}

lifecycle {
ignore_changes = [tags]
}
}
24 changes: 24 additions & 0 deletions infrastructure/modules/function-app/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,30 @@ variable "enable_alerting" {
default = false
}

variable "health_check_alert_enabled" {
type = bool
description = "Whether to create a Log Analytics alert for health check failures. Requires enable_alerting = true and health_check_path to be set."
default = false
}

variable "health_check_alert_threshold" {
type = number
description = "Number of health check failures in the evaluation window to trigger the alert."
default = 10
}

variable "health_check_alert_evaluation_frequency" {
type = string
description = "How often the scheduled query rule is evaluated. ISO 8601 duration (e.g., PT5M)."
default = "PT5M"
}

variable "health_check_alert_window_duration" {
type = string
description = "The time window over which health check failures are counted. ISO 8601 duration (e.g., PT5M)."
default = "PT5M"
}

variable "log_analytics_workspace_id" {
type = string
description = "id of the log analytics workspace to send resource logging to via diagnostic settings"
Expand Down
Loading