diff --git a/.openshift-tests-extension/openshift_payload_cluster-version-operator.json b/.openshift-tests-extension/openshift_payload_cluster-version-operator.json index e9be447240..39716e7bce 100644 --- a/.openshift-tests-extension/openshift_payload_cluster-version-operator.json +++ b/.openshift-tests-extension/openshift_payload_cluster-version-operator.json @@ -1,4 +1,18 @@ [ + { + "name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator should work with risks from alerts", + "labels": { + "Local": {}, + "OTA-1813": {}, + "Serial": {} + }, + "resources": { + "isolation": {} + }, + "source": "openshift:payload:cluster-version-operator", + "lifecycle": "blocking", + "environmentSelector": {} + }, { "name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator should work with accept risks", "labels": { diff --git a/cmd/cluster-version-operator-tests/main.go b/cmd/cluster-version-operator-tests/main.go index a2d00dd637..537ef43fb0 100644 --- a/cmd/cluster-version-operator-tests/main.go +++ b/cmd/cluster-version-operator-tests/main.go @@ -24,7 +24,7 @@ func main() { Name: "openshift/cluster-version-operator/conformance/parallel", Parents: []string{"openshift/conformance/parallel"}, Qualifiers: []string{ - `!(name.contains("[Serial]") || "Serial" in labels || name.contains("[Slow]"))`, + `!(name.contains("[Serial]") || "Serial" in labels || name.contains("[Slow]") || "Local" in labels)`, }, }) @@ -33,7 +33,7 @@ func main() { Name: "openshift/cluster-version-operator/conformance/serial", Parents: []string{"openshift/conformance/serial"}, Qualifiers: []string{ - `name.contains("[Serial]") || "Serial" in labels`, + `(name.contains("[Serial]") || "Serial" in labels) && !("Local" in labels)`, }, }) diff --git a/go.mod b/go.mod index 571d40c82a..0d3db44f24 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( github.com/operator-framework/api v0.17.1 github.com/operator-framework/operator-lifecycle-manager v0.22.0 github.com/pkg/errors v0.9.1 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.86.0 github.com/prometheus-operator/prometheus-operator/pkg/client v0.86.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 @@ -72,7 +73,6 @@ require ( github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.86.0 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/robfig/cron v1.2.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect diff --git a/pkg/clusterconditions/promql/alerts.go b/pkg/clusterconditions/promql/alerts.go new file mode 100644 index 0000000000..f7d8ea6868 --- /dev/null +++ b/pkg/clusterconditions/promql/alerts.go @@ -0,0 +1,89 @@ +package promql + +import ( + "context" + "fmt" + "sync" + + "github.com/prometheus/client_golang/api" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/config" + + "k8s.io/klog/v2" + + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" +) + +type Getter interface { + Get(ctx context.Context) prometheusv1.AlertsResult +} + +func NewAlertGetter(promQLTarget clusterconditions.PromQLTarget) Getter { + p := NewPromQL(promQLTarget) + condition := p.Condition + v, ok := condition.(*PromQL) + if !ok { + panic("invalid condition type") + } + return &ocAlertGetter{promQL: v} +} + +type ocAlertGetter struct { + promQL *PromQL + + mutex sync.Mutex + cached prometheusv1.AlertsResult +} + +func (o *ocAlertGetter) Get(ctx context.Context) prometheusv1.AlertsResult { + if err := o.refresh(ctx); err != nil { + klog.Errorf("Failed to refresh alerts, using stale cache instead: %v", err) + } + return o.cached +} + +func (o *ocAlertGetter) refresh(ctx context.Context) error { + o.mutex.Lock() + defer o.mutex.Unlock() + + klog.Info("refresh alerts ...") + p := o.promQL + host, err := p.Host(ctx) + if err != nil { + return fmt.Errorf("failure determine thanos IP: %w", err) + } + p.url.Host = host + clientConfig := api.Config{Address: p.url.String()} + + if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil { + clientConfig.RoundTripper = roundTripper + } else { + return fmt.Errorf("creating PromQL round-tripper: %w", err) + } + + promqlClient, err := api.NewClient(clientConfig) + if err != nil { + return fmt.Errorf("creating PromQL client: %w", err) + } + + client := &statusCodeNotImplementedForPostClient{ + client: promqlClient, + } + + v1api := prometheusv1.NewAPI(client) + + queryContext := ctx + if p.QueryTimeout > 0 { + var cancel context.CancelFunc + queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout) + defer cancel() + } + + r, err := v1api.Alerts(queryContext) + if err != nil { + return fmt.Errorf("failed to get alerts: %w", err) + } + o.cached = r + klog.Infof("refreshed: %d alerts", len(o.cached.Alerts)) + return nil +} diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index b69323138a..178fd380ec 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -15,6 +15,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/meta" @@ -102,6 +103,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 if !acceptRisks.Equal(optrAvailableUpdates.AcceptRisks) { needsConditionalUpdateEval = true } + // If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval if !needsConditionalUpdateEval { klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) return nil @@ -162,6 +164,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 optrAvailableUpdates.AcceptRisks = acceptRisks optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks + optrAvailableUpdates.AlertGetter = optr.AlertGetter optrAvailableUpdates.evaluateConditionalUpdates(ctx) queueKey := optr.queueKey() @@ -215,6 +218,8 @@ type availableUpdates struct { // RiskConditions stores the condition for every risk (name, url, message, matchingRules). RiskConditions map[string][]metav1.Condition + + AlertGetter AlertGetter } func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool { @@ -320,6 +325,7 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates { ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks, AcceptRisks: optr.availableUpdates.AcceptRisks, RiskConditions: optr.availableUpdates.RiskConditions, + AlertGetter: optr.availableUpdates.AlertGetter, LastAttempt: optr.availableUpdates.LastAttempt, LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange, Current: *optr.availableUpdates.Current.DeepCopy(), @@ -512,6 +518,157 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran } } +type AlertGetter interface { + Get(ctx context.Context) prometheusv1.AlertsResult +} + +func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) []configv1.ConditionalUpdateRisk { + if u == nil || u.AlertGetter == nil { + return nil + } + alertsResult := u.AlertGetter.Get(ctx) + return alertsToRisks(alertsResult.Alerts) +} + +func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk { + klog.V(2).Infof("Found %d alerts", len(alerts)) + risks := map[string]configv1.ConditionalUpdateRisk{} + for _, alert := range alerts { + var alertName string + if alertName = string(alert.Labels["alertname"]); alertName == "" { + continue + } + if alert.State == "pending" { + continue + } + + var summary string + if summary = string(alert.Annotations["summary"]); summary == "" { + summary = alertName + } + if !strings.HasSuffix(summary, ".") { + summary += "." + } + + var description string + alertMessage := string(alert.Annotations["message"]) + alertDescription := string(alert.Annotations["description"]) + switch { + case alertMessage != "" && alertDescription != "": + description += " The alert description is: " + alertDescription + " | " + alertMessage + case alertDescription != "": + description += " The alert description is: " + alertDescription + case alertMessage != "": + description += " The alert description is: " + alertMessage + default: + description += " The alert has no description." + } + + var runbook string + alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound" + if runbook = string(alert.Annotations["runbook_url"]); runbook == "" { + runbook = "" + } else { + alertURL = runbook + } + + details := fmt.Sprintf("%s%s %s", summary, description, runbook) + + severity := string(alert.Labels["severity"]) + if severity == "critical" { + if alertName == internal.AlertNamePodDisruptionBudgetLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + } + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if alertName == internal.AlertNamePodDisruptionBudgetAtLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if internal.HavePullWaiting.Has(alertName) || + internal.HaveNodes.Has(alertName) || + alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing || + alertName == internal.AlertNameVMCannotBeEvicted { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"]) + if updatePrecheck == "true" { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + } + + klog.V(2).Infof("Got %d risks", len(risks)) + if len(risks) == 0 { + return nil + } + + var ret []configv1.ConditionalUpdateRisk + var keys []string + for k := range risks { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + ret = append(ret, risks[k]) + } + return ret +} + +func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk { + risk, ok := risks[riskName] + if !ok { + return configv1.ConditionalUpdateRisk{ + Name: riskName, + Message: message, + URL: url, + // Always as the alert is firing + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{condition}, + } + } + + if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil { + c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message) + if c.LastTransitionTime.After(condition.LastTransitionTime.Time) { + c.LastTransitionTime = condition.LastTransitionTime + } + meta.SetStatusCondition(&risk.Conditions, *c) + } + + return risk +} + func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { if u == nil { return @@ -521,9 +678,14 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { risks := risksInOrder(riskVersions) u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry) + if u.ShouldReconcileAcceptRisks() { + u.attachAlertRisksToUpdates(u.evaluateAlertRisks(ctx)) + } + if err := sanityCheck(u.ConditionalUpdates); err != nil { klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err) } + for i, conditionalUpdate := range u.ConditionalUpdates { condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions) @@ -576,6 +738,45 @@ func (u *availableUpdates) removeUpdate(image string) { } } +func (u *availableUpdates) attachAlertRisksToUpdates(alertRisks []configv1.ConditionalUpdateRisk) { + if u == nil || len(alertRisks) == 0 { + return + } + if u.RiskConditions == nil { + u.RiskConditions = map[string][]metav1.Condition{} + } + for _, alertRisk := range alertRisks { + u.RiskConditions[alertRisk.Name] = alertRisk.Conditions + } + var conditionalUpdates []configv1.ConditionalUpdate + for _, update := range u.Updates { + conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{ + Release: update, + Risks: alertRisks, + }) + } + u.Updates = nil + for _, conditionalUpdate := range u.ConditionalUpdates { + for _, alertRisk := range alertRisks { + var found bool + for _, risk := range conditionalUpdate.Risks { + if alertRisk.Name == risk.Name { + found = true + break + } + } + if !found { + conditionalUpdate.Risks = append(conditionalUpdate.Risks, alertRisk) + } + } + conditionalUpdates = append(conditionalUpdates, conditionalUpdate) + } + sort.Slice(conditionalUpdates, func(i, j int) bool { + return conditionalUpdates[i].Release.Version < conditionalUpdates[j].Release.Version + }) + u.ConditionalUpdates = conditionalUpdates +} + func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) string { template := `Could not evaluate exposure to update risk %s (%v) %s description: %s @@ -616,10 +817,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ func newRecommendedReason(now, want string) string { switch { case now == recommendedReasonRisksNotExposed || - now == recommendedReasonAllExposedRisksAccepted || + now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed || + now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed || now == want: return want - case want == recommendedReasonRisksNotExposed: + case want == recommendedReasonRisksNotExposed || + now == recommendedReasonEvaluationFailed && want == recommendedReasonAllExposedRisksAccepted || + now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed): return now default: return recommendedReasonMultiple @@ -676,7 +880,6 @@ func evaluateConditionalUpdate( if len(errorMessages) > 0 { recommended.Message = strings.Join(errorMessages, "\n\n") } - return recommended } diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 525e9e136e..400f8a5185 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -2,11 +2,14 @@ package cvo import ( "context" + "encoding/json" "errors" "fmt" "net/http" "net/http/httptest" "net/url" + "os" + "path/filepath" "runtime" "testing" "time" @@ -14,6 +17,8 @@ import ( "github.com/blang/semver/v4" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -216,6 +221,7 @@ var availableUpdatesCmpOpts = []cmp.Option{ cmpopts.IgnoreTypes(time.Time{}), cmpopts.IgnoreInterfaces(struct { clusterconditions.ConditionRegistry + AlertGetter }{}), } @@ -950,3 +956,569 @@ func Test_loadRiskConditions(t *testing.T) { }) } } + +type mockAlertGetter struct { + ret prometheusv1.AlertsResult + jsonFile string + t *testing.T +} + +func (m *mockAlertGetter) Get(_ context.Context) prometheusv1.AlertsResult { + var ret prometheusv1.AlertsResult + if m.jsonFile != "" { + data, err := os.ReadFile(m.jsonFile) + if err != nil { + m.t.Fatal(err) + } + err = json.Unmarshal(data, &ret) + if err != nil { + m.t.Fatal(err) + } + return ret + } + return m.ret +} + +func Test_evaluateAlertConditions(t *testing.T) { + t1 := time.Now() + t2 := time.Now().Add(-3 * time.Minute) + t3, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") + if err != nil { + t.Fatalf("failed to parse time: %v", err) + } + tests := []struct { + name string + u *availableUpdates + expectedErr error + expectedAlertRisks []configv1.ConditionalUpdateRisk + }{ + { + name: "basic case", + u: &availableUpdates{ + AlertGetter: &mockAlertGetter{ + t: t, + ret: prometheusv1.AlertsResult{ + Alerts: []prometheusv1.Alert{ + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetLimit"), + model.LabelName("severity"): model.LabelValue("critical"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/abc.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("not-important"), + }, + State: prometheusv1.AlertStatePending, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("another-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t2, + }, + }, + }, + }, + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "PodDisruptionBudgetAtLimit", + Message: "summary.", + URL: "http://runbook.example.com/runbooks/bbb.md", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", + LastTransitionTime: metav1.NewTime(t2), + }}, + }, + { + Name: "PodDisruptionBudgetLimit", + Message: "summary.", + URL: "http://runbook.example.com/runbooks/abc.md", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert PodDisruptionBudgetLimit firing, suggesting significant cluster issues worth investigating. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/abc.md", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + }, + { + name: "from file", + u: &availableUpdates{ + AlertGetter: &mockAlertGetter{ + t: t, + jsonFile: filepath.Join("testdata", "alerts.json"), + }, + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t3), + }}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actual := tt.u.evaluateAlertRisks(context.TODO()) + if diff := cmp.Diff(tt.expectedAlertRisks, actual); diff != "" { + t.Errorf("AlertRisks mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func Test_newRecommendedReason(t *testing.T) { + tests := []struct { + name string + now string + want string + expected string + }{ + { + name: "recommendedReasonRisksNotExposed to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonAllExposedRisksAccepted, + }, + { + name: "recommendedReasonRisksNotExposed to recommendedReasonEvaluationFailed", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonRisksNotExposed to recommendedReasonMultiple", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonRisksNotExposed to recommendedReasonExposed", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonRisksNotExposed", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonAllExposedRisksAccepted, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonEvaluationFailed", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonMultiple", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonExposed", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonRisksNotExposed", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonMultiple", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonExposed", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonMultiple to recommendedReasonRisksNotExposed", + now: recommendedReasonMultiple, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonMultiple to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonMultiple, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonMultiple to recommendedReasonEvaluationFailed", + now: recommendedReasonMultiple, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonMultiple to recommendedReasonExposed", + now: recommendedReasonMultiple, + want: recommendedReasonExposed, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonExposed to recommendedReasonRisksNotExposed", + now: recommendedReasonExposed, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonExposed to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonExposed, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonExposed to recommendedReasonEvaluationFailed", + now: recommendedReasonExposed, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonExposed to recommendedReasonMultiple", + now: recommendedReasonExposed, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actual := newRecommendedReason(tt.now, tt.want) + if diff := cmp.Diff(tt.expected, actual); diff != "" { + t.Errorf("newRecommendedReason mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func Test_attachAlertRisksToUpdates(t *testing.T) { + now := metav1.Now() + tests := []struct { + name string + alertRisks []configv1.ConditionalUpdateRisk + updates []configv1.Release + conditionUpdates []configv1.ConditionalUpdate + expected []configv1.Release + expectedConditionUpdates []configv1.ConditionalUpdate + expectedRiskConditions map[string][]metav1.Condition + }{ + { + name: "no alert risks", + }, + { + name: "basic case", + alertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: now, + }}, + }, + }, + updates: []configv1.Release{ + { + Version: "4.21.1", + }, + { + Version: "4.21.2", + }, + }, + conditionUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "Risk1", + }, + }, + }, + }, + expectedConditionUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.1", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: now, + }}, + }, + }, + }, + { + Release: configv1.Release{ + Version: "4.21.2", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: now, + }}, + }, + }, + }, + { + Release: configv1.Release{ + Version: "4.21.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "Risk1", + }, + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: now, + }}, + }, + }, + }, + }, + expectedRiskConditions: map[string][]metav1.Condition{ + "TestAlert": []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: now, + }}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + u := &availableUpdates{Updates: tt.updates, ConditionalUpdates: tt.conditionUpdates} + u.attachAlertRisksToUpdates(tt.alertRisks) + // attaching is idempotent + u.attachAlertRisksToUpdates(tt.alertRisks) + if diff := cmp.Diff(tt.expected, u.Updates); diff != "" { + t.Errorf("available updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedConditionUpdates, u.ConditionalUpdates); diff != "" { + t.Errorf("conditional updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedRiskConditions, u.RiskConditions); diff != "" { + t.Errorf("risk conditions mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func Test_evaluateConditionalUpdates(t *testing.T) { + t1, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") + if err != nil { + t.Fatalf("failed to parse time: %v", err) + } + + conditionRegistry := clusterconditions.NewConditionRegistry() + conditionRegistry.Register("Always", &always.Always{}) + + tests := []struct { + name string + shouldReconcileAcceptRisks func() bool + updates []configv1.Release + conditionUpdates []configv1.ConditionalUpdate + expected []configv1.Release + expectedConditionUpdates []configv1.ConditionalUpdate + expectedRiskConditions map[string][]metav1.Condition + }{ + { + name: "basic case", + shouldReconcileAcceptRisks: func() bool { return true }, + updates: []configv1.Release{ + { + Version: "4.21.1", + }, + { + Version: "4.21.2", + }, + }, + expectedConditionUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.1", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: "False", + Reason: "TestAlert", + Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + { + Release: configv1.Release{ + Version: "4.21.2", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: "False", + Reason: "TestAlert", + Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + expectedRiskConditions: map[string][]metav1.Condition{ + "TestAlert": []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + u := &availableUpdates{Updates: tt.updates, + ConditionalUpdates: tt.conditionUpdates, + ShouldReconcileAcceptRisks: tt.shouldReconcileAcceptRisks, + ConditionRegistry: conditionRegistry, + AlertGetter: &mockAlertGetter{t: t, jsonFile: filepath.Join("testdata", "alerts.json")}, + } + u.evaluateConditionalUpdates(context.TODO()) + if diff := cmp.Diff(tt.expected, u.Updates); diff != "" { + t.Errorf("available updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedConditionUpdates, u.ConditionalUpdates, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")); diff != "" { + t.Errorf("conditional updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedRiskConditions, u.RiskConditions); diff != "" { + t.Errorf("risk conditions mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 1a2afaf21c..9866fe80c3 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -43,6 +43,7 @@ import ( "github.com/openshift/cluster-version-operator/lib/resourcebuilder" "github.com/openshift/cluster-version-operator/lib/validation" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" "github.com/openshift/cluster-version-operator/pkg/customsignaturestore" "github.com/openshift/cluster-version-operator/pkg/cvo/configuration" @@ -207,6 +208,8 @@ type Operator struct { // configuration, if enabled, reconciles the ClusterVersionOperator configuration. configuration *configuration.ClusterVersionOperatorConfiguration + + AlertGetter AlertGetter } // New returns a new cluster version operator. @@ -267,6 +270,7 @@ func New( exclude: exclude, clusterProfile: clusterProfile, conditionRegistry: standard.NewConditionRegistry(promqlTarget), + AlertGetter: promql.NewAlertGetter(promqlTarget), injectClusterIdIntoPromQL: injectClusterIdIntoPromQL, requiredFeatureSet: featureSet, diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go index 2a9ce45fac..5a8ba48e16 100644 --- a/pkg/cvo/cvo_test.go +++ b/pkg/cvo/cvo_test.go @@ -2742,6 +2742,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) { optr.cvLister = &clientCVLister{client: optr.client} optr.cmConfigManagedLister = &cmConfigLister{} optr.eventRecorder = record.NewFakeRecorder(100) + optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version") var updateServiceURI string if tt.handler != nil { diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index afa8f95098..1687e8d060 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -227,7 +227,17 @@ func updateClusterVersionStatus( var riskNamesForDesiredImage []string if shouldReconcileAcceptRisks() { - cvStatus.ConditionalUpdates, riskNamesForDesiredImage = conditionalUpdateWithRiskNamesAndRiskConditions(cvStatus.ConditionalUpdates, getAvailableUpdates, desired.Image) + updates := getAvailableUpdates() + var riskConditions map[string][]metav1.Condition + conditionalUpdates := cvStatus.ConditionalUpdates + if updates != nil { + // updates.Updates may become updates.ConditionalUpdates if some alert becomes a risk + // Here we use the refreshed ones to generate cvStatus + riskConditions = updates.RiskConditions + conditionalUpdates = updates.ConditionalUpdates + cvStatus.AvailableUpdates = updates.Updates + } + cvStatus.ConditionalUpdates, riskNamesForDesiredImage = conditionalUpdateWithRiskNamesAndRiskConditions(conditionalUpdates, riskConditions, desired.Image) cvStatus.ConditionalUpdateRisks = conditionalUpdateRisks(cvStatus.ConditionalUpdates) } @@ -422,14 +432,9 @@ func updateClusterVersionStatus( } } -func conditionalUpdateWithRiskNamesAndRiskConditions(conditionalUpdates []configv1.ConditionalUpdate, getAvailableUpdates func() *availableUpdates, desiredImage string) ([]configv1.ConditionalUpdate, []string) { +func conditionalUpdateWithRiskNamesAndRiskConditions(conditionalUpdates []configv1.ConditionalUpdate, riskConditions map[string][]metav1.Condition, desiredImage string) ([]configv1.ConditionalUpdate, []string) { var result []configv1.ConditionalUpdate var riskNamesForDesiredImage []string - var riskConditions map[string][]metav1.Condition - updates := getAvailableUpdates() - if updates != nil { - riskConditions = updates.RiskConditions - } for _, conditionalUpdate := range conditionalUpdates { riskNames := sets.New[string]() var risks []configv1.ConditionalUpdateRisk @@ -489,6 +494,7 @@ func conditionalUpdateRisks(conditionalUpdates []configv1.ConditionalUpdate) []c sort.Slice(result, func(i, j int) bool { return result[i].Name < result[j].Name }) + klog.V(2).Infof("Got %d conditional update risks", len(result)) return result } diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index 1c7aa1983a..36a9dbf562 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -977,6 +977,10 @@ var ( ) func Test_conditionalUpdateWithRiskNamesAndRiskConditions(t *testing.T) { + t1, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") + if err != nil { + t.Fatalf("failed to parse time: %v", err) + } tests := []struct { name string conditionalUpdates []configv1.ConditionalUpdate @@ -1075,13 +1079,143 @@ func Test_conditionalUpdateWithRiskNamesAndRiskConditions(t *testing.T) { }}, expectedNames: []string{"Risk1", "Risk2"}, }, + { + name: "alert", + desiredImage: "not-important", + availableUpdates: &availableUpdates{ + RiskConditions: map[string][]metav1.Condition{ + "TestAlert": []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + conditionalUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.1", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: "False", + Reason: "TestAlert", + Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + { + Release: configv1.Release{ + Version: "4.21.2", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: "False", + Reason: "TestAlert", + Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + expected: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.1", + }, + RiskNames: []string{"TestAlert"}, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: "False", + Reason: "TestAlert", + Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + { + Release: configv1.Release{ + Version: "4.21.2", + }, + RiskNames: []string{"TestAlert"}, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: "False", + Reason: "TestAlert", + Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - getAvailableUpdates := func() *availableUpdates { - return tt.availableUpdates + var riskConditions map[string][]metav1.Condition + if tt.availableUpdates != nil { + riskConditions = tt.availableUpdates.RiskConditions } - actual, actualNames := conditionalUpdateWithRiskNamesAndRiskConditions(tt.conditionalUpdates, getAvailableUpdates, tt.desiredImage) + actual, actualNames := conditionalUpdateWithRiskNamesAndRiskConditions(tt.conditionalUpdates, riskConditions, tt.desiredImage) if difference := cmp.Diff(tt.expected, actual, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")); difference != "" { t.Errorf("conditional updates differ from expected:\n%s", difference) } diff --git a/pkg/cvo/testdata/alerts.json b/pkg/cvo/testdata/alerts.json new file mode 100644 index 0000000000..236878ae2c --- /dev/null +++ b/pkg/cvo/testdata/alerts.json @@ -0,0 +1,137 @@ +{ + "alerts": [ + { + "labels": { + "alertname": "ClusterNotUpgradeable", + "condition": "Upgradeable", + "endpoint": "metrics", + "name": "version", + "namespace": "openshift-cluster-version", + "severity": "info" + }, + "annotations": { + "description": "In most cases, you will still be able to apply patch releases. Reason MultipleReasons. For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.ci-ln-rwc3xi2-76ef8.aws-2.ci.openshift.org/settings/cluster/.", + "summary": "One or more cluster operators have been blocking minor or major version cluster updates for at least an hour." + }, + "state": "pending", + "activeAt": "2026-03-04T00:06:31.900150556Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "UpdateAvailable", + "channel": "simple", + "namespace": "openshift-cluster-version", + "severity": "info", + "upstream": "https://fauxinnati-fauxinnati.apps.ota-stage.q2z4.p1.openshiftapps.com/api/upgrades_info/graph" + }, + "annotations": { + "description": "For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.ci-ln-rwc3xi2-76ef8.aws-2.ci.openshift.org/settings/cluster/.", + "summary": "Your upstream update recommendation service recommends you update your cluster." + }, + "state": "firing", + "activeAt": "2026-03-04T00:36:33.844767047Z", + "value": "2e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "TestAlert", + "container": "cluster-version-operator", + "endpoint": "metrics", + "instance": "10.0.61.171:9099", + "job": "cluster-version-operator", + "namespace": "openshift-cluster-version", + "openShiftUpdatePrecheck": "true", + "pod": "cluster-version-operator-dcb5d56cc-5jc94", + "service": "cluster-version-operator", + "severity": "critical" + }, + "annotations": { + "description": "Test description.", + "summary": "Test summary." + }, + "state": "firing", + "activeAt": "2026-03-04T00:38:19.02109776Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "InsightsRecommendationActive", + "container": "insights-operator", + "description": "Enabling the **TechPreviewNoUpgrade** feature set on your cluster\ncan not be undone and prevents minor version updates. Please do\nnot enable this feature set on production clusters.\n", + "endpoint": "https", + "info_link": "https://console.redhat.com/openshift/insights/advisor/clusters/efe476a4-97ad-4c07-bf46-d3da03a5ff6a?first=ccx_rules_ocp.external.rules.upgrade_is_blocked_due_to_tpfg%7CTECH_PREVIEW_NO_UPGRADE_FEATURE_SET_IS_ENABLED", + "instance": "10.129.0.36:8443", + "job": "metrics", + "namespace": "openshift-insights", + "pod": "insights-operator-65f69d8d84-l6f69", + "service": "metrics", + "severity": "info", + "total_risk": "Important" + }, + "annotations": { + "description": "Insights recommendation \"Enabling the **TechPreviewNoUpgrade** feature set on your cluster\ncan not be undone and prevents minor version updates. Please do\nnot enable this feature set on production clusters.\n\" with total risk \"Important\" was detected on the cluster. More information is available at https://console.redhat.com/openshift/insights/advisor/clusters/efe476a4-97ad-4c07-bf46-d3da03a5ff6a?first=ccx_rules_ocp.external.rules.upgrade_is_blocked_due_to_tpfg%7CTECH_PREVIEW_NO_UPGRADE_FEATURE_SET_IS_ENABLED.", + "summary": "An Insights recommendation is active for this cluster." + }, + "state": "firing", + "activeAt": "2026-03-04T00:07:08.820367488Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "TechPreviewNoUpgrade", + "container": "kube-apiserver-operator", + "endpoint": "https", + "instance": "10.129.0.11:8443", + "job": "kube-apiserver-operator", + "name": "TechPreviewNoUpgrade", + "namespace": "openshift-kube-apiserver-operator", + "pod": "kube-apiserver-operator-75c85dd77-zg8fc", + "service": "metrics", + "severity": "warning" + }, + "annotations": { + "description": "Cluster has enabled Technology Preview features that cannot be undone and will prevent upgrades. The TechPreviewNoUpgrade feature set is not recommended on production clusters.", + "summary": "Cluster has enabled tech preview features that will prevent upgrades." + }, + "state": "firing", + "activeAt": "2026-03-04T00:06:25.787986891Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "Watchdog", + "namespace": "openshift-monitoring", + "severity": "none" + }, + "annotations": { + "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.\n", + "summary": "An alert that should always be firing to certify that Alertmanager is working properly." + }, + "state": "firing", + "activeAt": "2026-03-04T00:06:23.165306226Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "AlertmanagerReceiversNotConfigured", + "namespace": "openshift-monitoring", + "severity": "warning" + }, + "annotations": { + "description": "Alerts are not configured to be sent to a notification system, meaning that you may not be notified in a timely fashion when important failures occur. Check the OpenShift documentation to learn how to configure notifications with Alertmanager.", + "summary": "Receivers (notification integrations) are not configured on Alertmanager" + }, + "state": "firing", + "activeAt": "2026-03-04T00:06:46.778977652Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + } + ] +} diff --git a/pkg/external/constants.go b/pkg/external/constants.go index 4cbdd95e67..21f343058c 100644 --- a/pkg/external/constants.go +++ b/pkg/external/constants.go @@ -1,6 +1,8 @@ package external -import "github.com/openshift/cluster-version-operator/pkg/internal" +import ( + "github.com/openshift/cluster-version-operator/pkg/internal" +) // The constants defined here are used by the components, e.g., e2e tests that have no access // to github.com/openshift/cluster-version-operator/pkg/internal @@ -18,4 +20,13 @@ const ( // ConditionalUpdateConditionTypeRecommended is a type of the condition present on a conditional update // that indicates whether the conditional update is recommended or not ConditionalUpdateConditionTypeRecommended = internal.ConditionalUpdateConditionTypeRecommended + + // ConditionalUpdateRiskConditionTypeApplies is a type of the condition present on a conditional update risk + // that indicates whether the conditional update risk applies to the cluster + ConditionalUpdateRiskConditionTypeApplies = internal.ConditionalUpdateRiskConditionTypeApplies ) + +// IsAlertConditionReason checks if the given reason is legit for a condition of a risk from an alert that applies +func IsAlertConditionReason(reason string) bool { + return internal.IsAlertConditionReason(reason) +} diff --git a/pkg/internal/constants.go b/pkg/internal/constants.go index 3c0dd28236..eac0f36af3 100644 --- a/pkg/internal/constants.go +++ b/pkg/internal/constants.go @@ -1,6 +1,10 @@ package internal import ( + "fmt" + "strings" + + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" configv1 "github.com/openshift/api/config/v1" @@ -89,4 +93,41 @@ const ( ConditionalUpdateRiskConditionTypeApplies = "Applies" ConditionalUpdateRiskConditionReasonInternalErrorFoundNoRiskCondition = "InternalErrorFoundNoRiskCondition" + + AlertNamePodDisruptionBudgetLimit = "PodDisruptionBudgetLimit" + AlertNamePodDisruptionBudgetAtLimit = "PodDisruptionBudgetAtLimit" + AlertNameKubeContainerWaiting = "KubeContainerWaiting" + + AlertNameKubeletHealthState = "KubeletHealthState" + AlertNameKubeNodeNotReady = "KubeNodeNotReady" + AlertNameKubeNodeReadinessFlapping = "KubeNodeReadinessFlapping" + AlertNameKubeNodeUnreachable = "KubeNodeUnreachable" + + AlertNameVirtHandlerDaemonSetRolloutFailing = "VirtHandlerDaemonSetRolloutFailing" + AlertNameVMCannotBeEvicted = "VMCannotBeEvicted" ) + +var ( + HavePodDisruptionBudget = sets.New[string](AlertNamePodDisruptionBudgetLimit, AlertNamePodDisruptionBudgetAtLimit) + HavePullWaiting = sets.New[string](AlertNameKubeContainerWaiting) + HaveNodes = sets.New[string]( + AlertNameKubeletHealthState, + AlertNameKubeNodeNotReady, + AlertNameKubeNodeReadinessFlapping, + AlertNameKubeNodeUnreachable, + ) +) + +const alertConditionReasonPrefix = "Alert:" + +func AlertConditionReason(state string) string { + return fmt.Sprintf("%s%s", alertConditionReasonPrefix, state) +} + +func IsAlertConditionReason(reason string) bool { + return strings.HasPrefix(reason, alertConditionReasonPrefix) +} + +func AlertConditionMessage(alertName, severity, state, impact, details string) string { + return fmt.Sprintf("%s alert %s %s, %s. %s", severity, alertName, state, impact, details) +} diff --git a/test/cvo/accept_risks.go b/test/cvo/accept_risks.go index d55c4b216e..dc0a1b4a6e 100644 --- a/test/cvo/accept_risks.go +++ b/test/cvo/accept_risks.go @@ -6,10 +6,13 @@ import ( g "github.com/onsi/ginkgo/v2" o "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" prometheusoperatorv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned/typed/monitoring/v1" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/rest" @@ -65,6 +68,88 @@ var _ = g.Describe(`[Jira:"Cluster Version Operator"] cluster-version-operator`, } }) + g.It("should work with risks from alerts", g.Label("OTA-1813"), g.Label("Serial"), g.Label("Local"), func() { + // This test case relies on a public service util.FauxinnatiAPIURL + o.Expect(util.SkipIfNetworkRestricted(ctx, c, util.FauxinnatiAPIURL)).To(o.BeNil()) + + cv, err := configClient.ClusterVersions().Get(ctx, external.DefaultClusterVersionName, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + g.By("Using fauxinnati as the upstream and its simple channel") + cv.Spec.Upstream = util.FauxinnatiAPIURL + cv.Spec.Channel = "simple" + + _, err = configClient.ClusterVersions().Update(ctx, cv, metav1.UpdateOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + needRecover = true + + g.By("Create a critical alert for testing") + prometheusRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testing", + Namespace: external.DefaultCVONamespace, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test", + Rules: []monitoringv1.Rule{ + { + Alert: "TestAlertFeatureE2ETestOTA1813", + Annotations: map[string]string{"summary": "Test summary.", "description": "Test description."}, + Expr: intstr.IntOrString{ + Type: intstr.String, + StrVal: `up{job="cluster-version-operator"} == 1`, + }, + Labels: map[string]string{"severity": "critical", "namespace": "openshift-cluster-version", "openShiftUpdatePrecheck": "true"}, + }, + }, + }, + }, + }, + } + created, err := monitoringClient.PrometheusRules(external.DefaultCVONamespace).Create(ctx, prometheusRule, metav1.CreateOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + defer func() { + err := monitoringClient.PrometheusRules(external.DefaultCVONamespace).Delete(ctx, created.Name, metav1.DeleteOptions{}) + if !errors.IsNotFound(err) { + o.Expect(err).To(o.BeNil()) + } + }() + + g.By("Checking if the risk shows up in ClusterVersion's status") + o.Expect(wait.PollUntilContextTimeout(ctx, 30*time.Second, 10*time.Minute, true, func(ctx context.Context) (done bool, err error) { + cv, err := configClient.ClusterVersions().Get(ctx, external.DefaultClusterVersionName, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + for _, risk := range cv.Status.ConditionalUpdateRisks { + if risk.Name == "TestAlertFeatureE2ETestOTA1813" { + if c := meta.FindStatusCondition(risk.Conditions, external.ConditionalUpdateRiskConditionTypeApplies); c != nil { + if c.Status == metav1.ConditionTrue && external.IsAlertConditionReason(c.Reason) { + return true, nil + } + } + } + } + return false, nil + })).NotTo(o.HaveOccurred(), "no conditional update risk from alert found in ClusterVersion's status") + + g.By("Checking that no updates is recommended if alert is firing") + o.Expect(wait.PollUntilContextTimeout(ctx, 30*time.Second, 5*time.Minute, true, func(ctx context.Context) (done bool, err error) { + cv, err := configClient.ClusterVersions().Get(ctx, external.DefaultClusterVersionName, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + if len(cv.Status.AvailableUpdates) > 0 { + return false, nil + } + for _, cu := range cv.Status.ConditionalUpdates { + condition := meta.FindStatusCondition(cu.Conditions, external.ConditionalUpdateConditionTypeRecommended) + if condition == nil || condition.Status == metav1.ConditionTrue || condition.Status == metav1.ConditionUnknown { + return false, nil + } + } + return true, nil + })).NotTo(o.HaveOccurred(), "still recommending updates while alert is firing") + }) + g.It("should work with accept risks", g.Label("Serial"), func() { // This test case relies on a public service util.FauxinnatiAPIURL o.Expect(util.SkipIfNetworkRestricted(ctx, c, util.FauxinnatiAPIURL)).To(o.BeNil())