From 8a9e0f89c0ab5b3f1a3b0d1344256ab49897fd4b Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Sat, 14 Feb 2026 00:00:59 -0500 Subject: [PATCH 1/9] Introduce a new metric cluster_version_risk_conditions Follow up [1]. The samples for `cluster_version_risk_conditions` will be collected only when its operator `shouldReconcileAcceptRisks`. It means, e.g., on a TechPreview disabled cluster the metric is still defined but has no samples. [1]. https://github.com/openshift/cluster-version-operator/pull/1284#discussion_r2734925872 --- pkg/cvo/metrics.go | 28 ++++++++++ pkg/cvo/metrics_test.go | 112 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 21d2718df..ba1289b79 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -15,6 +15,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apiserver/pkg/server/dynamiccertificates" @@ -54,6 +55,7 @@ type operatorMetrics struct { capability *prometheus.GaugeVec clusterOperatorUp *prometheus.GaugeVec clusterOperatorConditions *prometheus.GaugeVec + clusterVersionRiskConditions *prometheus.GaugeVec clusterOperatorConditionTransitions *prometheus.GaugeVec clusterInstaller *prometheus.GaugeVec clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec @@ -104,6 +106,10 @@ penultimate completed version for 'completed'. Name: "cluster_operator_conditions", Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.", }, []string{"name", "condition", "reason"}), + clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_version_risk_conditions", + Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.", + }, []string{"name", "condition", "risk"}), clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_operator_condition_transitions", Help: "Reports the number of times that a condition on a cluster operator changes status", @@ -489,6 +495,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.capability.WithLabelValues("").Desc() ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc() + ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() @@ -510,6 +517,24 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, } } +func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) { + for _, risk := range risks { + for _, condition := range risk.Conditions { + if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies { + continue + } + + g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name) + if condition.Status == metav1.ConditionTrue { + g.Set(1) + } else { + g.Set(0) + } + ch <- g + } + } +} + // Collect collects metrics from the operator into the channel ch func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { current := m.optr.currentVersion() @@ -655,6 +680,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { } m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates) + if m.optr.shouldReconcileAcceptRisks() { + m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks) + } } g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 63b9c5f98..6d5f1ea2f 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -25,6 +25,7 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/openshift/library-go/pkg/crypto" + "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/internal" ) @@ -669,6 +670,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version") tt.optr.eventRecorder = record.NewFakeRecorder(100) if tt.optr.cvLister == nil { tt.optr.cvLister = &cvLister{} @@ -975,6 +977,116 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { } } +func Test_collectConditionalUpdateRisks(t *testing.T) { + type valueWithLabels struct { + value float64 + labels map[string]string + } + testCases := []struct { + name string + risks []configv1.ConditionalUpdateRisk + expected []valueWithLabels + }{ + { + name: "no conditional updates", + expected: []valueWithLabels{}, + }, + { + name: "unknown type", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionFalse, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + }, + { + name: "apply false", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionFalse, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + expected: []valueWithLabels{{ + labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"}, + }}, + }, + { + name: "apply true", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + expected: []valueWithLabels{{ + value: 1, + labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"}, + }}, + }, + { + name: "apply unknown", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionUnknown, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + expected: []valueWithLabels{{ + labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"}, + }}, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + optr := &Operator{} + m := newOperatorMetrics(optr) + ch := make(chan prometheus.Metric) + + go func() { + m.collectConditionalUpdateRisks(ch, tc.risks) + close(ch) + }() + + var collected []prometheus.Metric + for item := range ch { + collected = append(collected, item) + } + + if lenC, lenE := len(collected), len(tc.expected); lenC != lenE { + + t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected)) + } + for i := range tc.expected { + expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels) + } + }) + } +} + func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) { t.Helper() var d dto.Metric From 1bda8e5aa583fcf11331319cb5387122c570fcf1 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Sat, 14 Feb 2026 00:24:28 -0500 Subject: [PATCH 2/9] Add alert/RiskApplies The alert is defined in a new `PrometheusRule/cluster-version-operator-tech-preview` which is installed only on a TechPreview cluster. --- ...2_prometheusrule-TechPreviewNoUpgrade.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml new file mode 100644 index 000000000..3da424a22 --- /dev/null +++ b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml @@ -0,0 +1,26 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + k8s-app: cluster-version-operator + name: cluster-version-operator-tech-preview + namespace: openshift-cluster-version + annotations: + kubernetes.io/description: Alerting rules for when cluster-version operator metrics call for administrator attention. + exclude.release.openshift.io/internal-openshift-hosted: "true" + include.release.openshift.io/self-managed-high-availability: "true" + release.openshift.io/feature-set: TechPreviewNoUpgrade +spec: + groups: + - name: cluster-version-tech-preview + rules: + - alert: RiskApplies + annotations: + summary: The cluster has been exposed to the conditional update risk for 10 minutes. + description: The conditional update risk {{ "{{ $labels.risk }}" }} applies to the cluster, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/RiskApplies.md + expr: | + max by (namespace, name, risk) (cluster_version_risk_conditions{job="cluster-version-operator", name="version", condition="Applies"} == 1) + for: 10m + labels: + severity: warning From b83e05fa2ecec4fde5510bcca5fc42b54f498f60 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Tue, 17 Feb 2026 08:28:55 -0500 Subject: [PATCH 3/9] Remove the name label on cluster_version_risk_conditions --- pkg/cvo/metrics.go | 8 ++++---- pkg/cvo/metrics_test.go | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index ba1289b79..7014c1663 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -108,8 +108,8 @@ penultimate completed version for 'completed'. }, []string{"name", "condition", "reason"}), clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_version_risk_conditions", - Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.", - }, []string{"name", "condition", "risk"}), + Help: "Report the risk conditions for the cluster version. 0 is False and 1 is True.", + }, []string{"condition", "risk"}), clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_operator_condition_transitions", Help: "Reports the number of times that a condition on a cluster operator changes status", @@ -495,7 +495,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.capability.WithLabelValues("").Desc() ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc() - ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc() + ch <- m.clusterVersionRiskConditions.WithLabelValues("", "").Desc() ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() @@ -524,7 +524,7 @@ func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Met continue } - g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name) + g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name) if condition.Status == metav1.ConditionTrue { g.Set(1) } else { diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 6d5f1ea2f..d2e520c83 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -1019,7 +1019,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) { }, }, expected: []valueWithLabels{{ - labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"}, + labels: map[string]string{"condition": "Applies", "risk": "RiskX"}, }}, }, { @@ -1037,7 +1037,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) { }, expected: []valueWithLabels{{ value: 1, - labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"}, + labels: map[string]string{"condition": "Applies", "risk": "RiskX"}, }}, }, { @@ -1054,7 +1054,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) { }, }, expected: []valueWithLabels{{ - labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"}, + labels: map[string]string{"condition": "Applies", "risk": "RiskX"}, }}, }, } From f2d05910b99edf4c84dc9ae4e9eb407f2e980218 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Tue, 17 Feb 2026 08:30:15 -0500 Subject: [PATCH 4/9] Do not set the default value --- pkg/cvo/metrics.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 7014c1663..7ab423737 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -527,9 +527,8 @@ func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Met g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name) if condition.Status == metav1.ConditionTrue { g.Set(1) - } else { - g.Set(0) } + // We do not need to do g.Set(0) as it is done when g is initialized ch <- g } } From cde3e625c5711e052086a1b91e8179edd4b44035 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Mon, 23 Feb 2026 11:20:12 -0500 Subject: [PATCH 5/9] Rename the alert name to OpenShiftUpdateRiskApplies --- ...rsion-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml index 3da424a22..004b4a5df 100644 --- a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml +++ b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml @@ -14,11 +14,11 @@ spec: groups: - name: cluster-version-tech-preview rules: - - alert: RiskApplies + - alert: OpenShiftUpdateRiskApplies annotations: summary: The cluster has been exposed to the conditional update risk for 10 minutes. description: The conditional update risk {{ "{{ $labels.risk }}" }} applies to the cluster, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/RiskApplies.md + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskApplies.md expr: | max by (namespace, name, risk) (cluster_version_risk_conditions{job="cluster-version-operator", name="version", condition="Applies"} == 1) for: 10m From aca2e3126f9ed56c605057d744dc48c4fc24155d Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Mon, 23 Feb 2026 11:32:03 -0500 Subject: [PATCH 6/9] refine the alert expression * Remove the "name" label which is removed in the metric * Use `last_over_time` to still get values after resetting --- ...version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml index 004b4a5df..cf14cff66 100644 --- a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml +++ b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml @@ -20,7 +20,7 @@ spec: description: The conditional update risk {{ "{{ $labels.risk }}" }} applies to the cluster, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskApplies.md expr: | - max by (namespace, name, risk) (cluster_version_risk_conditions{job="cluster-version-operator", name="version", condition="Applies"} == 1) + max by (namespace, risk) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) == 1) for: 10m labels: severity: warning From 6780b9d6e50f33fb20365606653d6ad7695e39dc Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Mon, 23 Feb 2026 21:59:15 -0500 Subject: [PATCH 7/9] Set -1 when Applies=Unknown --- ...r_02_prometheusrule-TechPreviewNoUpgrade.yaml | 16 ++++++++-------- pkg/cvo/metrics.go | 13 ++++++++----- pkg/cvo/metrics_test.go | 7 ++++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml index cf14cff66..b3c934b21 100644 --- a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml +++ b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml @@ -3,24 +3,24 @@ kind: PrometheusRule metadata: labels: k8s-app: cluster-version-operator - name: cluster-version-operator-tech-preview + name: cluster-version-operator-accept-risks namespace: openshift-cluster-version annotations: - kubernetes.io/description: Alerting rules for when cluster-version operator metrics call for administrator attention. + kubernetes.io/description: Alerting rules for the feature gate ClusterUpdateAcceptRisks. exclude.release.openshift.io/internal-openshift-hosted: "true" include.release.openshift.io/self-managed-high-availability: "true" - release.openshift.io/feature-set: TechPreviewNoUpgrade + release.openshift.io/feature-gate: "ClusterUpdateAcceptRisks" spec: groups: - name: cluster-version-tech-preview rules: - - alert: OpenShiftUpdateRiskApplies + - alert: OpenShiftUpdateRiskMightApply annotations: - summary: The cluster has been exposed to the conditional update risk for 10 minutes. - description: The conditional update risk {{ "{{ $labels.risk }}" }} applies to the cluster, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskApplies.md + summary: The cluster might have been exposed to the conditional update risk for 10 minutes. + description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskMightApply.md expr: | - max by (namespace, risk) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) == 1) + max by (namespace, risk, reason) (abs(last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m])) == 1) for: 10m labels: severity: warning diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 7ab423737..30d04302f 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -108,8 +108,8 @@ penultimate completed version for 'completed'. }, []string{"name", "condition", "reason"}), clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_version_risk_conditions", - Help: "Report the risk conditions for the cluster version. 0 is False and 1 is True.", - }, []string{"condition", "risk"}), + Help: "Report the risk conditions for the cluster version. -1 is Unknown, 0 is False and 1 is True.", + }, []string{"condition", "risk", "reason"}), clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_operator_condition_transitions", Help: "Reports the number of times that a condition on a cluster operator changes status", @@ -495,7 +495,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.capability.WithLabelValues("").Desc() ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc() - ch <- m.clusterVersionRiskConditions.WithLabelValues("", "").Desc() + ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() @@ -524,9 +524,12 @@ func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Met continue } - g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name) - if condition.Status == metav1.ConditionTrue { + g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name, condition.Reason) + switch condition.Status { + case metav1.ConditionTrue: g.Set(1) + case metav1.ConditionUnknown: + g.Set(-1) } // We do not need to do g.Set(0) as it is done when g is initialized ch <- g diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index d2e520c83..f12852be3 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -1019,7 +1019,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) { }, }, expected: []valueWithLabels{{ - labels: map[string]string{"condition": "Applies", "risk": "RiskX"}, + labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, }}, }, { @@ -1037,7 +1037,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) { }, expected: []valueWithLabels{{ value: 1, - labels: map[string]string{"condition": "Applies", "risk": "RiskX"}, + labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, }}, }, { @@ -1054,7 +1054,8 @@ func Test_collectConditionalUpdateRisks(t *testing.T) { }, }, expected: []valueWithLabels{{ - labels: map[string]string{"condition": "Applies", "risk": "RiskX"}, + value: -1, + labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, }}, }, } From 73dc865083fc86aeab5b261c80e6f5f7e20f9afc Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Tue, 24 Feb 2026 13:56:57 -0500 Subject: [PATCH 8/9] Replace abs() == 1 with != 0 --- ...version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml index b3c934b21..23daf0741 100644 --- a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml +++ b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml @@ -20,7 +20,7 @@ spec: description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskMightApply.md expr: | - max by (namespace, risk, reason) (abs(last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m])) == 1) + max by (namespace, risk, reason) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0) for: 10m labels: severity: warning From 3176f7954ae8761b8e9e45dfef280adbdcf90289 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Fri, 27 Feb 2026 09:28:53 -0500 Subject: [PATCH 9/9] Fix runbook_url --- ...version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml index 23daf0741..f4736ca85 100644 --- a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml +++ b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml @@ -18,7 +18,7 @@ spec: annotations: summary: The cluster might have been exposed to the conditional update risk for 10 minutes. description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskMightApply.md + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-version-operator/OpenShiftUpdateRiskMightApply.md expr: | max by (namespace, risk, reason) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0) for: 10m