diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml new file mode 100644 index 000000000..f4736ca85 --- /dev/null +++ b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml @@ -0,0 +1,26 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + k8s-app: cluster-version-operator + name: cluster-version-operator-accept-risks + namespace: openshift-cluster-version + annotations: + kubernetes.io/description: Alerting rules for the feature gate ClusterUpdateAcceptRisks. + exclude.release.openshift.io/internal-openshift-hosted: "true" + include.release.openshift.io/self-managed-high-availability: "true" + release.openshift.io/feature-gate: "ClusterUpdateAcceptRisks" +spec: + groups: + - name: cluster-version-tech-preview + rules: + - alert: OpenShiftUpdateRiskMightApply + annotations: + summary: The cluster might have been exposed to the conditional update risk for 10 minutes. + description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-version-operator/OpenShiftUpdateRiskMightApply.md + expr: | + max by (namespace, risk, reason) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0) + for: 10m + labels: + severity: warning diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 523d94d8c..f7e3e0434 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -14,6 +14,7 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apiserver/pkg/server/dynamiccertificates" @@ -52,6 +53,7 @@ type operatorMetrics struct { capability *prometheus.GaugeVec clusterOperatorUp *prometheus.GaugeVec clusterOperatorConditions *prometheus.GaugeVec + clusterVersionRiskConditions *prometheus.GaugeVec clusterOperatorConditionTransitions *prometheus.GaugeVec clusterInstaller *prometheus.GaugeVec clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec @@ -102,6 +104,10 @@ penultimate completed version for 'completed'. Name: "cluster_operator_conditions", Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.", }, []string{"name", "condition", "reason"}), + clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_version_risk_conditions", + Help: "Report the risk conditions for the cluster version. -1 is Unknown, 0 is False and 1 is True.", + }, []string{"condition", "risk", "reason"}), clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_operator_condition_transitions", Help: "Reports the number of times that a condition on a cluster operator changes status", @@ -487,6 +493,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.capability.WithLabelValues("").Desc() ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc() + ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() @@ -508,6 +515,26 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, } } +func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) { + for _, risk := range risks { + for _, condition := range risk.Conditions { + if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies { + continue + } + + g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name, condition.Reason) + switch condition.Status { + case metav1.ConditionTrue: + g.Set(1) + case metav1.ConditionUnknown: + g.Set(-1) + } + // We do not need to do g.Set(0) as it is done when g is initialized + ch <- g + } + } +} + // Collect collects metrics from the operator into the channel ch func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { current := m.optr.currentVersion() @@ -653,6 +680,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { } m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates) + if m.optr.shouldReconcileAcceptRisks() { + m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks) + } } g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index 8c5748072..224770821 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -23,6 +23,7 @@ import ( "k8s.io/apiserver/pkg/server/dynamiccertificates" "k8s.io/client-go/tools/record" + "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/internal" ) @@ -667,6 +668,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version") tt.optr.eventRecorder = record.NewFakeRecorder(100) if tt.optr.cvLister == nil { tt.optr.cvLister = &cvLister{} @@ -973,6 +975,117 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { } } +func Test_collectConditionalUpdateRisks(t *testing.T) { + type valueWithLabels struct { + value float64 + labels map[string]string + } + testCases := []struct { + name string + risks []configv1.ConditionalUpdateRisk + expected []valueWithLabels + }{ + { + name: "no conditional updates", + expected: []valueWithLabels{}, + }, + { + name: "unknown type", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateConditionTypeRecommended, + Status: metav1.ConditionFalse, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + }, + { + name: "apply false", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionFalse, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + expected: []valueWithLabels{{ + labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, + }}, + }, + { + name: "apply true", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + expected: []valueWithLabels{{ + value: 1, + labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, + }}, + }, + { + name: "apply unknown", + risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskX", + Conditions: []metav1.Condition{{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionUnknown, + Reason: "ReasonA", + Message: "Risk does not apply", + }}, + }, + }, + expected: []valueWithLabels{{ + value: -1, + labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, + }}, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + optr := &Operator{} + m := newOperatorMetrics(optr) + ch := make(chan prometheus.Metric) + + go func() { + m.collectConditionalUpdateRisks(ch, tc.risks) + close(ch) + }() + + var collected []prometheus.Metric + for item := range ch { + collected = append(collected, item) + } + + if lenC, lenE := len(collected), len(tc.expected); lenC != lenE { + + t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected)) + } + for i := range tc.expected { + expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels) + } + }) + } +} + func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) { t.Helper() var d dto.Metric