Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
k8s-app: cluster-version-operator
name: cluster-version-operator-accept-risks
namespace: openshift-cluster-version
annotations:
kubernetes.io/description: Alerting rules for the feature gate ClusterUpdateAcceptRisks.
exclude.release.openshift.io/internal-openshift-hosted: "true"
include.release.openshift.io/self-managed-high-availability: "true"
release.openshift.io/feature-gate: "ClusterUpdateAcceptRisks"
spec:
groups:
- name: cluster-version-tech-preview
rules:
- alert: OpenShiftUpdateRiskMightApply
annotations:
summary: The cluster might have been exposed to the conditional update risk for 10 minutes.
description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-version-operator/OpenShiftUpdateRiskMightApply.md
expr: |
max by (namespace, risk, reason) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0)
for: 10m
labels:
severity: warning
30 changes: 30 additions & 0 deletions pkg/cvo/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apiserver/pkg/server/dynamiccertificates"
Expand Down Expand Up @@ -52,6 +53,7 @@ type operatorMetrics struct {
capability *prometheus.GaugeVec
clusterOperatorUp *prometheus.GaugeVec
clusterOperatorConditions *prometheus.GaugeVec
clusterVersionRiskConditions *prometheus.GaugeVec
clusterOperatorConditionTransitions *prometheus.GaugeVec
clusterInstaller *prometheus.GaugeVec
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
Expand Down Expand Up @@ -102,6 +104,10 @@ penultimate completed version for 'completed'.
Name: "cluster_operator_conditions",
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
}, []string{"name", "condition", "reason"}),
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_version_risk_conditions",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not worth holding on, but personally it bugs me that cluster_operator_conditions uses a plural name when each time-series will only be talking about a single condition at a time. I'd have prefered cluster_operator_condition there and cluster_version_risk_condition here. Too late to be worth changing cluster_operator_conditions at this point. Maybe that means we go plural again just for CVO-metric-weirdness consistency. Or maybe we can pivot to singular before going GA.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upstream has the same preference on Singulars, such as kube_node_status_condition and kube_deployment_status_condition from CHANGELOG.md.

$ rg '_condition' -c
CHANGELOG.md:5
pkg/customresourcestate/registry_factory_test.go:2
internal/store/persistentvolumeclaim_test.go:39
internal/store/node.go:1
internal/store/persistentvolumeclaim.go:1
internal/store/horizontalpodautoscaler.go:1
internal/store/namespace.go:1
internal/store/deployment_test.go:23
internal/store/horizontalpodautoscaler_test.go:10
internal/store/node_test.go:36
internal/store/certificatesigningrequest_test.go:30
internal/store/namespace_test.go:11
internal/store/certificatesigningrequest.go:1
internal/store/deployment.go:1
docs/metrics/auth/certificatesigningrequest-metrics.md:1
docs/metrics/workload/deployment-metrics.md:1
docs/metrics/workload/horizontalpodautoscaler-metrics.md:1
docs/metrics/storage/persistentvolumeclaim-metrics.md:1
docs/metrics/cluster/node-metrics.md:1
docs/metrics/cluster/namespace-metrics.md:1

$ rg '_conditions' -c
pkg/customresourcestate/registry_factory_test.go:2

Maybe that means we go plural again just for CVO-metric-weirdness consistency. Or maybe we can pivot to singular before going GA.

Works for me.
If I get time after completing the features, I will make a pull to rename both.

Help: "Report the risk conditions for the cluster version. -1 is Unknown, 0 is False and 1 is True.",
}, []string{"condition", "risk", "reason"}),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know all reason values in advance?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, there are 3 of them.

riskConditionReasonEvaluationFailed = "EvaluationFailed"
riskConditionReasonMatch = "Match"
riskConditionReasonNotMatch = "NotMatch"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm so right now we have only the following combinations?

  • riskConditionReasonEvaluationFailed => value = -1
  • riskConditionReasonMatch => value = 1
  • riskConditionReasonNotMatch => value = 0

In that case, the reason label seems to be redundant with the metric's value?

Another option would be to expose all possible reasons and report 1 to represent the actual reason (similar to a pattern used by kube-state-metrics, e.g. kube_deployment_status_condition)

cluster_version_risk_conditions{condition="Applies",reason="EvaluationFailed",risk="XXX"} 0
cluster_version_risk_conditions{condition="Applies",reason="Match",risk="XXX"} 0
cluster_version_risk_conditions{condition="Applies",reason="NotMatch",risk="XXX"} 1
cluster_version_risk_conditions{condition="Applies",reason="EvaluationFailed",risk="YYY"} 0
cluster_version_risk_conditions{condition="Applies",reason="Match",risk="YYY"} 1
cluster_version_risk_conditions{condition="Applies",reason="NotMatch",risk="YYYX"} 0

Or

cluster_version_risk_conditions{condition="Applies",status="True",risk="XXX"} 0
cluster_version_risk_conditions{condition="Applies",status="False",risk="XXX"} 1
cluster_version_risk_conditions{condition="Applies",status="Unknown",risk="XXX"} 0
cluster_version_risk_conditions{condition="Applies",status="True",risk="YYY"} 1
cluster_version_risk_conditions{condition="Applies",status="False",risk="YYY"} 0
cluster_version_risk_conditions{condition="Applies",status="Unknown",risk="YYYX"} 0

Intermittent series are harder to reason about (https://prometheus.io/docs/practices/instrumentation/#avoid-missing-metrics)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, the reason label seems to be redundant with the metric's value?

Yes. But we might add more in the future.

Another option would be to expose all possible reasons and report 1 to represent the actual reason

Then we would have to use the reason label in our alert expression (something like cluster_version_risk_conditions{condition="Applies",reason!="NotMatch"} == 1) to filter our the deviation of interest, correct?

I think it would work too.

Comparing to the current implementation:

cluster_version_risk_conditions{condition="Applies",reason="NotMatch",risk="XXX"} 0
cluster_version_risk_conditions{condition="Applies",reason="Match",risk="YYY"} 1

The suggested way would have more samples (#samples =#risks X #reasons) than the above (#samples=#risks). Do I understand it correctly?

But it is not missing (as described in avoid-missing-metricsavoid-missing-metrics), it is just having less samples.

I want to keep the implementation close to the existing ones if you feel OK. It is already challenging enough for me to follow. Switching the method for this one would just make the reading even harder in the future.

Let me know if I misunderstood the point.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

riskConditionReasonEvaluationFailed is not very granular. While I have no problem for that for this initial work, I expect we will want to shard that one into failure modes (could not contact the PromQL service, contacted but auth rejected, got a non-200 back, got a 200 back but no metric hits, etc., etc.) to help us figure out what needs to change to improve evaluation.

Another option would be to expose all possible reasons and report 1 to represent the actual reason...

I find that approach harder to consume. And https://prometheus.io/docs/practices/instrumentation/#avoid-missing-metrics :

Time series that are not present until something happens are difficult to deal with, as the usual simple operations are no longer sufficient to correctly handle them. To avoid this, export a default value such as 0 for any time series you know may exist in advance.

isn't really selling me on that approach. I agree that it's good to have a signal for a given (riskName, conditionName) tuple. I don't see a problem with that flipping between multiple time-series as the current reason changes. What kind of simple operations would having multiple time-series make harder?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm advocating for a solution which is already used and documented by kube-state-metrics hence familiar for our users. Another advantage declaring all series upfront is that it's easier for users to understand the metric (you don't have to guess which label values can exist).

I don't see a problem with that flipping between multiple time-series as the current reason changes. What kind of simple operations would having multiple time-series make harder?

Taking the current alert expression: max by (namespace, risk, reason) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0)

If the condition status is Unknown but the reason value flips between "authn error" and "no metrics returned" then the alert would be flapping. With the approach I suggested, I'd use this

avg_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies",status="False"}[5m]} < 1

Which would avoid alert flapping upon transient scrape failures or blips in risk evaluation.

Re propagating the reason as a metric label to express more failure scenarios: my 2 cents would be that the PromQL evaluator should have its own instrumentation (e.g. #1292).

Just to wrap up: I don't claim to enforce anything here, I'm only providing free advice and it's fine if you dismiss my suggestions :) Having said that, upstream instrumentation practices didn't grow out of nowhere and non-predictable series are for sure harder to work with in the end (not only for alerting but also for users).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another advantage declaring all series upfront is that it's easier for users to understand the metric (you don't have to guess which label values can exist).

We can list labels in the metric description if reason-discovery is important. I'm currently not clear on why anyone would need reason-discovery, vs. just trying to human-ingest a current-reason string that's presented to them.

Which would avoid alert flapping upon transient scrape failures or blips in risk evaluation.

But you also lose the information about which reason in alert labels (which get shipped up through Telemetry by default), and flipping between reasons seems like an unlikely corner case. And if we are concerned with flipping between reasons un under the alert-for cook window, we can give up on reason in the alert without giving up on reason in the underlying metric, just by dropping it from the max by (...) aggregation:

max by (namespace, risk) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0)

Re propagating the reason as a metric label to express more failure scenarios: my 2 cents would be that the PromQL evaluator should have its own instrumentation (e.g. #1292).

#1292 would tell us there were problems with PromQL evaluation, but will not currently tell us which PromQL is failing to evaluate. The risk label on this pull's alert will tell us which risk is failing (although without a reason, we won't know for sure that the failure mechanism is PromQL related).

Having said that, upstream instrumentation practices didn't grow out of nowhere...

I'm not asserting the upstream approach is wrong, just that the benefit is not yet clear to me ;) I think having a hint at the reason will likely help responding humans, both in-cluster admins and Red-Hat-side update-service maintainers, figure out what needs to be fixed. For in-cluster admins, we don't need the reason in the expr, we can figure it out via a subquery in the description (like we already do when looking up console_url). For Red-Hat-side update-service maintainers, if the reason isn't in the alert, we can still get it out of Insights ClusterVersion uploads. But all of that just needs the reason in some metric, and would work with either the time-series-for-all-possible-reasons approach an the only-time-series-for-the-current-reason approach.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks to both of you for the feedback.
For now, I will keep the current implementation (i will do another round of testing with the latest code).
I will keep this conversation open so that it is visible if/when we are not happy with the metrics/alert and start to figure out why it is like it and do a revisit for improvement.

clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_operator_condition_transitions",
Help: "Reports the number of times that a condition on a cluster operator changes status",
Expand Down Expand Up @@ -487,6 +493,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
ch <- m.capability.WithLabelValues("").Desc()
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
Expand All @@ -508,6 +515,26 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
}
}

func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
for _, risk := range risks {
for _, condition := range risk.Conditions {
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
continue
}

g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name, condition.Reason)
switch condition.Status {
case metav1.ConditionTrue:
g.Set(1)
case metav1.ConditionUnknown:
g.Set(-1)
}
// We do not need to do g.Set(0) as it is done when g is initialized
ch <- g
}
}
}

// Collect collects metrics from the operator into the channel ch
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
current := m.optr.currentVersion()
Expand Down Expand Up @@ -653,6 +680,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
}

m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
if m.optr.shouldReconcileAcceptRisks() {
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
}
}

g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)
Expand Down
113 changes: 113 additions & 0 deletions pkg/cvo/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"k8s.io/apiserver/pkg/server/dynamiccertificates"
"k8s.io/client-go/tools/record"

"github.com/openshift/cluster-version-operator/pkg/featuregates"
"github.com/openshift/cluster-version-operator/pkg/internal"
)

Expand Down Expand Up @@ -667,6 +668,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version")
tt.optr.eventRecorder = record.NewFakeRecorder(100)
if tt.optr.cvLister == nil {
tt.optr.cvLister = &cvLister{}
Expand Down Expand Up @@ -973,6 +975,117 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
}
}

func Test_collectConditionalUpdateRisks(t *testing.T) {
type valueWithLabels struct {
value float64
labels map[string]string
}
testCases := []struct {
name string
risks []configv1.ConditionalUpdateRisk
expected []valueWithLabels
}{
{
name: "no conditional updates",
expected: []valueWithLabels{},
},
{
name: "unknown type",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateConditionTypeRecommended,
Status: metav1.ConditionFalse,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
},
{
name: "apply false",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
Status: metav1.ConditionFalse,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
expected: []valueWithLabels{{
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
}},
},
{
name: "apply true",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
Status: metav1.ConditionTrue,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
expected: []valueWithLabels{{
value: 1,
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
}},
},
{
name: "apply unknown",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
Status: metav1.ConditionUnknown,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
expected: []valueWithLabels{{
value: -1,
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
}},
},
}

for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
optr := &Operator{}
m := newOperatorMetrics(optr)
ch := make(chan prometheus.Metric)

go func() {
m.collectConditionalUpdateRisks(ch, tc.risks)
close(ch)
}()

var collected []prometheus.Metric
for item := range ch {
collected = append(collected, item)
}

if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {

t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
}
for i := range tc.expected {
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
}
})
}
}

func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
t.Helper()
var d dto.Metric
Expand Down