From 3a9d623756973c245f74b4107d130ebb07eaa84c Mon Sep 17 00:00:00 2001 From: Mark Old Date: Thu, 26 Feb 2026 14:41:20 -0800 Subject: [PATCH] Properly set Available and Progressing conditions The Pod Identity Webhook controller will now report Available: False when there are no pods available on the deployment, and Progressing: True when not all pods match the latest deployment spec (motivating factor: when the image on the deployment spec has been updated) --- pkg/operator/cleanup/status.go | 4 +- .../credentialsrequest_controller.go | 4 +- pkg/operator/credentialsrequest/status.go | 12 +- pkg/operator/loglevel/status.go | 4 +- .../podidentitywebhook_controller.go | 83 +++++++++++-- pkg/operator/secretannotator/status/status.go | 3 +- pkg/operator/status/status_controller.go | 9 +- pkg/operator/status/status_controller_test.go | 113 ++++++++++-------- 8 files changed, 156 insertions(+), 76 deletions(-) diff --git a/pkg/operator/cleanup/status.go b/pkg/operator/cleanup/status.go index 82cea22924..c571a915fd 100644 --- a/pkg/operator/cleanup/status.go +++ b/pkg/operator/cleanup/status.go @@ -1,6 +1,8 @@ package cleanup import ( + "context" + log "github.com/sirupsen/logrus" configv1 "github.com/openshift/api/config/v1" @@ -9,7 +11,7 @@ import ( var _ status.Handler = &ReconcileStaleCredentialsRequest{} -func (r *ReconcileStaleCredentialsRequest) GetConditions(logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { +func (r *ReconcileStaleCredentialsRequest) GetConditions(ctx context.Context, logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { return []configv1.ClusterOperatorStatusCondition{}, nil } diff --git a/pkg/operator/credentialsrequest/credentialsrequest_controller.go b/pkg/operator/credentialsrequest/credentialsrequest_controller.go index 1dc5c34aa4..8be2808ec7 100644 --- a/pkg/operator/credentialsrequest/credentialsrequest_controller.go +++ b/pkg/operator/credentialsrequest/credentialsrequest_controller.go @@ -493,9 +493,9 @@ type ReconcileSecretMissingLabel struct { mutatingClient corev1client.SecretsGetter } -func (r *ReconcileSecretMissingLabel) GetConditions(logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { +func (r *ReconcileSecretMissingLabel) GetConditions(ctx context.Context, logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { var secrets corev1.SecretList - if err := r.cachedClient.List(context.TODO(), &secrets); err != nil { + if err := r.cachedClient.List(ctx, &secrets); err != nil { return nil, err } var missing int diff --git a/pkg/operator/credentialsrequest/status.go b/pkg/operator/credentialsrequest/status.go index a433dc0b15..573b57cd64 100644 --- a/pkg/operator/credentialsrequest/status.go +++ b/pkg/operator/credentialsrequest/status.go @@ -29,8 +29,8 @@ const ( var _ status.Handler = &ReconcileCredentialsRequest{} -func (r *ReconcileCredentialsRequest) GetConditions(logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { - _, credRequests, mode, err := r.getOperatorState(logger) +func (r *ReconcileCredentialsRequest) GetConditions(ctx context.Context, logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { + _, credRequests, mode, err := r.getOperatorState(ctx, logger) if err != nil { return []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("failed to get operator state: %v", err) } @@ -43,7 +43,7 @@ func (r *ReconcileCredentialsRequest) GetConditions(logger log.FieldLogger) ([]c } func (r *ReconcileCredentialsRequest) GetRelatedObjects(logger log.FieldLogger) ([]configv1.ObjectReference, error) { - _, credRequests, _, err := r.getOperatorState(logger) + _, credRequests, _, err := r.getOperatorState(context.TODO(), logger) if err != nil { return []configv1.ObjectReference{}, fmt.Errorf("failed to get operator state: %v", err) } @@ -56,10 +56,10 @@ func (r *ReconcileCredentialsRequest) Name() string { // getOperatorState gets and returns the resources necessary to compute the // operator's current state. -func (r *ReconcileCredentialsRequest) getOperatorState(logger log.FieldLogger) (*corev1.Namespace, []minterv1.CredentialsRequest, operatorv1.CloudCredentialsMode, error) { +func (r *ReconcileCredentialsRequest) getOperatorState(ctx context.Context, logger log.FieldLogger) (*corev1.Namespace, []minterv1.CredentialsRequest, operatorv1.CloudCredentialsMode, error) { ns := &corev1.Namespace{} - if err := r.Client.Get(context.TODO(), types.NamespacedName{Name: minterv1.CloudCredOperatorNamespace}, ns); err != nil { + if err := r.Client.Get(ctx, types.NamespacedName{Name: minterv1.CloudCredOperatorNamespace}, ns); err != nil { if apierrors.IsNotFound(err) { return nil, nil, operatorv1.CloudCredentialsModeDefault, nil } @@ -72,7 +72,7 @@ func (r *ReconcileCredentialsRequest) getOperatorState(logger log.FieldLogger) ( // central list to live. Other credentials requests in other namespaces will not affect status, // but they will still work fine. credRequestList := &minterv1.CredentialsRequestList{} - err := r.Client.List(context.TODO(), credRequestList, client.InNamespace(minterv1.CloudCredOperatorNamespace)) + err := r.Client.List(ctx, credRequestList, client.InNamespace(minterv1.CloudCredOperatorNamespace)) if err != nil { return nil, nil, operatorv1.CloudCredentialsModeDefault, fmt.Errorf( "failed to list CredentialsRequests: %v", err) diff --git a/pkg/operator/loglevel/status.go b/pkg/operator/loglevel/status.go index 70928f9f09..6b9e96077b 100644 --- a/pkg/operator/loglevel/status.go +++ b/pkg/operator/loglevel/status.go @@ -1,6 +1,8 @@ package loglevel import ( + "context" + log "github.com/sirupsen/logrus" configv1 "github.com/openshift/api/config/v1" @@ -9,7 +11,7 @@ import ( var _ status.Handler = &ReconcileCloudCredentialConfig{} -func (r *ReconcileCloudCredentialConfig) GetConditions(logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { +func (r *ReconcileCloudCredentialConfig) GetConditions(ctx context.Context, logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { return []configv1.ClusterOperatorStatusCondition{}, nil } diff --git a/pkg/operator/podidentity/podidentitywebhook_controller.go b/pkg/operator/podidentity/podidentitywebhook_controller.go index 134f51e01f..e289c22bd6 100644 --- a/pkg/operator/podidentity/podidentitywebhook_controller.go +++ b/pkg/operator/podidentity/podidentitywebhook_controller.go @@ -37,12 +37,14 @@ import ( ) const ( - controllerName = "pod-identity" - deploymentName = "cloud-credential-operator" - operatorNamespace = "openshift-cloud-credential-operator" - retryInterval = 10 * time.Second - reasonStaticResourceReconcileFailed = "StaticResourceReconcileFailed" - pdb = "v4.1.0/common/poddisruptionbudget.yaml" + controllerName = "pod-identity" + deploymentName = "cloud-credential-operator" + operatorNamespace = "openshift-cloud-credential-operator" + retryInterval = 10 * time.Second + reasonStaticResourceReconcileFailed = "StaticResourceReconcileFailed" + reasonPodIdentityWebhookPodsStillUpdating = "PodIdentityWebhookPodsStillUpdating" + reasonPodIdentityWebhookPodsNotAvailable = "PodIdentityWebhookPodsNotAvailable" + pdb = "v4.1.0/common/poddisruptionbudget.yaml" ) var ( @@ -384,10 +386,75 @@ func (r *staticResourceReconciler) ReconcileResources(ctx context.Context) error return nil } +type webhookPodStatus struct { + desiredReplicas int32 + availableReplicas int32 + updatedReplicas int32 +} + +func (wps *webhookPodStatus) Available() bool { + return wps.availableReplicas > 0 +} + +func (wps *webhookPodStatus) Progressing() bool { + return wps.updatedReplicas != wps.desiredReplicas +} + +func (r *staticResourceReconciler) CheckPodStatus(ctx context.Context) (*webhookPodStatus, error) { + deployment := &appsv1.Deployment{} + err := r.client.Get(ctx, client.ObjectKey{ + Name: "pod-identity-webhook", + Namespace: "openshift-cloud-credential-operator", + }, deployment) + if err != nil { + return nil, err + } + + if deployment.Spec.Replicas == nil { + return &webhookPodStatus{}, nil + } + + deploymentStatus := &webhookPodStatus{ + desiredReplicas: *deployment.Spec.Replicas, + availableReplicas: deployment.Status.AvailableReplicas, + updatedReplicas: deployment.Status.UpdatedReplicas, + } + + return deploymentStatus, nil +} + var _ status.Handler = &staticResourceReconciler{} -func (r *staticResourceReconciler) GetConditions(logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { - return r.conditions, nil +func (r *staticResourceReconciler) GetConditions(ctx context.Context, logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { + podStatus, err := r.CheckPodStatus(ctx) + if err != nil { + logger.WithError(err).Errorf("checking pod identity webhook status failed") + return nil, err + } + + conditions := r.conditions + if !podStatus.Available() { + conditions = append(conditions, configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorAvailable, + Status: configv1.ConditionFalse, + Reason: reasonPodIdentityWebhookPodsNotAvailable, + Message: "No pod identity webhook pods are available\n", + }) + } + if podStatus.Progressing() { + conditions = append(conditions, configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: reasonPodIdentityWebhookPodsStillUpdating, + Message: fmt.Sprintf( + "Waiting for pod identity webhook deployment rollout to finish: %d out of %d new replica(s) have been updated...\n", + podStatus.updatedReplicas, + podStatus.desiredReplicas, + ), + }) + } + + return conditions, nil } func (r *staticResourceReconciler) GetRelatedObjects(logger log.FieldLogger) ([]configv1.ObjectReference, error) { diff --git a/pkg/operator/secretannotator/status/status.go b/pkg/operator/secretannotator/status/status.go index 367079f65a..aaccecb714 100644 --- a/pkg/operator/secretannotator/status/status.go +++ b/pkg/operator/secretannotator/status/status.go @@ -1,6 +1,7 @@ package status import ( + "context" "fmt" log "github.com/sirupsen/logrus" @@ -28,7 +29,7 @@ func NewSecretStatusHandler(kubeClient client.Client) *SecretStatusHandler { var _ status.Handler = &SecretStatusHandler{} -func (s *SecretStatusHandler) GetConditions(logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { +func (s *SecretStatusHandler) GetConditions(ctx context.Context, logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { conditions := []configv1.ClusterOperatorStatusCondition{} mode, conflict, err := utils.GetOperatorConfiguration(s.kubeClient, logger) diff --git a/pkg/operator/status/status_controller.go b/pkg/operator/status/status_controller.go index 24ea90f478..6bf2abcdad 100644 --- a/pkg/operator/status/status_controller.go +++ b/pkg/operator/status/status_controller.go @@ -44,7 +44,7 @@ const ( // Handler produces conditions and related objects to be reflected // in the cloud-credential-operator ClusterOperatorStatus type Handler interface { - GetConditions(logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) + GetConditions(ctx context.Context, logger log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) GetRelatedObjects(logger log.FieldLogger) ([]configv1.ObjectReference, error) Name() string } @@ -179,7 +179,7 @@ func (r *ReconcileStatus) Reconcile(ctx context.Context, request reconcile.Reque metrics.MetricControllerReconcileTime.WithLabelValues(controllerName).Observe(dur.Seconds()) }() - err := syncStatus(r.Client, r.Logger) + err := syncStatus(ctx, r.Client, r.Logger) return reconcile.Result{ RequeueAfter: defaultRequeuePeriod, }, err @@ -188,7 +188,7 @@ func (r *ReconcileStatus) Reconcile(ctx context.Context, request reconcile.Reque // syncStatus is written in a way so that if we expose this function it would allow // external controllers to trigger a static sync. But for now we will make this an internal // function until the need arises to expose it. -func syncStatus(kubeClient client.Client, logger log.FieldLogger) error { +func syncStatus(ctx context.Context, kubeClient client.Client, logger log.FieldLogger) error { log.Info("reconciling clusteroperator status") co := &configv1.ClusterOperator{} @@ -209,7 +209,7 @@ func syncStatus(kubeClient client.Client, logger log.FieldLogger) error { conditions := []configv1.ClusterOperatorStatusCondition{} relatedObjects := []configv1.ObjectReference{} for handlerName, handler := range statusHandlers { - handlerConditions, err := handler.GetConditions(logger) + handlerConditions, err := handler.GetConditions(ctx, logger) logger.WithFields(log.Fields{ "handlerconditions": handlerConditions, "statushandler": handlerName, @@ -261,6 +261,7 @@ func syncStatus(kubeClient client.Client, logger log.FieldLogger) error { progressing, _ := findClusterOperatorCondition(co.Status.Conditions, configv1.OperatorProgressing) // We know this should be there. progressing.LastTransitionTime = metav1.Now() + progressing.Status = configv1.ConditionTrue } // ClusterOperator should already exist (from the manifest payload), but recreate it if needed diff --git a/pkg/operator/status/status_controller_test.go b/pkg/operator/status/status_controller_test.go index e288a357dd..14fb5bde73 100644 --- a/pkg/operator/status/status_controller_test.go +++ b/pkg/operator/status/status_controller_test.go @@ -165,11 +165,14 @@ func TestConditionsEqual(t *testing.T) { } for _, tc := range testCases { - actual := conditionEqual(tc.a, tc.b) - if actual != tc.expected { - t.Fatalf("%q: expected %v, got %v", tc.description, - tc.expected, actual) - } + t.Run(tc.description, func(t *testing.T) { + + actual := conditionEqual(tc.a, tc.b) + if actual != tc.expected { + t.Fatalf("%q: expected %v, got %v", tc.description, + tc.expected, actual) + } + }) } } @@ -293,43 +296,45 @@ func TestConditions(t *testing.T) { } for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + + basicClusterOperator := testBasicClusterOperator() + // Make sure we have a clean ClusterOperator and CCO config for each test run + objects := []runtime.Object{ + basicClusterOperator, + testOperatorConfig(""), + } + + fakeClient := fake.NewClientBuilder(). + WithStatusSubresource(basicClusterOperator). + WithRuntimeObjects(objects...).Build() + + r := &ReconcileStatus{ + Client: fakeClient, + Logger: log.WithField("controller", controllerName), + platform: configv1.AWSPlatformType, + } - basicClusterOperator := testBasicClusterOperator() - // Make sure we have a clean ClusterOperator and CCO config for each test run - objects := []runtime.Object{ - basicClusterOperator, - testOperatorConfig(""), - } - - fakeClient := fake.NewClientBuilder(). - WithStatusSubresource(basicClusterOperator). - WithRuntimeObjects(objects...).Build() - - r := &ReconcileStatus{ - Client: fakeClient, - Logger: log.WithField("controller", controllerName), - platform: configv1.AWSPlatformType, - } - - for _, handler := range test.statusHandlers { - AddHandler(handler.Name(), handler) - } - defer clearHandlers() - - _, err := r.Reconcile(context.TODO(), reconcile.Request{}) - - require.NoError(t, err, "unexpected error") - - for _, condition := range test.expectedConditions { - co := getClusterOperator(fakeClient) - assert.NotNil(t, co) - foundCondition, _ := findClusterOperatorCondition(co.Status.Conditions, condition.conditionType) - require.NotNil(t, foundCondition) - assert.Equal(t, string(condition.status), string(foundCondition.Status), "condition %s had unexpected status", condition.conditionType) - if condition.reason != "" { - assert.Exactly(t, condition.reason, foundCondition.Reason) + for _, handler := range test.statusHandlers { + AddHandler(handler.Name(), handler) } - } + defer clearHandlers() + + _, err := r.Reconcile(context.TODO(), reconcile.Request{}) + + require.NoError(t, err, "unexpected error") + + for _, condition := range test.expectedConditions { + co := getClusterOperator(fakeClient) + assert.NotNil(t, co) + foundCondition, _ := findClusterOperatorCondition(co.Status.Conditions, condition.conditionType) + require.NotNil(t, foundCondition) + assert.Equal(t, string(condition.status), string(foundCondition.Status), "condition %s had unexpected status", condition.conditionType) + if condition.reason != "" { + assert.Exactly(t, condition.reason, foundCondition.Reason) + } + } + }) } } @@ -438,21 +443,23 @@ func TestSortedStatus(t *testing.T) { } for _, test := range tests { - sortStatusArrays(&test.status) - assert.ElementsMatchf(t, test.status.Conditions, test.expected.Conditions, "conditions = %v, want %v", test.status.Conditions, test.expected.Conditions) - assert.ElementsMatchf(t, test.status.RelatedObjects, test.expected.RelatedObjects, "conditions = %v, want %v", test.status.RelatedObjects, test.expected.RelatedObjects) + t.Run(test.name, func(t *testing.T) { + sortStatusArrays(&test.status) + assert.ElementsMatchf(t, test.status.Conditions, test.expected.Conditions, "conditions = %v, want %v", test.status.Conditions, test.expected.Conditions) + assert.ElementsMatchf(t, test.status.RelatedObjects, test.expected.RelatedObjects, "conditions = %v, want %v", test.status.RelatedObjects, test.expected.RelatedObjects) - for i, expected := range test.expected.Conditions { - assert.Equal(t, expected, test.status.Conditions[i]) - } + for i, expected := range test.expected.Conditions { + assert.Equal(t, expected, test.status.Conditions[i]) + } - for i, expected := range test.expected.RelatedObjects { - assert.Equal(t, expected, test.expected.RelatedObjects[i]) - } + for i, expected := range test.expected.RelatedObjects { + assert.Equal(t, expected, test.expected.RelatedObjects[i]) + } - if !reflect.DeepEqual(test.status, test.expected) { - t.Error("status' are not equal") - } + if !reflect.DeepEqual(test.status, test.expected) { + t.Error("status' are not equal") + } + }) } } @@ -534,7 +541,7 @@ func newHandler(name string, conditions []configv1.ClusterOperatorStatusConditio } } -func (h *miniHandler) GetConditions(log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { +func (h *miniHandler) GetConditions(context.Context, log.FieldLogger) ([]configv1.ClusterOperatorStatusCondition, error) { return h.conditions, nil }