Skip to content

Commit 3f7c7bf

Browse files
committed
fix(dvcr): make garbage collection completion more reliable
Use the dvcr garbage collection secret as the source of truth for postponed VI/CVI/VD imports and requeue postponed resources when the secret is created, updated, or deleted. Split deployment status persistence from garbage collection secret deletion to avoid losing the final GC result on deployment update conflicts with hook-triggered Helm reconciliations. It should fix problem when VI/CVI/VD stuck in "Postponed" state. Add timeout handling for running garbage collection and provisioner wait stages, and keep the cleanup result safe until it is persisted to deployment status. Minor: - Handle nil garbage collection secret in dvcr-cleaner when adding cleanup completion annotation. - Refactor dvcr gc secret watcher: filter events for secret early in predicate functions. Signed-off-by: Ivan Mikheykin <ivan.mikheykin@flant.com>
1 parent 5cdbf7e commit 3f7c7bf

17 files changed

Lines changed: 290 additions & 36 deletions

File tree

images/dvcr-artifact/cmd/dvcr-cleaner/cmd/gc.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,10 @@ func annotateGarbageCollectionSecretOnCleanupDone(ctx context.Context, result ma
380380
return err
381381
}
382382

383+
if secret == nil {
384+
return fmt.Errorf("garbage collection secret not found")
385+
}
386+
383387
if secret.Annotations == nil {
384388
secret.Annotations = make(map[string]string)
385389
}

images/virtualization-artifact/pkg/controller/cvi/cvi_controller.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import (
3333
"github.com/deckhouse/deckhouse/pkg/log"
3434
"github.com/deckhouse/virtualization-controller/pkg/controller/cvi/internal"
3535
"github.com/deckhouse/virtualization-controller/pkg/controller/cvi/internal/source"
36-
"github.com/deckhouse/virtualization-controller/pkg/controller/dvcr-garbage-collection/postponehandler"
36+
"github.com/deckhouse/virtualization-controller/pkg/controller/dvcr-garbage-collection/postponeimporter"
3737
"github.com/deckhouse/virtualization-controller/pkg/controller/gc"
3838
"github.com/deckhouse/virtualization-controller/pkg/controller/indexer"
3939
"github.com/deckhouse/virtualization-controller/pkg/controller/service"
@@ -81,7 +81,7 @@ func NewController(
8181

8282
reconciler := NewReconciler(
8383
mgr.GetClient(),
84-
postponehandler.New[*v1alpha2.ClusterVirtualImage](dvcrService, recorder),
84+
postponeimporter.NewHandler[*v1alpha2.ClusterVirtualImage](dvcrService, recorder),
8585
internal.NewDatasourceReadyHandler(sources),
8686
internal.NewLifeCycleHandler(sources, mgr.GetClient()),
8787
internal.NewImagePresenceHandler(dvcr.NewImageChecker(mgr.GetClient(), dvcrSettings)),
@@ -100,7 +100,7 @@ func NewController(
100100
return nil, err
101101
}
102102

103-
err = reconciler.SetupController(ctx, mgr, cviController)
103+
err = reconciler.SetupController(ctx, mgr, cviController, log)
104104
if err != nil {
105105
return nil, err
106106
}

images/virtualization-artifact/pkg/controller/cvi/cvi_reconciler.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ import (
3030
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3131
"sigs.k8s.io/controller-runtime/pkg/source"
3232

33+
"github.com/deckhouse/deckhouse/pkg/log"
3334
"github.com/deckhouse/virtualization-controller/pkg/controller/cvi/internal/watcher"
35+
"github.com/deckhouse/virtualization-controller/pkg/controller/dvcr-garbage-collection/postponeimporter"
3436
"github.com/deckhouse/virtualization-controller/pkg/controller/reconciler"
3537
"github.com/deckhouse/virtualization-controller/pkg/controller/watchers"
3638
"github.com/deckhouse/virtualization/api/core/v1alpha2"
@@ -81,7 +83,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
8183
return rec.Reconcile(ctx)
8284
}
8385

84-
func (r *Reconciler) SetupController(_ context.Context, mgr manager.Manager, ctr controller.Controller) error {
86+
func (r *Reconciler) SetupController(_ context.Context, mgr manager.Manager, ctr controller.Controller, logger *log.Logger) error {
8587
if err := ctr.Watch(
8688
source.Kind(
8789
mgr.GetCache(),
@@ -102,6 +104,7 @@ func (r *Reconciler) SetupController(_ context.Context, mgr manager.Manager, ctr
102104
watcher.NewVirtualMachineWatcher(),
103105
watcher.NewVirtualDiskWatcher(mgrClient),
104106
watcher.NewVirtualDiskSnapshotWatcher(mgrClient),
107+
postponeimporter.NewWatcher[*v1alpha2.ClusterVirtualImage](mgrClient, logger),
105108
} {
106109
err := w.Watch(mgr, ctr)
107110
if err != nil {

images/virtualization-artifact/pkg/controller/dvcr-garbage-collection/condition/deployment.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,15 @@ func UpdateGarbageCollectionCondition(deploy *appsv1.Deployment, reason dvcrdepl
7171
}
7272
deploy.Status.Conditions = filteredConditions
7373
}
74+
75+
func GetGarbageCollectionCondition(deploy *appsv1.Deployment) appsv1.DeploymentCondition {
76+
if deploy == nil {
77+
return appsv1.DeploymentCondition{}
78+
}
79+
for _, cond := range deploy.Status.Conditions {
80+
if cond.Type == dvcrdeploymentcondition.GarbageCollectionType {
81+
return cond
82+
}
83+
}
84+
return appsv1.DeploymentCondition{}
85+
}

images/virtualization-artifact/pkg/controller/dvcr-garbage-collection/internal/life_cycle.go

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ type LifeCycleHandler struct {
3737
provisioningLister dvcrtypes.ProvisioningLister
3838
}
3939

40+
var (
41+
gcStatusPollingInterval = time.Second * 20
42+
resultPersistRetryInterval = time.Second
43+
)
44+
4045
func NewLifeCycleHandler(client client.Client, dvcrService dvcrtypes.DVCRService, provisioningLister dvcrtypes.ProvisioningLister) *LifeCycleHandler {
4146
return &LifeCycleHandler{
4247
client: client,
@@ -74,11 +79,17 @@ func (h LifeCycleHandler) Handle(ctx context.Context, req reconcile.Request, dep
7479
return reconcile.Result{}, fmt.Errorf("fetch garbage collection secret: %w", err)
7580
}
7681
if secret == nil || secret.GetDeletionTimestamp() != nil {
77-
// Secret is gone, no action required.
82+
// Secret is gone, nothing to clean up:
83+
// - Keep deployment conditions for informational purposes.
84+
// - Postponed CVI/VI/VD will start importers themselves.
7885
return reconcile.Result{}, nil
7986
}
8087

8188
if h.dvcrService.IsGarbageCollectionDone(secret) {
89+
if h.dvcrService.IsGarbageCollectionResultPersisted(secret, deploy) {
90+
return reconcile.Result{}, h.dvcrService.DeleteGarbageCollectionSecret(ctx)
91+
}
92+
8293
// Extract error or success message from the result.
8394
reason, msg, err := h.dvcrService.ParseGarbageCollectionResult(secret)
8495
if err != nil {
@@ -87,17 +98,34 @@ func (h LifeCycleHandler) Handle(ctx context.Context, req reconcile.Request, dep
8798
dvcrcondition.UpdateGarbageCollectionCondition(deploy, reason, "%s", msg)
8899
// Put full result JSON into annotation on deployment.
89100
annotations.AddAnnotation(deploy, annotations.AnnDVCRGarbageCollectionResult, h.dvcrService.GetGarbageCollectionResult(secret))
90-
// It is now possible to delete a secret.
91-
return reconcile.Result{}, h.dvcrService.DeleteGarbageCollectionSecret(ctx)
101+
// Requeue to delete secret only after deployment update succeeds.
102+
return reconcile.Result{RequeueAfter: resultPersistRetryInterval}, nil
92103
}
93104

94105
if h.dvcrService.IsGarbageCollectionStarted(secret) {
106+
hasCreationTimestamp := !secret.GetCreationTimestamp().Time.IsZero()
107+
waitDuration := time.Since(secret.GetCreationTimestamp().Time)
108+
if hasCreationTimestamp && waitDuration > dvcrtypes.GarbageCollectionTimeout {
109+
if h.dvcrService.IsGarbageCollectionResultPersisted(secret, deploy) {
110+
return reconcile.Result{}, h.dvcrService.DeleteGarbageCollectionSecret(ctx)
111+
}
112+
113+
dvcrcondition.UpdateGarbageCollectionCondition(deploy,
114+
dvcrdeploymentcondition.Error,
115+
"Wait for garbage collection more than %s timeout: %s elapsed, garbage collection canceled",
116+
dvcrtypes.GarbageCollectionTimeout.String(),
117+
waitDuration.Truncate(time.Second).String(),
118+
)
119+
annotations.AddAnnotation(deploy, annotations.AnnDVCRGarbageCollectionResult, "")
120+
return reconcile.Result{RequeueAfter: resultPersistRetryInterval}, nil
121+
}
122+
95123
dvcrcondition.UpdateGarbageCollectionCondition(deploy,
96124
dvcrdeploymentcondition.InProgress,
97125
"Wait for garbage collection to finish.",
98126
)
99127
// Wait for done annotation appears on secret.
100-
return reconcile.Result{}, nil
128+
return reconcile.Result{RequeueAfter: gcStatusPollingInterval}, nil
101129
}
102130

103131
// No special annotation, check for provisioners to finish.
@@ -116,30 +144,35 @@ func (h LifeCycleHandler) Handle(ctx context.Context, req reconcile.Request, dep
116144
dvcrdeploymentcondition.InProgress,
117145
"Wait for garbage collection to finish.",
118146
)
119-
return reconcile.Result{}, nil
147+
return reconcile.Result{RequeueAfter: gcStatusPollingInterval}, nil
120148
}
121149

122150
// Cancel garbage collection if wait for provisioners for too long.
123151
hasCreationTimestamp := !secret.GetCreationTimestamp().Time.IsZero()
124152
waitDuration := time.Since(secret.GetCreationTimestamp().Time)
125153
if hasCreationTimestamp && waitDuration > dvcrtypes.WaitProvisionersTimeout {
154+
if h.dvcrService.IsGarbageCollectionResultPersisted(secret, deploy) {
155+
return reconcile.Result{}, h.dvcrService.DeleteGarbageCollectionSecret(ctx)
156+
}
157+
126158
// Wait for provisioners timed out: report error and stop garbage collection.
127159
dvcrcondition.UpdateGarbageCollectionCondition(deploy,
128160
dvcrdeploymentcondition.Error,
129-
"Wait for resources provisioners more than %s timeout: %s elapsed, garbage collection canceled",
161+
"Wait for %d resources provisioners to finish more than %s timeout: %s elapsed, garbage collection canceled",
162+
remainInProvisioning,
130163
dvcrtypes.WaitProvisionersTimeout.String(),
131-
waitDuration.String(),
164+
waitDuration.Truncate(time.Second).String(),
132165
)
133166
annotations.AddAnnotation(deploy, annotations.AnnDVCRGarbageCollectionResult, "")
134-
return reconcile.Result{}, h.dvcrService.DeleteGarbageCollectionSecret(ctx)
167+
return reconcile.Result{RequeueAfter: resultPersistRetryInterval}, nil
135168
}
136169

137170
// Use requeue to wait for provisioners to finish.
138171
dvcrcondition.UpdateGarbageCollectionCondition(deploy,
139172
dvcrdeploymentcondition.InProgress,
140173
"Wait for cvi/vi/vd finish provisioning: %d resources remain.", remainInProvisioning,
141174
)
142-
return reconcile.Result{RequeueAfter: time.Second * 20}, nil
175+
return reconcile.Result{RequeueAfter: gcStatusPollingInterval}, nil
143176
}
144177

145178
return reconcile.Result{}, nil

images/virtualization-artifact/pkg/controller/dvcr-garbage-collection/internal/service/provisioning_lister.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ func (p *ProvisioningLister) ListClusterVirtualImagesInProvisioning(ctx context.
8080

8181
var provisioning []v1alpha2.ClusterVirtualImage
8282
for _, cvi := range cviList.Items {
83+
// Ignore if Terminating state.
84+
if !cvi.GetDeletionTimestamp().IsZero() {
85+
continue
86+
}
8387
cond, exists := conditions.GetCondition(cvicondition.ReadyType, cvi.Status.Conditions)
8488
if exists && cond.Status == metav1.ConditionFalse && cond.Reason == cvicondition.Provisioning.String() {
8589
provisioning = append(provisioning, cvi)
@@ -97,6 +101,10 @@ func (p *ProvisioningLister) ListVirtualImagesInProvisioning(ctx context.Context
97101

98102
var provisioning []v1alpha2.VirtualImage
99103
for _, vi := range viList.Items {
104+
// Ignore if Terminating state.
105+
if !vi.GetDeletionTimestamp().IsZero() {
106+
continue
107+
}
100108
cond, exists := conditions.GetCondition(vicondition.ReadyType, vi.Status.Conditions)
101109
if exists && cond.Status == metav1.ConditionFalse && cond.Reason == vicondition.Provisioning.String() {
102110
provisioning = append(provisioning, vi)
@@ -118,6 +126,10 @@ func (p *ProvisioningLister) ListVirtualDisksInProvisioning(ctx context.Context)
118126
if !vdHasDVCRStage(&vd) {
119127
continue
120128
}
129+
// Ignore if Terminating state.
130+
if !vd.GetDeletionTimestamp().IsZero() {
131+
continue
132+
}
121133
cond, exists := conditions.GetCondition(vdcondition.ReadyType, vd.Status.Conditions)
122134
if exists && cond.Status == metav1.ConditionFalse && cond.Reason == vdcondition.Provisioning.String() {
123135
provisioning = append(provisioning, vd)

images/virtualization-artifact/pkg/controller/dvcr-garbage-collection/internal/watcher/garbage_collection_secret.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"reflect"
2222

2323
corev1 "k8s.io/api/core/v1"
24+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2425
"sigs.k8s.io/controller-runtime/pkg/client"
2526
"sigs.k8s.io/controller-runtime/pkg/controller"
2627
"sigs.k8s.io/controller-runtime/pkg/event"
@@ -50,16 +51,29 @@ func (w *DVCRGarbageCollectionSecretWatcher) Watch(mgr manager.Manager, ctr cont
5051
mgr.GetCache(),
5152
&corev1.Secret{},
5253
handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, secret *corev1.Secret) []reconcile.Request {
53-
if secret.GetNamespace() == dvcrtypes.ModuleNamespace && secret.GetName() == dvcrtypes.DVCRGarbageCollectionSecretName {
54-
return []reconcile.Request{{NamespacedName: client.ObjectKeyFromObject(secret)}}
55-
}
56-
return nil
54+
return []reconcile.Request{{NamespacedName: client.ObjectKeyFromObject(secret)}}
5755
}),
5856
predicate.TypedFuncs[*corev1.Secret]{
57+
CreateFunc: func(e event.TypedCreateEvent[*corev1.Secret]) bool {
58+
return IsDVCRGarbageCollectionSecret(e.Object)
59+
},
5960
UpdateFunc: func(e event.TypedUpdateEvent[*corev1.Secret]) bool {
61+
if !IsDVCRGarbageCollectionSecret(e.ObjectNew) {
62+
return false
63+
}
6064
return !reflect.DeepEqual(e.ObjectNew.GetAnnotations(), e.ObjectOld.GetAnnotations())
6165
},
66+
DeleteFunc: func(e event.TypedDeleteEvent[*corev1.Secret]) bool {
67+
return IsDVCRGarbageCollectionSecret(e.Object)
68+
},
6269
},
6370
),
6471
)
6572
}
73+
74+
func IsDVCRGarbageCollectionSecret(secret metav1.Object) bool {
75+
if secret == nil {
76+
return false
77+
}
78+
return secret.GetNamespace() == dvcrtypes.ModuleNamespace && secret.GetName() == dvcrtypes.DVCRGarbageCollectionSecretName
79+
}

images/virtualization-artifact/pkg/controller/dvcr-garbage-collection/postponehandler/postpone.go renamed to images/virtualization-artifact/pkg/controller/dvcr-garbage-collection/postponeimporter/postpone_handler.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package postponehandler
17+
package postponeimporter
1818

1919
import (
2020
"context"
@@ -42,13 +42,13 @@ type DVCRService interface {
4242

4343
var PostponePeriod = time.Second * 15
4444

45-
type Postpone[object client.Object] struct {
45+
type PostponeHandler[object client.Object] struct {
4646
dvcrService DVCRService
4747
recorder eventrecord.EventRecorderLogger
4848
}
4949

50-
func New[T client.Object](dvcrService DVCRService, recorder eventrecord.EventRecorderLogger) *Postpone[T] {
51-
return &Postpone[T]{
50+
func NewHandler[T client.Object](dvcrService DVCRService, recorder eventrecord.EventRecorderLogger) *PostponeHandler[T] {
51+
return &PostponeHandler[T]{
5252
dvcrService: dvcrService,
5353
recorder: recorder,
5454
}
@@ -57,7 +57,7 @@ func New[T client.Object](dvcrService DVCRService, recorder eventrecord.EventRec
5757
// Handle sets Ready condition to Provisioning for newly created resources
5858
// if dvcr is in the garbage collection mode.
5959
// Applicable for ClusterVirtualImage, VirtualImage, and VirtualDisk.
60-
func (p *Postpone[T]) Handle(ctx context.Context, obj T) (reconcile.Result, error) {
60+
func (p *PostponeHandler[T]) Handle(ctx context.Context, obj T) (reconcile.Result, error) {
6161
conditionsPtr := conditions.NewConditionsAccessor(obj).Conditions()
6262

6363
readyCondition, readyConditionPresent := conditions.GetCondition(getReadyType(obj), *conditionsPtr)
@@ -109,7 +109,7 @@ func (p *Postpone[T]) Handle(ctx context.Context, obj T) (reconcile.Result, erro
109109
return reconcile.Result{RequeueAfter: PostponePeriod}, reconciler.ErrStopHandlerChain
110110
}
111111

112-
func (p *Postpone[T]) Name() string {
112+
func (p *PostponeHandler[T]) Name() string {
113113
return "postpone-on-dvcr-garbage-collection-handler"
114114
}
115115

0 commit comments

Comments
 (0)