clustercache: do not use RequeueAfter when hitting ErrClusterNotConnected and return error instead

chrischdi · chrischdi · commit 5d109d87561a · 2025-01-23T17:24:55.000+01:00
diff --git a/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go b/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go
@@ -279,12 +279,7 @@ func (r *KubeadmConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques
 		return ctrl.Result{}, nil
 	}
 
-	res, err := r.reconcile(ctx, scope, cluster, config, configOwner)
-	if err != nil && errors.Is(err, clustercache.ErrClusterNotConnected) {
-		log.V(5).Info("Requeuing because connection to the workload cluster is down")
-		return ctrl.Result{RequeueAfter: time.Minute}, nil
-	}
-	return res, err
+	return r.reconcile(ctx, scope, cluster, config, configOwner)
 }
 
 func (r *KubeadmConfigReconciler) reconcile(ctx context.Context, scope *Scope, cluster *clusterv1.Cluster, config *bootstrapv1.KubeadmConfig, configOwner *bsutil.ConfigOwner) (ctrl.Result, error) {
diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go
@@ -255,21 +255,11 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
 
 	if !kcp.ObjectMeta.DeletionTimestamp.IsZero() {
 		// Handle deletion reconciliation loop.
-		res, err = r.reconcileDelete(ctx, controlPlane)
-		if errors.Is(err, clustercache.ErrClusterNotConnected) {
-			log.V(5).Info("Requeuing because connection to the workload cluster is down")
-			return ctrl.Result{RequeueAfter: time.Minute}, nil
-		}
-		return res, err
+		return r.reconcileDelete(ctx, controlPlane)
 	}
 
 	// Handle normal reconciliation loop.
-	res, err = r.reconcile(ctx, controlPlane)
-	if errors.Is(err, clustercache.ErrClusterNotConnected) {
-		log.V(5).Info("Requeuing because connection to the workload cluster is down")
-		return ctrl.Result{RequeueAfter: time.Minute}, nil
-	}
-	return res, err
+	return r.reconcile(ctx, controlPlane)
 }
 
 // initControlPlaneScope initializes the control plane scope; this includes also checking for orphan machines and
diff --git a/exp/addons/internal/controllers/clusterresourceset_controller.go b/exp/addons/internal/controllers/clusterresourceset_controller.go
@@ -183,17 +183,9 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
 	}
 
 	errs := []error{}
-	errClusterNotConnectedOccurred := false
 	for _, cluster := range clusters {
 		if err := r.ApplyClusterResourceSet(ctx, cluster, clusterResourceSet); err != nil {
-			// Requeue if the reconcile failed because connection to workload cluster was down.
-			if errors.Is(err, clustercache.ErrClusterNotConnected) {
-				log.V(5).Info("Requeuing because connection to the workload cluster is down")
-				errClusterNotConnectedOccurred = true
-			} else {
-				// Append the error if the error is not ErrClusterLocked.
-				errs = append(errs, err)
-			}
+			errs = append(errs, err)
 		}
 	}
 
@@ -218,13 +210,6 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
 		return ctrl.Result{}, kerrors.NewAggregate(errs)
 	}
 
-	// Requeue if ErrClusterNotConnected was returned for one of the clusters.
-	if errClusterNotConnectedOccurred {
-		// Requeue after a minute to not end up in exponential delayed requeue which
-		// could take up to 16m40s.
-		return ctrl.Result{RequeueAfter: time.Minute}, nil
-	}
-
 	return ctrl.Result{}, nil
 }
 
diff --git a/exp/internal/controllers/machinepool_controller.go b/exp/internal/controllers/machinepool_controller.go
@@ -228,28 +228,15 @@ func (r *MachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	// Handle deletion reconciliation loop.
 	if !mp.ObjectMeta.DeletionTimestamp.IsZero() {
-		err := r.reconcileDelete(ctx, cluster, mp)
-		// Requeue if the reconcile failed because connection to workload cluster was down.
-		if errors.Is(err, clustercache.ErrClusterNotConnected) {
-			log.V(5).Info("Requeuing because connection to the workload cluster is down")
-			return ctrl.Result{RequeueAfter: time.Minute}, nil
-		}
-
-		return ctrl.Result{}, err
+		return ctrl.Result{}, r.reconcileDelete(ctx, cluster, mp)
 	}
 
 	// Handle normal reconciliation loop.
 	scope := &scope{
 		cluster:     cluster,
 		machinePool: mp,
 	}
-	res, err := r.reconcile(ctx, scope)
-	// Requeue if the reconcile failed because connection to workload cluster was down.
-	if errors.Is(err, clustercache.ErrClusterNotConnected) {
-		log.V(5).Info("Requeuing because connection to the workload cluster is down")
-		return ctrl.Result{RequeueAfter: time.Minute}, nil
-	}
-	return res, err
+	return r.reconcile(ctx, scope)
 }
 
 func (r *MachinePoolReconciler) reconcile(ctx context.Context, s *scope) (ctrl.Result, error) {
diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go
@@ -198,8 +198,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
 		return ctrl.Result{}, err
 	}
 
-	log := ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName))
-	ctx = ctrl.LoggerInto(ctx, log)
+	ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName)))
 
 	// Add finalizer first if not set to avoid the race condition between init and delete.
 	if finalizerAdded, err := finalizers.EnsureFinalizer(ctx, r.Client, m, clusterv1.MachineFinalizer); err != nil || finalizerAdded {
@@ -208,7 +207,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
 
 	// AddOwners adds the owners of Machine as k/v pairs to the logger.
 	// Specifically, it will add KubeadmControlPlane, MachineSet and MachineDeployment.
-	ctx, log, err := clog.AddOwners(ctx, r.Client, m)
+	ctx, _, err := clog.AddOwners(ctx, r.Client, m)
 	if err != nil {
 		return ctrl.Result{}, err
 	}
@@ -273,23 +272,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
 			r.reconcileDelete,
 		)
 
-		res, err := doReconcile(ctx, reconcileDelete, s)
-		// Requeue if the reconcile failed because connection to workload cluster was down.
-		if errors.Is(err, clustercache.ErrClusterNotConnected) {
-			log.V(5).Info("Requeuing because connection to the workload cluster is down")
-			return ctrl.Result{RequeueAfter: time.Minute}, nil
-		}
-		return res, err
+		return doReconcile(ctx, reconcileDelete, s)
 	}
 
 	// Handle normal reconciliation loop.
-	res, err := doReconcile(ctx, alwaysReconcile, s)
-	// Requeue if the reconcile failed because connection to workload cluster was down.
-	if errors.Is(err, clustercache.ErrClusterNotConnected) {
-		log.V(5).Info("Requeuing because connection to the workload cluster is down")
-		return ctrl.Result{RequeueAfter: time.Minute}, nil
-	}
-	return res, err
+	return doReconcile(ctx, alwaysReconcile, s)
 }
 
 func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -816,14 +803,10 @@ func (r *Reconciler) drainNode(ctx context.Context, s *scope) (ctrl.Result, erro
 
 	remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
 	if err != nil {
-		if errors.Is(err, clustercache.ErrClusterNotConnected) {
-			log.V(5).Info("Requeuing drain Node because connection to the workload cluster is down")
-			s.deletingReason = clusterv1.MachineDeletingDrainingNodeV1Beta2Reason
-			s.deletingMessage = "Requeuing drain Node because connection to the workload cluster is down"
-			return ctrl.Result{RequeueAfter: time.Minute}, nil
-		}
-		log.Error(err, "Error creating a remote client for cluster while draining Node, won't retry")
-		return ctrl.Result{}, nil
+		log.V(5).Info("Waiting for Cluster connection to come up to drain the Node")
+		s.deletingReason = clusterv1.MachineDeletingDrainingNodeV1Beta2Reason
+		s.deletingMessage = "Waiting for Cluster connection to come up to drain the Node"
+		return ctrl.Result{}, err
 	}
 
 	node := &corev1.Node{}
@@ -989,15 +972,9 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, s *scope) (ct
 }
 
 func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, name string) error {
-	log := ctrl.LoggerFrom(ctx)
-
 	remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
 	if err != nil {
-		if errors.Is(err, clustercache.ErrClusterNotConnected) {
-			return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down")
-		}
-		log.Error(err, "Error creating a remote client for cluster while deleting Node, won't retry")
-		return nil
+		return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down")
 	}
 
 	node := &corev1.Node{
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go
@@ -188,19 +188,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
 	}
 	m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName
 
-	result, err := r.reconcile(ctx, log, cluster, m)
-	if err != nil {
-		// Requeue if the reconcile failed because connection to workload cluster was down.
-		if errors.Is(err, clustercache.ErrClusterNotConnected) {
-			log.V(5).Info("Requeuing because connection to the workload cluster is down")
-			return ctrl.Result{RequeueAfter: time.Minute}, nil
-		}
-
-		// Requeue immediately if any errors occurred
-		return ctrl.Result{}, err
-	}
-
-	return result, nil
+	return r.reconcile(ctx, log, cluster, m)
 }
 
 func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) {
diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go
@@ -168,8 +168,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
 		return ctrl.Result{}, err
 	}
 
-	log := ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(machineSet.Namespace, machineSet.Spec.ClusterName))
-	ctx = ctrl.LoggerInto(ctx, log)
+	ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(machineSet.Namespace, machineSet.Spec.ClusterName)))
 
 	// Add finalizer first if not set to avoid the race condition between init and delete.
 	if finalizerAdded, err := finalizers.EnsureFinalizer(ctx, r.Client, machineSet, clusterv1.MachineSetFinalizer); err != nil || finalizerAdded {
@@ -178,7 +177,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
 
 	// AddOwners adds the owners of MachineSet as k/v pairs to the logger.
 	// Specifically, it will add MachineDeployment.
-	ctx, log, err := clog.AddOwners(ctx, r.Client, machineSet)
+	ctx, _, err := clog.AddOwners(ctx, r.Client, machineSet)
 	if err != nil {
 		return ctrl.Result{}, err
 	}
@@ -257,20 +256,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
 		wrapErrMachineSetReconcileFunc(r.syncReplicas, "failed to sync replicas"),
 	)
 
-	result, kerr := doReconcile(ctx, s, reconcileNormal)
-	if kerr != nil {
-		// Requeue if the reconcile failed because connection to workload cluster was down.
-		if errors.Is(kerr, clustercache.ErrClusterNotConnected) {
-			if len(kerr.Errors()) > 1 {
-				log.Error(kerr, "Requeuing because connection to the workload cluster is down")
-			} else {
-				log.V(5).Info("Requeuing because connection to the workload cluster is down")
-			}
-			return ctrl.Result{RequeueAfter: time.Minute}, nil
-		}
-		err = kerr
-	}
-	return result, err
+	return doReconcile(ctx, s, reconcileNormal)
 }
 
 type scope struct {
diff --git a/internal/controllers/topology/cluster/cluster_controller.go b/internal/controllers/topology/cluster/cluster_controller.go
@@ -278,8 +278,6 @@ func (r *Reconciler) SetupForDryRun(recorder record.EventRecorder) {
 }
 
 func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
-	log := ctrl.LoggerFrom(ctx)
-
 	// Fetch the Cluster instance.
 	cluster := &clusterv1.Cluster{}
 	if err := r.Client.Get(ctx, req.NamespacedName, cluster); err != nil {
@@ -340,15 +338,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
 	}
 
 	// Handle normal reconciliation loop.
-	result, err := r.reconcile(ctx, s)
-	if err != nil {
-		// Requeue if the reconcile failed because connection to workload cluster was down.
-		if errors.Is(err, clustercache.ErrClusterNotConnected) {
-			log.V(5).Info("Requeuing because connection to the workload cluster is down")
-			return ctrl.Result{RequeueAfter: time.Minute}, nil
-		}
-	}
-	return result, err
+	return r.reconcile(ctx, s)
 }
 
 // reconcile handles cluster reconciliation.
diff --git a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go
@@ -27,6 +27,7 @@ import (
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
+	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/klog/v2"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -70,8 +71,7 @@ type DockerMachineReconciler struct {
 // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
 
 // Reconcile handles DockerMachine events.
-func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, rerr error) {
-	log := ctrl.LoggerFrom(ctx)
+func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
 	ctx = container.RuntimeInto(ctx, r.ContainerRuntime)
 
 	// Fetch the DockerMachine instance.
@@ -150,10 +150,7 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
 	// Always attempt to Patch the DockerMachine object and status after each reconciliation.
 	defer func() {
 		if err := patchDockerMachine(ctx, patchHelper, dockerMachine); err != nil {
-			log.Error(err, "Failed to patch DockerMachine")
-			if rerr == nil {
-				rerr = err
-			}
+			reterr = kerrors.NewAggregate([]error{reterr, err})
 		}
 	}()
 
@@ -188,14 +185,7 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
 	}
 
 	// Handle non-deleted machines
-	res, err := r.reconcileNormal(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer)
-	// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
-	// the current cluster because of concurrent access.
-	if errors.Is(err, clustercache.ErrClusterNotConnected) {
-		log.V(5).Info("Requeuing because connection to the workload cluster is down")
-		return ctrl.Result{RequeueAfter: time.Minute}, nil
-	}
-	return res, err
+	return r.reconcileNormal(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer)
 }
 
 func patchDockerMachine(ctx context.Context, patchHelper *patch.Helper, dockerMachine *infrav1.DockerMachine) error {

Original file line number	Diff line number	Diff line change
`@@ -279,12 +279,7 @@ func (r *KubeadmConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques`
`279`	`279`	`return ctrl.Result{}, nil`
`280`	`280`	`}`
`281`	`281`
`282`		`- res, err := r.reconcile(ctx, scope, cluster, config, configOwner)`
`283`		`- if err != nil && errors.Is(err, clustercache.ErrClusterNotConnected) {`
`284`		`- log.V(5).Info("Requeuing because connection to the workload cluster is down")`
`285`		`- return ctrl.Result{RequeueAfter: time.Minute}, nil`
`286`		`- }`
`287`		`- return res, err`
	`282`	`+ return r.reconcile(ctx, scope, cluster, config, configOwner)`
`288`	`283`	`}`
`289`	`284`
`290`	`285`	`func (r KubeadmConfigReconciler) reconcile(ctx context.Context, scope Scope, cluster clusterv1.Cluster, config bootstrapv1.KubeadmConfig, configOwner *bsutil.ConfigOwner) (ctrl.Result, error) {`
Original file line number	Diff line number	Diff line change
`@@ -183,17 +183,9 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R`
`183`	`183`	`}`
`184`	`184`
`185`	`185`	`errs := []error{}`
`186`		`- errClusterNotConnectedOccurred := false`
`187`	`186`	`for _, cluster := range clusters {`
`188`	`187`	`if err := r.ApplyClusterResourceSet(ctx, cluster, clusterResourceSet); err != nil {`
`189`		`- // Requeue if the reconcile failed because connection to workload cluster was down.`
`190`		`- if errors.Is(err, clustercache.ErrClusterNotConnected) {`
`191`		`- log.V(5).Info("Requeuing because connection to the workload cluster is down")`
`192`		`- errClusterNotConnectedOccurred = true`
`193`		`- } else {`
`194`		`- // Append the error if the error is not ErrClusterLocked.`
`195`		`- errs = append(errs, err)`
`196`		`- }`
	`188`	`+ errs = append(errs, err)`
`197`	`189`	`}`
`198`	`190`	`}`
`199`	`191`
`@@ -218,13 +210,6 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R`
`218`	`210`	`return ctrl.Result{}, kerrors.NewAggregate(errs)`
`219`	`211`	`}`
`220`	`212`
`221`		`- // Requeue if ErrClusterNotConnected was returned for one of the clusters.`
`222`		`- if errClusterNotConnectedOccurred {`
`223`		`- // Requeue after a minute to not end up in exponential delayed requeue which`
`224`		`- // could take up to 16m40s.`
`225`		`- return ctrl.Result{RequeueAfter: time.Minute}, nil`
`226`		`- }`
`227`		`-`
`228`	`213`	`return ctrl.Result{}, nil`
`229`	`214`	`}`
`230`	`215`