Skip to content

Commit 5d109d8

Browse files
committed
clustercache: do not use RequeueAfter when hitting ErrClusterNotConnected and return error instead
1 parent a090145 commit 5d109d8

File tree

9 files changed

+24
-136
lines changed

9 files changed

+24
-136
lines changed

bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go

+1-6
Original file line numberDiff line numberDiff line change
@@ -279,12 +279,7 @@ func (r *KubeadmConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques
279279
return ctrl.Result{}, nil
280280
}
281281

282-
res, err := r.reconcile(ctx, scope, cluster, config, configOwner)
283-
if err != nil && errors.Is(err, clustercache.ErrClusterNotConnected) {
284-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
285-
return ctrl.Result{RequeueAfter: time.Minute}, nil
286-
}
287-
return res, err
282+
return r.reconcile(ctx, scope, cluster, config, configOwner)
288283
}
289284

290285
func (r *KubeadmConfigReconciler) reconcile(ctx context.Context, scope *Scope, cluster *clusterv1.Cluster, config *bootstrapv1.KubeadmConfig, configOwner *bsutil.ConfigOwner) (ctrl.Result, error) {

controlplane/kubeadm/internal/controllers/controller.go

+2-12
Original file line numberDiff line numberDiff line change
@@ -255,21 +255,11 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
255255

256256
if !kcp.ObjectMeta.DeletionTimestamp.IsZero() {
257257
// Handle deletion reconciliation loop.
258-
res, err = r.reconcileDelete(ctx, controlPlane)
259-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
260-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
261-
return ctrl.Result{RequeueAfter: time.Minute}, nil
262-
}
263-
return res, err
258+
return r.reconcileDelete(ctx, controlPlane)
264259
}
265260

266261
// Handle normal reconciliation loop.
267-
res, err = r.reconcile(ctx, controlPlane)
268-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
269-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
270-
return ctrl.Result{RequeueAfter: time.Minute}, nil
271-
}
272-
return res, err
262+
return r.reconcile(ctx, controlPlane)
273263
}
274264

275265
// initControlPlaneScope initializes the control plane scope; this includes also checking for orphan machines and

exp/addons/internal/controllers/clusterresourceset_controller.go

+1-16
Original file line numberDiff line numberDiff line change
@@ -183,17 +183,9 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
183183
}
184184

185185
errs := []error{}
186-
errClusterNotConnectedOccurred := false
187186
for _, cluster := range clusters {
188187
if err := r.ApplyClusterResourceSet(ctx, cluster, clusterResourceSet); err != nil {
189-
// Requeue if the reconcile failed because connection to workload cluster was down.
190-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
191-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
192-
errClusterNotConnectedOccurred = true
193-
} else {
194-
// Append the error if the error is not ErrClusterLocked.
195-
errs = append(errs, err)
196-
}
188+
errs = append(errs, err)
197189
}
198190
}
199191

@@ -218,13 +210,6 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
218210
return ctrl.Result{}, kerrors.NewAggregate(errs)
219211
}
220212

221-
// Requeue if ErrClusterNotConnected was returned for one of the clusters.
222-
if errClusterNotConnectedOccurred {
223-
// Requeue after a minute to not end up in exponential delayed requeue which
224-
// could take up to 16m40s.
225-
return ctrl.Result{RequeueAfter: time.Minute}, nil
226-
}
227-
228213
return ctrl.Result{}, nil
229214
}
230215

exp/internal/controllers/machinepool_controller.go

+2-15
Original file line numberDiff line numberDiff line change
@@ -228,28 +228,15 @@ func (r *MachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request)
228228

229229
// Handle deletion reconciliation loop.
230230
if !mp.ObjectMeta.DeletionTimestamp.IsZero() {
231-
err := r.reconcileDelete(ctx, cluster, mp)
232-
// Requeue if the reconcile failed because connection to workload cluster was down.
233-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
234-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
235-
return ctrl.Result{RequeueAfter: time.Minute}, nil
236-
}
237-
238-
return ctrl.Result{}, err
231+
return ctrl.Result{}, r.reconcileDelete(ctx, cluster, mp)
239232
}
240233

241234
// Handle normal reconciliation loop.
242235
scope := &scope{
243236
cluster: cluster,
244237
machinePool: mp,
245238
}
246-
res, err := r.reconcile(ctx, scope)
247-
// Requeue if the reconcile failed because connection to workload cluster was down.
248-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
249-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
250-
return ctrl.Result{RequeueAfter: time.Minute}, nil
251-
}
252-
return res, err
239+
return r.reconcile(ctx, scope)
253240
}
254241

255242
func (r *MachinePoolReconciler) reconcile(ctx context.Context, s *scope) (ctrl.Result, error) {

internal/controllers/machine/machine_controller.go

+9-32
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
198198
return ctrl.Result{}, err
199199
}
200200

201-
log := ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName))
202-
ctx = ctrl.LoggerInto(ctx, log)
201+
ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName)))
203202

204203
// Add finalizer first if not set to avoid the race condition between init and delete.
205204
if finalizerAdded, err := finalizers.EnsureFinalizer(ctx, r.Client, m, clusterv1.MachineFinalizer); err != nil || finalizerAdded {
@@ -208,7 +207,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
208207

209208
// AddOwners adds the owners of Machine as k/v pairs to the logger.
210209
// Specifically, it will add KubeadmControlPlane, MachineSet and MachineDeployment.
211-
ctx, log, err := clog.AddOwners(ctx, r.Client, m)
210+
ctx, _, err := clog.AddOwners(ctx, r.Client, m)
212211
if err != nil {
213212
return ctrl.Result{}, err
214213
}
@@ -273,23 +272,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
273272
r.reconcileDelete,
274273
)
275274

276-
res, err := doReconcile(ctx, reconcileDelete, s)
277-
// Requeue if the reconcile failed because connection to workload cluster was down.
278-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
279-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
280-
return ctrl.Result{RequeueAfter: time.Minute}, nil
281-
}
282-
return res, err
275+
return doReconcile(ctx, reconcileDelete, s)
283276
}
284277

285278
// Handle normal reconciliation loop.
286-
res, err := doReconcile(ctx, alwaysReconcile, s)
287-
// Requeue if the reconcile failed because connection to workload cluster was down.
288-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
289-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
290-
return ctrl.Result{RequeueAfter: time.Minute}, nil
291-
}
292-
return res, err
279+
return doReconcile(ctx, alwaysReconcile, s)
293280
}
294281

295282
func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -816,14 +803,10 @@ func (r *Reconciler) drainNode(ctx context.Context, s *scope) (ctrl.Result, erro
816803

817804
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
818805
if err != nil {
819-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
820-
log.V(5).Info("Requeuing drain Node because connection to the workload cluster is down")
821-
s.deletingReason = clusterv1.MachineDeletingDrainingNodeV1Beta2Reason
822-
s.deletingMessage = "Requeuing drain Node because connection to the workload cluster is down"
823-
return ctrl.Result{RequeueAfter: time.Minute}, nil
824-
}
825-
log.Error(err, "Error creating a remote client for cluster while draining Node, won't retry")
826-
return ctrl.Result{}, nil
806+
log.V(5).Info("Waiting for Cluster connection to come up to drain the Node")
807+
s.deletingReason = clusterv1.MachineDeletingDrainingNodeV1Beta2Reason
808+
s.deletingMessage = "Waiting for Cluster connection to come up to drain the Node"
809+
return ctrl.Result{}, err
827810
}
828811

829812
node := &corev1.Node{}
@@ -989,15 +972,9 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, s *scope) (ct
989972
}
990973

991974
func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, name string) error {
992-
log := ctrl.LoggerFrom(ctx)
993-
994975
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
995976
if err != nil {
996-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
997-
return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down")
998-
}
999-
log.Error(err, "Error creating a remote client for cluster while deleting Node, won't retry")
1000-
return nil
977+
return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down")
1001978
}
1002979

1003980
node := &corev1.Node{

internal/controllers/machinehealthcheck/machinehealthcheck_controller.go

+1-13
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
188188
}
189189
m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName
190190

191-
result, err := r.reconcile(ctx, log, cluster, m)
192-
if err != nil {
193-
// Requeue if the reconcile failed because connection to workload cluster was down.
194-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
195-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
196-
return ctrl.Result{RequeueAfter: time.Minute}, nil
197-
}
198-
199-
// Requeue immediately if any errors occurred
200-
return ctrl.Result{}, err
201-
}
202-
203-
return result, nil
191+
return r.reconcile(ctx, log, cluster, m)
204192
}
205193

206194
func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) {

internal/controllers/machineset/machineset_controller.go

+3-17
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
168168
return ctrl.Result{}, err
169169
}
170170

171-
log := ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(machineSet.Namespace, machineSet.Spec.ClusterName))
172-
ctx = ctrl.LoggerInto(ctx, log)
171+
ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(machineSet.Namespace, machineSet.Spec.ClusterName)))
173172

174173
// Add finalizer first if not set to avoid the race condition between init and delete.
175174
if finalizerAdded, err := finalizers.EnsureFinalizer(ctx, r.Client, machineSet, clusterv1.MachineSetFinalizer); err != nil || finalizerAdded {
@@ -178,7 +177,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
178177

179178
// AddOwners adds the owners of MachineSet as k/v pairs to the logger.
180179
// Specifically, it will add MachineDeployment.
181-
ctx, log, err := clog.AddOwners(ctx, r.Client, machineSet)
180+
ctx, _, err := clog.AddOwners(ctx, r.Client, machineSet)
182181
if err != nil {
183182
return ctrl.Result{}, err
184183
}
@@ -257,20 +256,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
257256
wrapErrMachineSetReconcileFunc(r.syncReplicas, "failed to sync replicas"),
258257
)
259258

260-
result, kerr := doReconcile(ctx, s, reconcileNormal)
261-
if kerr != nil {
262-
// Requeue if the reconcile failed because connection to workload cluster was down.
263-
if errors.Is(kerr, clustercache.ErrClusterNotConnected) {
264-
if len(kerr.Errors()) > 1 {
265-
log.Error(kerr, "Requeuing because connection to the workload cluster is down")
266-
} else {
267-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
268-
}
269-
return ctrl.Result{RequeueAfter: time.Minute}, nil
270-
}
271-
err = kerr
272-
}
273-
return result, err
259+
return doReconcile(ctx, s, reconcileNormal)
274260
}
275261

276262
type scope struct {

internal/controllers/topology/cluster/cluster_controller.go

+1-11
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,6 @@ func (r *Reconciler) SetupForDryRun(recorder record.EventRecorder) {
278278
}
279279

280280
func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
281-
log := ctrl.LoggerFrom(ctx)
282-
283281
// Fetch the Cluster instance.
284282
cluster := &clusterv1.Cluster{}
285283
if err := r.Client.Get(ctx, req.NamespacedName, cluster); err != nil {
@@ -340,15 +338,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
340338
}
341339

342340
// Handle normal reconciliation loop.
343-
result, err := r.reconcile(ctx, s)
344-
if err != nil {
345-
// Requeue if the reconcile failed because connection to workload cluster was down.
346-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
347-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
348-
return ctrl.Result{RequeueAfter: time.Minute}, nil
349-
}
350-
}
351-
return result, err
341+
return r.reconcile(ctx, s)
352342
}
353343

354344
// reconcile handles cluster reconciliation.

test/infrastructure/docker/internal/controllers/dockermachine_controller.go

+4-14
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
apierrors "k8s.io/apimachinery/pkg/api/errors"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929
"k8s.io/apimachinery/pkg/types"
30+
kerrors "k8s.io/apimachinery/pkg/util/errors"
3031
"k8s.io/klog/v2"
3132
ctrl "sigs.k8s.io/controller-runtime"
3233
"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -70,8 +71,7 @@ type DockerMachineReconciler struct {
7071
// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
7172

7273
// Reconcile handles DockerMachine events.
73-
func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, rerr error) {
74-
log := ctrl.LoggerFrom(ctx)
74+
func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
7575
ctx = container.RuntimeInto(ctx, r.ContainerRuntime)
7676

7777
// Fetch the DockerMachine instance.
@@ -150,10 +150,7 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
150150
// Always attempt to Patch the DockerMachine object and status after each reconciliation.
151151
defer func() {
152152
if err := patchDockerMachine(ctx, patchHelper, dockerMachine); err != nil {
153-
log.Error(err, "Failed to patch DockerMachine")
154-
if rerr == nil {
155-
rerr = err
156-
}
153+
reterr = kerrors.NewAggregate([]error{reterr, err})
157154
}
158155
}()
159156

@@ -188,14 +185,7 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
188185
}
189186

190187
// Handle non-deleted machines
191-
res, err := r.reconcileNormal(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer)
192-
// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
193-
// the current cluster because of concurrent access.
194-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
195-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
196-
return ctrl.Result{RequeueAfter: time.Minute}, nil
197-
}
198-
return res, err
188+
return r.reconcileNormal(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer)
199189
}
200190

201191
func patchDockerMachine(ctx context.Context, patchHelper *patch.Helper, dockerMachine *infrav1.DockerMachine) error {

0 commit comments

Comments
 (0)