kubernetes-sigs · k8s-ci-robot · Feb 5, 2025 · Feb 4, 2025
diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions.go b/controlplane/kubeadm/internal/workload_cluster_conditions.go
@@ -47,41 +47,8 @@ import (
 // This operation is best effort, in the sense that in case of problems in retrieving member status, it sets
 // the condition to Unknown state without returning any error.
 func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
-	shouldRetry := func() bool {
-		// if CP is scaling up or down.
-		if ptr.Deref(controlPlane.KCP.Spec.Replicas, 0) != int32(len(controlPlane.Machines)) {
-			return true
-		}
-		// if CP machines are provisioning or deleting.
-		for _, m := range controlPlane.Machines {
-			if m.Status.NodeRef == nil {
-				return true
-			}
-			if !m.DeletionTimestamp.IsZero() {
-				return true
-			}
-		}
-		return false
-	}
-
 	if controlPlane.IsEtcdManaged() {
-		// Update etcd conditions.
-		// In case of well known temporary errors + control plane scaling up/down or rolling out, retry a few times.
-		// Note: it seems that reducing the number of them during every reconciles also improves stability,
-		// thus we are stopping doing retries (we only try once).
-		// However, we keep the code implementing retry support so we can easily revert this decision in a patch
-		// release if we need to.
-		maxRetry := 1
-		for i := range maxRetry {
-			retryableError := w.updateManagedEtcdConditions(ctx, controlPlane)
-			// if we should retry and there is a retry left, wait a bit.
-			if !retryableError || !shouldRetry() {
-				break
-			}
-			if i < maxRetry-1 {
-				time.Sleep(time.Duration(250*(i+1)) * time.Millisecond)
-			}
-		}
+		w.updateManagedEtcdConditions(ctx, controlPlane)
 		return
 	}
 	w.updateExternalEtcdConditions(ctx, controlPlane)
@@ -97,7 +64,7 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane
 	// As soon as the v1beta1 condition above will be removed, we should drop this func entirely.
 }
 
-func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) (retryableError bool) {
+func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
 	// NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd
 	// as ultimate source of truth for the list of members and for their health.
 	controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
@@ -121,7 +88,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 			Reason:  controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason,
 			Message: "Failed to get Nodes hosting the etcd cluster",
 		})
-		return retryableError
+		return
 	}
 
 	// Update conditions for etcd members on the nodes.
@@ -189,7 +156,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 			// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).
 
 			// While scaling up/down or rolling out new CP machines this error might happen.
-			retryableError = true
 			continue
 		}
 
@@ -214,7 +180,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 			})
 
 			// While scaling up/down or rolling out new CP machines this error might happen because we are reading the list from different nodes at different time.
-			retryableError = true
 			continue
 		}
 
@@ -316,7 +281,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 		trueReason:        controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason,
 		note:              "etcd member",
 	})
-	return retryableError
 }
 
 func unwrapAll(err error) error {

diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions_test.go b/controlplane/kubeadm/internal/workload_cluster_conditions_test.go
@@ -170,7 +170,6 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 		machines                                  []*clusterv1.Machine
 		injectClient                              client.Client // This test is injecting a fake client because it is required to create nodes with a controlled Status or to fail with a specific error.
 		injectEtcdClientGenerator                 etcdClientFor // This test is injecting a fake etcdClientGenerator because it is required to nodes with a controlled Status or to fail with a specific error.
-		expectedRetryableError                    bool
 		expectedKCPCondition                      *clusterv1.Condition
 		expectedKCPV1Beta2Condition               *metav1.Condition
 		expectedMachineConditions                 map[string]clusterv1.Conditions
@@ -188,8 +187,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 			injectClient: &fakeClient{
 				listErr: errors.New("failed to list Nodes"),
 			},
-			expectedRetryableError: false,
-			expectedKCPCondition:   conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
+			expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to get the Node which is hosting the etcd member"),
@@ -220,8 +218,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					Items: []corev1.Node{*fakeNode("n1")},
 				},
 			},
-			expectedRetryableError: false,
-			expectedKCPCondition:   nil,
+			expectedKCPCondition: nil,
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {},
 			},
@@ -251,8 +248,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					Items: []corev1.Node{*fakeNode("n1")},
 				},
 			},
-			expectedRetryableError: false,
-			expectedKCPCondition:   nil,
+			expectedKCPCondition: nil,
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {},
 			},
@@ -280,8 +276,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					Items: []corev1.Node{*fakeNode("n1")},
 				},
 			},
-			expectedRetryableError: false,
-			expectedKCPCondition:   conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
+			expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
 			expectedKCPV1Beta2Condition: &metav1.Condition{
 				Type:    controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition,
 				Status:  metav1.ConditionFalse,
@@ -305,8 +300,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 			injectEtcdClientGenerator: &fakeEtcdClientGenerator{
 				forNodesErr: errors.New("failed to get client for node"),
 			},
-			expectedRetryableError: true,
-			expectedKCPCondition:   conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
+			expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to connect to the etcd Pod on the %s Node: failed to get client for node", "n1"),
@@ -346,8 +340,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					Errors: []string{"some errors"},
 				},
 			},
-			expectedRetryableError: true,
-			expectedKCPCondition:   conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
+			expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member status reports errors: %s", "some errors"),
@@ -387,8 +380,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					},
 				},
 			},
-			expectedRetryableError: true,
-			expectedKCPCondition:   conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
+			expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed to get answer from the etcd member on the %s Node", "n1"),
@@ -437,8 +429,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					},
 				},
 			},
-			expectedRetryableError: false,
-			expectedKCPCondition:   conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
+			expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports alarms: %s", "NOSPACE"),
@@ -519,8 +510,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					}
 				},
 			},
-			expectedRetryableError: false,
-			expectedKCPCondition:   conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
+			expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -607,8 +597,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					}
 				},
 			},
-			expectedRetryableError: true,
-			expectedKCPCondition:   conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
+			expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -677,8 +666,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					}
 				},
 			},
-			expectedRetryableError: true,
-			expectedKCPCondition:   conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
+			expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -765,8 +753,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 					}
 				},
 			},
-			expectedRetryableError: false,
-			expectedKCPCondition:   conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
+			expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
 			expectedMachineConditions: map[string]clusterv1.Conditions{
 				"m1": {
 					*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -810,8 +797,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
 				Machines: collections.FromMachines(tt.machines...),
 			}
 
-			retryableError := w.updateManagedEtcdConditions(ctx, controlPane)
-			g.Expect(retryableError).To(Equal(tt.expectedRetryableError))
+			w.updateManagedEtcdConditions(ctx, controlPane)
 
 			if tt.expectedKCPCondition != nil {
 				g.Expect(*conditions.Get(tt.kcp, controlplanev1.EtcdClusterHealthyCondition)).To(conditions.MatchCondition(*tt.expectedKCPCondition))