Skip to content

🌱 Drop retry when ready KCP conditions #11797

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 3 additions & 39 deletions controlplane/kubeadm/internal/workload_cluster_conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,41 +47,8 @@ import (
// This operation is best effort, in the sense that in case of problems in retrieving member status, it sets
// the condition to Unknown state without returning any error.
func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
shouldRetry := func() bool {
// if CP is scaling up or down.
if ptr.Deref(controlPlane.KCP.Spec.Replicas, 0) != int32(len(controlPlane.Machines)) {
return true
}
// if CP machines are provisioning or deleting.
for _, m := range controlPlane.Machines {
if m.Status.NodeRef == nil {
return true
}
if !m.DeletionTimestamp.IsZero() {
return true
}
}
return false
}

if controlPlane.IsEtcdManaged() {
// Update etcd conditions.
// In case of well known temporary errors + control plane scaling up/down or rolling out, retry a few times.
// Note: it seems that reducing the number of them during every reconciles also improves stability,
// thus we are stopping doing retries (we only try once).
// However, we keep the code implementing retry support so we can easily revert this decision in a patch
// release if we need to.
maxRetry := 1
for i := range maxRetry {
retryableError := w.updateManagedEtcdConditions(ctx, controlPlane)
// if we should retry and there is a retry left, wait a bit.
if !retryableError || !shouldRetry() {
break
}
if i < maxRetry-1 {
time.Sleep(time.Duration(250*(i+1)) * time.Millisecond)
}
}
w.updateManagedEtcdConditions(ctx, controlPlane)
return
}
w.updateExternalEtcdConditions(ctx, controlPlane)
Expand All @@ -97,7 +64,7 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane
// As soon as the v1beta1 condition above will be removed, we should drop this func entirely.
}

func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) (retryableError bool) {
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
// NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd
// as ultimate source of truth for the list of members and for their health.
controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
Expand All @@ -121,7 +88,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
Reason: controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason,
Message: "Failed to get Nodes hosting the etcd cluster",
})
return retryableError
return
}

// Update conditions for etcd members on the nodes.
Expand Down Expand Up @@ -189,7 +156,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).

// While scaling up/down or rolling out new CP machines this error might happen.
retryableError = true
continue
}

Expand All @@ -214,7 +180,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
})

// While scaling up/down or rolling out new CP machines this error might happen because we are reading the list from different nodes at different time.
retryableError = true
continue
}

Expand Down Expand Up @@ -316,7 +281,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
trueReason: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason,
note: "etcd member",
})
return retryableError
}

func unwrapAll(err error) error {
Expand Down
40 changes: 13 additions & 27 deletions controlplane/kubeadm/internal/workload_cluster_conditions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
machines []*clusterv1.Machine
injectClient client.Client // This test is injecting a fake client because it is required to create nodes with a controlled Status or to fail with a specific error.
injectEtcdClientGenerator etcdClientFor // This test is injecting a fake etcdClientGenerator because it is required to nodes with a controlled Status or to fail with a specific error.
expectedRetryableError bool
expectedKCPCondition *clusterv1.Condition
expectedKCPV1Beta2Condition *metav1.Condition
expectedMachineConditions map[string]clusterv1.Conditions
Expand All @@ -188,8 +187,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
injectClient: &fakeClient{
listErr: errors.New("failed to list Nodes"),
},
expectedRetryableError: false,
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to get the Node which is hosting the etcd member"),
Expand Down Expand Up @@ -220,8 +218,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
Items: []corev1.Node{*fakeNode("n1")},
},
},
expectedRetryableError: false,
expectedKCPCondition: nil,
expectedKCPCondition: nil,
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {},
},
Expand Down Expand Up @@ -251,8 +248,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
Items: []corev1.Node{*fakeNode("n1")},
},
},
expectedRetryableError: false,
expectedKCPCondition: nil,
expectedKCPCondition: nil,
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {},
},
Expand Down Expand Up @@ -280,8 +276,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
Items: []corev1.Node{*fakeNode("n1")},
},
},
expectedRetryableError: false,
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
expectedKCPV1Beta2Condition: &metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition,
Status: metav1.ConditionFalse,
Expand All @@ -305,8 +300,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
injectEtcdClientGenerator: &fakeEtcdClientGenerator{
forNodesErr: errors.New("failed to get client for node"),
},
expectedRetryableError: true,
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to connect to the etcd Pod on the %s Node: failed to get client for node", "n1"),
Expand Down Expand Up @@ -346,8 +340,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
Errors: []string{"some errors"},
},
},
expectedRetryableError: true,
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member status reports errors: %s", "some errors"),
Expand Down Expand Up @@ -387,8 +380,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
},
},
},
expectedRetryableError: true,
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed to get answer from the etcd member on the %s Node", "n1"),
Expand Down Expand Up @@ -437,8 +429,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
},
},
},
expectedRetryableError: false,
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports alarms: %s", "NOSPACE"),
Expand Down Expand Up @@ -519,8 +510,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
}
},
},
expectedRetryableError: false,
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
Expand Down Expand Up @@ -607,8 +597,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
}
},
},
expectedRetryableError: true,
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
Expand Down Expand Up @@ -677,8 +666,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
}
},
},
expectedRetryableError: true,
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
Expand Down Expand Up @@ -765,8 +753,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
}
},
},
expectedRetryableError: false,
expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
expectedMachineConditions: map[string]clusterv1.Conditions{
"m1": {
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
Expand Down Expand Up @@ -810,8 +797,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
Machines: collections.FromMachines(tt.machines...),
}

retryableError := w.updateManagedEtcdConditions(ctx, controlPane)
g.Expect(retryableError).To(Equal(tt.expectedRetryableError))
w.updateManagedEtcdConditions(ctx, controlPane)

if tt.expectedKCPCondition != nil {
g.Expect(*conditions.Get(tt.kcp, controlplanev1.EtcdClusterHealthyCondition)).To(conditions.MatchCondition(*tt.expectedKCPCondition))
Expand Down