Skip to content

Commit 2776d2f

Browse files
committed
Improve Node drain e2e test coverage
Signed-off-by: Stefan Büringer [email protected]
1 parent 518fce7 commit 2776d2f

File tree

9 files changed

+202
-103
lines changed

9 files changed

+202
-103
lines changed

Makefile

-1
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,6 @@ generate-e2e-templates-main: $(KUSTOMIZE)
600600
echo "---" >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml
601601
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption/step2 --load-restrictor LoadRestrictionsNone >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml
602602
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool.yaml
603-
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-node-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-node-drain.yaml
604603
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml
605604
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml
606605
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,8 @@ func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr
317317
t.Machine,
318318
corev1.EventTypeNormal,
319319
EventDetectedUnhealthy,
320-
"Machine %v has unhealthy node %v",
321-
t.string(),
320+
"Machine %s has unhealthy Node %s",
321+
klog.KObj(t.Machine),
322322
t.nodeName(),
323323
)
324324
nextCheckTimes = append(nextCheckTimes, nextCheck)

test/e2e/config/docker.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ providers:
347347
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-remediation.yaml"
348348
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml"
349349
- sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml"
350-
- sourcePath: "../data/infrastructure-docker/main/cluster-template-node-drain.yaml"
351350
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml"
352351
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml"
353352
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml"
@@ -408,7 +407,6 @@ variables:
408407
CNI: "./data/cni/kindnet/kindnet.yaml"
409408
KUBETEST_CONFIGURATION: "./data/kubetest/conformance.yaml"
410409
AUTOSCALER_WORKLOAD: "./data/autoscaler/autoscaler-to-workload-workload.yaml"
411-
NODE_DRAIN_TIMEOUT: "60s"
412410
# Enabling the feature flags by setting the env variables.
413411
# Note: EXP_CLUSTER_RESOURCE_SET & EXP_MACHINE_POOL are enabled per default with CAPI v1.7.0.
414412
# We still have to enable them here for clusterctl upgrade tests that use older versions.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/cluster-with-kcp.yaml

-9
This file was deleted.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/kustomization.yaml

-8
This file was deleted.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/md.yaml

-8
This file was deleted.

test/e2e/node_drain_timeout.go

+154-42
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ import (
3030
"k8s.io/utils/ptr"
3131

3232
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
33-
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
3433
"sigs.k8s.io/cluster-api/test/framework"
3534
"sigs.k8s.io/cluster-api/test/framework/clusterctl"
3635
"sigs.k8s.io/cluster-api/util"
36+
"sigs.k8s.io/cluster-api/util/conditions"
3737
)
3838

3939
// NodeDrainTimeoutSpecInput is the input for NodeDrainTimeoutSpec.
@@ -52,10 +52,8 @@ type NodeDrainTimeoutSpecInput struct {
5252
// able to identify the default.
5353
InfrastructureProvider *string
5454

55-
// Flavor, if specified, must refer to a template that contains
56-
// a KubeadmControlPlane resource with spec.machineTemplate.nodeDrainTimeout
57-
// configured and a MachineDeployment resource that has
58-
// spec.template.spec.nodeDrainTimeout configured.
55+
// Flavor, if specified, must refer to a template that uses a Cluster with ClusterClass.
56+
// The cluster must use a KubeadmControlPlane and a MachineDeployment.
5957
// If not specified, "node-drain" is used.
6058
Flavor *string
6159

@@ -66,13 +64,11 @@ type NodeDrainTimeoutSpecInput struct {
6664

6765
func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeoutSpecInput) {
6866
var (
69-
specName = "node-drain"
70-
input NodeDrainTimeoutSpecInput
71-
namespace *corev1.Namespace
72-
cancelWatches context.CancelFunc
73-
clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
74-
machineDeployments []*clusterv1.MachineDeployment
75-
controlplane *controlplanev1.KubeadmControlPlane
67+
specName = "node-drain"
68+
input NodeDrainTimeoutSpecInput
69+
namespace *corev1.Namespace
70+
cancelWatches context.CancelFunc
71+
clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
7672
)
7773

7874
BeforeEach(func() {
@@ -97,6 +93,7 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
9793
if input.InfrastructureProvider != nil {
9894
infrastructureProvider = *input.InfrastructureProvider
9995
}
96+
10097
controlPlaneReplicas := 3
10198
clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
10299
ClusterProxy: input.BootstrapClusterProxy,
@@ -118,52 +115,167 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
118115
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
119116
}, clusterResources)
120117
cluster := clusterResources.Cluster
121-
controlplane = clusterResources.ControlPlane
122-
machineDeployments = clusterResources.MachineDeployments
118+
controlplane := clusterResources.ControlPlane
119+
machineDeployments := clusterResources.MachineDeployments
123120
Expect(machineDeployments[0].Spec.Replicas).To(Equal(ptr.To[int32](1)))
124121

125-
By("Add a deployment with unevictable pods and podDisruptionBudget to the workload cluster. The deployed pods cannot be evicted in the node draining process.")
122+
// This label will be added to all Machines so we can later create the unevictable Pods on the right Nodes.
123+
nodeOwnerLabelKey := "owner.node.cluster.x-k8s.io"
124+
125+
By("Ensure Node label is set & NodeDrainTimeout is set to 0 (wait forever) on ControlPlane and MachineDeployment topologies")
126+
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
127+
ClusterProxy: input.BootstrapClusterProxy,
128+
Cluster: cluster,
129+
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
130+
topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)}
131+
if topology.Metadata.Labels == nil {
132+
topology.Metadata.Labels = map[string]string{}
133+
}
134+
topology.Metadata.Labels[nodeOwnerLabelKey] = "KubeadmControlPlane-" + controlplane.Name
135+
},
136+
WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
137+
})
138+
modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{
139+
ClusterProxy: input.BootstrapClusterProxy,
140+
Cluster: cluster,
141+
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
142+
topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)}
143+
if topology.Metadata.Labels == nil {
144+
topology.Metadata.Labels = map[string]string{}
145+
}
146+
for _, md := range machineDeployments {
147+
if md.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] == topology.Name {
148+
topology.Metadata.Labels[nodeOwnerLabelKey] = "MachineDeployment-" + md.Name
149+
}
150+
}
151+
},
152+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
153+
})
154+
126155
workloadClusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, cluster.Namespace, cluster.Name)
156+
By("Deploy Deployment with unevictable Pods on control plane Nodes.")
157+
158+
cpDeploymentAndPDBName := fmt.Sprintf("%s-%s", "unevictable-pod-cp", util.RandomString(3))
127159
framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{
128160
WorkloadClusterProxy: workloadClusterProxy,
129-
DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod", util.RandomString(3)),
161+
ControlPlane: controlplane,
162+
DeploymentName: cpDeploymentAndPDBName,
130163
Namespace: namespace.Name + "-unevictable-workload",
164+
NodeSelector: map[string]string{nodeOwnerLabelKey: "KubeadmControlPlane-" + controlplane.Name},
131165
WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"),
132166
})
133-
134-
By("Scale the machinedeployment down to zero. If we didn't have the NodeDrainTimeout duration, the node drain process would block this operator.")
135-
// Because all the machines of a machinedeployment can be deleted at the same time, so we only prepare the interval for 1 replica.
136-
nodeDrainTimeoutMachineDeploymentInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), machineDeployments[0].Spec.Template.Spec.NodeDrainTimeout, 1)
167+
By("Deploy Deployment with unevictable Pods on MachineDeployment Nodes.")
168+
mdDeploymentAndPDBNames := map[string]string{}
137169
for _, md := range machineDeployments {
138-
framework.ScaleAndWaitMachineDeployment(ctx, framework.ScaleAndWaitMachineDeploymentInput{
139-
ClusterProxy: input.BootstrapClusterProxy,
140-
Cluster: cluster,
141-
MachineDeployment: md,
142-
WaitForMachineDeployments: nodeDrainTimeoutMachineDeploymentInterval,
143-
Replicas: 0,
170+
mdDeploymentAndPDBNames[md.Name] = fmt.Sprintf("%s-%s", "unevictable-pod-md", util.RandomString(3))
171+
framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{
172+
WorkloadClusterProxy: workloadClusterProxy,
173+
MachineDeployment: md,
174+
DeploymentName: mdDeploymentAndPDBNames[md.Name],
175+
Namespace: namespace.Name + "-unevictable-workload",
176+
NodeSelector: map[string]string{nodeOwnerLabelKey: "MachineDeployment-" + md.Name},
177+
WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"),
144178
})
145179
}
146180

147-
By("Deploy deployment with unevictable pods on control plane nodes.")
148-
framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{
149-
WorkloadClusterProxy: workloadClusterProxy,
150-
ControlPlane: controlplane,
151-
DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod", util.RandomString(3)),
152-
Namespace: namespace.Name + "-unevictable-workload",
153-
WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"),
181+
By("Scale down the control plane to 1 and MachineDeployments to 0.")
182+
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
183+
ClusterProxy: input.BootstrapClusterProxy,
184+
Cluster: cluster,
185+
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
186+
topology.Replicas = ptr.To[int32](1)
187+
},
188+
WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
189+
})
190+
modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{
191+
ClusterProxy: input.BootstrapClusterProxy,
192+
Cluster: cluster,
193+
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
194+
topology.Replicas = ptr.To[int32](0)
195+
},
196+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
154197
})
155198

156-
By("Scale down the controlplane of the workload cluster and make sure that nodes running workload can be deleted even the draining process is blocked.")
157-
// When we scale down the KCP, controlplane machines are by default deleted one by one, so it requires more time.
158-
nodeDrainTimeoutKCPInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), controlplane.Spec.MachineTemplate.NodeDrainTimeout, controlPlaneReplicas)
159-
framework.ScaleAndWaitControlPlane(ctx, framework.ScaleAndWaitControlPlaneInput{
160-
ClusterProxy: input.BootstrapClusterProxy,
161-
Cluster: cluster,
162-
ControlPlane: controlplane,
163-
Replicas: 1,
164-
WaitForControlPlane: nodeDrainTimeoutKCPInterval,
199+
By("Verify Node drains for control plane and MachineDeployment Machines are blocked")
200+
Eventually(func(g Gomega) {
201+
controlPlaneMachines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{
202+
Lister: input.BootstrapClusterProxy.GetClient(),
203+
ClusterName: cluster.Name,
204+
Namespace: cluster.Namespace,
205+
})
206+
var condition *clusterv1.Condition
207+
for _, machine := range controlPlaneMachines {
208+
condition = conditions.Get(&machine, clusterv1.DrainingSucceededCondition)
209+
if condition != nil {
210+
// We only expect to find the condition on one Machine (as KCP will only try to drain one Machine at a time)
211+
break
212+
}
213+
}
214+
g.Expect(condition).ToNot(BeNil())
215+
g.Expect(condition.Status).To(Equal(corev1.ConditionFalse))
216+
g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Cannot evict pod as it would violate the pod's disruption budget. The disruption budget %s needs", cpDeploymentAndPDBName)))
217+
}, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed())
218+
for _, md := range machineDeployments {
219+
Eventually(func(g Gomega) {
220+
machines := framework.GetMachinesByMachineDeployments(ctx, framework.GetMachinesByMachineDeploymentsInput{
221+
Lister: input.BootstrapClusterProxy.GetClient(),
222+
ClusterName: cluster.Name,
223+
Namespace: cluster.Namespace,
224+
MachineDeployment: *md,
225+
})
226+
g.Expect(machines).To(HaveLen(1))
227+
condition := conditions.Get(&machines[0], clusterv1.DrainingSucceededCondition)
228+
g.Expect(condition).ToNot(BeNil())
229+
g.Expect(condition.Status).To(Equal(corev1.ConditionFalse))
230+
g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Cannot evict pod as it would violate the pod's disruption budget. The disruption budget %s needs", mdDeploymentAndPDBNames[md.Name])))
231+
}, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed())
232+
}
233+
234+
By("Set NodeDrainTimeout to 1s to unblock Node drain")
235+
// Note: This also verifies that KCP & MachineDeployments are still propagating changes to NodeDrainTimeout down to
236+
// Machines that already have a deletionTimestamp.
237+
drainTimeout := &metav1.Duration{Duration: time.Duration(1) * time.Second}
238+
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
239+
ClusterProxy: input.BootstrapClusterProxy,
240+
Cluster: cluster,
241+
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
242+
topology.NodeDrainTimeout = drainTimeout
243+
},
244+
WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
245+
})
246+
modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{
247+
ClusterProxy: input.BootstrapClusterProxy,
248+
Cluster: cluster,
249+
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
250+
topology.NodeDrainTimeout = drainTimeout
251+
},
252+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
165253
})
166254

255+
By("Verify Node drains were unblocked")
256+
// When we scale down the KCP, controlplane machines are deleted one by one, so it requires more time
257+
// MD Machine deletion is done in parallel and will be faster.
258+
nodeDrainTimeoutKCPInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), drainTimeout, controlPlaneReplicas)
259+
Eventually(func(g Gomega) {
260+
// When all drains complete we only have 1 control plane & 0 MD replicas left.
261+
controlPlaneMachines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{
262+
Lister: input.BootstrapClusterProxy.GetClient(),
263+
ClusterName: cluster.Name,
264+
Namespace: cluster.Namespace,
265+
})
266+
g.Expect(controlPlaneMachines).To(HaveLen(1))
267+
268+
for _, md := range machineDeployments {
269+
machines := framework.GetMachinesByMachineDeployments(ctx, framework.GetMachinesByMachineDeploymentsInput{
270+
Lister: input.BootstrapClusterProxy.GetClient(),
271+
ClusterName: cluster.Name,
272+
Namespace: cluster.Namespace,
273+
MachineDeployment: *md,
274+
})
275+
g.Expect(machines).To(BeEmpty())
276+
}
277+
}, nodeDrainTimeoutKCPInterval...).Should(Succeed())
278+
167279
By("PASSED!")
168280
})
169281

test/e2e/node_drain_timeout_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ var _ = Describe("When testing node drain timeout", func() {
3232
BootstrapClusterProxy: bootstrapClusterProxy,
3333
ArtifactFolder: artifactFolder,
3434
SkipCleanup: skipCleanup,
35+
Flavor: ptr.To("topology"),
3536
InfrastructureProvider: ptr.To("docker"),
3637
}
3738
})

0 commit comments

Comments
 (0)