Skip to content

Commit 4758f18

Browse files
committed
Improve Node drain e2e test coverage
Signed-off-by: Stefan Büringer [email protected]
1 parent 3232abc commit 4758f18

File tree

8 files changed

+193
-97
lines changed

8 files changed

+193
-97
lines changed

Makefile

-1
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,6 @@ generate-e2e-templates-main: $(KUSTOMIZE)
600600
echo "---" >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml
601601
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption/step2 --load-restrictor LoadRestrictionsNone >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml
602602
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool.yaml
603-
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-node-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-node-drain.yaml
604603
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml
605604
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml
606605
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml

test/e2e/config/docker.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ providers:
347347
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-remediation.yaml"
348348
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml"
349349
- sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml"
350-
- sourcePath: "../data/infrastructure-docker/main/cluster-template-node-drain.yaml"
351350
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml"
352351
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml"
353352
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml"
@@ -408,7 +407,6 @@ variables:
408407
CNI: "./data/cni/kindnet/kindnet.yaml"
409408
KUBETEST_CONFIGURATION: "./data/kubetest/conformance.yaml"
410409
AUTOSCALER_WORKLOAD: "./data/autoscaler/autoscaler-to-workload-workload.yaml"
411-
NODE_DRAIN_TIMEOUT: "60s"
412410
# Enabling the feature flags by setting the env variables.
413411
# Note: EXP_CLUSTER_RESOURCE_SET & EXP_MACHINE_POOL are enabled per default with CAPI v1.7.0.
414412
# We still have to enable them here for clusterctl upgrade tests that use older versions.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/cluster-with-kcp.yaml

-9
This file was deleted.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/kustomization.yaml

-8
This file was deleted.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/md.yaml

-8
This file was deleted.

test/e2e/node_drain_timeout.go

+147-38
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ import (
3030
"k8s.io/utils/ptr"
3131

3232
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
33-
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
3433
"sigs.k8s.io/cluster-api/test/framework"
3534
"sigs.k8s.io/cluster-api/test/framework/clusterctl"
3635
"sigs.k8s.io/cluster-api/util"
36+
"sigs.k8s.io/cluster-api/util/conditions"
3737
)
3838

3939
// NodeDrainTimeoutSpecInput is the input for NodeDrainTimeoutSpec.
@@ -66,13 +66,11 @@ type NodeDrainTimeoutSpecInput struct {
6666

6767
func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeoutSpecInput) {
6868
var (
69-
specName = "node-drain"
70-
input NodeDrainTimeoutSpecInput
71-
namespace *corev1.Namespace
72-
cancelWatches context.CancelFunc
73-
clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
74-
machineDeployments []*clusterv1.MachineDeployment
75-
controlplane *controlplanev1.KubeadmControlPlane
69+
specName = "node-drain"
70+
input NodeDrainTimeoutSpecInput
71+
namespace *corev1.Namespace
72+
cancelWatches context.CancelFunc
73+
clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
7674
)
7775

7876
BeforeEach(func() {
@@ -97,6 +95,7 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
9795
if input.InfrastructureProvider != nil {
9896
infrastructureProvider = *input.InfrastructureProvider
9997
}
98+
10099
controlPlaneReplicas := 3
101100
clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
102101
ClusterProxy: input.BootstrapClusterProxy,
@@ -118,52 +117,162 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
118117
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
119118
}, clusterResources)
120119
cluster := clusterResources.Cluster
121-
controlplane = clusterResources.ControlPlane
122-
machineDeployments = clusterResources.MachineDeployments
120+
controlplane := clusterResources.ControlPlane
121+
machineDeployments := clusterResources.MachineDeployments
123122
Expect(machineDeployments[0].Spec.Replicas).To(Equal(ptr.To[int32](1)))
124123

125-
By("Add a deployment with unevictable pods and podDisruptionBudget to the workload cluster. The deployed pods cannot be evicted in the node draining process.")
124+
// This label will be added to all Machines so we can later create the unevictable Pods on the right Nodes.
125+
nodeLabelKey := "owner.node.cluster.x-k8s.io"
126+
127+
By("Ensure Node label is set & NodeDrainTimeout is set to 0 (wait forever) on ControlPlane and MachineDeployment topologies")
128+
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
129+
ClusterProxy: input.BootstrapClusterProxy,
130+
Cluster: cluster,
131+
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
132+
topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)}
133+
if topology.Metadata.Labels == nil {
134+
topology.Metadata.Labels = map[string]string{}
135+
}
136+
topology.Metadata.Labels[nodeLabelKey] = "KubeadmControlPlane-" + controlplane.Name
137+
},
138+
WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
139+
})
140+
modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{
141+
ClusterProxy: input.BootstrapClusterProxy,
142+
Cluster: cluster,
143+
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
144+
topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)}
145+
if topology.Metadata.Labels == nil {
146+
topology.Metadata.Labels = map[string]string{}
147+
}
148+
for _, md := range machineDeployments {
149+
if md.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] == topology.Name {
150+
topology.Metadata.Labels[nodeLabelKey] = "MachineDeployment-" + md.Name
151+
}
152+
}
153+
},
154+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
155+
})
156+
126157
workloadClusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, cluster.Namespace, cluster.Name)
158+
By("Deploy Deployment with unevictable pods on control plane Nodes.")
127159
framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{
128160
WorkloadClusterProxy: workloadClusterProxy,
129-
DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod", util.RandomString(3)),
161+
ControlPlane: controlplane,
162+
DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod-cp", util.RandomString(3)),
130163
Namespace: namespace.Name + "-unevictable-workload",
164+
NodeSelector: map[string]string{nodeLabelKey: "KubeadmControlPlane-" + controlplane.Name},
131165
WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"),
132166
})
133-
134-
By("Scale the machinedeployment down to zero. If we didn't have the NodeDrainTimeout duration, the node drain process would block this operator.")
135-
// Because all the machines of a machinedeployment can be deleted at the same time, so we only prepare the interval for 1 replica.
136-
nodeDrainTimeoutMachineDeploymentInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), machineDeployments[0].Spec.Template.Spec.NodeDrainTimeout, 1)
167+
By("Deploy Deployment with unevictable pods on MachineDeployment Nodes.")
137168
for _, md := range machineDeployments {
138-
framework.ScaleAndWaitMachineDeployment(ctx, framework.ScaleAndWaitMachineDeploymentInput{
139-
ClusterProxy: input.BootstrapClusterProxy,
140-
Cluster: cluster,
141-
MachineDeployment: md,
142-
WaitForMachineDeployments: nodeDrainTimeoutMachineDeploymentInterval,
143-
Replicas: 0,
169+
framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{
170+
WorkloadClusterProxy: workloadClusterProxy,
171+
MachineDeployment: md,
172+
DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod-md", util.RandomString(3)),
173+
Namespace: namespace.Name + "-unevictable-workload",
174+
NodeSelector: map[string]string{nodeLabelKey: "MachineDeployment-" + md.Name},
175+
WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"),
144176
})
145177
}
146178

147-
By("Deploy deployment with unevictable pods on control plane nodes.")
148-
framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{
149-
WorkloadClusterProxy: workloadClusterProxy,
150-
ControlPlane: controlplane,
151-
DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod", util.RandomString(3)),
152-
Namespace: namespace.Name + "-unevictable-workload",
153-
WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"),
179+
By("Scale down the control plane to 1 and MachineDeployments to 0.")
180+
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
181+
ClusterProxy: input.BootstrapClusterProxy,
182+
Cluster: cluster,
183+
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
184+
topology.Replicas = ptr.To[int32](1)
185+
},
186+
WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
154187
})
188+
modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{
189+
ClusterProxy: input.BootstrapClusterProxy,
190+
Cluster: cluster,
191+
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
192+
topology.Replicas = ptr.To[int32](0)
193+
},
194+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
195+
})
196+
197+
By("Verify Node drains for control plane and MachineDeployment Machines are blocked")
198+
Eventually(func(g Gomega) {
199+
controlPlaneMachines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{
200+
Lister: input.BootstrapClusterProxy.GetClient(),
201+
ClusterName: cluster.Name,
202+
Namespace: cluster.Namespace,
203+
})
204+
var condition *clusterv1.Condition
205+
for _, machine := range controlPlaneMachines {
206+
condition = conditions.Get(&machine, clusterv1.DrainingSucceededCondition)
207+
if condition != nil {
208+
// We only expect to find the condition on one Machine (as KCP will only try to drain one Machine)
209+
break
210+
}
211+
}
212+
g.Expect(condition).ToNot(BeNil())
213+
g.Expect(condition.Message).To(ContainSubstring("Cannot evict pod as it would violate the pod's disruption budget"))
214+
215+
}, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed())
216+
for _, md := range machineDeployments {
217+
Eventually(func(g Gomega) {
218+
machines := framework.GetMachinesByMachineDeployments(ctx, framework.GetMachinesByMachineDeploymentsInput{
219+
Lister: input.BootstrapClusterProxy.GetClient(),
220+
ClusterName: cluster.Name,
221+
Namespace: cluster.Namespace,
222+
MachineDeployment: *md,
223+
})
224+
g.Expect(machines).To(HaveLen(1))
225+
condition := conditions.Get(&machines[0], clusterv1.DrainingSucceededCondition)
226+
g.Expect(condition).ToNot(BeNil())
227+
g.Expect(condition.Message).To(ContainSubstring("Cannot evict pod as it would violate the pod's disruption budget"))
228+
}, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed())
229+
}
155230

156-
By("Scale down the controlplane of the workload cluster and make sure that nodes running workload can be deleted even the draining process is blocked.")
157-
// When we scale down the KCP, controlplane machines are by default deleted one by one, so it requires more time.
158-
nodeDrainTimeoutKCPInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), controlplane.Spec.MachineTemplate.NodeDrainTimeout, controlPlaneReplicas)
159-
framework.ScaleAndWaitControlPlane(ctx, framework.ScaleAndWaitControlPlaneInput{
160-
ClusterProxy: input.BootstrapClusterProxy,
161-
Cluster: cluster,
162-
ControlPlane: controlplane,
163-
Replicas: 1,
164-
WaitForControlPlane: nodeDrainTimeoutKCPInterval,
231+
By("Set NodeDrainTimeout to 1s to unblock drain")
232+
// Note: This also verifies that KCP & MachineDeployments are still propagating changes to NodeDrainTimeout down to
233+
// Machines that already have a deletionTimestamp.
234+
drainTimeout := &metav1.Duration{Duration: time.Duration(1) * time.Second}
235+
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
236+
ClusterProxy: input.BootstrapClusterProxy,
237+
Cluster: cluster,
238+
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
239+
topology.NodeDrainTimeout = drainTimeout
240+
},
241+
WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
242+
})
243+
modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{
244+
ClusterProxy: input.BootstrapClusterProxy,
245+
Cluster: cluster,
246+
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
247+
topology.NodeDrainTimeout = drainTimeout
248+
},
249+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
165250
})
166251

252+
By("Verify Node drains were unblocked")
253+
// When we scale down the KCP, controlplane machines are deleted one by one, so it requires more time
254+
// MD Machine deletion is done in parallel and will be faster.
255+
nodeDrainTimeoutKCPInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), drainTimeout, controlPlaneReplicas)
256+
Eventually(func(g Gomega) {
257+
// When all drains complete we only have 1 control plane & 0 MD replicas left.
258+
controlPlaneMachines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{
259+
Lister: input.BootstrapClusterProxy.GetClient(),
260+
ClusterName: cluster.Name,
261+
Namespace: cluster.Namespace,
262+
})
263+
g.Expect(controlPlaneMachines).To(HaveLen(1))
264+
265+
for _, md := range machineDeployments {
266+
machines := framework.GetMachinesByMachineDeployments(ctx, framework.GetMachinesByMachineDeploymentsInput{
267+
Lister: input.BootstrapClusterProxy.GetClient(),
268+
ClusterName: cluster.Name,
269+
Namespace: cluster.Namespace,
270+
MachineDeployment: *md,
271+
})
272+
g.Expect(machines).To(HaveLen(0))
273+
}
274+
}, nodeDrainTimeoutKCPInterval...).Should(Succeed())
275+
167276
By("PASSED!")
168277
})
169278

test/e2e/node_drain_timeout_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ var _ = Describe("When testing node drain timeout", func() {
3232
BootstrapClusterProxy: bootstrapClusterProxy,
3333
ArtifactFolder: artifactFolder,
3434
SkipCleanup: skipCleanup,
35+
Flavor: ptr.To("topology"),
3536
InfrastructureProvider: ptr.To("docker"),
3637
}
3738
})

0 commit comments

Comments
 (0)