Skip to content

Commit eebff7b

Browse files
authored
Merge pull request #11127 from sbueringer/pr-improve-node-drain-e2e-test
🌱 Improve node drain e2e test
2 parents a8ae016 + dbe5b1a commit eebff7b

File tree

9 files changed

+446
-135
lines changed

9 files changed

+446
-135
lines changed

Makefile

-1
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,6 @@ generate-e2e-templates-main: $(KUSTOMIZE)
600600
echo "---" >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml
601601
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption/step2 --load-restrictor LoadRestrictionsNone >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml
602602
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool.yaml
603-
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-node-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-node-drain.yaml
604603
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml
605604
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml
606605
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,8 @@ func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr
317317
t.Machine,
318318
corev1.EventTypeNormal,
319319
EventDetectedUnhealthy,
320-
"Machine %v has unhealthy node %v",
321-
t.string(),
320+
"Machine %s has unhealthy Node %s",
321+
klog.KObj(t.Machine),
322322
t.nodeName(),
323323
)
324324
nextCheckTimes = append(nextCheckTimes, nextCheck)

test/e2e/config/docker.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ providers:
347347
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-remediation.yaml"
348348
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml"
349349
- sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml"
350-
- sourcePath: "../data/infrastructure-docker/main/cluster-template-node-drain.yaml"
351350
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml"
352351
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml"
353352
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml"
@@ -408,7 +407,6 @@ variables:
408407
CNI: "./data/cni/kindnet/kindnet.yaml"
409408
KUBETEST_CONFIGURATION: "./data/kubetest/conformance.yaml"
410409
AUTOSCALER_WORKLOAD: "./data/autoscaler/autoscaler-to-workload-workload.yaml"
411-
NODE_DRAIN_TIMEOUT: "60s"
412410
# Enabling the feature flags by setting the env variables.
413411
# Note: EXP_CLUSTER_RESOURCE_SET & EXP_MACHINE_POOL are enabled per default with CAPI v1.7.0.
414412
# We still have to enable them here for clusterctl upgrade tests that use older versions.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/cluster-with-kcp.yaml

-9
This file was deleted.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/kustomization.yaml

-8
This file was deleted.

test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/md.yaml

-8
This file was deleted.

test/e2e/node_drain_timeout.go

+297-42
Large diffs are not rendered by default.

test/e2e/node_drain_timeout_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ var _ = Describe("When testing node drain timeout", func() {
3232
BootstrapClusterProxy: bootstrapClusterProxy,
3333
ArtifactFolder: artifactFolder,
3434
SkipCleanup: skipCleanup,
35+
Flavor: ptr.To("topology"),
3536
InfrastructureProvider: ptr.To("docker"),
3637
}
3738
})

test/framework/deployment_helpers.go

+146-63
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,14 @@ import (
4242
"k8s.io/apimachinery/pkg/labels"
4343
kerrors "k8s.io/apimachinery/pkg/util/errors"
4444
"k8s.io/apimachinery/pkg/util/intstr"
45-
utilversion "k8s.io/apimachinery/pkg/util/version"
4645
"k8s.io/apimachinery/pkg/util/wait"
47-
"k8s.io/apimachinery/pkg/version"
4846
"k8s.io/client-go/kubernetes"
4947
"k8s.io/client-go/tools/cache"
5048
"k8s.io/klog/v2"
51-
"k8s.io/utils/ptr"
5249
toolscache "sigs.k8s.io/controller-runtime/pkg/cache"
5350
"sigs.k8s.io/controller-runtime/pkg/client"
5451

52+
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
5553
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
5654
. "sigs.k8s.io/cluster-api/test/framework/ginkgoextensions"
5755
"sigs.k8s.io/cluster-api/test/framework/internal/log"
@@ -493,115 +491,200 @@ func WaitForDNSUpgrade(ctx context.Context, input WaitForDNSUpgradeInput, interv
493491
type DeployUnevictablePodInput struct {
494492
WorkloadClusterProxy ClusterProxy
495493
ControlPlane *controlplanev1.KubeadmControlPlane
494+
MachineDeployment *clusterv1.MachineDeployment
496495
DeploymentName string
497496
Namespace string
497+
NodeSelector map[string]string
498498

499499
WaitForDeploymentAvailableInterval []interface{}
500500
}
501501

502+
// DeployUnevictablePod will deploy a Deployment on a ControlPlane or MachineDeployment.
503+
// It will deploy one Pod replica to each Machine and then deploy a PDB to ensure none of the Pods can be evicted.
502504
func DeployUnevictablePod(ctx context.Context, input DeployUnevictablePodInput) {
503505
Expect(input.DeploymentName).ToNot(BeNil(), "Need a deployment name in DeployUnevictablePod")
504506
Expect(input.Namespace).ToNot(BeNil(), "Need a namespace in DeployUnevictablePod")
505507
Expect(input.WorkloadClusterProxy).ToNot(BeNil(), "Need a workloadClusterProxy in DeployUnevictablePod")
508+
Expect((input.MachineDeployment == nil && input.ControlPlane != nil) ||
509+
(input.MachineDeployment != nil && input.ControlPlane == nil)).To(BeTrue(), "Either MachineDeployment or ControlPlane must be set in DeployUnevictablePod")
506510

507511
EnsureNamespace(ctx, input.WorkloadClusterProxy.GetClient(), input.Namespace)
508512

509-
workloadDeployment := &appsv1.Deployment{
513+
workloadDeployment := generateDeployment(generateDeploymentInput{
514+
ControlPlane: input.ControlPlane,
515+
MachineDeployment: input.MachineDeployment,
516+
Name: input.DeploymentName,
517+
Namespace: input.Namespace,
518+
NodeSelector: input.NodeSelector,
519+
})
520+
521+
workloadClient := input.WorkloadClusterProxy.GetClientSet()
522+
523+
AddDeploymentToWorkloadCluster(ctx, AddDeploymentToWorkloadClusterInput{
524+
Namespace: input.Namespace,
525+
ClientSet: workloadClient,
526+
Deployment: workloadDeployment,
527+
})
528+
529+
budget := &policyv1.PodDisruptionBudget{
510530
ObjectMeta: metav1.ObjectMeta{
511531
Name: input.DeploymentName,
512532
Namespace: input.Namespace,
513533
},
534+
Spec: policyv1.PodDisruptionBudgetSpec{
535+
Selector: &metav1.LabelSelector{
536+
MatchLabels: map[string]string{
537+
"app": "nonstop",
538+
"deployment": input.DeploymentName,
539+
},
540+
},
541+
// Setting MaxUnavailable to 0 means no Pods can be evicted / unavailable.
542+
MaxUnavailable: &intstr.IntOrString{
543+
Type: intstr.Int,
544+
IntVal: 0,
545+
},
546+
},
547+
}
548+
549+
AddPodDisruptionBudget(ctx, AddPodDisruptionBudgetInput{
550+
Namespace: input.Namespace,
551+
ClientSet: workloadClient,
552+
Budget: budget,
553+
})
554+
555+
WaitForDeploymentsAvailable(ctx, WaitForDeploymentsAvailableInput{
556+
Getter: input.WorkloadClusterProxy.GetClient(),
557+
Deployment: workloadDeployment,
558+
}, input.WaitForDeploymentAvailableInterval...)
559+
}
560+
561+
type DeployEvictablePodInput struct {
562+
WorkloadClusterProxy ClusterProxy
563+
ControlPlane *controlplanev1.KubeadmControlPlane
564+
MachineDeployment *clusterv1.MachineDeployment
565+
DeploymentName string
566+
Namespace string
567+
NodeSelector map[string]string
568+
569+
ModifyDeployment func(deployment *appsv1.Deployment)
570+
571+
WaitForDeploymentAvailableInterval []interface{}
572+
}
573+
574+
// DeployEvictablePod will deploy a Deployment on a ControlPlane or MachineDeployment.
575+
// It will deploy one Pod replica to each Machine.
576+
func DeployEvictablePod(ctx context.Context, input DeployEvictablePodInput) {
577+
Expect(input.DeploymentName).ToNot(BeNil(), "Need a deployment name in DeployUnevictablePod")
578+
Expect(input.Namespace).ToNot(BeNil(), "Need a namespace in DeployUnevictablePod")
579+
Expect(input.WorkloadClusterProxy).ToNot(BeNil(), "Need a workloadClusterProxy in DeployUnevictablePod")
580+
Expect((input.MachineDeployment == nil && input.ControlPlane != nil) ||
581+
(input.MachineDeployment != nil && input.ControlPlane == nil)).To(BeTrue(), "Either MachineDeployment or ControlPlane must be set in DeployUnevictablePod")
582+
583+
EnsureNamespace(ctx, input.WorkloadClusterProxy.GetClient(), input.Namespace)
584+
585+
workloadDeployment := generateDeployment(generateDeploymentInput{
586+
ControlPlane: input.ControlPlane,
587+
MachineDeployment: input.MachineDeployment,
588+
Name: input.DeploymentName,
589+
Namespace: input.Namespace,
590+
NodeSelector: input.NodeSelector,
591+
})
592+
593+
input.ModifyDeployment(workloadDeployment)
594+
595+
workloadClient := input.WorkloadClusterProxy.GetClientSet()
596+
597+
AddDeploymentToWorkloadCluster(ctx, AddDeploymentToWorkloadClusterInput{
598+
Namespace: input.Namespace,
599+
ClientSet: workloadClient,
600+
Deployment: workloadDeployment,
601+
})
602+
603+
WaitForDeploymentsAvailable(ctx, WaitForDeploymentsAvailableInput{
604+
Getter: input.WorkloadClusterProxy.GetClient(),
605+
Deployment: workloadDeployment,
606+
}, input.WaitForDeploymentAvailableInterval...)
607+
}
608+
609+
type generateDeploymentInput struct {
610+
ControlPlane *controlplanev1.KubeadmControlPlane
611+
MachineDeployment *clusterv1.MachineDeployment
612+
Name string
613+
Namespace string
614+
NodeSelector map[string]string
615+
}
616+
617+
func generateDeployment(input generateDeploymentInput) *appsv1.Deployment {
618+
workloadDeployment := &appsv1.Deployment{
619+
ObjectMeta: metav1.ObjectMeta{
620+
Name: input.Name,
621+
Namespace: input.Namespace,
622+
},
514623
Spec: appsv1.DeploymentSpec{
515-
Replicas: ptr.To[int32](4),
516624
Selector: &metav1.LabelSelector{
517625
MatchLabels: map[string]string{
518-
"app": "nonstop",
626+
"app": "nonstop",
627+
"deployment": input.Name,
519628
},
520629
},
521630
Template: corev1.PodTemplateSpec{
522631
ObjectMeta: metav1.ObjectMeta{
523632
Labels: map[string]string{
524-
"app": "nonstop",
633+
"app": "nonstop",
634+
"deployment": input.Name,
525635
},
526636
},
527637
Spec: corev1.PodSpec{
528638
Containers: []corev1.Container{
529639
{
530-
Name: "web",
640+
Name: "main",
531641
Image: "registry.k8s.io/pause:3.10",
532642
},
533643
},
644+
Affinity: &corev1.Affinity{
645+
// Make sure only 1 Pod of this Deployment can run on the same Node.
646+
PodAntiAffinity: &corev1.PodAntiAffinity{
647+
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
648+
{
649+
LabelSelector: &metav1.LabelSelector{
650+
MatchExpressions: []metav1.LabelSelectorRequirement{
651+
{
652+
Key: "deployment",
653+
Operator: "In",
654+
Values: []string{input.Name},
655+
},
656+
},
657+
},
658+
TopologyKey: "kubernetes.io/hostname",
659+
},
660+
},
661+
},
662+
},
534663
},
535664
},
536665
},
537666
}
538-
workloadClient := input.WorkloadClusterProxy.GetClientSet()
539667

540668
if input.ControlPlane != nil {
541-
var serverVersion *version.Info
542-
Eventually(func() error {
543-
var err error
544-
serverVersion, err = workloadClient.ServerVersion()
545-
return err
546-
}, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "failed to get server version")
547-
548-
// Use the control-plane label for Kubernetes version >= v1.20.0.
549-
if utilversion.MustParseGeneric(serverVersion.String()).AtLeast(utilversion.MustParseGeneric("v1.20.0")) {
550-
workloadDeployment.Spec.Template.Spec.NodeSelector = map[string]string{nodeRoleControlPlane: ""}
551-
} else {
552-
workloadDeployment.Spec.Template.Spec.NodeSelector = map[string]string{nodeRoleOldControlPlane: ""}
553-
}
554-
669+
workloadDeployment.Spec.Template.Spec.NodeSelector = map[string]string{nodeRoleControlPlane: ""}
555670
workloadDeployment.Spec.Template.Spec.Tolerations = []corev1.Toleration{
556-
{
557-
Key: nodeRoleOldControlPlane,
558-
Effect: "NoSchedule",
559-
},
560671
{
561672
Key: nodeRoleControlPlane,
562673
Effect: "NoSchedule",
563674
},
564675
}
676+
workloadDeployment.Spec.Replicas = input.ControlPlane.Spec.Replicas
565677
}
566-
AddDeploymentToWorkloadCluster(ctx, AddDeploymentToWorkloadClusterInput{
567-
Namespace: input.Namespace,
568-
ClientSet: workloadClient,
569-
Deployment: workloadDeployment,
570-
})
571-
572-
budget := &policyv1.PodDisruptionBudget{
573-
TypeMeta: metav1.TypeMeta{
574-
Kind: "PodDisruptionBudget",
575-
APIVersion: "policy/v1",
576-
},
577-
ObjectMeta: metav1.ObjectMeta{
578-
Name: input.DeploymentName,
579-
Namespace: input.Namespace,
580-
},
581-
Spec: policyv1.PodDisruptionBudgetSpec{
582-
Selector: &metav1.LabelSelector{
583-
MatchLabels: map[string]string{
584-
"app": "nonstop",
585-
},
586-
},
587-
MaxUnavailable: &intstr.IntOrString{
588-
Type: intstr.Int,
589-
IntVal: 1,
590-
StrVal: "1",
591-
},
592-
},
678+
if input.MachineDeployment != nil {
679+
workloadDeployment.Spec.Replicas = input.MachineDeployment.Spec.Replicas
593680
}
594681

595-
AddPodDisruptionBudget(ctx, AddPodDisruptionBudgetInput{
596-
Namespace: input.Namespace,
597-
ClientSet: workloadClient,
598-
Budget: budget,
599-
})
682+
// Note: If set, the NodeSelector field overwrites the NodeSelector we set above for control plane nodes.
683+
if input.NodeSelector != nil {
684+
workloadDeployment.Spec.Template.Spec.NodeSelector = input.NodeSelector
685+
}
600686

601-
WaitForDeploymentsAvailable(ctx, WaitForDeploymentsAvailableInput{
602-
Getter: input.WorkloadClusterProxy.GetClient(),
603-
Deployment: workloadDeployment,
604-
}, input.WaitForDeploymentAvailableInterval...)
687+
return workloadDeployment
605688
}
606689

607690
type AddDeploymentToWorkloadClusterInput struct {

0 commit comments

Comments
 (0)