Skip to content

Commit b30bdb4

Browse files
chrischdisbueringer
authored andcommitted
🌱 test: add PreWaitForControlplaneToBeUpgraded to ClusterUpgradeConformanceSpec (kubernetes-sigs#11145)
* test: add PreWaitForControlplaneToBeUpgraded to ClusterUpgradeConformanceSpec * test: add template for kcp-pre-drain * test: adjust multi-controlplane quickstart test to check for all nodes and kube-proxy being healthy via a pre-drain hook * lint fix * Review fixes * review fixes * review fixes * review fix
1 parent 2ad269c commit b30bdb4

File tree

7 files changed

+184
-14
lines changed

7 files changed

+184
-14
lines changed

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,7 @@ generate-e2e-templates-main: $(KUSTOMIZE)
602602
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-node-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-node-drain.yaml
603603
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml
604604
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml
605+
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain.yaml
605606
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml
606607
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-ipv6 --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-ipv6.yaml
607608
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary.yaml

test/e2e/cluster_upgrade.go

+19-1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ type ClusterUpgradeConformanceSpecInput struct {
6666
// Allows to inject a function to be run after test namespace is created.
6767
// If not specified, this is a no-op.
6868
PostNamespaceCreated func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace string)
69+
70+
// Allows to inject a function to be run before checking control-plane machines to be upgraded.
71+
// If not specified, this is a no-op.
72+
PreWaitForControlPlaneToBeUpgraded func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string)
6973
}
7074

7175
// ClusterUpgradeConformanceSpec implements a spec that upgrades a cluster and runs the Kubernetes conformance suite.
@@ -93,6 +97,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
9397

9498
clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
9599
kubetestConfigFilePath string
100+
101+
clusterName string
96102
)
97103

98104
BeforeEach(func() {
@@ -142,6 +148,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
142148
infrastructureProvider = *input.InfrastructureProvider
143149
}
144150

151+
clusterName = fmt.Sprintf("%s-%s", specName, util.RandomString(6))
152+
145153
clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
146154
ClusterProxy: input.BootstrapClusterProxy,
147155
ConfigCluster: clusterctl.ConfigClusterInput{
@@ -151,7 +159,7 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
151159
InfrastructureProvider: infrastructureProvider,
152160
Flavor: ptr.Deref(input.Flavor, "upgrades"),
153161
Namespace: namespace.Name,
154-
ClusterName: fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
162+
ClusterName: clusterName,
155163
KubernetesVersion: input.E2EConfig.GetVariable(KubernetesVersionUpgradeFrom),
156164
ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount),
157165
WorkerMachineCount: ptr.To[int64](workerMachineCount),
@@ -180,6 +188,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
180188
WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
181189
WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
182190
WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
191+
PreWaitForControlPlaneToBeUpgraded: func() {
192+
if input.PreWaitForControlPlaneToBeUpgraded != nil {
193+
input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName)
194+
}
195+
},
183196
})
184197
} else {
185198
// Cluster is not using ClusterClass, upgrade via individual resources.
@@ -209,6 +222,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
209222
WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
210223
WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
211224
WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
225+
PreWaitForControlPlaneToBeUpgraded: func() {
226+
if input.PreWaitForControlPlaneToBeUpgraded != nil {
227+
input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName)
228+
}
229+
},
212230
})
213231

214232
if workerMachineCount > 0 {

test/e2e/cluster_upgrade_test.go

+129-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,20 @@ package e2e
2121

2222
import (
2323
. "github.com/onsi/ginkgo/v2"
24+
. "github.com/onsi/gomega"
25+
"github.com/pkg/errors"
26+
corev1 "k8s.io/api/core/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
kerrors "k8s.io/apimachinery/pkg/util/errors"
29+
"k8s.io/klog/v2"
2430
"k8s.io/utils/ptr"
31+
"sigs.k8s.io/controller-runtime/pkg/client"
32+
33+
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
34+
"sigs.k8s.io/cluster-api/test/e2e/internal/log"
35+
"sigs.k8s.io/cluster-api/test/framework"
36+
"sigs.k8s.io/cluster-api/util/conditions"
37+
"sigs.k8s.io/cluster-api/util/patch"
2538
)
2639

2740
var _ = Describe("When upgrading a workload cluster using ClusterClass and testing K8S conformance [Conformance] [K8s-Upgrade] [ClusterClass]", func() {
@@ -58,6 +71,7 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass [ClusterC
5871
})
5972

6073
var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA control plane [ClusterClass]", func() {
74+
controlPlaneMachineCount := int64(3)
6175
ClusterUpgradeConformanceSpec(ctx, func() ClusterUpgradeConformanceSpecInput {
6276
return ClusterUpgradeConformanceSpecInput{
6377
E2EConfig: e2eConfig,
@@ -69,9 +83,122 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA
6983
// This test is run in CI in parallel with other tests. To keep the test duration reasonable
7084
// the conformance tests are skipped.
7185
SkipConformanceTests: true,
72-
ControlPlaneMachineCount: ptr.To[int64](3),
86+
ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount),
7387
WorkerMachineCount: ptr.To[int64](1),
74-
Flavor: ptr.To("topology"),
88+
Flavor: ptr.To("kcp-pre-drain"),
89+
PreWaitForControlPlaneToBeUpgraded: func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string) {
90+
log.Logf("Waiting for control-plane machines to have the upgraded Kubernetes version")
91+
92+
preDrainHook := "pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check"
93+
94+
cluster := &clusterv1.Cluster{
95+
ObjectMeta: metav1.ObjectMeta{
96+
Namespace: workloadClusterNamespace,
97+
Name: workloadClusterName,
98+
},
99+
}
100+
Expect(managementClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(cluster), cluster)).To(Succeed())
101+
102+
// This replaces the WaitForControlPlaneMachinesToBeUpgraded function to verify via a pre-drain hook
103+
// that all static Pods, kube-proxy and all Nodes are becoming healthy before we let the upgrade
104+
// process precede by removing the pre-drain hook.
105+
// This captures cases where static Pods, kube-proxy and all Nodes would only become healthy after
106+
// all control plane Machines have been upgraded.
107+
Eventually(func() (int64, error) {
108+
machines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{
109+
Lister: managementClusterProxy.GetClient(),
110+
ClusterName: cluster.Name,
111+
Namespace: cluster.Namespace,
112+
})
113+
114+
// Collect information about:
115+
// * how many control-plane machines already got upgradedAndHealthy
116+
// * control-plane machines which are in deletion
117+
// * workload cluster nodes
118+
// * kube-proxy pods
119+
var upgradedAndHealthy int64
120+
deletingMachines := []clusterv1.Machine{}
121+
for _, m := range machines {
122+
if *m.Spec.Version == cluster.Spec.Topology.Version && conditions.IsTrue(&m, clusterv1.MachineNodeHealthyCondition) {
123+
upgradedAndHealthy++
124+
}
125+
if !m.DeletionTimestamp.IsZero() {
126+
deletingMachines = append(deletingMachines, m)
127+
}
128+
}
129+
130+
wlClient := managementClusterProxy.GetWorkloadCluster(ctx, workloadClusterNamespace, workloadClusterName).GetClient()
131+
nodes := corev1.NodeList{}
132+
if err := wlClient.List(ctx, &nodes); err != nil {
133+
return 0, errors.Wrapf(err, "failed to list nodes in workload cluster")
134+
}
135+
136+
kubeProxyPods := corev1.PodList{}
137+
if err := wlClient.List(ctx, &kubeProxyPods, client.InNamespace(metav1.NamespaceSystem), client.MatchingLabels{"k8s-app": "kube-proxy"}); err != nil {
138+
return 0, errors.Wrapf(err, "failed to list kube-proxy pods in workload cluster")
139+
}
140+
141+
errList := []error{}
142+
143+
// Check all nodes to be Ready.
144+
for _, node := range nodes.Items {
145+
for _, condition := range node.Status.Conditions {
146+
if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue {
147+
errList = append(errList, errors.Errorf("expected the Ready condition for Node %s to be true but got %s instead: %s", node.GetName(), condition.Status, condition.Message))
148+
}
149+
}
150+
}
151+
152+
// Check if the expected number of kube-proxy pods exist and all of them are healthy for all existing Nodes of the Cluster.
153+
if len(nodes.Items) != len(kubeProxyPods.Items) {
154+
errList = append(errList, errors.Errorf("expected %d kube-proxy pods to exist, got %d", len(nodes.Items), len(kubeProxyPods.Items)))
155+
}
156+
for _, pod := range kubeProxyPods.Items {
157+
for _, condition := range pod.Status.Conditions {
158+
if condition.Type == corev1.PodReady && condition.Status != corev1.ConditionTrue {
159+
errList = append(errList, errors.Errorf("expected the Ready condition for Pod %s to be true but got %s instead: %s", pod.GetName(), condition.Status, condition.Message))
160+
}
161+
}
162+
}
163+
164+
if err := kerrors.NewAggregate(errList); err != nil {
165+
return 0, errors.Wrap(err, "blocking upgrade because cluster is not stable")
166+
}
167+
168+
// At this stage all current machines are considered ok, so remove the pre-drain webhook from a CP to unblock the next step of the upgrade.
169+
if len(deletingMachines) > 0 {
170+
if len(deletingMachines) > 1 {
171+
return 0, errors.Errorf("expected a maximum of 1 machine to be in deleting but got %d", len(deletingMachines))
172+
}
173+
174+
m := &deletingMachines[0]
175+
176+
if m.Annotations[preDrainHook] != "true" {
177+
return 0, errors.Errorf("machine %s is in deletion but does not have pre-drain hook %q", klog.KObj(m), preDrainHook)
178+
}
179+
180+
// Removing pre-drain hook from machine.
181+
patchHelper, err := patch.NewHelper(m, managementClusterProxy.GetClient())
182+
if err != nil {
183+
return 0, err
184+
}
185+
delete(m.Annotations, preDrainHook)
186+
187+
if err := patchHelper.Patch(ctx, m); err != nil {
188+
return 0, err
189+
}
190+
191+
// Return to enter the function again.
192+
return 0, errors.Errorf("deletion of Machine %s was blocked by pre-drain hook", klog.KObj(m))
193+
}
194+
195+
if int64(len(machines)) > upgradedAndHealthy {
196+
return 0, errors.New("old Machines remain")
197+
}
198+
199+
return upgradedAndHealthy, nil
200+
}, e2eConfig.GetIntervals("k8s-upgrade-and-conformance", "wait-machine-upgrade")...).Should(Equal(controlPlaneMachineCount), "Timed out waiting for all control-plane machines in Cluster %s to be upgraded to kubernetes version %s", klog.KObj(cluster), cluster.Spec.Topology.Version)
201+
},
75202
}
76203
})
77204
})

test/e2e/config/docker.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ providers:
309309
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml"
310310
- sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml"
311311
- sourcePath: "../data/infrastructure-docker/main/cluster-template-node-drain.yaml"
312+
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-pre-drain.yaml"
312313
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml"
313314
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml"
314315
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
apiVersion: cluster.x-k8s.io/v1beta1
3+
kind: Cluster
4+
metadata:
5+
name: '${CLUSTER_NAME}'
6+
spec:
7+
topology:
8+
class: quick-start
9+
controlPlane:
10+
metadata:
11+
annotations:
12+
pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check: "true"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
resources:
2+
- ../cluster-template-upgrades
3+
patches:
4+
- path: cluster.yaml

test/framework/controlplane_helpers.go

+18-11
Original file line numberDiff line numberDiff line change
@@ -306,17 +306,18 @@ func WaitForControlPlaneAndMachinesReady(ctx context.Context, input WaitForContr
306306

307307
// UpgradeControlPlaneAndWaitForUpgradeInput is the input type for UpgradeControlPlaneAndWaitForUpgrade.
308308
type UpgradeControlPlaneAndWaitForUpgradeInput struct {
309-
ClusterProxy ClusterProxy
310-
Cluster *clusterv1.Cluster
311-
ControlPlane *controlplanev1.KubeadmControlPlane
312-
KubernetesUpgradeVersion string
313-
UpgradeMachineTemplate *string
314-
EtcdImageTag string
315-
DNSImageTag string
316-
WaitForMachinesToBeUpgraded []interface{}
317-
WaitForDNSUpgrade []interface{}
318-
WaitForKubeProxyUpgrade []interface{}
319-
WaitForEtcdUpgrade []interface{}
309+
ClusterProxy ClusterProxy
310+
Cluster *clusterv1.Cluster
311+
ControlPlane *controlplanev1.KubeadmControlPlane
312+
KubernetesUpgradeVersion string
313+
UpgradeMachineTemplate *string
314+
EtcdImageTag string
315+
DNSImageTag string
316+
WaitForMachinesToBeUpgraded []interface{}
317+
WaitForDNSUpgrade []interface{}
318+
WaitForKubeProxyUpgrade []interface{}
319+
WaitForEtcdUpgrade []interface{}
320+
PreWaitForControlPlaneToBeUpgraded func()
320321
}
321322

322323
// UpgradeControlPlaneAndWaitForUpgrade upgrades a KubeadmControlPlane and waits for it to be upgraded.
@@ -357,6 +358,12 @@ func UpgradeControlPlaneAndWaitForUpgrade(ctx context.Context, input UpgradeCont
357358
return patchHelper.Patch(ctx, input.ControlPlane)
358359
}, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch the new kubernetes version to KCP %s", klog.KObj(input.ControlPlane))
359360

361+
// Once we have patched the Kubernetes Cluster we can run PreWaitForControlPlaneToBeUpgraded.
362+
if input.PreWaitForControlPlaneToBeUpgraded != nil {
363+
log.Logf("Calling PreWaitForControlPlaneToBeUpgraded")
364+
input.PreWaitForControlPlaneToBeUpgraded()
365+
}
366+
360367
log.Logf("Waiting for control-plane machines to have the upgraded kubernetes version")
361368
WaitForControlPlaneMachinesToBeUpgraded(ctx, WaitForControlPlaneMachinesToBeUpgradedInput{
362369
Lister: mgmtClient,

0 commit comments

Comments
 (0)