Skip to content

[release-1.8] 🌱 test: add PreWaitForControlplaneToBeUpgraded to ClusterUpgradeConformanceSpec #11303

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ generate-e2e-templates-main: $(KUSTOMIZE)
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-node-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-node-drain.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-ipv6 --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-ipv6.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary.yaml
Expand Down
20 changes: 19 additions & 1 deletion test/e2e/cluster_upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ type ClusterUpgradeConformanceSpecInput struct {
// Allows to inject a function to be run after test namespace is created.
// If not specified, this is a no-op.
PostNamespaceCreated func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace string)

// Allows to inject a function to be run before checking control-plane machines to be upgraded.
// If not specified, this is a no-op.
PreWaitForControlPlaneToBeUpgraded func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string)
}

// ClusterUpgradeConformanceSpec implements a spec that upgrades a cluster and runs the Kubernetes conformance suite.
Expand Down Expand Up @@ -93,6 +97,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust

clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
kubetestConfigFilePath string

clusterName string
)

BeforeEach(func() {
Expand Down Expand Up @@ -142,6 +148,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
infrastructureProvider = *input.InfrastructureProvider
}

clusterName = fmt.Sprintf("%s-%s", specName, util.RandomString(6))

clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
ClusterProxy: input.BootstrapClusterProxy,
ConfigCluster: clusterctl.ConfigClusterInput{
Expand All @@ -151,7 +159,7 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
InfrastructureProvider: infrastructureProvider,
Flavor: ptr.Deref(input.Flavor, "upgrades"),
Namespace: namespace.Name,
ClusterName: fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
ClusterName: clusterName,
KubernetesVersion: input.E2EConfig.GetVariable(KubernetesVersionUpgradeFrom),
ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount),
WorkerMachineCount: ptr.To[int64](workerMachineCount),
Expand Down Expand Up @@ -180,6 +188,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
PreWaitForControlPlaneToBeUpgraded: func() {
if input.PreWaitForControlPlaneToBeUpgraded != nil {
input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName)
}
},
})
} else {
// Cluster is not using ClusterClass, upgrade via individual resources.
Expand Down Expand Up @@ -209,6 +222,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
PreWaitForControlPlaneToBeUpgraded: func() {
if input.PreWaitForControlPlaneToBeUpgraded != nil {
input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName)
}
},
})

if workerMachineCount > 0 {
Expand Down
131 changes: 129 additions & 2 deletions test/e2e/cluster_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,20 @@ package e2e

import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/test/e2e/internal/log"
"sigs.k8s.io/cluster-api/test/framework"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
)

var _ = Describe("When upgrading a workload cluster using ClusterClass and testing K8S conformance [Conformance] [K8s-Upgrade] [ClusterClass]", func() {
Expand Down Expand Up @@ -58,6 +71,7 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass [ClusterC
})

var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA control plane [ClusterClass]", func() {
controlPlaneMachineCount := int64(3)
ClusterUpgradeConformanceSpec(ctx, func() ClusterUpgradeConformanceSpecInput {
return ClusterUpgradeConformanceSpecInput{
E2EConfig: e2eConfig,
Expand All @@ -69,9 +83,122 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA
// This test is run in CI in parallel with other tests. To keep the test duration reasonable
// the conformance tests are skipped.
SkipConformanceTests: true,
ControlPlaneMachineCount: ptr.To[int64](3),
ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount),
WorkerMachineCount: ptr.To[int64](1),
Flavor: ptr.To("topology"),
Flavor: ptr.To("kcp-pre-drain"),
PreWaitForControlPlaneToBeUpgraded: func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string) {
log.Logf("Waiting for control-plane machines to have the upgraded Kubernetes version")

preDrainHook := "pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check"

cluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Namespace: workloadClusterNamespace,
Name: workloadClusterName,
},
}
Expect(managementClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(cluster), cluster)).To(Succeed())

// This replaces the WaitForControlPlaneMachinesToBeUpgraded function to verify via a pre-drain hook
// that all static Pods, kube-proxy and all Nodes are becoming healthy before we let the upgrade
// process precede by removing the pre-drain hook.
// This captures cases where static Pods, kube-proxy and all Nodes would only become healthy after
// all control plane Machines have been upgraded.
Eventually(func() (int64, error) {
machines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{
Lister: managementClusterProxy.GetClient(),
ClusterName: cluster.Name,
Namespace: cluster.Namespace,
})

// Collect information about:
// * how many control-plane machines already got upgradedAndHealthy
// * control-plane machines which are in deletion
// * workload cluster nodes
// * kube-proxy pods
var upgradedAndHealthy int64
deletingMachines := []clusterv1.Machine{}
for _, m := range machines {
if *m.Spec.Version == cluster.Spec.Topology.Version && conditions.IsTrue(&m, clusterv1.MachineNodeHealthyCondition) {
upgradedAndHealthy++
}
if !m.DeletionTimestamp.IsZero() {
deletingMachines = append(deletingMachines, m)
}
}

wlClient := managementClusterProxy.GetWorkloadCluster(ctx, workloadClusterNamespace, workloadClusterName).GetClient()
nodes := corev1.NodeList{}
if err := wlClient.List(ctx, &nodes); err != nil {
return 0, errors.Wrapf(err, "failed to list nodes in workload cluster")
}

kubeProxyPods := corev1.PodList{}
if err := wlClient.List(ctx, &kubeProxyPods, client.InNamespace(metav1.NamespaceSystem), client.MatchingLabels{"k8s-app": "kube-proxy"}); err != nil {
return 0, errors.Wrapf(err, "failed to list kube-proxy pods in workload cluster")
}

errList := []error{}

// Check all nodes to be Ready.
for _, node := range nodes.Items {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue {
errList = append(errList, errors.Errorf("expected the Ready condition for Node %s to be true but got %s instead: %s", node.GetName(), condition.Status, condition.Message))
}
}
}

// Check if the expected number of kube-proxy pods exist and all of them are healthy for all existing Nodes of the Cluster.
if len(nodes.Items) != len(kubeProxyPods.Items) {
errList = append(errList, errors.Errorf("expected %d kube-proxy pods to exist, got %d", len(nodes.Items), len(kubeProxyPods.Items)))
}
for _, pod := range kubeProxyPods.Items {
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodReady && condition.Status != corev1.ConditionTrue {
errList = append(errList, errors.Errorf("expected the Ready condition for Pod %s to be true but got %s instead: %s", pod.GetName(), condition.Status, condition.Message))
}
}
}

if err := kerrors.NewAggregate(errList); err != nil {
return 0, errors.Wrap(err, "blocking upgrade because cluster is not stable")
}

// At this stage all current machines are considered ok, so remove the pre-drain webhook from a CP to unblock the next step of the upgrade.
if len(deletingMachines) > 0 {
if len(deletingMachines) > 1 {
return 0, errors.Errorf("expected a maximum of 1 machine to be in deleting but got %d", len(deletingMachines))
}

m := &deletingMachines[0]

if m.Annotations[preDrainHook] != "true" {
return 0, errors.Errorf("machine %s is in deletion but does not have pre-drain hook %q", klog.KObj(m), preDrainHook)
}

// Removing pre-drain hook from machine.
patchHelper, err := patch.NewHelper(m, managementClusterProxy.GetClient())
if err != nil {
return 0, err
}
delete(m.Annotations, preDrainHook)

if err := patchHelper.Patch(ctx, m); err != nil {
return 0, err
}

// Return to enter the function again.
return 0, errors.Errorf("deletion of Machine %s was blocked by pre-drain hook", klog.KObj(m))
}

if int64(len(machines)) > upgradedAndHealthy {
return 0, errors.New("old Machines remain")
}

return upgradedAndHealthy, nil
}, e2eConfig.GetIntervals("k8s-upgrade-and-conformance", "wait-machine-upgrade")...).Should(Equal(controlPlaneMachineCount), "Timed out waiting for all control-plane machines in Cluster %s to be upgraded to kubernetes version %s", klog.KObj(cluster), cluster.Spec.Topology.Version)
},
}
})
})
Expand Down
1 change: 1 addition & 0 deletions test/e2e/config/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ providers:
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-node-drain.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-pre-drain.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: cluster.x-k8s.io/v1beta1
kind: Cluster
metadata:
name: '${CLUSTER_NAME}'
spec:
topology:
class: quick-start
controlPlane:
metadata:
annotations:
pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
resources:
- ../cluster-template-upgrades
patches:
- path: cluster.yaml
29 changes: 18 additions & 11 deletions test/framework/controlplane_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,18 @@ func WaitForControlPlaneAndMachinesReady(ctx context.Context, input WaitForContr

// UpgradeControlPlaneAndWaitForUpgradeInput is the input type for UpgradeControlPlaneAndWaitForUpgrade.
type UpgradeControlPlaneAndWaitForUpgradeInput struct {
ClusterProxy ClusterProxy
Cluster *clusterv1.Cluster
ControlPlane *controlplanev1.KubeadmControlPlane
KubernetesUpgradeVersion string
UpgradeMachineTemplate *string
EtcdImageTag string
DNSImageTag string
WaitForMachinesToBeUpgraded []interface{}
WaitForDNSUpgrade []interface{}
WaitForKubeProxyUpgrade []interface{}
WaitForEtcdUpgrade []interface{}
ClusterProxy ClusterProxy
Cluster *clusterv1.Cluster
ControlPlane *controlplanev1.KubeadmControlPlane
KubernetesUpgradeVersion string
UpgradeMachineTemplate *string
EtcdImageTag string
DNSImageTag string
WaitForMachinesToBeUpgraded []interface{}
WaitForDNSUpgrade []interface{}
WaitForKubeProxyUpgrade []interface{}
WaitForEtcdUpgrade []interface{}
PreWaitForControlPlaneToBeUpgraded func()
}

// UpgradeControlPlaneAndWaitForUpgrade upgrades a KubeadmControlPlane and waits for it to be upgraded.
Expand Down Expand Up @@ -357,6 +358,12 @@ func UpgradeControlPlaneAndWaitForUpgrade(ctx context.Context, input UpgradeCont
return patchHelper.Patch(ctx, input.ControlPlane)
}, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch the new kubernetes version to KCP %s", klog.KObj(input.ControlPlane))

// Once we have patched the Kubernetes Cluster we can run PreWaitForControlPlaneToBeUpgraded.
if input.PreWaitForControlPlaneToBeUpgraded != nil {
log.Logf("Calling PreWaitForControlPlaneToBeUpgraded")
input.PreWaitForControlPlaneToBeUpgraded()
}

log.Logf("Waiting for control-plane machines to have the upgraded kubernetes version")
WaitForControlPlaneMachinesToBeUpgraded(ctx, WaitForControlPlaneMachinesToBeUpgradedInput{
Lister: mgmtClient,
Expand Down
Loading