From 5f891b6fc2b68678d4fac36d1b21de7adc5c9958 Mon Sep 17 00:00:00 2001 From: devtools-bot Date: Tue, 21 Jan 2025 09:39:40 +0800 Subject: [PATCH] Fix the failures in qe ci jobs --- hack/ci-integration.sh | 2 +- pkg/framework/framework.go | 16 ++++++++++++++++ pkg/framework/machinesets.go | 30 +++++++++++++++++++++++------- pkg/infra/spot.go | 20 ++++++++++++++++---- pkg/infra/webhooks.go | 24 ++++++++++++++++++++++++ 5 files changed, 80 insertions(+), 12 deletions(-) diff --git a/hack/ci-integration.sh b/hack/ci-integration.sh index ab5897894..9e91355b7 100755 --- a/hack/ci-integration.sh +++ b/hack/ci-integration.sh @@ -7,7 +7,7 @@ go run ./vendor/github.com/onsi/ginkgo/v2/ginkgo \ -v \ --timeout=115m \ --grace-period=5m \ - --fail-fast \ + --fail-fast=false \ --no-color \ --junit-report="junit_cluster_api_actuator_pkg_e2e.xml" \ --output-dir="${OUTPUT_DIR}" \ diff --git a/pkg/framework/framework.go b/pkg/framework/framework.go index f6e40b6ed..f39865b2d 100644 --- a/pkg/framework/framework.go +++ b/pkg/framework/framework.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "time" . "github.com/onsi/ginkgo/v2" @@ -313,3 +314,18 @@ func GetCredentialsFromCluster(oc *gatherer.CLI) ([]byte, []byte, string) { return accessKeyID, secureKey, clusterRegion } + +// IsCustomerVPC check if cluster is customer vpc cluster. +func IsCustomerVPC(oc *gatherer.CLI) bool { + installConfig, err := oc.WithoutNamespace().Run("get").Args("cm", "cluster-config-v1", "-n", "kube-system", "-o=jsonpath={.data.install-config}").Output() + Expect(err).NotTo(HaveOccurred(), "Failed to get install-config") + + switch platform { + case configv1.AWSPlatformType: + return strings.Contains(installConfig, "subnets:") + case configv1.AzurePlatformType: + return strings.Contains(installConfig, "virtualNetwork:") + default: + return false + } +} diff --git a/pkg/framework/machinesets.go b/pkg/framework/machinesets.go index 042b30f04..3fc2d1518 100644 --- a/pkg/framework/machinesets.go +++ b/pkg/framework/machinesets.go @@ -77,7 +77,7 @@ func BuildPerArchMachineSetParamsList(ctx context.Context, client runtimeclient. var params MachineSetParams for _, worker := range workers { - if arch, err = getArchitectureFromMachineSetNodes(ctx, client, worker); err != nil { + if arch, err = GetArchitectureFromMachineSetNodes(ctx, client, worker); err != nil { klog.Warningf("unable to get the architecture for the machine set %s: %v", worker.Name, err) continue } @@ -180,7 +180,7 @@ func CreateMachineSet(c runtimeclient.Client, params MachineSetParams) (*machine } // BuildMachineSetParamsList creates a list of MachineSetParams based on the given machineSetParams with modified instance type. -func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platform configv1.PlatformType) ([]MachineSetParams, error) { +func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platform configv1.PlatformType, arch string) ([]MachineSetParams, error) { baseMachineSetParams := machineSetParams baseProviderSpec := baseMachineSetParams.ProviderSpec.DeepCopy() @@ -189,7 +189,15 @@ func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platfor switch platform { case configv1.AWSPlatformType: // Using cheapest compute optimized instances that meet openshift minimum requirements (4 vCPU, 8GiB RAM) - alternativeInstanceTypes := []string{"c5.xlarge", "c5a.xlarge", "m5.xlarge"} + var alternativeInstanceTypes []string + + switch arch { + case "arm64": + alternativeInstanceTypes = []string{"m6g.large", "t4g.nano", "t4g.micro", "m6gd.xlarge"} + default: + alternativeInstanceTypes = []string{"c5.xlarge", "c5a.xlarge", "m5.xlarge"} + } + for _, instanceType := range alternativeInstanceTypes { updatedProviderSpec, err := updateProviderSpecAWSInstanceType(baseProviderSpec, instanceType) if err != nil { @@ -200,7 +208,15 @@ func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platfor output = append(output, baseMachineSetParams) } case configv1.AzurePlatformType: - alternativeVMSizes := []string{"Standard_F4s_v2", "Standard_D4as_v5", "Standard_D4as_v4"} + var alternativeVMSizes []string + + switch arch { + case "arm64": + alternativeVMSizes = []string{"Standard_D2ps_v5", "Standard_D3ps_v5", "Standard_D4ps_v5"} + default: + alternativeVMSizes = []string{"Standard_F4s_v2", "Standard_D4as_v5", "Standard_D4as_v4"} + } + for _, VMSize := range alternativeVMSizes { updatedProviderSpec, err := updateProviderSpecAzureVMSize(baseProviderSpec, VMSize) if err != nil { @@ -338,13 +354,13 @@ func GetWorkerMachineSets(ctx context.Context, client runtimeclient.Client) ([]* return result, nil } -// getArchitectureFromMachineSetNodes returns the architecture of the nodes controlled by the given machineSet's machines. -func getArchitectureFromMachineSetNodes(ctx context.Context, client runtimeclient.Client, machineSet *machinev1.MachineSet) (string, error) { +// GetArchitectureFromMachineSetNodes returns the architecture of the nodes controlled by the given machineSet's machines. +func GetArchitectureFromMachineSetNodes(ctx context.Context, client runtimeclient.Client, machineSet *machinev1.MachineSet) (string, error) { nodes, err := GetNodesFromMachineSet(ctx, client, machineSet) if err != nil || len(nodes) == 0 { klog.Warningf("error getting the machineSet's nodes or no nodes associated with %s. Using the capacity annotation", machineSet.Name) - for _, kv := range strings.Split(machineSet.Labels[labelsKey], ",") { + for _, kv := range strings.Split(machineSet.Annotations[labelsKey], ",") { if strings.Contains(kv, "kubernetes.io/arch") { return strings.Split(kv, "=")[1], nil } diff --git a/pkg/infra/spot.go b/pkg/infra/spot.go index d10c11c63..1a05a4d96 100644 --- a/pkg/infra/spot.go +++ b/pkg/infra/spot.go @@ -44,7 +44,7 @@ var _ = Describe("Running on Spot", framework.LabelMAPI, framework.LabelDisrupti var client runtimeclient.Client var machineSet *machinev1.MachineSet var platform configv1.PlatformType - + var arch string var delObjects map[string]runtimeclient.Object var gatherer *gatherer.StateGatherer @@ -97,11 +97,22 @@ var _ = Describe("Running on Spot", framework.LabelMAPI, framework.LabelDisrupti default: Skip(fmt.Sprintf("Platform %s does not support Spot, skipping.", platform)) } + oc, _ := framework.NewCLI() + if framework.IsCustomerVPC(oc) { + //The termination-simulator will hit network error on customer vpc cluster, cannot mark the node as terminating, skip for now. + Skip("Skip this test on customer vpc cluster.") + } By("Creating a Spot backed MachineSet", func() { machineSetReady := false machineSetParams := framework.BuildMachineSetParams(ctx, client, machinesCount) - machineSetParamsList, err := framework.BuildAlternativeMachineSetParams(machineSetParams, platform) + + workers, err := framework.GetWorkerMachineSets(ctx, client) + Expect(err).ToNot(HaveOccurred(), "listing Worker MachineSets should not error.") + + arch, err = framework.GetArchitectureFromMachineSetNodes(ctx, client, workers[0]) + Expect(err).NotTo(HaveOccurred(), "unable to get the architecture for the machine set") + machineSetParamsList, err := framework.BuildAlternativeMachineSetParams(machineSetParams, platform, arch) Expect(err).ToNot(HaveOccurred(), "Should be able to build list of MachineSet parameters") for i, machineSetParams := range machineSetParamsList { if i >= spotMachineSetMaxProvisioningRetryCount { @@ -376,8 +387,9 @@ func getMetadataMockDeployment(platform configv1.PlatformType) *appsv1.Deploymen Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "metadata-mock", - Image: "golang:1.14", + Name: "metadata-mock", + // This is a golang:1.22 image which is mirrored in https://quay.io/repository/openshifttest/golang, so that disconnected cluster can access. + Image: "quay.io/openshifttest/golang@sha256:8f1c43387f0a107535906c7ee918a9d46079cc7be5e80a18424e8558d8afc702", Command: []string{"/usr/local/go/bin/go"}, Args: []string{ "run", diff --git a/pkg/infra/webhooks.go b/pkg/infra/webhooks.go index 4c48a0463..d5bef2051 100644 --- a/pkg/infra/webhooks.go +++ b/pkg/infra/webhooks.go @@ -12,6 +12,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/klog" runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" configv1 "github.com/openshift/api/config/v1" @@ -112,6 +113,21 @@ var _ = Describe("Webhooks", framework.LabelMAPI, framework.LabelDisruptive, fun if err != nil { return err } + + failed := framework.FilterMachines([]*machinev1beta1.Machine{m}, framework.MachinePhaseFailed) + if len(failed) > 0 { + reason := "failureReason not present in Machine.status" + if m.Status.ErrorReason != nil { + reason = string(*m.Status.ErrorReason) + } + message := "failureMessage not present in Machine.status" + if m.Status.ErrorMessage != nil { + message = *m.Status.ErrorMessage + } + klog.Errorf("Failed machine: %s, Reason: %s, Message: %s", m.Name, reason, message) + } + Expect(len(failed)).To(Equal(0), "zero machines should be in a Failed phase") + running := framework.FilterRunningMachines([]*machinev1beta1.Machine{m}) if len(running) == 0 { return fmt.Errorf("machine not yet running") @@ -252,6 +268,9 @@ func minimalAzureProviderSpec(ps *machinev1beta1.ProviderSpec) (*machinev1beta1. OSDisk: machinev1beta1.OSDisk{ DiskSizeGB: fullProviderSpec.OSDisk.DiskSizeGB, }, + Vnet: fullProviderSpec.Vnet, + Subnet: fullProviderSpec.Subnet, + NetworkResourceGroup: fullProviderSpec.NetworkResourceGroup, }, }, }, nil @@ -270,6 +289,11 @@ func minimalGCPProviderSpec(ps *machinev1beta1.ProviderSpec) (*machinev1beta1.Pr Region: fullProviderSpec.Region, Zone: fullProviderSpec.Zone, ServiceAccounts: fullProviderSpec.ServiceAccounts, + NetworkInterfaces: []*machinev1beta1.GCPNetworkInterface{{ + Network: fullProviderSpec.NetworkInterfaces[0].Network, + Subnetwork: fullProviderSpec.NetworkInterfaces[0].Subnetwork, + ProjectID: fullProviderSpec.NetworkInterfaces[0].ProjectID, + }}, }, }, }, nil