From 36b69f6c7c447b5b28e32d2e23544ec3e6545683 Mon Sep 17 00:00:00 2001 From: devtools-bot Date: Tue, 21 Jan 2025 09:39:40 +0800 Subject: [PATCH] Fix the failures in qe ci jobs --- hack/ci-integration.sh | 2 +- pkg/framework/framework.go | 17 +++++++++++++++++ pkg/framework/machinesets.go | 30 +++++++++++++++++++++++------- pkg/infra/spot.go | 20 ++++++++++++++++---- pkg/infra/webhooks.go | 24 ++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 12 deletions(-) diff --git a/hack/ci-integration.sh b/hack/ci-integration.sh index ab5897894..9e91355b7 100755 --- a/hack/ci-integration.sh +++ b/hack/ci-integration.sh @@ -7,7 +7,7 @@ go run ./vendor/github.com/onsi/ginkgo/v2/ginkgo \ -v \ --timeout=115m \ --grace-period=5m \ - --fail-fast \ + --fail-fast=false \ --no-color \ --junit-report="junit_cluster_api_actuator_pkg_e2e.xml" \ --output-dir="${OUTPUT_DIR}" \ diff --git a/pkg/framework/framework.go b/pkg/framework/framework.go index fd003dbbe..5aca7df1f 100644 --- a/pkg/framework/framework.go +++ b/pkg/framework/framework.go @@ -6,8 +6,10 @@ import ( "fmt" "os" "path/filepath" + "strings" "time" + . "github.com/onsi/gomega" configv1 "github.com/openshift/api/config/v1" cov1helpers "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" admissionregistrationv1 "k8s.io/api/admissionregistration/v1" @@ -271,3 +273,18 @@ func NewGatherer() (*gatherer.StateGatherer, error) { return gatherer.NewStateGatherer(context.Background(), cli, time.Now()), nil } + +// IsCustomerVPC check if cluster is customer vpc cluster. +func IsCustomerVPC(oc *gatherer.CLI) bool { + installConfig, err := oc.WithoutNamespace().Run("get").Args("cm", "cluster-config-v1", "-n", "kube-system", "-o=jsonpath={.data.install-config}").Output() + Expect(err).NotTo(HaveOccurred(), "Failed to get install-config") + + switch platform { + case configv1.AWSPlatformType: + return strings.Contains(installConfig, "subnets:") + case configv1.AzurePlatformType: + return strings.Contains(installConfig, "virtualNetwork:") + default: + return false + } +} diff --git a/pkg/framework/machinesets.go b/pkg/framework/machinesets.go index e821ef629..811695209 100644 --- a/pkg/framework/machinesets.go +++ b/pkg/framework/machinesets.go @@ -74,7 +74,7 @@ func BuildPerArchMachineSetParamsList(ctx context.Context, client runtimeclient. var params MachineSetParams for _, worker := range workers { - if arch, err = getArchitectureFromMachineSetNodes(ctx, client, worker); err != nil { + if arch, err = GetArchitectureFromMachineSetNodes(ctx, client, worker); err != nil { klog.Warningf("unable to get the architecture for the machine set %s: %v", worker.Name, err) continue } @@ -176,7 +176,7 @@ func CreateMachineSet(c runtimeclient.Client, params MachineSetParams) (*machine } // BuildMachineSetParamsList creates a list of MachineSetParams based on the given machineSetParams with modified instance type. -func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platform configv1.PlatformType) ([]MachineSetParams, error) { +func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platform configv1.PlatformType, arch string) ([]MachineSetParams, error) { baseMachineSetParams := machineSetParams baseProviderSpec := baseMachineSetParams.ProviderSpec.DeepCopy() @@ -185,7 +185,15 @@ func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platfor switch platform { case configv1.AWSPlatformType: // Using cheapest compute optimized instances that meet openshift minimum requirements (4 vCPU, 8GiB RAM) - alternativeInstanceTypes := []string{"c5.xlarge", "c5a.xlarge", "m5.xlarge"} + var alternativeInstanceTypes []string + + switch arch { + case "arm64": + alternativeInstanceTypes = []string{"m6g.large", "t4g.nano", "t4g.micro", "m6gd.xlarge"} + default: + alternativeInstanceTypes = []string{"c5.xlarge", "c5a.xlarge", "m5.xlarge"} + } + for _, instanceType := range alternativeInstanceTypes { updatedProviderSpec, err := updateProviderSpecAWSInstanceType(baseProviderSpec, instanceType) if err != nil { @@ -196,7 +204,15 @@ func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platfor output = append(output, baseMachineSetParams) } case configv1.AzurePlatformType: - alternativeVMSizes := []string{"Standard_F4s_v2", "Standard_D4as_v5", "Standard_D4as_v4"} + var alternativeVMSizes []string + + switch arch { + case "arm64": + alternativeVMSizes = []string{"Standard_D2ps_v5", "Standard_D3ps_v5", "Standard_D4ps_v5"} + default: + alternativeVMSizes = []string{"Standard_F4s_v2", "Standard_D4as_v5", "Standard_D4as_v4"} + } + for _, VMSize := range alternativeVMSizes { updatedProviderSpec, err := updateProviderSpecAzureVMSize(baseProviderSpec, VMSize) if err != nil { @@ -334,13 +350,13 @@ func GetWorkerMachineSets(ctx context.Context, client runtimeclient.Client) ([]* return result, nil } -// getArchitectureFromMachineSetNodes returns the architecture of the nodes controlled by the given machineSet's machines. -func getArchitectureFromMachineSetNodes(ctx context.Context, client runtimeclient.Client, machineSet *machinev1.MachineSet) (string, error) { +// GetArchitectureFromMachineSetNodes returns the architecture of the nodes controlled by the given machineSet's machines. +func GetArchitectureFromMachineSetNodes(ctx context.Context, client runtimeclient.Client, machineSet *machinev1.MachineSet) (string, error) { nodes, err := GetNodesFromMachineSet(ctx, client, machineSet) if err != nil || len(nodes) == 0 { klog.Warningf("error getting the machineSet's nodes or no nodes associated with %s. Using the capacity annotation", machineSet.Name) - for _, kv := range strings.Split(machineSet.Labels[labelsKey], ",") { + for _, kv := range strings.Split(machineSet.Annotations[labelsKey], ",") { if strings.Contains(kv, "kubernetes.io/arch") { return strings.Split(kv, "=")[1], nil } diff --git a/pkg/infra/spot.go b/pkg/infra/spot.go index f26bdcad4..cfdbebecc 100644 --- a/pkg/infra/spot.go +++ b/pkg/infra/spot.go @@ -44,7 +44,7 @@ var _ = Describe("Running on Spot", framework.LabelMAPI, framework.LabelDisrupti var client runtimeclient.Client var machineSet *machinev1.MachineSet var platform configv1.PlatformType - + var arch string var delObjects map[string]runtimeclient.Object var gatherer *gatherer.StateGatherer @@ -75,11 +75,22 @@ var _ = Describe("Running on Spot", framework.LabelMAPI, framework.LabelDisrupti default: Skip(fmt.Sprintf("Platform %s does not support Spot, skipping.", platform)) } + oc, _ := framework.NewCLI() + if framework.IsCustomerVPC(oc) { + //The termination-simulator will hit network error on customer vpc cluster, cannot mark the node as terminating, skip for now. + Skip("Skip this test on customer vpc cluster.") + } By("Creating a Spot backed MachineSet", func() { machineSetReady := false machineSetParams := framework.BuildMachineSetParams(ctx, client, machinesCount) - machineSetParamsList, err := framework.BuildAlternativeMachineSetParams(machineSetParams, platform) + + workers, err := framework.GetWorkerMachineSets(ctx, client) + Expect(err).ToNot(HaveOccurred(), "listing Worker MachineSets should not error.") + + arch, err = framework.GetArchitectureFromMachineSetNodes(ctx, client, workers[0]) + Expect(err).NotTo(HaveOccurred(), "unable to get the architecture for the machine set") + machineSetParamsList, err := framework.BuildAlternativeMachineSetParams(machineSetParams, platform, arch) Expect(err).ToNot(HaveOccurred(), "Should be able to build list of MachineSet parameters") for i, machineSetParams := range machineSetParamsList { if i >= spotMachineSetMaxProvisioningRetryCount { @@ -373,8 +384,9 @@ func getMetadataMockDeployment(platform configv1.PlatformType) *appsv1.Deploymen Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "metadata-mock", - Image: "golang:1.14", + Name: "metadata-mock", + // This is a golang:1.22 image which is mirrored in https://quay.io/repository/openshifttest/golang, so that disconnected cluster can access. + Image: "quay.io/openshifttest/golang@sha256:8f1c43387f0a107535906c7ee918a9d46079cc7be5e80a18424e8558d8afc702", Command: []string{"/usr/local/go/bin/go"}, Args: []string{ "run", diff --git a/pkg/infra/webhooks.go b/pkg/infra/webhooks.go index 109632bdc..8ab6374e8 100644 --- a/pkg/infra/webhooks.go +++ b/pkg/infra/webhooks.go @@ -12,6 +12,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/klog" runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" configv1 "github.com/openshift/api/config/v1" @@ -109,6 +110,21 @@ var _ = Describe("Webhooks", framework.LabelMAPI, framework.LabelDisruptive, fun if err != nil { return err } + + failed := framework.FilterMachines([]*machinev1beta1.Machine{m}, framework.MachinePhaseFailed) + if len(failed) > 0 { + reason := "failureReason not present in Machine.status" + if m.Status.ErrorReason != nil { + reason = string(*m.Status.ErrorReason) + } + message := "failureMessage not present in Machine.status" + if m.Status.ErrorMessage != nil { + message = *m.Status.ErrorMessage + } + klog.Errorf("Failed machine: %s, Reason: %s, Message: %s", m.Name, reason, message) + } + Expect(len(failed)).To(Equal(0), "zero machines should be in a Failed phase") + running := framework.FilterRunningMachines([]*machinev1beta1.Machine{m}) if len(running) == 0 { return fmt.Errorf("machine not yet running") @@ -249,6 +265,9 @@ func minimalAzureProviderSpec(ps *machinev1beta1.ProviderSpec) (*machinev1beta1. OSDisk: machinev1beta1.OSDisk{ DiskSizeGB: fullProviderSpec.OSDisk.DiskSizeGB, }, + Vnet: fullProviderSpec.Vnet, + Subnet: fullProviderSpec.Subnet, + NetworkResourceGroup: fullProviderSpec.NetworkResourceGroup, }, }, }, nil @@ -267,6 +286,11 @@ func minimalGCPProviderSpec(ps *machinev1beta1.ProviderSpec) (*machinev1beta1.Pr Region: fullProviderSpec.Region, Zone: fullProviderSpec.Zone, ServiceAccounts: fullProviderSpec.ServiceAccounts, + NetworkInterfaces: []*machinev1beta1.GCPNetworkInterface{{ + Network: fullProviderSpec.NetworkInterfaces[0].Network, + Subnetwork: fullProviderSpec.NetworkInterfaces[0].Subnetwork, + ProjectID: fullProviderSpec.NetworkInterfaces[0].ProjectID, + }}, }, }, }, nil