Skip to content

OCPQE-27285: Fix the failures in qe ci jobs #374

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hack/ci-integration.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ go run ./vendor/github.com/onsi/ginkgo/v2/ginkgo \
-v \
--timeout=115m \
--grace-period=5m \
--fail-fast \
--fail-fast=false \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of interest, why do we switch to false here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now if one case failed, the left cases will be marked as [INTERRUPTED], I change this to continue run other test cases even if a certain test fails.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that the failed test case won't interfere with the later test cases? It will definitely be cleaned up properly?

I wouldn't want the failure to then impact the later tests somehow

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ran the disruptive cases locally on azure versioned-installer-fully_private_cluster-proxy cluster, didn't find the failed case interfere with the later test cases, will check on aws and gcp as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I run several times on aws/gcp, didn't find the failed case interfere with the later test cases. switch to false here looks safe.

--no-color \
--junit-report="junit_cluster_api_actuator_pkg_e2e.xml" \
--output-dir="${OUTPUT_DIR}" \
Expand Down
16 changes: 16 additions & 0 deletions pkg/framework/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"time"

. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -313,3 +314,18 @@ func GetCredentialsFromCluster(oc *gatherer.CLI) ([]byte, []byte, string) {

return accessKeyID, secureKey, clusterRegion
}

// IsCustomerVPC check if cluster is customer vpc cluster.
func IsCustomerVPC(oc *gatherer.CLI) bool {
installConfig, err := oc.WithoutNamespace().Run("get").Args("cm", "cluster-config-v1", "-n", "kube-system", "-o=jsonpath={.data.install-config}").Output()
Expect(err).NotTo(HaveOccurred(), "Failed to get install-config")

switch platform {
case configv1.AWSPlatformType:
return strings.Contains(installConfig, "subnets:")
case configv1.AzurePlatformType:
return strings.Contains(installConfig, "virtualNetwork:")
default:
return false
}
}
30 changes: 23 additions & 7 deletions pkg/framework/machinesets.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func BuildPerArchMachineSetParamsList(ctx context.Context, client runtimeclient.
var params MachineSetParams

for _, worker := range workers {
if arch, err = getArchitectureFromMachineSetNodes(ctx, client, worker); err != nil {
if arch, err = GetArchitectureFromMachineSetNodes(ctx, client, worker); err != nil {
klog.Warningf("unable to get the architecture for the machine set %s: %v", worker.Name, err)
continue
}
Expand Down Expand Up @@ -180,7 +180,7 @@ func CreateMachineSet(c runtimeclient.Client, params MachineSetParams) (*machine
}

// BuildMachineSetParamsList creates a list of MachineSetParams based on the given machineSetParams with modified instance type.
func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platform configv1.PlatformType) ([]MachineSetParams, error) {
func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platform configv1.PlatformType, arch string) ([]MachineSetParams, error) {
baseMachineSetParams := machineSetParams
baseProviderSpec := baseMachineSetParams.ProviderSpec.DeepCopy()

Expand All @@ -189,7 +189,15 @@ func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platfor
switch platform {
case configv1.AWSPlatformType:
// Using cheapest compute optimized instances that meet openshift minimum requirements (4 vCPU, 8GiB RAM)
alternativeInstanceTypes := []string{"c5.xlarge", "c5a.xlarge", "m5.xlarge"}
var alternativeInstanceTypes []string

switch arch {
case "arm64":
alternativeInstanceTypes = []string{"m6g.large", "t4g.nano", "t4g.micro", "m6gd.xlarge"}
default:
alternativeInstanceTypes = []string{"c5.xlarge", "c5a.xlarge", "m5.xlarge"}
}

for _, instanceType := range alternativeInstanceTypes {
updatedProviderSpec, err := updateProviderSpecAWSInstanceType(baseProviderSpec, instanceType)
if err != nil {
Expand All @@ -200,7 +208,15 @@ func BuildAlternativeMachineSetParams(machineSetParams MachineSetParams, platfor
output = append(output, baseMachineSetParams)
}
case configv1.AzurePlatformType:
alternativeVMSizes := []string{"Standard_F4s_v2", "Standard_D4as_v5", "Standard_D4as_v4"}
var alternativeVMSizes []string

switch arch {
case "arm64":
alternativeVMSizes = []string{"Standard_D2ps_v5", "Standard_D3ps_v5", "Standard_D4ps_v5"}
default:
alternativeVMSizes = []string{"Standard_F4s_v2", "Standard_D4as_v5", "Standard_D4as_v4"}
}

for _, VMSize := range alternativeVMSizes {
updatedProviderSpec, err := updateProviderSpecAzureVMSize(baseProviderSpec, VMSize)
if err != nil {
Expand Down Expand Up @@ -338,13 +354,13 @@ func GetWorkerMachineSets(ctx context.Context, client runtimeclient.Client) ([]*
return result, nil
}

// getArchitectureFromMachineSetNodes returns the architecture of the nodes controlled by the given machineSet's machines.
func getArchitectureFromMachineSetNodes(ctx context.Context, client runtimeclient.Client, machineSet *machinev1.MachineSet) (string, error) {
// GetArchitectureFromMachineSetNodes returns the architecture of the nodes controlled by the given machineSet's machines.
func GetArchitectureFromMachineSetNodes(ctx context.Context, client runtimeclient.Client, machineSet *machinev1.MachineSet) (string, error) {
nodes, err := GetNodesFromMachineSet(ctx, client, machineSet)
if err != nil || len(nodes) == 0 {
klog.Warningf("error getting the machineSet's nodes or no nodes associated with %s. Using the capacity annotation", machineSet.Name)

for _, kv := range strings.Split(machineSet.Labels[labelsKey], ",") {
for _, kv := range strings.Split(machineSet.Annotations[labelsKey], ",") {
if strings.Contains(kv, "kubernetes.io/arch") {
return strings.Split(kv, "=")[1], nil
}
Expand Down
20 changes: 16 additions & 4 deletions pkg/infra/spot.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ var _ = Describe("Running on Spot", framework.LabelMAPI, framework.LabelDisrupti
var client runtimeclient.Client
var machineSet *machinev1.MachineSet
var platform configv1.PlatformType

var arch string
var delObjects map[string]runtimeclient.Object

var gatherer *gatherer.StateGatherer
Expand Down Expand Up @@ -97,11 +97,22 @@ var _ = Describe("Running on Spot", framework.LabelMAPI, framework.LabelDisrupti
default:
Skip(fmt.Sprintf("Platform %s does not support Spot, skipping.", platform))
}
oc, _ := framework.NewCLI()
if framework.IsCustomerVPC(oc) {
//The termination-simulator will hit network error on customer vpc cluster, cannot mark the node as terminating, skip for now.
Skip("Skip this test on customer vpc cluster.")
}

By("Creating a Spot backed MachineSet", func() {
machineSetReady := false
machineSetParams := framework.BuildMachineSetParams(ctx, client, machinesCount)
machineSetParamsList, err := framework.BuildAlternativeMachineSetParams(machineSetParams, platform)

workers, err := framework.GetWorkerMachineSets(ctx, client)
Expect(err).ToNot(HaveOccurred(), "listing Worker MachineSets should not error.")

arch, err = framework.GetArchitectureFromMachineSetNodes(ctx, client, workers[0])
Expect(err).NotTo(HaveOccurred(), "unable to get the architecture for the machine set")
machineSetParamsList, err := framework.BuildAlternativeMachineSetParams(machineSetParams, platform, arch)
Expect(err).ToNot(HaveOccurred(), "Should be able to build list of MachineSet parameters")
for i, machineSetParams := range machineSetParamsList {
if i >= spotMachineSetMaxProvisioningRetryCount {
Expand Down Expand Up @@ -376,8 +387,9 @@ func getMetadataMockDeployment(platform configv1.PlatformType) *appsv1.Deploymen
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "metadata-mock",
Image: "golang:1.14",
Name: "metadata-mock",
// This is a golang:1.22 image which is mirrored in https://quay.io/repository/openshifttest/golang, so that disconnected cluster can access.
Image: "quay.io/openshifttest/golang@sha256:8f1c43387f0a107535906c7ee918a9d46079cc7be5e80a18424e8558d8afc702",
Command: []string{"/usr/local/go/bin/go"},
Args: []string{
"run",
Expand Down
24 changes: 24 additions & 0 deletions pkg/infra/webhooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/klog"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"

configv1 "github.com/openshift/api/config/v1"
Expand Down Expand Up @@ -112,6 +113,21 @@ var _ = Describe("Webhooks", framework.LabelMAPI, framework.LabelDisruptive, fun
if err != nil {
return err
}

failed := framework.FilterMachines([]*machinev1beta1.Machine{m}, framework.MachinePhaseFailed)
if len(failed) > 0 {
reason := "failureReason not present in Machine.status"
if m.Status.ErrorReason != nil {
reason = string(*m.Status.ErrorReason)
}
message := "failureMessage not present in Machine.status"
if m.Status.ErrorMessage != nil {
message = *m.Status.ErrorMessage
}
klog.Errorf("Failed machine: %s, Reason: %s, Message: %s", m.Name, reason, message)
}
Expect(len(failed)).To(Equal(0), "zero machines should be in a Failed phase")

running := framework.FilterRunningMachines([]*machinev1beta1.Machine{m})
if len(running) == 0 {
return fmt.Errorf("machine not yet running")
Expand Down Expand Up @@ -252,6 +268,9 @@ func minimalAzureProviderSpec(ps *machinev1beta1.ProviderSpec) (*machinev1beta1.
OSDisk: machinev1beta1.OSDisk{
DiskSizeGB: fullProviderSpec.OSDisk.DiskSizeGB,
},
Vnet: fullProviderSpec.Vnet,
Subnet: fullProviderSpec.Subnet,
NetworkResourceGroup: fullProviderSpec.NetworkResourceGroup,
},
},
}, nil
Expand All @@ -270,6 +289,11 @@ func minimalGCPProviderSpec(ps *machinev1beta1.ProviderSpec) (*machinev1beta1.Pr
Region: fullProviderSpec.Region,
Zone: fullProviderSpec.Zone,
ServiceAccounts: fullProviderSpec.ServiceAccounts,
NetworkInterfaces: []*machinev1beta1.GCPNetworkInterface{{
Network: fullProviderSpec.NetworkInterfaces[0].Network,
Subnetwork: fullProviderSpec.NetworkInterfaces[0].Subnetwork,
ProjectID: fullProviderSpec.NetworkInterfaces[0].ProjectID,
}},
},
},
}, nil
Expand Down