From 030322ab17187d2441ba938ddab05920c6931dbf Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Tue, 5 Nov 2024 16:59:23 +0100 Subject: [PATCH 1/3] Implement MS remediating conditions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stefan Büringer buringerst@vmware.com --- api/v1beta1/machineset_types.go | 29 ++- .../api/v1beta1/v1beta2_condition_consts.go | 2 +- .../controllers/machine/machine_controller.go | 2 - .../machineset/machineset_controller.go | 188 ++++++++++++++---- .../machineset_controller_status.go | 85 ++++++++ .../machineset_controller_status_test.go | 137 +++++++++++++ .../machineset/machineset_controller_test.go | 90 ++++++++- .../machineset/machineset_preflight.go | 10 +- .../templates/clusterclass-quick-start.yaml | 16 ++ .../controllers/inmemorymachine_controller.go | 28 +-- 10 files changed, 518 insertions(+), 69 deletions(-) diff --git a/api/v1beta1/machineset_types.go b/api/v1beta1/machineset_types.go index 8b9fc282d9ee..03c8edd2aa09 100644 --- a/api/v1beta1/machineset_types.go +++ b/api/v1beta1/machineset_types.go @@ -153,10 +153,37 @@ const ( MachineSetMachinesUpToDateInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason ) -// Conditions that will be used for the MachineSet object in v1Beta2 API version. +// MachineSet's Remediating condition and corresponding reasons that will be used in v1Beta2 API version. const ( // MachineSetRemediatingV1Beta2Condition surfaces details about ongoing remediation of the controlled machines, if any. MachineSetRemediatingV1Beta2Condition = RemediatingV1Beta2Condition + + // MachineSetRemediatingV1Beta2Reason surfaces when the MachineSet has at least one machine with HealthCheckSucceeded set to false + // and with the OwnerRemediated condition set to false. + MachineSetRemediatingV1Beta2Reason = RemediatingV1Beta2Reason + + // MachineSetNotRemediatingV1Beta2Reason surfaces when the MachineSet does not have any machine with HealthCheckSucceeded set to false + // and with the OwnerRemediated condition set to false. + MachineSetNotRemediatingV1Beta2Reason = NotRemediatingV1Beta2Reason + + // MachineSetRemediatingInternalErrorV1Beta2Reason surfaces unexpected failures when computing the Remediating condition. + MachineSetRemediatingInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason +) + +// Reasons that will be used for the OwnerRemediated condition set by MachineHealthCheck on MachineSet controlled machines +// being remediated in v1Beta2 API version. +const ( + // MachineSetMachineCannotBeRemediatedV1Beta2Reason surfaces when remediation of a MachineSet machine can't be started. + MachineSetMachineCannotBeRemediatedV1Beta2Reason = "CannotBeRemediated" + + // MachineSetMachineRemediationDeferredV1Beta2Reason surfaces when remediation of a MachineSet machine must be deferred. + MachineSetMachineRemediationDeferredV1Beta2Reason = "RemediationDeferred" + + // MachineSetMachineRemediationMachineDeletedV1Beta2Reason surfaces when remediation of a MachineSet machine + // has been completed by deleting the unhealthy machine. + // Note: After an unhealthy machine is deleted, a new one is created by the MachineSet as part of the + // regular reconcile loop that ensures the correct number of replicas exist. + MachineSetMachineRemediationMachineDeletedV1Beta2Reason = "MachineDeleted" ) // MachineSet's Deleting condition and corresponding reasons that will be used in v1Beta2 API version. diff --git a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go index d3049763ae92..722e5ec6e6c5 100644 --- a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go +++ b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go @@ -191,7 +191,7 @@ const ( // Reasons that will be used for the OwnerRemediated condition set by MachineHealthCheck on KubeadmControlPlane controlled machines // being remediated in v1Beta2 API version. const ( - // KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason surfaces unexpected failures while remediation a control plane machine. + // KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason surfaces unexpected failures while remediating a control plane machine. KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason = clusterv1.InternalErrorV1Beta2Reason // KubeadmControlPlaneMachineCannotBeRemediatedV1Beta2Reason surfaces when remediation of a control plane machine can't be started. diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index a462490d956f..afe1371480f1 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -309,8 +309,6 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust clusterv1.BootstrapReadyCondition, clusterv1.InfrastructureReadyCondition, clusterv1.DrainingSucceededCondition, - clusterv1.MachineHealthCheckSucceededCondition, - clusterv1.MachineOwnerRemediatedCondition, }}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{ clusterv1.MachineAvailableV1Beta2Condition, diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index eadc30a22753..acfa32d8e937 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -54,6 +54,7 @@ import ( "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" utilconversion "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/finalizers" "sigs.k8s.io/cluster-api/util/labels/format" @@ -1195,19 +1196,75 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( cluster := s.cluster ms := s.machineSet - filteredMachines := s.machines + machines := s.machines owner := s.owningMachineDeployment log := ctrl.LoggerFrom(ctx) + // Remove OwnerRemediated condition from Machines that have HealthCheckSucceeded condition true + // and OwnerRemediated condition false + errList := []error{} + for _, m := range machines { + if !m.DeletionTimestamp.IsZero() { + continue + } + + shouldCleanup := conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) + shouldCleanupV1Beta2 := v1beta2conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededV1Beta2Condition) && v1beta2conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + + if !(shouldCleanup || shouldCleanupV1Beta2) { + continue + } + + patchHelper, err := patch.NewHelper(m, r.Client) + if err != nil { + errList = append(errList, err) + continue + } + + if shouldCleanup { + conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition) + } + + if shouldCleanupV1Beta2 { + v1beta2conditions.Delete(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + } + + if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineOwnerRemediatedCondition, + }}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{ + clusterv1.MachineOwnerRemediatedV1Beta2Condition, + }}); err != nil { + errList = append(errList, err) + } + } + if len(errList) > 0 { + return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errList), "failed to remove OwnerRemediated condition from healhty Machines") + } + + machinesToRemediate := collections.FromMachines(machines...).Filter(collections.IsUnhealthyAndOwnerRemediated, collections.Not(collections.HasDeletionTimestamp)).UnsortedList() + + // If there are no machines to remediate return early. + if len(machinesToRemediate) == 0 { + return ctrl.Result{}, nil + } + // Calculate how many in flight machines we should remediate. // By default, we allow all machines to be remediated at the same time. - maxInFlight := len(filteredMachines) + maxInFlight := len(machinesToRemediate) // If the MachineSet is part of a MachineDeployment, only allow remediations if // it's the desired revision. if isDeploymentChild(ms) { if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] { // MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed. + if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineCannotBeRemediatedV1Beta2Reason, + Message: "Machine won't be remediated because it is pending removal due to rollout", + }, nil); err != nil { + return ctrl.Result{}, err + } return ctrl.Result{}, nil } @@ -1224,31 +1281,31 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( } } - // List all unhealthy machines. - machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines)) - for _, m := range filteredMachines { - // filteredMachines contains machines in deleting status to calculate correct status. - // skip remediation for those in deleting status. + // Update maxInFlight based on remediations that are in flight. + for _, m := range machines { if !m.DeletionTimestamp.IsZero() { + // TODO: Check for Status: False and Reason: MachineSetMachineRemediationMachineDeletedV1Beta2Reason + // instead when starting to use v1beta2 conditions for control flow. if conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) { - // Machine has been remediated by this controller and still in flight. + // Remediation for this Machine has been triggered by this controller but it is still in flight, + // i.e. it still goes through the deletion workflow and exists in etcd. maxInFlight-- } - continue - } - if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) { - machinesToRemediate = append(machinesToRemediate, m) } } - // If there are no machines to remediate return early. - if len(machinesToRemediate) == 0 { - return ctrl.Result{}, nil - } // Check if we can remediate any machines. if maxInFlight <= 0 { // No tokens available to remediate machines. log.V(3).Info("Remediation strategy is set, and maximum in flight has been reached", "machinesToBeRemediated", len(machinesToRemediate)) + if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason, + Message: fmt.Sprintf("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)", owner.Spec.Strategy.Remediation.MaxInFlight), + }, nil); err != nil { + return ctrl.Result{}, err + } return ctrl.Result{}, nil } @@ -1263,11 +1320,22 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( if len(machinesToRemediate) > maxInFlight { log.V(5).Info("Remediation strategy is set, limiting in flight operations", "machinesToBeRemediated", len(machinesToRemediate)) // We have more machines to remediate than tokens available. - machinesToRemediate = machinesToRemediate[:maxInFlight] + allMachinesToRemediate := machinesToRemediate + machinesToRemediate = allMachinesToRemediate[:maxInFlight] + machinesToDeferRemediation := allMachinesToRemediate[maxInFlight:] + + if err := patchMachineConditions(ctx, r.Client, machinesToDeferRemediation, metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason, + Message: fmt.Sprintf("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)", owner.Spec.Strategy.Remediation.MaxInFlight), + }, nil); err != nil { + return ctrl.Result{}, err + } } // Run preflight checks. - preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation") + preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine remediation") if err != nil { // If err is not nil use that as the preflightCheckErrMessage preflightCheckErrMessage = err.Error() @@ -1277,41 +1345,47 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( if preflightChecksFailed { // PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with // WaitingForRemediationReason reason. - var errs []error - for _, m := range machinesToRemediate { - patchHelper, err := patch.NewHelper(m, r.Client) - if err != nil { - errs = append(errs, err) - continue - } - conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage) - if err := patchHelper.Patch(ctx, m); err != nil { - errs = append(errs, err) - } - } - - if len(errs) > 0 { - return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines") + if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason, + Message: preflightCheckErrMessage, + }, &clusterv1.Condition{ + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + Reason: clusterv1.WaitingForRemediationReason, + Severity: clusterv1.ConditionSeverityWarning, + Message: preflightCheckErrMessage, + }); err != nil { + return ctrl.Result{}, err } return preflightChecksResult, nil } - // PreflightChecks passed, so it is safe to remediate unhealthy machines. - // Remediate unhealthy machines by deleting them. + // PreflightChecks passed, so it is safe to remediate unhealthy machines by deleting them. + + // Note: We intentionally patch the Machines before we delete them to make this code reentrant. + // If we delete the Machine first, the Machine would be filtered out on next reconcile because + // it has a deletionTimestamp so it would never get the condition. + // Instead if we set the condition but the deletion does not go through on next reconcile either the + // condition will be fixed/updated or the Machine deletion will be retried. + if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, + }, &clusterv1.Condition{ + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionTrue, + }); err != nil { + return ctrl.Result{}, err + } var errs []error for _, m := range machinesToRemediate { log.Info("Deleting unhealthy Machine", "Machine", klog.KObj(m)) - patch := client.MergeFrom(m.DeepCopy()) if err := r.Client.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) { errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) - continue - } - conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition) - if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) { - errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m))) } } - if len(errs) > 0 { return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines") } @@ -1319,6 +1393,36 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( return ctrl.Result{}, nil } +func patchMachineConditions(ctx context.Context, c client.Client, machines []*clusterv1.Machine, v1beta2Condition metav1.Condition, condition *clusterv1.Condition) error { + var errs []error + for _, m := range machines { + patchHelper, err := patch.NewHelper(m, c) + if err != nil { + errs = append(errs, err) + continue + } + + if condition != nil { + conditions.Set(m, condition) + } + v1beta2conditions.Set(m, v1beta2Condition) + + if err := patchHelper.Patch(ctx, m, + patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineOwnerRemediatedCondition, + }}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{ + clusterv1.MachineOwnerRemediatedV1Beta2Condition, + }}); err != nil { + errs = append(errs, err) + } + } + if len(errs) > 0 { + return errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch Machines") + } + + return nil +} + func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, owner *clusterv1.MachineDeployment, ref *corev1.ObjectReference) (objectNotFound bool, err error) { if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) { return false, nil diff --git a/internal/controllers/machineset/machineset_controller_status.go b/internal/controllers/machineset/machineset_controller_status.go index 910f47896eb0..948391e5877f 100644 --- a/internal/controllers/machineset/machineset_controller_status.go +++ b/internal/controllers/machineset/machineset_controller_status.go @@ -28,6 +28,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/collections" v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" clog "sigs.k8s.io/cluster-api/util/log" ) @@ -55,6 +56,11 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) { // MachinesUpToDate condition: aggregate the Machine's UpToDate condition. setMachinesUpToDateCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + machines := collections.FromMachines(s.machines...) + machinesToBeRemediated := machines.Filter(collections.IsUnhealthyAndOwnerRemediated) + unhealthyMachines := machines.Filter(collections.IsUnhealthy) + setRemediatingCondition(ctx, s.machineSet, machinesToBeRemediated, unhealthyMachines, s.getAndAdoptMachinesForMachineSetSucceeded) + setDeletingCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) } @@ -275,6 +281,53 @@ func setMachinesUpToDateCondition(ctx context.Context, machineSet *clusterv1.Mac v1beta2conditions.Set(machineSet, *upToDateCondition) } +func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machinesToBeRemediated, unhealthyMachines collections.Machines, getAndAdoptMachinesForMachineSetSucceeded bool) { + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetRemediatingInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + if len(machinesToBeRemediated) == 0 { + message := aggregateUnhealthyMachines(unhealthyMachines) + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotRemediatingV1Beta2Reason, + Message: message, + }) + return + } + + remediatingCondition, err := v1beta2conditions.NewAggregateCondition( + machinesToBeRemediated.UnsortedList(), clusterv1.MachineOwnerRemediatedV1Beta2Condition, + v1beta2conditions.TargetConditionType(clusterv1.MachineSetRemediatingV1Beta2Condition), + ) + if err != nil { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetRemediatingInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + + log := ctrl.LoggerFrom(ctx) + log.Error(err, fmt.Sprintf("Failed to aggregate Machine's %s conditions", clusterv1.MachineOwnerRemediatedV1Beta2Condition)) + return + } + + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: remediatingCondition.Type, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetRemediatingV1Beta2Reason, + Message: remediatingCondition.Message, + }) +} + func setDeletingCondition(_ context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { // If we got unexpected errors in listing the machines (this should never happen), surface them. if !getAndAdoptMachinesForMachineSetSucceeded { @@ -369,3 +422,35 @@ func aggregateStaleMachines(machines []*clusterv1.Machine) string { return message } + +func aggregateUnhealthyMachines(machines collections.Machines) string { + if len(machines) == 0 { + return "" + } + + machineNames := []string{} + for _, machine := range machines { + machineNames = append(machineNames, machine.GetName()) + } + + if len(machineNames) == 0 { + return "" + } + + message := "Machine" + if len(machineNames) > 1 { + message += "s" + } + + sort.Strings(machineNames) + message += " " + clog.ListToString(machineNames, func(s string) string { return s }, 3) + + if len(machineNames) == 1 { + message += " is " + } else { + message += " are " + } + message += "not healthy (not to be remediated by MachineSet)" + + return message +} diff --git a/internal/controllers/machineset/machineset_controller_status_test.go b/internal/controllers/machineset/machineset_controller_status_test.go index e157563fd4ba..5ae3277fe2c6 100644 --- a/internal/controllers/machineset/machineset_controller_status_test.go +++ b/internal/controllers/machineset/machineset_controller_status_test.go @@ -26,6 +26,7 @@ import ( "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/collections" v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" ) @@ -753,6 +754,113 @@ func Test_setMachinesUpToDateCondition(t *testing.T) { } } +func Test_setRemediatingCondition(t *testing.T) { + healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue} + healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse} + ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse} + ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"} + + tests := []struct { + name string + machineSet *clusterv1.MachineSet + machines []*clusterv1.Machine + getAndAdoptMachinesForMachineSetSucceeded bool + expectCondition metav1.Condition + }{ + { + name: "get machines failed", + machineSet: &clusterv1.MachineSet{}, + machines: nil, + getAndAdoptMachinesForMachineSetSucceeded: false, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetRemediatingInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }, + }, + { + name: "Without unhealthy machines", + machineSet: &clusterv1.MachineSet{}, + machines: []*clusterv1.Machine{ + fakeMachine("m1"), + fakeMachine("m2"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotRemediatingV1Beta2Reason, + }, + }, + { + name: "With machines to be remediated by MS", + machineSet: &clusterv1.MachineSet{}, + machines: []*clusterv1.Machine{ + fakeMachine("m1", withConditions(healthCheckSucceeded)), // Healthy machine + fakeMachine("m2", withConditions(healthCheckNotSucceeded)), // Unhealthy machine, not yet marked for remediation + fakeMachine("m3", withConditions(healthCheckNotSucceeded, ownerRemediated), withV1Beta2Condition(ownerRemediatedV1Beta2)), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetRemediatingV1Beta2Reason, + Message: "Remediation in progress from Machine m3", + }, + }, + { + name: "With one unhealthy machine not to be remediated by MS", + machineSet: &clusterv1.MachineSet{}, + machines: []*clusterv1.Machine{ + fakeMachine("m1", withConditions(healthCheckSucceeded)), // Healthy machine + fakeMachine("m2", withConditions(healthCheckNotSucceeded)), // Unhealthy machine, not yet marked for remediation + fakeMachine("m3", withConditions(healthCheckSucceeded)), // Healthy machine + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotRemediatingV1Beta2Reason, + Message: "Machine m2 is not healthy (not to be remediated by MachineSet)", + }, + }, + { + name: "With two unhealthy machine not to be remediated by MS", + machineSet: &clusterv1.MachineSet{}, + machines: []*clusterv1.Machine{ + fakeMachine("m1", withConditions(healthCheckNotSucceeded)), // Unhealthy machine, not yet marked for remediation + fakeMachine("m2", withConditions(healthCheckNotSucceeded)), // Unhealthy machine, not yet marked for remediation + fakeMachine("m3", withConditions(healthCheckSucceeded)), // Healthy machine + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetRemediatingV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotRemediatingV1Beta2Reason, + Message: "Machines m1, m2 are not healthy (not to be remediated by MachineSet)", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + var machinesToBeRemediated, unHealthyMachines collections.Machines + if tt.getAndAdoptMachinesForMachineSetSucceeded { + machines := collections.FromMachines(tt.machines...) + machinesToBeRemediated = machines.Filter(collections.IsUnhealthyAndOwnerRemediated) + unHealthyMachines = machines.Filter(collections.IsUnhealthy) + } + setRemediatingCondition(ctx, tt.machineSet, machinesToBeRemediated, unHealthyMachines, tt.getAndAdoptMachinesForMachineSetSucceeded) + + condition := v1beta2conditions.Get(tt.machineSet, clusterv1.MachineSetRemediatingV1Beta2Condition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(*condition).To(v1beta2conditions.MatchCondition(tt.expectCondition, v1beta2conditions.IgnoreLastTransitionTime(true))) + }) + } +} + func Test_setDeletingCondition(t *testing.T) { tests := []struct { name string @@ -847,3 +955,32 @@ func newStaleDeletingMachine(name string) *clusterv1.Machine { m.DeletionTimestamp = ptr.To(metav1.Time{Time: time.Now().Add(-1 * time.Hour)}) return m } + +type fakeMachinesOption func(m *clusterv1.Machine) + +func fakeMachine(name string, options ...fakeMachinesOption) *clusterv1.Machine { + p := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + } + for _, opt := range options { + opt(p) + } + return p +} + +func withV1Beta2Condition(c metav1.Condition) fakeMachinesOption { + return func(m *clusterv1.Machine) { + if m.Status.V1Beta2 == nil { + m.Status.V1Beta2 = &clusterv1.MachineV1Beta2Status{} + } + v1beta2conditions.Set(m, c) + } +} + +func withConditions(c ...clusterv1.Condition) fakeMachinesOption { + return func(m *clusterv1.Machine) { + m.Status.Conditions = append(m.Status.Conditions, c...) + } +} diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index 1a62fba1f914..0db1fa218592 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -40,6 +40,7 @@ import ( "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -1500,6 +1501,8 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "unhealthy-machine", Namespace: "default", + // Blocking deletion so we can confirm conditions were updated as expected. + Finalizers: []string{"block-deletion"}, }, Status: clusterv1.MachineStatus{ Conditions: []clusterv1.Condition{ @@ -1519,7 +1522,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine} - fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneStable, unhealthyMachine, healthyMachine).Build() + fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneStable, unhealthyMachine, healthyMachine).WithStatusSubresource(&clusterv1.Machine{}).Build() r := &Reconciler{ Client: fakeClient, } @@ -1533,13 +1536,26 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) - // Verify the unhealthy machine is deleted. + + // Verify the unhealthy machine is deleted (deletionTimestamp must be set). m := &clusterv1.Machine{} - err = r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m) - g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeFalse()) + conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) + c := v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + g.Expect(c).ToNot(BeNil()) + g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, + }, v1beta2conditions.IgnoreLastTransitionTime(true))) + // Verify the healthy machine is not deleted. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).Should(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) + g.Expect(conditions.Has(m, clusterv1.MachineOwnerRemediatedCondition)).To(BeFalse()) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) }) t.Run("should update the unhealthy machine MachineOwnerRemediated condition if preflight checks did not pass", func(t *testing.T) { @@ -1602,6 +1618,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { condition := clusterv1.MachineOwnerRemediatedCondition m := &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeTrue(), "Machine should have the %s condition set", condition) machineOwnerRemediatedCondition := conditions.Get(m, condition) @@ -1609,12 +1626,23 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) g.Expect(machineOwnerRemediatedCondition.Reason). To(Equal(clusterv1.WaitingForRemediationReason), "%s condition should have reason %s", condition, clusterv1.WaitingForRemediationReason) + c := v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + g.Expect(c).ToNot(BeNil()) + g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason, + Message: "Machine remediation on hold because GenericControlPlane default/cp1 is upgrading (\"ControlPlaneIsStable\" preflight failed). " + + "The operation will continue after the preflight check(s) pass", + }, v1beta2conditions.IgnoreLastTransitionTime(true))) // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) }) t.Run("should only try to remediate MachineOwnerRemediated if MachineSet is current", func(t *testing.T) { @@ -1672,6 +1700,8 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "unhealthy-machine", Namespace: "default", + // Blocking deletion so we can confirm conditions were updated as expected. + Finalizers: []string{"block-deletion"}, }, Status: clusterv1.MachineStatus{ Conditions: []clusterv1.Condition{ @@ -1725,12 +1755,22 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(machineOwnerRemediatedCondition.Status). To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) g.Expect(unhealthyMachine.DeletionTimestamp).Should(BeZero()) + c := v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + g.Expect(c).ToNot(BeNil()) + g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineCannotBeRemediatedV1Beta2Reason, + Message: "Machine won't be remediated because it is pending removal due to rollout", + }, v1beta2conditions.IgnoreLastTransitionTime(true))) // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) // Test with the current MachineSet. s = &scope{ @@ -1744,14 +1784,24 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) // Verify the unhealthy machine has been deleted. - err = r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m) - g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeFalse()) + conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) + c = v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + g.Expect(c).ToNot(BeNil()) + g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, + }, v1beta2conditions.IgnoreLastTransitionTime(true))) // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) }) t.Run("should only try to remediate up to MaxInFlight unhealthy", func(t *testing.T) { @@ -1872,6 +1922,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { machineOwnerRemediatedCondition := conditions.Get(m, condition) g.Expect(machineOwnerRemediatedCondition.Status). To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) + c := v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + g.Expect(c).ToNot(BeNil()) + g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason, + Message: "Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is 3)", + }, v1beta2conditions.IgnoreLastTransitionTime(true))) } else { // Machines after maxInFlight, should be deleted. g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "expected machine %d to be deleted", i) @@ -1883,6 +1941,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) // // Second pass. @@ -1929,6 +1988,15 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { machineOwnerRemediatedCondition := conditions.Get(m, condition) g.Expect(machineOwnerRemediatedCondition.Status). To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) + c := v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + g.Expect(c).ToNot(BeNil()) + g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason, + Message: "Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is 3)", + }, v1beta2conditions.IgnoreLastTransitionTime(true))) + g.Expect(m.DeletionTimestamp).To(BeZero()) } else if i < total-maxInFlight { // Machines before the maxInFlight should have a deletion timestamp g.Expect(err).ToNot(HaveOccurred()) @@ -1937,6 +2005,13 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { machineOwnerRemediatedCondition := conditions.Get(m, condition) g.Expect(machineOwnerRemediatedCondition.Status). To(Equal(corev1.ConditionTrue), "%s condition status should be true", condition) + c := v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) + g.Expect(c).ToNot(BeNil()) + g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, + }, v1beta2conditions.IgnoreLastTransitionTime(true))) g.Expect(m.DeletionTimestamp).ToNot(BeZero()) if cleanFinalizer { @@ -1955,6 +2030,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) // Perform another pass with the same exact configuration. // This is testing that, given that we have Machines that are being deleted and are in flight, @@ -1976,6 +2052,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) // Call again to verify that the remaining unhealthy machines are deleted, // at this point all unhealthy machines should be deleted given the max in flight @@ -2000,6 +2077,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) + g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) }) } diff --git a/internal/controllers/machineset/machineset_preflight.go b/internal/controllers/machineset/machineset_preflight.go index ae959329e2f8..0b3bb07bf63f 100644 --- a/internal/controllers/machineset/machineset_preflight.go +++ b/internal/controllers/machineset/machineset_preflight.go @@ -136,7 +136,7 @@ func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1. for _, v := range preflightCheckErrs { preflightCheckErrStrings = append(preflightCheckErrStrings, *v) } - msg := fmt.Sprintf("Performing %q on hold because %s. The operation will continue after the preflight check(s) pass", action, strings.Join(preflightCheckErrStrings, "; ")) + msg := fmt.Sprintf("%s on hold because %s. The operation will continue after the preflight check(s) pass", action, strings.Join(preflightCheckErrStrings, "; ")) log.Info(msg) return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, msg, nil } @@ -149,19 +149,19 @@ func (r *Reconciler) controlPlaneStablePreflightCheck(controlPlane *unstructured // Check that the control plane is not provisioning. isProvisioning, err := contract.ControlPlane().IsProvisioning(controlPlane) if err != nil { - return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if ControlPlane %s is provisioning", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, cpKlogRef) + return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if %s %s is provisioning", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, controlPlane.GetKind(), cpKlogRef) } if isProvisioning { - return ptr.To(fmt.Sprintf("ControlPlane %s is provisioning (%q preflight failed)", cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil + return ptr.To(fmt.Sprintf("%s %s is provisioning (%q preflight failed)", controlPlane.GetKind(), cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil } // Check that the control plane is not upgrading. isUpgrading, err := contract.ControlPlane().IsUpgrading(controlPlane) if err != nil { - return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if the ControlPlane %s is upgrading", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, cpKlogRef) + return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if the %s %s is upgrading", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, controlPlane.GetKind(), cpKlogRef) } if isUpgrading { - return ptr.To(fmt.Sprintf("ControlPlane %s is upgrading (%q preflight failed)", cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil + return ptr.To(fmt.Sprintf("%s %s is upgrading (%q preflight failed)", controlPlane.GetKind(), cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil } return nil, nil diff --git a/test/infrastructure/docker/templates/clusterclass-quick-start.yaml b/test/infrastructure/docker/templates/clusterclass-quick-start.yaml index e6d31972820b..b5413035dbab 100644 --- a/test/infrastructure/docker/templates/clusterclass-quick-start.yaml +++ b/test/infrastructure/docker/templates/clusterclass-quick-start.yaml @@ -13,6 +13,14 @@ spec: kind: DockerMachineTemplate apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 name: quick-start-control-plane + machineHealthCheck: + unhealthyConditions: + - type: Ready + status: Unknown + timeout: 300s + - type: Ready + status: "False" + timeout: 300s infrastructure: ref: apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 @@ -32,6 +40,14 @@ spec: apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 kind: DockerMachineTemplate name: quick-start-default-worker-machinetemplate + machineHealthCheck: + unhealthyConditions: + - type: Ready + status: Unknown + timeout: 300s + - type: Ready + status: "False" + timeout: 300s machinePools: - class: default-worker template: diff --git a/test/infrastructure/inmemory/internal/controllers/inmemorymachine_controller.go b/test/infrastructure/inmemory/internal/controllers/inmemorymachine_controller.go index 51cbca048dc3..e737b9d0d1ca 100644 --- a/test/infrastructure/inmemory/internal/controllers/inmemorymachine_controller.go +++ b/test/infrastructure/inmemory/internal/controllers/inmemorymachine_controller.go @@ -332,24 +332,28 @@ func (r *InMemoryMachineReconciler) reconcileNormalNode(ctx context.Context, clu Status: corev1.NodeStatus{ Conditions: []corev1.NodeCondition{ { - Type: corev1.NodeReady, - Status: corev1.ConditionTrue, - Reason: "KubeletReady", + LastTransitionTime: metav1.Now(), + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + Reason: "KubeletReady", }, { - Type: corev1.NodeMemoryPressure, - Status: corev1.ConditionFalse, - Reason: "KubeletHasSufficientMemory", + LastTransitionTime: metav1.Now(), + Type: corev1.NodeMemoryPressure, + Status: corev1.ConditionFalse, + Reason: "KubeletHasSufficientMemory", }, { - Type: corev1.NodeDiskPressure, - Status: corev1.ConditionFalse, - Reason: "KubeletHasNoDiskPressure", + LastTransitionTime: metav1.Now(), + Type: corev1.NodeDiskPressure, + Status: corev1.ConditionFalse, + Reason: "KubeletHasNoDiskPressure", }, { - Type: corev1.NodePIDPressure, - Status: corev1.ConditionFalse, - Reason: "KubeletHasSufficientPID", + LastTransitionTime: metav1.Now(), + Type: corev1.NodePIDPressure, + Status: corev1.ConditionFalse, + Reason: "KubeletHasSufficientPID", }, }, }, From 039c6c17590f95bfe614adbc7bd65f1eef30da6f Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Thu, 7 Nov 2024 15:49:31 +0100 Subject: [PATCH 2/3] Fix review findings --- .../machineset/machineset_controller_test.go | 202 +++++++++++++++++- 1 file changed, 195 insertions(+), 7 deletions(-) diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index 0db1fa218592..4338afac63e1 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -1510,6 +1510,25 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse, }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionFalse, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason, + Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation", + }, + }, }, }, } @@ -1518,6 +1537,34 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Name: "healthy-machine", Namespace: "default", }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionTrue, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckSucceededV1Beta2Reason, + }, + }, + }, + }, } machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine} @@ -1590,6 +1637,25 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse, }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionFalse, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason, + Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation", + }, + }, }, }, } @@ -1598,6 +1664,34 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Name: "healthy-machine", Namespace: "default", }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionTrue, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckSucceededV1Beta2Reason, + }, + }, + }, + }, } machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine} @@ -1636,7 +1730,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { "The operation will continue after the preflight check(s) pass", }, v1beta2conditions.IgnoreLastTransitionTime(true))) - // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. + // Verify the healthy machine does not have the MachineOwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) @@ -1709,6 +1803,25 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse, }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionFalse, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason, + Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation", + }, + }, }, }, } @@ -1717,6 +1830,34 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Name: "healthy-machine", Namespace: "default", }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionTrue, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckSucceededV1Beta2Reason, + }, + }, + }, + }, } machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine} @@ -1764,7 +1905,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Message: "Machine won't be remediated because it is pending removal due to rollout", }, v1beta2conditions.IgnoreLastTransitionTime(true))) - // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. + // Verify the healthy machine does not have the MachineOwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) @@ -1795,7 +1936,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, }, v1beta2conditions.IgnoreLastTransitionTime(true))) - // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) @@ -1872,6 +2013,25 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse, }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionFalse, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason, + Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation", + }, + }, }, }, }) @@ -1882,6 +2042,34 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Name: "healthy-machine", Namespace: "default", }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + { + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: corev1.ConditionTrue, + }, + }, + V1Beta2: &clusterv1.MachineV1Beta2Status{ + Conditions: []metav1.Condition{ + { + // This condition should be cleaned up because HealthCheckSucceeded is true. + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, + }, + { + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckSucceededV1Beta2Reason, + }, + }, + }, + }, } fakeClient := fake.NewClientBuilder().WithObjects(cluster, machineDeployment, healthyMachine). @@ -1936,7 +2124,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { } } - // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. + // Verify the healthy machine does not have the MachineOwnerRemediated condition. m := &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). @@ -2026,7 +2214,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { } validateSecondPass(false) - // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) @@ -2048,7 +2236,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // Validate and remove finalizers for in flight machines. validateSecondPass(true) - // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) @@ -2073,7 +2261,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "expected machine %d to be deleted: %v", i) } - // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) From 4b20c92b2ce465cd82bba7db3f3e222c19c52433 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Fri, 8 Nov 2024 10:17:31 +0100 Subject: [PATCH 3/3] Fix review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stefan Büringer buringerst@vmware.com --- .../kubeadm/internal/controllers/status.go | 5 +--- .../cluster/cluster_controller_status.go | 5 +--- .../machinedeployment_status.go | 5 +--- .../machineset/machineset_controller.go | 7 +++++- .../machineset_controller_status.go | 5 +--- .../machineset/machineset_controller_test.go | 24 +++++++++++-------- 6 files changed, 24 insertions(+), 27 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go index 2327d19f2cc5..8353b2fbdf72 100644 --- a/controlplane/kubeadm/internal/controllers/status.go +++ b/controlplane/kubeadm/internal/controllers/status.go @@ -480,10 +480,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string { return "" } - machineNames := []string{} - for _, machine := range machines { - machineNames = append(machineNames, machine.GetName()) - } + machineNames := machines.Names() if len(machineNames) == 0 { return "" diff --git a/internal/controllers/cluster/cluster_controller_status.go b/internal/controllers/cluster/cluster_controller_status.go index 645cf1a1d4c7..67e6efcfbf46 100644 --- a/internal/controllers/cluster/cluster_controller_status.go +++ b/internal/controllers/cluster/cluster_controller_status.go @@ -1009,10 +1009,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string { return "" } - machineNames := []string{} - for _, machine := range machines { - machineNames = append(machineNames, machine.GetName()) - } + machineNames := machines.Names() if len(machineNames) == 0 { return "" diff --git a/internal/controllers/machinedeployment/machinedeployment_status.go b/internal/controllers/machinedeployment/machinedeployment_status.go index 24cee4926993..c614bdd0a193 100644 --- a/internal/controllers/machinedeployment/machinedeployment_status.go +++ b/internal/controllers/machinedeployment/machinedeployment_status.go @@ -487,10 +487,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string { return "" } - machineNames := []string{} - for _, machine := range machines { - machineNames = append(machineNames, machine.GetName()) - } + machineNames := machines.Names() if len(machineNames) == 0 { return "" diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index acfa32d8e937..0c20682441f3 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -19,6 +19,7 @@ package machineset import ( "context" "fmt" + "math" "sort" "strings" "time" @@ -1241,6 +1242,8 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errList), "failed to remove OwnerRemediated condition from healhty Machines") } + // Calculates the Machines to be remediated. + // Note: Machines already deleting are not included, there is no need to trigger remediation for them again. machinesToRemediate := collections.FromMachines(machines...).Filter(collections.IsUnhealthyAndOwnerRemediated, collections.Not(collections.HasDeletionTimestamp)).UnsortedList() // If there are no machines to remediate return early. @@ -1250,7 +1253,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( // Calculate how many in flight machines we should remediate. // By default, we allow all machines to be remediated at the same time. - maxInFlight := len(machinesToRemediate) + maxInFlight := math.MaxInt // If the MachineSet is part of a MachineDeployment, only allow remediations if // it's the desired revision. @@ -1282,6 +1285,8 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( } // Update maxInFlight based on remediations that are in flight. + // A Machine has a remediation in flight when Machine's OwnerRemediated condition + // reports that remediation has been completed and the Machine has been deleted. for _, m := range machines { if !m.DeletionTimestamp.IsZero() { // TODO: Check for Status: False and Reason: MachineSetMachineRemediationMachineDeletedV1Beta2Reason diff --git a/internal/controllers/machineset/machineset_controller_status.go b/internal/controllers/machineset/machineset_controller_status.go index 948391e5877f..ac59d79b70ed 100644 --- a/internal/controllers/machineset/machineset_controller_status.go +++ b/internal/controllers/machineset/machineset_controller_status.go @@ -428,10 +428,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string { return "" } - machineNames := []string{} - for _, machine := range machines { - machineNames = append(machineNames, machine.GetName()) - } + machineNames := machines.Names() if len(machineNames) == 0 { return "" diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index 4338afac63e1..0ccd20d9025c 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -1588,7 +1588,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { m := &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeFalse()) - conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) + g.Expect(conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition)).To(BeTrue()) c := v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) g.Expect(c).ToNot(BeNil()) g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ @@ -1597,7 +1597,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, }, v1beta2conditions.IgnoreLastTransitionTime(true))) - // Verify the healthy machine is not deleted. + // Verify the healthy machine is not deleted and does not have the OwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).Should(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) @@ -1730,7 +1730,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { "The operation will continue after the preflight check(s) pass", }, v1beta2conditions.IgnoreLastTransitionTime(true))) - // Verify the healthy machine does not have the MachineOwnerRemediated condition. + // Verify the healthy machine is not deleted and does not have the OwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) @@ -1905,7 +1905,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Message: "Machine won't be remediated because it is pending removal due to rollout", }, v1beta2conditions.IgnoreLastTransitionTime(true))) - // Verify the healthy machine does not have the MachineOwnerRemediated condition. + // Verify the healthy machine is not deleted and does not have the OwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) @@ -1927,7 +1927,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // Verify the unhealthy machine has been deleted. g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeFalse()) - conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) + g.Expect(conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition)).To(BeTrue()) c = v1beta2conditions.Get(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition) g.Expect(c).ToNot(BeNil()) g.Expect(*c).To(v1beta2conditions.MatchCondition(metav1.Condition{ @@ -1936,7 +1936,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, }, v1beta2conditions.IgnoreLastTransitionTime(true))) - // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine is not deleted and does not have the OwnerRemediated condition. m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) @@ -2124,9 +2124,10 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { } } - // Verify the healthy machine does not have the MachineOwnerRemediated condition. + // Verify the healthy machine is not deleted and does not have the OwnerRemediated condition. m := &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) @@ -2214,8 +2215,9 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { } validateSecondPass(false) - // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine is not deleted and does not have the OwnerRemediated condition. g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) @@ -2236,8 +2238,9 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // Validate and remove finalizers for in flight machines. validateSecondPass(true) - // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine is not deleted and does not have the OwnerRemediated condition. g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse()) @@ -2261,8 +2264,9 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "expected machine %d to be deleted: %v", i) } - // Verify (again) the healthy machine does not have the MachineOwnerRemediated condition. + // Verify (again) the healthy machine is not deleted and does not have the OwnerRemediated condition. g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(m.DeletionTimestamp.IsZero()).To(BeTrue()) g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) g.Expect(v1beta2conditions.Has(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)).To(BeFalse())