Skip to content

Commit 86b0fe6

Browse files
committed
Extend MS ScalingUp and Remediationg conditions to include preflight
check errors Signed-off-by: Stefan Büringer [email protected]
1 parent 039c6c1 commit 86b0fe6

5 files changed

+103
-39
lines changed

internal/controllers/machineset/machineset_controller.go

+4
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,8 @@ type scope struct {
269269
infrastructureObjectNotFound bool
270270
getAndAdoptMachinesForMachineSetSucceeded bool
271271
owningMachineDeployment *clusterv1.MachineDeployment
272+
remediationPreflightCheckErrMessage string
273+
scaleUpPreflightCheckErrMessage string
272274
}
273275

274276
type machineSetReconcileFunc func(ctx context.Context, s *scope) (ctrl.Result, error)
@@ -601,6 +603,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e
601603
// If the error is not nil use that as the message for the condition.
602604
preflightCheckErrMessage = err.Error()
603605
}
606+
s.scaleUpPreflightCheckErrMessage = preflightCheckErrMessage
604607
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage)
605608
return result, err
606609
}
@@ -1345,6 +1348,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
13451348
if preflightChecksFailed {
13461349
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
13471350
// WaitingForRemediationReason reason.
1351+
s.remediationPreflightCheckErrMessage = preflightCheckErrMessage
13481352
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
13491353
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
13501354
Status: metav1.ConditionFalse,

internal/controllers/machineset/machineset_controller_status.go

+19-10
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package machineset
1919
import (
2020
"context"
2121
"fmt"
22+
"slices"
2223
"sort"
2324
"strings"
2425
"time"
@@ -47,7 +48,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) {
4748
// Conditions
4849

4950
// Update the ScalingUp and ScalingDown condition.
50-
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded)
51+
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded, s.scaleUpPreflightCheckErrMessage)
5152
setScalingDownCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded)
5253

5354
// MachinesReady condition: aggregate the Machine's Ready condition.
@@ -59,7 +60,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) {
5960
machines := collections.FromMachines(s.machines...)
6061
machinesToBeRemediated := machines.Filter(collections.IsUnhealthyAndOwnerRemediated)
6162
unhealthyMachines := machines.Filter(collections.IsUnhealthy)
62-
setRemediatingCondition(ctx, s.machineSet, machinesToBeRemediated, unhealthyMachines, s.getAndAdoptMachinesForMachineSetSucceeded)
63+
setRemediatingCondition(ctx, s.machineSet, machinesToBeRemediated, unhealthyMachines, s.getAndAdoptMachinesForMachineSetSucceeded, s.remediationPreflightCheckErrMessage)
6364

6465
setDeletingCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded)
6566
}
@@ -92,7 +93,7 @@ func setReplicas(_ context.Context, ms *clusterv1.MachineSet, machines []*cluste
9293
ms.Status.V1Beta2.UpToDateReplicas = ptr.To(upToDateReplicas)
9394
}
9495

95-
func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool) {
96+
func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool, scaleUpPreflightCheckErrMessage string) {
9697
// If we got unexpected errors in listing the machines (this should never happen), surface them.
9798
if !getAndAdoptMachinesForMachineSetSucceeded {
9899
v1beta2conditions.Set(ms, metav1.Condition{
@@ -125,7 +126,7 @@ func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines
125126
if currentReplicas >= desiredReplicas {
126127
var message string
127128
if missingReferencesMessage != "" {
128-
message = fmt.Sprintf("Scaling up would be blocked %s", missingReferencesMessage)
129+
message = fmt.Sprintf("Scaling up would be blocked because %s", missingReferencesMessage)
129130
}
130131
v1beta2conditions.Set(ms, metav1.Condition{
131132
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
@@ -138,8 +139,11 @@ func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines
138139

139140
// Scaling up.
140141
message := fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas)
141-
if missingReferencesMessage != "" {
142-
message += fmt.Sprintf(" is blocked %s", missingReferencesMessage)
142+
if missingReferencesMessage != "" || scaleUpPreflightCheckErrMessage != "" {
143+
blockMessages := slices.DeleteFunc([]string{missingReferencesMessage, scaleUpPreflightCheckErrMessage}, func(s string) bool {
144+
return s == ""
145+
})
146+
message += fmt.Sprintf(" is blocked because %s", strings.Join(blockMessages, " and "))
143147
}
144148
v1beta2conditions.Set(ms, metav1.Condition{
145149
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
@@ -281,7 +285,7 @@ func setMachinesUpToDateCondition(ctx context.Context, machineSet *clusterv1.Mac
281285
v1beta2conditions.Set(machineSet, *upToDateCondition)
282286
}
283287

284-
func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machinesToBeRemediated, unhealthyMachines collections.Machines, getAndAdoptMachinesForMachineSetSucceeded bool) {
288+
func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machinesToBeRemediated, unhealthyMachines collections.Machines, getAndAdoptMachinesForMachineSetSucceeded bool, remediationPreflightCheckErrMessage string) {
285289
if !getAndAdoptMachinesForMachineSetSucceeded {
286290
v1beta2conditions.Set(machineSet, metav1.Condition{
287291
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
@@ -320,11 +324,16 @@ func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineS
320324
return
321325
}
322326

327+
msg := remediatingCondition.Message
328+
if remediationPreflightCheckErrMessage != "" {
329+
msg = fmt.Sprintf("Remediation is blocked because %s; %s", remediationPreflightCheckErrMessage, remediatingCondition.Message)
330+
}
331+
323332
v1beta2conditions.Set(machineSet, metav1.Condition{
324333
Type: remediatingCondition.Type,
325334
Status: metav1.ConditionTrue,
326335
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
327-
Message: remediatingCondition.Message,
336+
Message: msg,
328337
})
329338
}
330339

@@ -383,10 +392,10 @@ func calculateMissingReferencesMessage(ms *clusterv1.MachineSet, bootstrapTempla
383392
}
384393

385394
if len(missingObjects) == 1 {
386-
return fmt.Sprintf("because %s does not exist", missingObjects[0])
395+
return fmt.Sprintf("%s does not exist", missingObjects[0])
387396
}
388397

389-
return fmt.Sprintf("because %s do not exist", strings.Join(missingObjects, " and "))
398+
return fmt.Sprintf("%s do not exist", strings.Join(missingObjects, " and "))
390399
}
391400

392401
func aggregateStaleMachines(machines []*clusterv1.Machine) string {

internal/controllers/machineset/machineset_controller_status_test.go

+48-4
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ func Test_setScalingUpCondition(t *testing.T) {
194194
bootstrapObjectNotFound bool
195195
infrastructureObjectNotFound bool
196196
getAndAdoptMachinesForMachineSetSucceeded bool
197+
scaleUpPreflightCheckErrMessage string
197198
expectCondition metav1.Condition
198199
}{
199200
{
@@ -299,6 +300,27 @@ func Test_setScalingUpCondition(t *testing.T) {
299300
Message: "Scaling up from 0 to 3 replicas is blocked because DockerMachineTemplate does not exist",
300301
},
301302
},
303+
{
304+
name: "scaling up and blocked by bootstrap and infrastructure object and preflight checks",
305+
ms: scalingUpMachineSetWith3Replicas,
306+
bootstrapObjectNotFound: true,
307+
infrastructureObjectNotFound: true,
308+
getAndAdoptMachinesForMachineSetSucceeded: true,
309+
// This preflight check error can happen when a MachineSet is scaling up while the control plane
310+
// already has a newer Kubernetes version.
311+
scaleUpPreflightCheckErrMessage: "MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
312+
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
313+
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
314+
expectCondition: metav1.Condition{
315+
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
316+
Status: metav1.ConditionTrue,
317+
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
318+
Message: "Scaling up from 0 to 3 replicas is blocked because KubeadmBootstrapTemplate and DockerMachineTemplate " +
319+
"do not exist and MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
320+
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
321+
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
322+
},
323+
},
302324
{
303325
name: "deleting",
304326
ms: deletingMachineSetWith3Replicas,
@@ -317,7 +339,7 @@ func Test_setScalingUpCondition(t *testing.T) {
317339
t.Run(tt.name, func(t *testing.T) {
318340
g := NewWithT(t)
319341

320-
setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded)
342+
setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.scaleUpPreflightCheckErrMessage)
321343

322344
condition := v1beta2conditions.Get(tt.ms, clusterv1.MachineSetScalingUpV1Beta2Condition)
323345
g.Expect(condition).ToNot(BeNil())
@@ -758,13 +780,15 @@ func Test_setRemediatingCondition(t *testing.T) {
758780
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
759781
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
760782
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
761-
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
783+
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason}
784+
ownerRemediatedWaitingForRemediationV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason}
762785

763786
tests := []struct {
764787
name string
765788
machineSet *clusterv1.MachineSet
766789
machines []*clusterv1.Machine
767790
getAndAdoptMachinesForMachineSetSucceeded bool
791+
remediationPreflightCheckErrMessage string
768792
expectCondition metav1.Condition
769793
}{
770794
{
@@ -806,7 +830,27 @@ func Test_setRemediatingCondition(t *testing.T) {
806830
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
807831
Status: metav1.ConditionTrue,
808832
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
809-
Message: "Remediation in progress from Machine m3",
833+
Message: " from Machine m3", // FIXME: We should change something :)
834+
},
835+
},
836+
{
837+
name: "With machines to be remediated by MS and preflight check error",
838+
machineSet: &clusterv1.MachineSet{},
839+
machines: []*clusterv1.Machine{
840+
fakeMachine("m1", withConditions(healthCheckSucceeded)), // Healthy machine
841+
fakeMachine("m2", withConditions(healthCheckNotSucceeded)), // Unhealthy machine, not yet marked for remediation
842+
fakeMachine("m3", withConditions(healthCheckNotSucceeded, ownerRemediated), withV1Beta2Condition(ownerRemediatedV1Beta2)),
843+
fakeMachine("m4", withConditions(healthCheckNotSucceeded, ownerRemediated), withV1Beta2Condition(ownerRemediatedWaitingForRemediationV1Beta2)),
844+
},
845+
getAndAdoptMachinesForMachineSetSucceeded: true,
846+
// This preflight check error can happen when a Machine becomes unhealthy while the control plane is upgrading.
847+
remediationPreflightCheckErrMessage: "KubeadmControlPlane ns1/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed)",
848+
expectCondition: metav1.Condition{
849+
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
850+
Status: metav1.ConditionTrue,
851+
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
852+
Message: "Remediation is blocked because KubeadmControlPlane ns1/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed);" +
853+
" from Machines m3, m4", // FIXME: We should change something :)
810854
},
811855
},
812856
{
@@ -852,7 +896,7 @@ func Test_setRemediatingCondition(t *testing.T) {
852896
machinesToBeRemediated = machines.Filter(collections.IsUnhealthyAndOwnerRemediated)
853897
unHealthyMachines = machines.Filter(collections.IsUnhealthy)
854898
}
855-
setRemediatingCondition(ctx, tt.machineSet, machinesToBeRemediated, unHealthyMachines, tt.getAndAdoptMachinesForMachineSetSucceeded)
899+
setRemediatingCondition(ctx, tt.machineSet, machinesToBeRemediated, unHealthyMachines, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.remediationPreflightCheckErrMessage)
856900

857901
condition := v1beta2conditions.Get(tt.machineSet, clusterv1.MachineSetRemediatingV1Beta2Condition)
858902
g.Expect(condition).ToNot(BeNil())

internal/controllers/machineset/machineset_preflight.go

+7-8
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,8 @@ func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1.
136136
for _, v := range preflightCheckErrs {
137137
preflightCheckErrStrings = append(preflightCheckErrStrings, *v)
138138
}
139-
msg := fmt.Sprintf("%s on hold because %s. The operation will continue after the preflight check(s) pass", action, strings.Join(preflightCheckErrStrings, "; "))
140-
log.Info(msg)
141-
return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, msg, nil
139+
log.Info(fmt.Sprintf("%s on hold because %s. The operation will continue after the preflight check(s) pass", action, strings.Join(preflightCheckErrStrings, "; ")))
140+
return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, strings.Join(preflightCheckErrStrings, "; "), nil
142141
}
143142
return ctrl.Result{}, "", nil
144143
}
@@ -152,7 +151,7 @@ func (r *Reconciler) controlPlaneStablePreflightCheck(controlPlane *unstructured
152151
return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if %s %s is provisioning", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, controlPlane.GetKind(), cpKlogRef)
153152
}
154153
if isProvisioning {
155-
return ptr.To(fmt.Sprintf("%s %s is provisioning (%q preflight failed)", controlPlane.GetKind(), cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil
154+
return ptr.To(fmt.Sprintf("%s %s is provisioning (%q preflight check failed)", controlPlane.GetKind(), cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil
156155
}
157156

158157
// Check that the control plane is not upgrading.
@@ -161,7 +160,7 @@ func (r *Reconciler) controlPlaneStablePreflightCheck(controlPlane *unstructured
161160
return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if the %s %s is upgrading", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, controlPlane.GetKind(), cpKlogRef)
162161
}
163162
if isUpgrading {
164-
return ptr.To(fmt.Sprintf("%s %s is upgrading (%q preflight failed)", controlPlane.GetKind(), cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil
163+
return ptr.To(fmt.Sprintf("%s %s is upgrading (%q preflight check failed)", controlPlane.GetKind(), cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil
165164
}
166165

167166
return nil, nil
@@ -173,15 +172,15 @@ func (r *Reconciler) kubernetesVersionPreflightCheck(cpSemver, msSemver semver.V
173172
// => MS minor version cannot be outside of the supported skew.
174173
// Kubernetes skew policy: https://kubernetes.io/releases/version-skew-policy/#kubelet
175174
if msSemver.Minor > cpSemver.Minor {
176-
return ptr.To(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to the kubernetes version skew policy as MachineSet version is higher than ControlPlane version (%q preflight failed)", msSemver.String(), cpSemver.String(), clusterv1.MachineSetPreflightCheckKubernetesVersionSkew))
175+
return ptr.To(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to the kubernetes version skew policy as MachineSet version is higher than ControlPlane version (%q preflight check failed)", msSemver.String(), cpSemver.String(), clusterv1.MachineSetPreflightCheckKubernetesVersionSkew))
177176
}
178177
minorSkew := uint64(3)
179178
// For Control Planes running Kubernetes < v1.28, the version skew policy for kubelets is two.
180179
if cpSemver.LT(minVerKubernetesKubeletVersionSkewThree) {
181180
minorSkew = 2
182181
}
183182
if msSemver.Minor < cpSemver.Minor-minorSkew {
184-
return ptr.To(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to the kubernetes version skew policy as MachineSet version is more than %d minor versions older than the ControlPlane version (%q preflight failed)", msSemver.String(), cpSemver.String(), minorSkew, clusterv1.MachineSetPreflightCheckKubernetesVersionSkew))
183+
return ptr.To(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to the kubernetes version skew policy as MachineSet version is more than %d minor versions older than the ControlPlane version (%q preflight check failed)", msSemver.String(), cpSemver.String(), minorSkew, clusterv1.MachineSetPreflightCheckKubernetesVersionSkew))
185184
}
186185

187186
return nil
@@ -205,7 +204,7 @@ func (r *Reconciler) kubeadmVersionPreflightCheck(cpSemver, msSemver semver.Vers
205204
groupVersion.Group == bootstrapv1.GroupVersion.Group
206205
if kubeadmBootstrapProviderUsed {
207206
if cpSemver.Minor != msSemver.Minor {
208-
return ptr.To(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to kubeadm version skew policy as kubeadm only supports joining with the same major+minor version as the control plane (%q preflight failed)", msSemver.String(), cpSemver.String(), clusterv1.MachineSetPreflightCheckKubeadmVersionSkew)), nil
207+
return ptr.To(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to kubeadm version skew policy as kubeadm only supports joining with the same major+minor version as the control plane (%q preflight check failed)", msSemver.String(), cpSemver.String(), clusterv1.MachineSetPreflightCheckKubeadmVersionSkew)), nil
209208
}
210209
}
211210
return nil, nil

0 commit comments

Comments
 (0)