Skip to content

Commit 650a357

Browse files
authored
Merge pull request #3108 from benmoss/machineset-remediations
🌱 MHC marks machines for remediation with conditions
2 parents a9fc88e + 2c77beb commit 650a357

6 files changed

+426
-548
lines changed

api/v1alpha3/condition_consts.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const (
2929
// Conditions and condition Reasons for the Machine object
3030

3131
// MachineSummaryConditionsCount defines the total number of conditions on the Machine object.
32-
const MachineSummaryConditionsCount = 2
32+
const MachineSummaryConditionsCount = 4
3333

3434
const (
3535
// BootstrapReadyCondition reports a summary of current status of the bootstrap object defined for this machine.
@@ -56,3 +56,30 @@ const (
5656
// NOTE: This reason is used only as a fallback when the infrastructure object is not reporting its own ready condition.
5757
WaitingForInfrastructureFallbackReason = "WaitingForInfrastructure"
5858
)
59+
60+
const (
61+
// MachineHealthCheckSuccededCondition is set on machines that have passed a healthcheck by the MachineHealthCheck controller.
62+
// In the event that the health check fails it will be set to False.
63+
MachineHealthCheckSuccededCondition ConditionType = "HealthCheckSucceeded"
64+
65+
// MachineHasFailure is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
66+
MachineHasFailure = "MachineHasFailure"
67+
68+
// NodeNotFound is the reason used when a machine's node has previously been observed but is now gone.
69+
NodeNotFound = "NodeNotFound"
70+
71+
// NodeStartupTimeout is the reason used when a machine's node does not appear within the specified timeout.
72+
NodeStartupTimeout = "NodeStartupTimeout"
73+
74+
// UnhealthyNodeCondition is the reason used when a machine's node has one of the MachineHealthCheck's unhealthy conditions.
75+
UnhealthyNodeCondition = "UnhealthyNode"
76+
)
77+
78+
const (
79+
// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the MachineHealthCheck controller.
80+
// MachineOwnerRemediatedCondition is set to False after a health check fails, but should be changed to True by the owning controller after remediation succeeds.
81+
MachineOwnerRemediatedCondition ConditionType = "OwnerRemediated"
82+
83+
// WaitingForRemediation is the reason used when a machine fails a health check and remediation is needed.
84+
WaitingForRemediation = "WaitingForRemediation"
85+
)

controllers/machinehealthcheck_controller.go

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"sigs.k8s.io/cluster-api/controllers/remote"
3636
"sigs.k8s.io/cluster-api/util"
3737
"sigs.k8s.io/cluster-api/util/annotations"
38+
"sigs.k8s.io/cluster-api/util/conditions"
3839
"sigs.k8s.io/cluster-api/util/patch"
3940
"sigs.k8s.io/cluster-api/util/predicates"
4041
ctrl "sigs.k8s.io/controller-runtime"
@@ -134,6 +135,8 @@ func (r *MachineHealthCheckReconciler) Reconcile(req ctrl.Request) (_ ctrl.Resul
134135
m.Spec.ClusterName, m.Name, m.Namespace)
135136
}
136137

138+
logger = r.Log.WithValues("cluster", cluster.Name)
139+
137140
// Return early if the object or Cluster is paused.
138141
if annotations.IsPaused(cluster, m) {
139142
logger.Info("Reconciliation is paused for this object")
@@ -160,7 +163,7 @@ func (r *MachineHealthCheckReconciler) Reconcile(req ctrl.Request) (_ ctrl.Resul
160163
}
161164
m.Labels[clusterv1.ClusterLabelName] = m.Spec.ClusterName
162165

163-
result, err := r.reconcile(ctx, cluster, m)
166+
result, err := r.reconcile(ctx, logger, cluster, m)
164167
if err != nil {
165168
logger.Error(err, "Failed to reconcile MachineHealthCheck")
166169
r.recorder.Eventf(m, corev1.EventTypeWarning, "ReconcileError", "%v", err)
@@ -172,7 +175,7 @@ func (r *MachineHealthCheckReconciler) Reconcile(req ctrl.Request) (_ ctrl.Resul
172175
return result, nil
173176
}
174177

175-
func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) {
178+
func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) {
176179
// Ensure the MachineHealthCheck is owned by the Cluster it belongs to
177180
m.OwnerReferences = util.EnsureOwnerRef(m.OwnerReferences, metav1.OwnerReference{
178181
APIVersion: clusterv1.GroupVersion.String(),
@@ -181,42 +184,36 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *c
181184
UID: cluster.UID,
182185
})
183186

184-
logger := r.Log.WithValues("machinehealthcheck", m.Name, "namespace", m.Namespace)
185-
logger = logger.WithValues("cluster", cluster.Name)
186-
187187
// Create client for target cluster
188188
clusterClient, err := remote.NewClusterClient(ctx, r.Client, util.ObjectKey(cluster), r.scheme)
189189
if err != nil {
190-
logger.Error(err, "Error building target cluster client")
191-
return ctrl.Result{}, err
190+
return ctrl.Result{}, errors.Wrapf(err, "Error building target cluster client")
192191
}
193192

194193
if err := r.watchClusterNodes(ctx, cluster); err != nil {
195-
logger.Error(err, "Error watching nodes on target cluster")
196-
return ctrl.Result{}, err
194+
return ctrl.Result{}, errors.Wrapf(err, "Error watching nodes on target cluster")
197195
}
198196

199197
// fetch all targets
200198
logger.V(3).Info("Finding targets")
201-
targets, err := r.getTargetsFromMHC(clusterClient, cluster, m)
199+
targets, err := r.getTargetsFromMHC(clusterClient, m)
202200
if err != nil {
203-
logger.Error(err, "Failed to fetch targets from MachineHealthCheck")
204-
return ctrl.Result{}, err
201+
return ctrl.Result{}, errors.Wrapf(err, "Failed to fetch targets from MachineHealthCheck")
205202
}
206203
totalTargets := len(targets)
207204
m.Status.ExpectedMachines = int32(totalTargets)
208205

209206
// health check all targets and reconcile mhc status
210-
currentHealthy, needRemediationTargets, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration)
211-
m.Status.CurrentHealthy = int32(currentHealthy)
207+
healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration)
208+
m.Status.CurrentHealthy = int32(len(healthy))
212209

213210
// check MHC current health against MaxUnhealthy
214211
if !isAllowedRemediation(m) {
215212
logger.V(3).Info(
216213
"Short-circuiting remediation",
217214
"total target", totalTargets,
218215
"max unhealthy", m.Spec.MaxUnhealthy,
219-
"unhealthy targets", totalTargets-currentHealthy,
216+
"unhealthy targets", len(unhealthy),
220217
)
221218

222219
r.recorder.Eventf(
@@ -225,31 +222,50 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *c
225222
EventRemediationRestricted,
226223
"Remediation restricted due to exceeded number of unhealthy machines (total: %v, unhealthy: %v, maxUnhealthy: %v)",
227224
totalTargets,
228-
totalTargets-currentHealthy,
225+
m.Status.CurrentHealthy,
229226
m.Spec.MaxUnhealthy,
230227
)
228+
for _, t := range append(healthy, unhealthy...) {
229+
if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
230+
return ctrl.Result{}, errors.Wrapf(err, "Failed to patch machine status for machine %q", t.Machine.Name)
231+
}
232+
}
231233
return reconcile.Result{Requeue: true}, nil
232234
}
233235
logger.V(3).Info(
234236
"Remediations are allowed",
235237
"total target", totalTargets,
236238
"max unhealthy", m.Spec.MaxUnhealthy,
237-
"unhealthy targets", totalTargets-currentHealthy,
239+
"unhealthy targets", len(unhealthy),
238240
)
239241

240-
// remediate
242+
// mark for remediation
241243
errList := []error{}
242-
for _, t := range needRemediationTargets {
244+
for _, t := range unhealthy {
243245
logger.V(3).Info("Target meets unhealthy criteria, triggers remediation", "target", t.string())
244-
if err := t.remediate(ctx, logger, r.Client, r.recorder); err != nil {
245-
logger.Error(err, "Error remediating target", "target", t.string())
246-
errList = append(errList, err)
246+
247+
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediation, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed")
248+
if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
249+
return ctrl.Result{}, errors.Wrapf(err, "Failed to patch unhealthy machine status for machine %q", t.Machine.Name)
250+
}
251+
r.recorder.Eventf(
252+
t.Machine,
253+
corev1.EventTypeNormal,
254+
EventMachineMarkedUnhealthy,
255+
"Machine %v has been marked as unhealthy",
256+
t.string(),
257+
)
258+
}
259+
for _, t := range healthy {
260+
logger.V(3).Info("patching machine", "machine", t.Machine.GetName())
261+
if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
262+
return ctrl.Result{}, errors.Wrapf(err, "Failed to patch healthy machine status for machine %q", t.Machine.Name)
247263
}
248264
}
249265

250-
// handle remediation errors
266+
// handle update errors
251267
if len(errList) > 0 {
252-
logger.V(3).Info("Error(s) remediating request, requeueing")
268+
logger.V(3).Info("Error(s) marking machine, requeueing")
253269
return reconcile.Result{}, kerrors.NewAggregate(errList)
254270
}
255271

0 commit comments

Comments
 (0)