@@ -35,6 +35,7 @@ import (
35
35
"sigs.k8s.io/cluster-api/controllers/remote"
36
36
"sigs.k8s.io/cluster-api/util"
37
37
"sigs.k8s.io/cluster-api/util/annotations"
38
+ "sigs.k8s.io/cluster-api/util/conditions"
38
39
"sigs.k8s.io/cluster-api/util/patch"
39
40
"sigs.k8s.io/cluster-api/util/predicates"
40
41
ctrl "sigs.k8s.io/controller-runtime"
@@ -134,6 +135,8 @@ func (r *MachineHealthCheckReconciler) Reconcile(req ctrl.Request) (_ ctrl.Resul
134
135
m .Spec .ClusterName , m .Name , m .Namespace )
135
136
}
136
137
138
+ logger = r .Log .WithValues ("cluster" , cluster .Name )
139
+
137
140
// Return early if the object or Cluster is paused.
138
141
if annotations .IsPaused (cluster , m ) {
139
142
logger .Info ("Reconciliation is paused for this object" )
@@ -160,7 +163,7 @@ func (r *MachineHealthCheckReconciler) Reconcile(req ctrl.Request) (_ ctrl.Resul
160
163
}
161
164
m .Labels [clusterv1 .ClusterLabelName ] = m .Spec .ClusterName
162
165
163
- result , err := r .reconcile (ctx , cluster , m )
166
+ result , err := r .reconcile (ctx , logger , cluster , m )
164
167
if err != nil {
165
168
logger .Error (err , "Failed to reconcile MachineHealthCheck" )
166
169
r .recorder .Eventf (m , corev1 .EventTypeWarning , "ReconcileError" , "%v" , err )
@@ -172,7 +175,7 @@ func (r *MachineHealthCheckReconciler) Reconcile(req ctrl.Request) (_ ctrl.Resul
172
175
return result , nil
173
176
}
174
177
175
- func (r * MachineHealthCheckReconciler ) reconcile (ctx context.Context , cluster * clusterv1.Cluster , m * clusterv1.MachineHealthCheck ) (ctrl.Result , error ) {
178
+ func (r * MachineHealthCheckReconciler ) reconcile (ctx context.Context , logger logr. Logger , cluster * clusterv1.Cluster , m * clusterv1.MachineHealthCheck ) (ctrl.Result , error ) {
176
179
// Ensure the MachineHealthCheck is owned by the Cluster it belongs to
177
180
m .OwnerReferences = util .EnsureOwnerRef (m .OwnerReferences , metav1.OwnerReference {
178
181
APIVersion : clusterv1 .GroupVersion .String (),
@@ -181,42 +184,36 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *c
181
184
UID : cluster .UID ,
182
185
})
183
186
184
- logger := r .Log .WithValues ("machinehealthcheck" , m .Name , "namespace" , m .Namespace )
185
- logger = logger .WithValues ("cluster" , cluster .Name )
186
-
187
187
// Create client for target cluster
188
188
clusterClient , err := remote .NewClusterClient (ctx , r .Client , util .ObjectKey (cluster ), r .scheme )
189
189
if err != nil {
190
- logger .Error (err , "Error building target cluster client" )
191
- return ctrl.Result {}, err
190
+ return ctrl.Result {}, errors .Wrapf (err , "Error building target cluster client" )
192
191
}
193
192
194
193
if err := r .watchClusterNodes (ctx , cluster ); err != nil {
195
- logger .Error (err , "Error watching nodes on target cluster" )
196
- return ctrl.Result {}, err
194
+ return ctrl.Result {}, errors .Wrapf (err , "Error watching nodes on target cluster" )
197
195
}
198
196
199
197
// fetch all targets
200
198
logger .V (3 ).Info ("Finding targets" )
201
- targets , err := r .getTargetsFromMHC (clusterClient , cluster , m )
199
+ targets , err := r .getTargetsFromMHC (clusterClient , m )
202
200
if err != nil {
203
- logger .Error (err , "Failed to fetch targets from MachineHealthCheck" )
204
- return ctrl.Result {}, err
201
+ return ctrl.Result {}, errors .Wrapf (err , "Failed to fetch targets from MachineHealthCheck" )
205
202
}
206
203
totalTargets := len (targets )
207
204
m .Status .ExpectedMachines = int32 (totalTargets )
208
205
209
206
// health check all targets and reconcile mhc status
210
- currentHealthy , needRemediationTargets , nextCheckTimes := r .healthCheckTargets (targets , logger , m .Spec .NodeStartupTimeout .Duration )
211
- m .Status .CurrentHealthy = int32 (currentHealthy )
207
+ healthy , unhealthy , nextCheckTimes := r .healthCheckTargets (targets , logger , m .Spec .NodeStartupTimeout .Duration )
208
+ m .Status .CurrentHealthy = int32 (len ( healthy ) )
212
209
213
210
// check MHC current health against MaxUnhealthy
214
211
if ! isAllowedRemediation (m ) {
215
212
logger .V (3 ).Info (
216
213
"Short-circuiting remediation" ,
217
214
"total target" , totalTargets ,
218
215
"max unhealthy" , m .Spec .MaxUnhealthy ,
219
- "unhealthy targets" , totalTargets - currentHealthy ,
216
+ "unhealthy targets" , len ( unhealthy ) ,
220
217
)
221
218
222
219
r .recorder .Eventf (
@@ -225,31 +222,50 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *c
225
222
EventRemediationRestricted ,
226
223
"Remediation restricted due to exceeded number of unhealthy machines (total: %v, unhealthy: %v, maxUnhealthy: %v)" ,
227
224
totalTargets ,
228
- totalTargets - currentHealthy ,
225
+ m . Status . CurrentHealthy ,
229
226
m .Spec .MaxUnhealthy ,
230
227
)
228
+ for _ , t := range append (healthy , unhealthy ... ) {
229
+ if err := t .patchHelper .Patch (ctx , t .Machine ); err != nil {
230
+ return ctrl.Result {}, errors .Wrapf (err , "Failed to patch machine status for machine %q" , t .Machine .Name )
231
+ }
232
+ }
231
233
return reconcile.Result {Requeue : true }, nil
232
234
}
233
235
logger .V (3 ).Info (
234
236
"Remediations are allowed" ,
235
237
"total target" , totalTargets ,
236
238
"max unhealthy" , m .Spec .MaxUnhealthy ,
237
- "unhealthy targets" , totalTargets - currentHealthy ,
239
+ "unhealthy targets" , len ( unhealthy ) ,
238
240
)
239
241
240
- // remediate
242
+ // mark for remediation
241
243
errList := []error {}
242
- for _ , t := range needRemediationTargets {
244
+ for _ , t := range unhealthy {
243
245
logger .V (3 ).Info ("Target meets unhealthy criteria, triggers remediation" , "target" , t .string ())
244
- if err := t .remediate (ctx , logger , r .Client , r .recorder ); err != nil {
245
- logger .Error (err , "Error remediating target" , "target" , t .string ())
246
- errList = append (errList , err )
246
+
247
+ conditions .MarkFalse (t .Machine , clusterv1 .MachineOwnerRemediatedCondition , clusterv1 .WaitingForRemediation , clusterv1 .ConditionSeverityWarning , "MachineHealthCheck failed" )
248
+ if err := t .patchHelper .Patch (ctx , t .Machine ); err != nil {
249
+ return ctrl.Result {}, errors .Wrapf (err , "Failed to patch unhealthy machine status for machine %q" , t .Machine .Name )
250
+ }
251
+ r .recorder .Eventf (
252
+ t .Machine ,
253
+ corev1 .EventTypeNormal ,
254
+ EventMachineMarkedUnhealthy ,
255
+ "Machine %v has been marked as unhealthy" ,
256
+ t .string (),
257
+ )
258
+ }
259
+ for _ , t := range healthy {
260
+ logger .V (3 ).Info ("patching machine" , "machine" , t .Machine .GetName ())
261
+ if err := t .patchHelper .Patch (ctx , t .Machine ); err != nil {
262
+ return ctrl.Result {}, errors .Wrapf (err , "Failed to patch healthy machine status for machine %q" , t .Machine .Name )
247
263
}
248
264
}
249
265
250
- // handle remediation errors
266
+ // handle update errors
251
267
if len (errList ) > 0 {
252
- logger .V (3 ).Info ("Error(s) remediating request , requeueing" )
268
+ logger .V (3 ).Info ("Error(s) marking machine , requeueing" )
253
269
return reconcile.Result {}, kerrors .NewAggregate (errList )
254
270
}
255
271
0 commit comments