Skip to content

Commit ff912c8

Browse files
author
Naadir Jeewa
committed
machine controller: improve EC2 state handling
Signed-off-by: Naadir Jeewa <[email protected]>
1 parent 938643c commit ff912c8

File tree

3 files changed

+138
-30
lines changed

3 files changed

+138
-30
lines changed

controllers/awsmachine_controller.go

+43-13
Original file line numberDiff line numberDiff line change
@@ -189,23 +189,23 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope,
189189
// and AWSMachine
190190
// 3. Issue a delete
191191
// 4. Scale controller deployment to 1
192-
machineScope.V(2).Info("Unable to locate instance by ID or tags")
192+
machineScope.V(2).Info("Unable to locate EC2 instance by ID or tags")
193193
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "NoInstanceFound", "Unable to find matching EC2 instance")
194194
machineScope.AWSMachine.Finalizers = util.Filter(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer)
195195
return reconcile.Result{}, nil
196196
}
197197

198-
machineScope.V(3).Info("Instance found matching deleted AWSMachine", "instanceID", instance.ID)
198+
machineScope.V(3).Info("EC2 instance found matching deleted AWSMachine", "instance-id", instance.ID)
199199

200200
// Check the instance state. If it's already shutting down or terminated,
201201
// do nothing. Otherwise attempt to delete it.
202202
// This decision is based on the ec2-instance-lifecycle graph at
203203
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
204204
switch instance.State {
205205
case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated:
206-
machineScope.Info("Instance is shutting down or already terminated", "instanceID", instance.ID)
206+
machineScope.Info("EC2 instance is shutting down or already terminated", "instance-id", instance.ID)
207207
default:
208-
machineScope.Info("Terminating instance", "instanceID", instance.ID)
208+
machineScope.Info("Terminating EC2 instance", "instance-id", instance.ID)
209209
if err := ec2Service.TerminateInstanceAndWait(instance.ID); err != nil {
210210
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "FailedTerminate", "Failed to terminate instance %q: %v", instance.ID, err)
211211
return reconcile.Result{}, errors.Wrap(err, "failed to terminate instance")
@@ -231,6 +231,7 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope,
231231
}
232232
}
233233

234+
machineScope.Info("EC2 instance successfully terminated", "instance-id", instance.ID)
234235
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeNormal, "SuccessfulTerminate", "Terminated instance %q", instance.ID)
235236
}
236237

@@ -276,6 +277,7 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope
276277

277278
// If the AWSMachine doesn't have our finalizer, add it.
278279
if !util.Contains(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer) {
280+
machineScope.V(1).Info("Adding Cluster API Provider AWS finalizer")
279281
machineScope.AWSMachine.Finalizers = append(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer)
280282
}
281283

@@ -300,6 +302,7 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope
300302

301303
// Set an error message if we couldn't find the instance.
302304
if instance == nil {
305+
machineScope.Info("EC2 instance cannot be found")
303306
machineScope.SetErrorReason(capierrors.UpdateMachineError)
304307
machineScope.SetErrorMessage(errors.New("EC2 instance cannot be found"))
305308
return reconcile.Result{}, nil
@@ -308,26 +311,42 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope
308311
// TODO(ncdc): move this validation logic into a validating webhook
309312
if errs := r.validateUpdate(&machineScope.AWSMachine.Spec, instance); len(errs) > 0 {
310313
agg := kerrors.NewAggregate(errs)
314+
machineScope.Info("Invalid update", "failedUpdates", agg.Error())
311315
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InvalidUpdate", "Invalid update: %s", agg.Error())
312316
return reconcile.Result{}, nil
313317
}
314318

315319
// Make sure Spec.ProviderID is always set.
316320
machineScope.SetProviderID(fmt.Sprintf("aws:////%s", instance.ID))
317321

318-
// Proceed to reconcile the AWSMachine state.
322+
// See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
323+
324+
existingInstanceState := machineScope.GetInstanceState()
319325
machineScope.SetInstanceState(instance.State)
320326

321-
// TODO(vincepri): Remove this annotation when clusterctl is no longer relevant.
322-
machineScope.SetAnnotation("cluster-api-provider-aws", "true")
327+
// Proceed to reconcile the AWSMachine state.
328+
if existingInstanceState == nil || *existingInstanceState != instance.State {
329+
machineScope.Info("EC2 instance state changed", "state", instance.State, "instance-id", *machineScope.GetInstanceID())
330+
}
323331

324332
switch instance.State {
333+
case infrav1.InstanceStatePending, infrav1.InstanceStateStopping, infrav1.InstanceStateStopped:
334+
machineScope.SetNotReady()
325335
case infrav1.InstanceStateRunning:
326-
machineScope.Info("Machine instance is running", "instance-id", *machineScope.GetInstanceID())
327336
machineScope.SetReady()
328-
case infrav1.InstanceStatePending:
329-
machineScope.Info("Machine instance is pending", "instance-id", *machineScope.GetInstanceID())
337+
case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated:
338+
machineScope.SetNotReady()
339+
machineScope.Info("Unexpected EC2 instance termination", "state", instance.State, "instance-id", *machineScope.GetInstanceID())
340+
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnexpectedTermination", "Unexpected EC2 instance termination")
330341
default:
342+
machineScope.SetNotReady()
343+
machineScope.Info("EC2 instance state is undefined", "state", instance.State, "instance-id", *machineScope.GetInstanceID())
344+
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnhandledState", "EC2 instance state is undefined")
345+
machineScope.SetErrorReason(capierrors.UpdateMachineError)
346+
machineScope.SetErrorMessage(errors.Errorf("EC2 instance state %q is undefined", instance.State))
347+
}
348+
349+
if instance.State == infrav1.InstanceStateTerminated {
331350
machineScope.SetErrorReason(capierrors.UpdateMachineError)
332351
machineScope.SetErrorMessage(errors.Errorf("EC2 instance state %q is unexpected", instance.State))
333352
}
@@ -336,6 +355,9 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope
336355
return reconcile.Result{}, errors.Errorf("failed to reconcile LB attachment: %+v", err)
337356
}
338357

358+
// TODO(vincepri): Remove this annotation when clusterctl is no longer relevant.
359+
machineScope.SetAnnotation("cluster-api-provider-aws", "true")
360+
339361
existingSecurityGroups, err := ec2svc.GetInstanceSecurityGroups(*machineScope.GetInstanceID())
340362
if err != nil {
341363
return reconcile.Result{}, err
@@ -363,10 +385,11 @@ func (r *AWSMachineReconciler) getOrCreate(scope *scope.MachineScope, ec2svc ser
363385
}
364386

365387
if instance == nil {
388+
scope.Info("Creating EC2 instance")
366389
// Create a new AWSMachine instance if we couldn't find a running instance.
367390
instance, err = ec2svc.CreateInstance(scope)
368391
if err != nil {
369-
return nil, errors.Wrapf(err, "failed to create AWSMachine instance")
392+
return nil, errors.Wrapf(err, "failed to create EC2 instance")
370393
}
371394
}
372395

@@ -390,14 +413,21 @@ func (r *AWSMachineReconciler) reconcileLBAttachment(machineScope *scope.Machine
390413
// validateUpdate checks that no immutable fields have been updated and
391414
// returns a slice of errors representing attempts to change immutable state.
392415
func (r *AWSMachineReconciler) validateUpdate(spec *infrav1.AWSMachineSpec, i *infrav1.Instance) (errs []error) {
416+
417+
// EC2 instance attributes start disapearing during shutdown and termination, so do not
418+
// perform more checks
419+
if i.State == infrav1.InstanceStateTerminated || i.State == infrav1.InstanceStateShuttingDown {
420+
return nil
421+
}
422+
393423
// Instance Type
394424
if spec.InstanceType != i.Type {
395-
errs = append(errs, errors.Errorf("instance type cannot be mutated from %q to %q", i.Type, spec.InstanceType))
425+
errs = append(errs, errors.Errorf("EC2 instance type cannot be mutated from %q to %q", i.Type, spec.InstanceType))
396426
}
397427

398428
// IAM Profile
399429
if spec.IAMInstanceProfile != i.IAMProfile {
400-
errs = append(errs, errors.Errorf("instance IAM profile cannot be mutated from %q to %q", i.IAMProfile, spec.IAMInstanceProfile))
430+
errs = append(errs, errors.Errorf("EC2 instance IAM profile cannot be mutated from %q to %q", i.IAMProfile, spec.IAMInstanceProfile))
401431
}
402432

403433
// SSH Key Name (also account for default)

controllers/awsmachine_controller_test.go

+90-17
Original file line numberDiff line numberDiff line change
@@ -219,26 +219,35 @@ var _ = Describe("AWSMachineReconciler", func() {
219219
klog.SetOutput(buf)
220220
})
221221

222-
It("should set instance to running", func() {
223-
instance.State = infrav1.InstanceStateRunning
224-
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
225-
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning)))
226-
Expect(buf.String()).To(ContainSubstring(("Machine instance is running")))
227-
})
228-
229222
It("should set instance to pending", func() {
230223
instance.State = infrav1.InstanceStatePending
231224
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
232225
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStatePending)))
233-
Expect(buf.String()).To(ContainSubstring(("Machine instance is pending")))
226+
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
227+
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
228+
})
229+
230+
It("should set instance to running", func() {
231+
instance.State = infrav1.InstanceStateRunning
232+
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
233+
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning)))
234+
Expect(ms.AWSMachine.Status.Ready).To(Equal(true))
235+
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
234236
})
235237
})
238+
})
236239

237-
It("should set error message when instance status unknown", func() {
238-
instance.State = infrav1.InstanceStateStopping
240+
Context("New EC2 instance state", func() {
241+
It("should error when the instance state is a new unseen one", func() {
242+
buf := new(bytes.Buffer)
243+
klog.SetOutput(buf)
244+
ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()).Return(nil, errors.New("stop here"))
245+
instance.State = "NewAWSMachineState"
239246
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
240-
Expect(ms.AWSMachine.Status.ErrorReason).To(PointTo(Equal(capierrors.UpdateMachineError)))
241-
Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"stopping\" is unexpected")))
247+
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
248+
Expect(buf.String()).To(ContainSubstring(("EC2 instance state is undefined")))
249+
Expect(recorder.Events).To(Receive(ContainSubstring("InstanceUnhandledState")))
250+
Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"NewAWSMachineState\" is undefined")))
242251
})
243252
})
244253

@@ -284,6 +293,71 @@ var _ = Describe("AWSMachineReconciler", func() {
284293
Expect(err).To(BeNil())
285294
})
286295
})
296+
297+
When("temporarily stopping then starting the AWSMachine", func() {
298+
var buf *bytes.Buffer
299+
BeforeEach(func() {
300+
buf = new(bytes.Buffer)
301+
klog.SetOutput(buf)
302+
ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()).
303+
Return(map[string][]string{"eid": {}}, nil).AnyTimes()
304+
ec2Svc.EXPECT().GetCoreSecurityGroups(gomock.Any()).Return([]string{}, nil).AnyTimes()
305+
})
306+
307+
It("should set instance to stopping and unready", func() {
308+
instance.State = infrav1.InstanceStateStopping
309+
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
310+
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateStopping)))
311+
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
312+
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
313+
})
314+
315+
It("should then set instance to stopped and unready", func() {
316+
instance.State = infrav1.InstanceStateStopped
317+
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
318+
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateStopped)))
319+
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
320+
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
321+
})
322+
323+
It("should then set instance to running and ready once it is restarted", func() {
324+
instance.State = infrav1.InstanceStateRunning
325+
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
326+
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning)))
327+
Expect(ms.AWSMachine.Status.Ready).To(Equal(true))
328+
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
329+
})
330+
})
331+
332+
When("deleting the AWSMachine outside of Kubernetes", func() {
333+
var buf *bytes.Buffer
334+
BeforeEach(func() {
335+
buf = new(bytes.Buffer)
336+
klog.SetOutput(buf)
337+
ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()).
338+
Return(map[string][]string{"eid": {}}, nil).AnyTimes()
339+
ec2Svc.EXPECT().GetCoreSecurityGroups(gomock.Any()).Return([]string{}, nil).AnyTimes()
340+
})
341+
342+
It("should warn if an instance is shutting-down", func() {
343+
instance.State = infrav1.InstanceStateShuttingDown
344+
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
345+
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
346+
Expect(buf.String()).To(ContainSubstring(("Unexpected EC2 instance termination")))
347+
Expect(recorder.Events).To(Receive(ContainSubstring("UnexpectedTermination")))
348+
})
349+
350+
It("should error when the instance is seen as terminated", func() {
351+
instance.State = infrav1.InstanceStateTerminated
352+
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
353+
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
354+
Expect(buf.String()).To(ContainSubstring(("Unexpected EC2 instance termination")))
355+
Expect(recorder.Events).To(Receive(ContainSubstring("UnexpectedTermination")))
356+
Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"terminated\" is unexpected")))
357+
})
358+
359+
})
360+
287361
})
288362
})
289363

@@ -312,7 +386,7 @@ var _ = Describe("AWSMachineReconciler", func() {
312386

313387
_, err := reconciler.reconcileDelete(ms, cs)
314388
Expect(err).To(BeNil())
315-
Expect(buf.String()).To(ContainSubstring("Unable to locate instance by ID or tags"))
389+
Expect(buf.String()).To(ContainSubstring("Unable to locate EC2 instance by ID or tags"))
316390
Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
317391
Expect(recorder.Events).To(Receive(ContainSubstring("NoInstanceFound")))
318392
})
@@ -327,7 +401,7 @@ var _ = Describe("AWSMachineReconciler", func() {
327401

328402
_, err := reconciler.reconcileDelete(ms, cs)
329403
Expect(err).To(BeNil())
330-
Expect(buf.String()).To(ContainSubstring("Instance is shutting down or already terminated"))
404+
Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated"))
331405
Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
332406
})
333407

@@ -341,7 +415,7 @@ var _ = Describe("AWSMachineReconciler", func() {
341415

342416
_, err := reconciler.reconcileDelete(ms, cs)
343417
Expect(err).To(BeNil())
344-
Expect(buf.String()).To(ContainSubstring("Instance is shutting down or already terminated"))
418+
Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated"))
345419
Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
346420
})
347421

@@ -361,7 +435,7 @@ var _ = Describe("AWSMachineReconciler", func() {
361435

362436
_, err := reconciler.reconcileDelete(ms, cs)
363437
Expect(errors.Cause(err)).To(MatchError(expected))
364-
Expect(buf.String()).To(ContainSubstring("Terminating instance"))
438+
Expect(buf.String()).To(ContainSubstring("Terminating EC2 instance"))
365439
Expect(recorder.Events).To(Receive(ContainSubstring("FailedTerminate")))
366440
})
367441

@@ -414,7 +488,6 @@ var _ = Describe("AWSMachineReconciler", func() {
414488
})
415489
})
416490
})
417-
418491
})
419492
})
420493

pkg/cloud/scope/machine.go

+5
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ func (m *MachineScope) SetReady() {
154154
m.AWSMachine.Status.Ready = true
155155
}
156156

157+
// SetNotReady sets the AWSMachine Ready Status to false
158+
func (m *MachineScope) SetNotReady() {
159+
m.AWSMachine.Status.Ready = false
160+
}
161+
157162
// SetErrorMessage sets the AWSMachine status error message.
158163
func (m *MachineScope) SetErrorMessage(v error) {
159164
m.AWSMachine.Status.ErrorMessage = pointer.StringPtr(v.Error())

0 commit comments

Comments
 (0)