Skip to content

🐛 Improve EC2 state handling, and set error for manually deleted EC2 instances #1256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 43 additions & 13 deletions controllers/awsmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,23 +189,23 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope,
// and AWSMachine
// 3. Issue a delete
// 4. Scale controller deployment to 1
machineScope.V(2).Info("Unable to locate instance by ID or tags")
machineScope.V(2).Info("Unable to locate EC2 instance by ID or tags")
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "NoInstanceFound", "Unable to find matching EC2 instance")
machineScope.AWSMachine.Finalizers = util.Filter(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer)
return reconcile.Result{}, nil
}

machineScope.V(3).Info("Instance found matching deleted AWSMachine", "instanceID", instance.ID)
machineScope.V(3).Info("EC2 instance found matching deleted AWSMachine", "instance-id", instance.ID)

// Check the instance state. If it's already shutting down or terminated,
// do nothing. Otherwise attempt to delete it.
// This decision is based on the ec2-instance-lifecycle graph at
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
switch instance.State {
case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated:
machineScope.Info("Instance is shutting down or already terminated", "instanceID", instance.ID)
machineScope.Info("EC2 instance is shutting down or already terminated", "instance-id", instance.ID)
default:
machineScope.Info("Terminating instance", "instanceID", instance.ID)
machineScope.Info("Terminating EC2 instance", "instance-id", instance.ID)
if err := ec2Service.TerminateInstanceAndWait(instance.ID); err != nil {
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "FailedTerminate", "Failed to terminate instance %q: %v", instance.ID, err)
return reconcile.Result{}, errors.Wrap(err, "failed to terminate instance")
Expand All @@ -231,6 +231,7 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope,
}
}

machineScope.Info("EC2 instance successfully terminated", "instance-id", instance.ID)
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeNormal, "SuccessfulTerminate", "Terminated instance %q", instance.ID)
}

Expand Down Expand Up @@ -276,6 +277,7 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope

// If the AWSMachine doesn't have our finalizer, add it.
if !util.Contains(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer) {
machineScope.V(1).Info("Adding Cluster API Provider AWS finalizer")
machineScope.AWSMachine.Finalizers = append(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer)
}

Expand All @@ -300,6 +302,7 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope

// Set an error message if we couldn't find the instance.
if instance == nil {
machineScope.Info("EC2 instance cannot be found")
machineScope.SetErrorReason(capierrors.UpdateMachineError)
machineScope.SetErrorMessage(errors.New("EC2 instance cannot be found"))
return reconcile.Result{}, nil
Expand All @@ -308,26 +311,42 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope
// TODO(ncdc): move this validation logic into a validating webhook
if errs := r.validateUpdate(&machineScope.AWSMachine.Spec, instance); len(errs) > 0 {
agg := kerrors.NewAggregate(errs)
machineScope.Info("Invalid update", "failedUpdates", agg.Error())
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InvalidUpdate", "Invalid update: %s", agg.Error())
return reconcile.Result{}, nil
}

// Make sure Spec.ProviderID is always set.
machineScope.SetProviderID(fmt.Sprintf("aws:////%s", instance.ID))

// Proceed to reconcile the AWSMachine state.
// See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html

existingInstanceState := machineScope.GetInstanceState()
machineScope.SetInstanceState(instance.State)

// TODO(vincepri): Remove this annotation when clusterctl is no longer relevant.
machineScope.SetAnnotation("cluster-api-provider-aws", "true")
// Proceed to reconcile the AWSMachine state.
if existingInstanceState == nil || *existingInstanceState != instance.State {
machineScope.Info("EC2 instance state changed", "state", instance.State, "instance-id", *machineScope.GetInstanceID())
}

switch instance.State {
case infrav1.InstanceStatePending, infrav1.InstanceStateStopping, infrav1.InstanceStateStopped:
machineScope.SetNotReady()
case infrav1.InstanceStateRunning:
machineScope.Info("Machine instance is running", "instance-id", *machineScope.GetInstanceID())
machineScope.SetReady()
case infrav1.InstanceStatePending:
machineScope.Info("Machine instance is pending", "instance-id", *machineScope.GetInstanceID())
case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated:
machineScope.SetNotReady()
machineScope.Info("Unexpected EC2 instance termination", "state", instance.State, "instance-id", *machineScope.GetInstanceID())
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnexpectedTermination", "Unexpected EC2 instance termination")
default:
machineScope.SetNotReady()
machineScope.Info("EC2 instance state is undefined", "state", instance.State, "instance-id", *machineScope.GetInstanceID())
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnhandledState", "EC2 instance state is undefined")
machineScope.SetErrorReason(capierrors.UpdateMachineError)
machineScope.SetErrorMessage(errors.Errorf("EC2 instance state %q is undefined", instance.State))
}

if instance.State == infrav1.InstanceStateTerminated {
machineScope.SetErrorReason(capierrors.UpdateMachineError)
machineScope.SetErrorMessage(errors.Errorf("EC2 instance state %q is unexpected", instance.State))
}
Expand All @@ -336,6 +355,9 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope
return reconcile.Result{}, errors.Errorf("failed to reconcile LB attachment: %+v", err)
}

// TODO(vincepri): Remove this annotation when clusterctl is no longer relevant.
machineScope.SetAnnotation("cluster-api-provider-aws", "true")

existingSecurityGroups, err := ec2svc.GetInstanceSecurityGroups(*machineScope.GetInstanceID())
if err != nil {
return reconcile.Result{}, err
Expand Down Expand Up @@ -363,10 +385,11 @@ func (r *AWSMachineReconciler) getOrCreate(scope *scope.MachineScope, ec2svc ser
}

if instance == nil {
scope.Info("Creating EC2 instance")
// Create a new AWSMachine instance if we couldn't find a running instance.
instance, err = ec2svc.CreateInstance(scope)
if err != nil {
return nil, errors.Wrapf(err, "failed to create AWSMachine instance")
return nil, errors.Wrapf(err, "failed to create EC2 instance")
}
}

Expand All @@ -390,14 +413,21 @@ func (r *AWSMachineReconciler) reconcileLBAttachment(machineScope *scope.Machine
// validateUpdate checks that no immutable fields have been updated and
// returns a slice of errors representing attempts to change immutable state.
func (r *AWSMachineReconciler) validateUpdate(spec *infrav1.AWSMachineSpec, i *infrav1.Instance) (errs []error) {

// EC2 instance attributes start disapearing during shutdown and termination, so do not
// perform more checks
if i.State == infrav1.InstanceStateTerminated || i.State == infrav1.InstanceStateShuttingDown {
return nil
}

// Instance Type
if spec.InstanceType != i.Type {
errs = append(errs, errors.Errorf("instance type cannot be mutated from %q to %q", i.Type, spec.InstanceType))
errs = append(errs, errors.Errorf("EC2 instance type cannot be mutated from %q to %q", i.Type, spec.InstanceType))
}

// IAM Profile
if spec.IAMInstanceProfile != i.IAMProfile {
errs = append(errs, errors.Errorf("instance IAM profile cannot be mutated from %q to %q", i.IAMProfile, spec.IAMInstanceProfile))
errs = append(errs, errors.Errorf("EC2 instance IAM profile cannot be mutated from %q to %q", i.IAMProfile, spec.IAMInstanceProfile))
}

// SSH Key Name (also account for default)
Expand Down
107 changes: 90 additions & 17 deletions controllers/awsmachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,26 +219,35 @@ var _ = Describe("AWSMachineReconciler", func() {
klog.SetOutput(buf)
})

It("should set instance to running", func() {
instance.State = infrav1.InstanceStateRunning
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning)))
Expect(buf.String()).To(ContainSubstring(("Machine instance is running")))
})

It("should set instance to pending", func() {
instance.State = infrav1.InstanceStatePending
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStatePending)))
Expect(buf.String()).To(ContainSubstring(("Machine instance is pending")))
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
})

It("should set instance to running", func() {
instance.State = infrav1.InstanceStateRunning
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning)))
Expect(ms.AWSMachine.Status.Ready).To(Equal(true))
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
})
})
})

It("should set error message when instance status unknown", func() {
instance.State = infrav1.InstanceStateStopping
Context("New EC2 instance state", func() {
It("should error when the instance state is a new unseen one", func() {
buf := new(bytes.Buffer)
klog.SetOutput(buf)
ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()).Return(nil, errors.New("stop here"))
instance.State = "NewAWSMachineState"
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.ErrorReason).To(PointTo(Equal(capierrors.UpdateMachineError)))
Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"stopping\" is unexpected")))
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
Expect(buf.String()).To(ContainSubstring(("EC2 instance state is undefined")))
Expect(recorder.Events).To(Receive(ContainSubstring("InstanceUnhandledState")))
Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"NewAWSMachineState\" is undefined")))
})
})

Expand Down Expand Up @@ -284,6 +293,71 @@ var _ = Describe("AWSMachineReconciler", func() {
Expect(err).To(BeNil())
})
})

When("temporarily stopping then starting the AWSMachine", func() {
var buf *bytes.Buffer
BeforeEach(func() {
buf = new(bytes.Buffer)
klog.SetOutput(buf)
ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()).
Return(map[string][]string{"eid": {}}, nil).AnyTimes()
ec2Svc.EXPECT().GetCoreSecurityGroups(gomock.Any()).Return([]string{}, nil).AnyTimes()
})

It("should set instance to stopping and unready", func() {
instance.State = infrav1.InstanceStateStopping
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateStopping)))
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
})

It("should then set instance to stopped and unready", func() {
instance.State = infrav1.InstanceStateStopped
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateStopped)))
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
})

It("should then set instance to running and ready once it is restarted", func() {
instance.State = infrav1.InstanceStateRunning
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning)))
Expect(ms.AWSMachine.Status.Ready).To(Equal(true))
Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
})
})

When("deleting the AWSMachine outside of Kubernetes", func() {
var buf *bytes.Buffer
BeforeEach(func() {
buf = new(bytes.Buffer)
klog.SetOutput(buf)
ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()).
Return(map[string][]string{"eid": {}}, nil).AnyTimes()
ec2Svc.EXPECT().GetCoreSecurityGroups(gomock.Any()).Return([]string{}, nil).AnyTimes()
})

It("should warn if an instance is shutting-down", func() {
instance.State = infrav1.InstanceStateShuttingDown
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
Expect(buf.String()).To(ContainSubstring(("Unexpected EC2 instance termination")))
Expect(recorder.Events).To(Receive(ContainSubstring("UnexpectedTermination")))
})

It("should error when the instance is seen as terminated", func() {
instance.State = infrav1.InstanceStateTerminated
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs)
Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
Expect(buf.String()).To(ContainSubstring(("Unexpected EC2 instance termination")))
Expect(recorder.Events).To(Receive(ContainSubstring("UnexpectedTermination")))
Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"terminated\" is unexpected")))
})

})

})
})

Expand Down Expand Up @@ -312,7 +386,7 @@ var _ = Describe("AWSMachineReconciler", func() {

_, err := reconciler.reconcileDelete(ms, cs)
Expect(err).To(BeNil())
Expect(buf.String()).To(ContainSubstring("Unable to locate instance by ID or tags"))
Expect(buf.String()).To(ContainSubstring("Unable to locate EC2 instance by ID or tags"))
Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
Expect(recorder.Events).To(Receive(ContainSubstring("NoInstanceFound")))
})
Expand All @@ -327,7 +401,7 @@ var _ = Describe("AWSMachineReconciler", func() {

_, err := reconciler.reconcileDelete(ms, cs)
Expect(err).To(BeNil())
Expect(buf.String()).To(ContainSubstring("Instance is shutting down or already terminated"))
Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated"))
Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
})

Expand All @@ -341,7 +415,7 @@ var _ = Describe("AWSMachineReconciler", func() {

_, err := reconciler.reconcileDelete(ms, cs)
Expect(err).To(BeNil())
Expect(buf.String()).To(ContainSubstring("Instance is shutting down or already terminated"))
Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated"))
Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
})

Expand All @@ -361,7 +435,7 @@ var _ = Describe("AWSMachineReconciler", func() {

_, err := reconciler.reconcileDelete(ms, cs)
Expect(errors.Cause(err)).To(MatchError(expected))
Expect(buf.String()).To(ContainSubstring("Terminating instance"))
Expect(buf.String()).To(ContainSubstring("Terminating EC2 instance"))
Expect(recorder.Events).To(Receive(ContainSubstring("FailedTerminate")))
})

Expand Down Expand Up @@ -414,7 +488,6 @@ var _ = Describe("AWSMachineReconciler", func() {
})
})
})

})
})

Expand Down
5 changes: 5 additions & 0 deletions pkg/cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ func (m *MachineScope) SetReady() {
m.AWSMachine.Status.Ready = true
}

// SetNotReady sets the AWSMachine Ready Status to false
func (m *MachineScope) SetNotReady() {
m.AWSMachine.Status.Ready = false
}

// SetErrorMessage sets the AWSMachine status error message.
func (m *MachineScope) SetErrorMessage(v error) {
m.AWSMachine.Status.ErrorMessage = pointer.StringPtr(v.Error())
Expand Down