diff --git a/controllers/awsmachine_controller.go b/controllers/awsmachine_controller.go index c9d511feee..3516de45d2 100644 --- a/controllers/awsmachine_controller.go +++ b/controllers/awsmachine_controller.go @@ -189,13 +189,13 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope, // and AWSMachine // 3. Issue a delete // 4. Scale controller deployment to 1 - machineScope.V(2).Info("Unable to locate instance by ID or tags") + machineScope.V(2).Info("Unable to locate EC2 instance by ID or tags") r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "NoInstanceFound", "Unable to find matching EC2 instance") machineScope.AWSMachine.Finalizers = util.Filter(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer) return reconcile.Result{}, nil } - machineScope.V(3).Info("Instance found matching deleted AWSMachine", "instanceID", instance.ID) + machineScope.V(3).Info("EC2 instance found matching deleted AWSMachine", "instance-id", instance.ID) // Check the instance state. If it's already shutting down or terminated, // do nothing. Otherwise attempt to delete it. @@ -203,9 +203,9 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope, // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html switch instance.State { case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated: - machineScope.Info("Instance is shutting down or already terminated", "instanceID", instance.ID) + machineScope.Info("EC2 instance is shutting down or already terminated", "instance-id", instance.ID) default: - machineScope.Info("Terminating instance", "instanceID", instance.ID) + machineScope.Info("Terminating EC2 instance", "instance-id", instance.ID) if err := ec2Service.TerminateInstanceAndWait(instance.ID); err != nil { r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "FailedTerminate", "Failed to terminate instance %q: %v", instance.ID, err) return reconcile.Result{}, errors.Wrap(err, "failed to terminate instance") @@ -231,6 +231,7 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope, } } + machineScope.Info("EC2 instance successfully terminated", "instance-id", instance.ID) r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeNormal, "SuccessfulTerminate", "Terminated instance %q", instance.ID) } @@ -276,6 +277,7 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope // If the AWSMachine doesn't have our finalizer, add it. if !util.Contains(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer) { + machineScope.V(1).Info("Adding Cluster API Provider AWS finalizer") machineScope.AWSMachine.Finalizers = append(machineScope.AWSMachine.Finalizers, infrav1.MachineFinalizer) } @@ -300,6 +302,7 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope // Set an error message if we couldn't find the instance. if instance == nil { + machineScope.Info("EC2 instance cannot be found") machineScope.SetErrorReason(capierrors.UpdateMachineError) machineScope.SetErrorMessage(errors.New("EC2 instance cannot be found")) return reconcile.Result{}, nil @@ -308,6 +311,7 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope // TODO(ncdc): move this validation logic into a validating webhook if errs := r.validateUpdate(&machineScope.AWSMachine.Spec, instance); len(errs) > 0 { agg := kerrors.NewAggregate(errs) + machineScope.Info("Invalid update", "failedUpdates", agg.Error()) r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InvalidUpdate", "Invalid update: %s", agg.Error()) return reconcile.Result{}, nil } @@ -315,19 +319,34 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope // Make sure Spec.ProviderID is always set. machineScope.SetProviderID(fmt.Sprintf("aws:////%s", instance.ID)) - // Proceed to reconcile the AWSMachine state. + // See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html + + existingInstanceState := machineScope.GetInstanceState() machineScope.SetInstanceState(instance.State) - // TODO(vincepri): Remove this annotation when clusterctl is no longer relevant. - machineScope.SetAnnotation("cluster-api-provider-aws", "true") + // Proceed to reconcile the AWSMachine state. + if existingInstanceState == nil || *existingInstanceState != instance.State { + machineScope.Info("EC2 instance state changed", "state", instance.State, "instance-id", *machineScope.GetInstanceID()) + } switch instance.State { + case infrav1.InstanceStatePending, infrav1.InstanceStateStopping, infrav1.InstanceStateStopped: + machineScope.SetNotReady() case infrav1.InstanceStateRunning: - machineScope.Info("Machine instance is running", "instance-id", *machineScope.GetInstanceID()) machineScope.SetReady() - case infrav1.InstanceStatePending: - machineScope.Info("Machine instance is pending", "instance-id", *machineScope.GetInstanceID()) + case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated: + machineScope.SetNotReady() + machineScope.Info("Unexpected EC2 instance termination", "state", instance.State, "instance-id", *machineScope.GetInstanceID()) + r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnexpectedTermination", "Unexpected EC2 instance termination") default: + machineScope.SetNotReady() + machineScope.Info("EC2 instance state is undefined", "state", instance.State, "instance-id", *machineScope.GetInstanceID()) + r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnhandledState", "EC2 instance state is undefined") + machineScope.SetErrorReason(capierrors.UpdateMachineError) + machineScope.SetErrorMessage(errors.Errorf("EC2 instance state %q is undefined", instance.State)) + } + + if instance.State == infrav1.InstanceStateTerminated { machineScope.SetErrorReason(capierrors.UpdateMachineError) machineScope.SetErrorMessage(errors.Errorf("EC2 instance state %q is unexpected", instance.State)) } @@ -336,6 +355,9 @@ func (r *AWSMachineReconciler) reconcileNormal(ctx context.Context, machineScope return reconcile.Result{}, errors.Errorf("failed to reconcile LB attachment: %+v", err) } + // TODO(vincepri): Remove this annotation when clusterctl is no longer relevant. + machineScope.SetAnnotation("cluster-api-provider-aws", "true") + existingSecurityGroups, err := ec2svc.GetInstanceSecurityGroups(*machineScope.GetInstanceID()) if err != nil { return reconcile.Result{}, err @@ -363,10 +385,11 @@ func (r *AWSMachineReconciler) getOrCreate(scope *scope.MachineScope, ec2svc ser } if instance == nil { + scope.Info("Creating EC2 instance") // Create a new AWSMachine instance if we couldn't find a running instance. instance, err = ec2svc.CreateInstance(scope) if err != nil { - return nil, errors.Wrapf(err, "failed to create AWSMachine instance") + return nil, errors.Wrapf(err, "failed to create EC2 instance") } } @@ -390,14 +413,21 @@ func (r *AWSMachineReconciler) reconcileLBAttachment(machineScope *scope.Machine // validateUpdate checks that no immutable fields have been updated and // returns a slice of errors representing attempts to change immutable state. func (r *AWSMachineReconciler) validateUpdate(spec *infrav1.AWSMachineSpec, i *infrav1.Instance) (errs []error) { + + // EC2 instance attributes start disapearing during shutdown and termination, so do not + // perform more checks + if i.State == infrav1.InstanceStateTerminated || i.State == infrav1.InstanceStateShuttingDown { + return nil + } + // Instance Type if spec.InstanceType != i.Type { - errs = append(errs, errors.Errorf("instance type cannot be mutated from %q to %q", i.Type, spec.InstanceType)) + errs = append(errs, errors.Errorf("EC2 instance type cannot be mutated from %q to %q", i.Type, spec.InstanceType)) } // IAM Profile if spec.IAMInstanceProfile != i.IAMProfile { - errs = append(errs, errors.Errorf("instance IAM profile cannot be mutated from %q to %q", i.IAMProfile, spec.IAMInstanceProfile)) + errs = append(errs, errors.Errorf("EC2 instance IAM profile cannot be mutated from %q to %q", i.IAMProfile, spec.IAMInstanceProfile)) } // SSH Key Name (also account for default) diff --git a/controllers/awsmachine_controller_test.go b/controllers/awsmachine_controller_test.go index ab274984e2..ca434727c6 100644 --- a/controllers/awsmachine_controller_test.go +++ b/controllers/awsmachine_controller_test.go @@ -219,26 +219,35 @@ var _ = Describe("AWSMachineReconciler", func() { klog.SetOutput(buf) }) - It("should set instance to running", func() { - instance.State = infrav1.InstanceStateRunning - _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) - Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning))) - Expect(buf.String()).To(ContainSubstring(("Machine instance is running"))) - }) - It("should set instance to pending", func() { instance.State = infrav1.InstanceStatePending _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStatePending))) - Expect(buf.String()).To(ContainSubstring(("Machine instance is pending"))) + Expect(ms.AWSMachine.Status.Ready).To(Equal(false)) + Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed"))) + }) + + It("should set instance to running", func() { + instance.State = infrav1.InstanceStateRunning + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) + Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning))) + Expect(ms.AWSMachine.Status.Ready).To(Equal(true)) + Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed"))) }) }) + }) - It("should set error message when instance status unknown", func() { - instance.State = infrav1.InstanceStateStopping + Context("New EC2 instance state", func() { + It("should error when the instance state is a new unseen one", func() { + buf := new(bytes.Buffer) + klog.SetOutput(buf) + ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()).Return(nil, errors.New("stop here")) + instance.State = "NewAWSMachineState" _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) - Expect(ms.AWSMachine.Status.ErrorReason).To(PointTo(Equal(capierrors.UpdateMachineError))) - Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"stopping\" is unexpected"))) + Expect(ms.AWSMachine.Status.Ready).To(Equal(false)) + Expect(buf.String()).To(ContainSubstring(("EC2 instance state is undefined"))) + Expect(recorder.Events).To(Receive(ContainSubstring("InstanceUnhandledState"))) + Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"NewAWSMachineState\" is undefined"))) }) }) @@ -284,6 +293,71 @@ var _ = Describe("AWSMachineReconciler", func() { Expect(err).To(BeNil()) }) }) + + When("temporarily stopping then starting the AWSMachine", func() { + var buf *bytes.Buffer + BeforeEach(func() { + buf = new(bytes.Buffer) + klog.SetOutput(buf) + ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()). + Return(map[string][]string{"eid": {}}, nil).AnyTimes() + ec2Svc.EXPECT().GetCoreSecurityGroups(gomock.Any()).Return([]string{}, nil).AnyTimes() + }) + + It("should set instance to stopping and unready", func() { + instance.State = infrav1.InstanceStateStopping + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) + Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateStopping))) + Expect(ms.AWSMachine.Status.Ready).To(Equal(false)) + Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed"))) + }) + + It("should then set instance to stopped and unready", func() { + instance.State = infrav1.InstanceStateStopped + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) + Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateStopped))) + Expect(ms.AWSMachine.Status.Ready).To(Equal(false)) + Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed"))) + }) + + It("should then set instance to running and ready once it is restarted", func() { + instance.State = infrav1.InstanceStateRunning + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) + Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning))) + Expect(ms.AWSMachine.Status.Ready).To(Equal(true)) + Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed"))) + }) + }) + + When("deleting the AWSMachine outside of Kubernetes", func() { + var buf *bytes.Buffer + BeforeEach(func() { + buf = new(bytes.Buffer) + klog.SetOutput(buf) + ec2Svc.EXPECT().GetInstanceSecurityGroups(gomock.Any()). + Return(map[string][]string{"eid": {}}, nil).AnyTimes() + ec2Svc.EXPECT().GetCoreSecurityGroups(gomock.Any()).Return([]string{}, nil).AnyTimes() + }) + + It("should warn if an instance is shutting-down", func() { + instance.State = infrav1.InstanceStateShuttingDown + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) + Expect(ms.AWSMachine.Status.Ready).To(Equal(false)) + Expect(buf.String()).To(ContainSubstring(("Unexpected EC2 instance termination"))) + Expect(recorder.Events).To(Receive(ContainSubstring("UnexpectedTermination"))) + }) + + It("should error when the instance is seen as terminated", func() { + instance.State = infrav1.InstanceStateTerminated + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs) + Expect(ms.AWSMachine.Status.Ready).To(Equal(false)) + Expect(buf.String()).To(ContainSubstring(("Unexpected EC2 instance termination"))) + Expect(recorder.Events).To(Receive(ContainSubstring("UnexpectedTermination"))) + Expect(ms.AWSMachine.Status.ErrorMessage).To(PointTo(Equal("EC2 instance state \"terminated\" is unexpected"))) + }) + + }) + }) }) @@ -312,7 +386,7 @@ var _ = Describe("AWSMachineReconciler", func() { _, err := reconciler.reconcileDelete(ms, cs) Expect(err).To(BeNil()) - Expect(buf.String()).To(ContainSubstring("Unable to locate instance by ID or tags")) + Expect(buf.String()).To(ContainSubstring("Unable to locate EC2 instance by ID or tags")) Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents)) Expect(recorder.Events).To(Receive(ContainSubstring("NoInstanceFound"))) }) @@ -327,7 +401,7 @@ var _ = Describe("AWSMachineReconciler", func() { _, err := reconciler.reconcileDelete(ms, cs) Expect(err).To(BeNil()) - Expect(buf.String()).To(ContainSubstring("Instance is shutting down or already terminated")) + Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated")) Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents)) }) @@ -341,7 +415,7 @@ var _ = Describe("AWSMachineReconciler", func() { _, err := reconciler.reconcileDelete(ms, cs) Expect(err).To(BeNil()) - Expect(buf.String()).To(ContainSubstring("Instance is shutting down or already terminated")) + Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated")) Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents)) }) @@ -361,7 +435,7 @@ var _ = Describe("AWSMachineReconciler", func() { _, err := reconciler.reconcileDelete(ms, cs) Expect(errors.Cause(err)).To(MatchError(expected)) - Expect(buf.String()).To(ContainSubstring("Terminating instance")) + Expect(buf.String()).To(ContainSubstring("Terminating EC2 instance")) Expect(recorder.Events).To(Receive(ContainSubstring("FailedTerminate"))) }) @@ -414,7 +488,6 @@ var _ = Describe("AWSMachineReconciler", func() { }) }) }) - }) }) diff --git a/pkg/cloud/scope/machine.go b/pkg/cloud/scope/machine.go index b3f4b08301..2efb994d58 100644 --- a/pkg/cloud/scope/machine.go +++ b/pkg/cloud/scope/machine.go @@ -154,6 +154,11 @@ func (m *MachineScope) SetReady() { m.AWSMachine.Status.Ready = true } +// SetNotReady sets the AWSMachine Ready Status to false +func (m *MachineScope) SetNotReady() { + m.AWSMachine.Status.Ready = false +} + // SetErrorMessage sets the AWSMachine status error message. func (m *MachineScope) SetErrorMessage(v error) { m.AWSMachine.Status.ErrorMessage = pointer.StringPtr(v.Error())