Skip to content

Commit ae84f36

Browse files
Merge pull request #83 from racheljpg/addTerminatedState
OCPBUGS-15255: Add terminated as a handled state so terminated instances don't get stuck
2 parents 318ae2f + b089132 commit ae84f36

File tree

3 files changed

+97
-36
lines changed

3 files changed

+97
-36
lines changed

pkg/actuators/machine/reconciler.go

+54-19
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,16 @@ func (r *Reconciler) create() error {
4242

4343
if instances, err := r.getMachineInstances(); err == nil && len(instances) > 0 {
4444
klog.Infof("%s: found existing instance %s for machine", r.machine.Name, aws.StringValue(instances[0].InstanceId))
45+
46+
if r.checkIfInstanceTerminated(instances[0]) {
47+
// The instance exists, but is in a terminated state.
48+
// This means the instance was terminated prior to MAPI realizing it existed.
49+
// We should fail the machine in this scenario else we end up in a forever loop.
50+
return machinecontroller.InvalidMachineConfiguration("Instance %s is in a terminated state", aws.StringValue(instances[0].InstanceId))
51+
}
52+
4553
// If we got here, then Exists failed to find the instance, and we were asked to create a new instance.
46-
// The instance already exists, so requeue and start the reconcile again, Exists should pass now.
54+
// The instance already exists, and isn't terminated, so requeue and start the reconcile again, Exists should pass now.
4755
// Don't bother updating the status, Update will configure everything on the next reconcile.
4856
return fmt.Errorf("%s: Possible eventual-consistency discrepancy; returning an error to requeue", r.machine.Name)
4957
}
@@ -122,7 +130,7 @@ func (r *Reconciler) create() error {
122130
func (r *Reconciler) delete() error {
123131
klog.Infof("%s: deleting machine", r.machine.Name)
124132

125-
// Get all instances not terminated.
133+
// Get all instances (including terminated, so that we can handle a terminated state)
126134
existingInstances, err := r.getMachineInstances()
127135
if err != nil {
128136
metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{
@@ -141,23 +149,29 @@ func (r *Reconciler) delete() error {
141149
return nil
142150
}
143151

144-
if err = r.removeFromLoadBalancers(existingInstances); err != nil {
145-
metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{
146-
Name: r.machine.Name,
147-
Namespace: r.machine.Namespace,
148-
Reason: "failed to remove instance from load balancers",
149-
})
150-
return fmt.Errorf("failed to remove instance from load balancers: %w", err)
151-
}
152+
isTerminated := r.checkIfInstanceTerminated(existingInstances[0])
153+
var terminatingInstances []*ec2.InstanceStateChange
154+
155+
if !isTerminated {
156+
if err := r.removeFromLoadBalancers(existingInstances); err != nil {
157+
metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{
158+
Name: r.machine.Name,
159+
Namespace: r.machine.Namespace,
160+
Reason: "failed to remove instance from load balancers",
161+
})
162+
return fmt.Errorf("failed to remove instance from load balancers: %w", err)
163+
}
164+
165+
terminatingInstances, err = terminateInstances(r.awsClient, existingInstances)
166+
if err != nil {
167+
metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{
168+
Name: r.machine.Name,
169+
Namespace: r.machine.Namespace,
170+
Reason: "failed to delete instances",
171+
})
172+
return fmt.Errorf("failed to delete instaces: %w", err)
173+
}
152174

153-
terminatingInstances, err := terminateInstances(r.awsClient, existingInstances)
154-
if err != nil {
155-
metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{
156-
Name: r.machine.Name,
157-
Namespace: r.machine.Namespace,
158-
Reason: "failed to delete instances",
159-
})
160-
return fmt.Errorf("failed to delete instaces: %w", err)
161175
}
162176

163177
if r.machine.Annotations == nil {
@@ -168,6 +182,8 @@ func (r *Reconciler) delete() error {
168182
if terminatingInstances[0] != nil && terminatingInstances[0].CurrentState != nil && terminatingInstances[0].CurrentState.Name != nil {
169183
r.machine.Annotations[machinecontroller.MachineInstanceStateAnnotationName] = aws.StringValue(terminatingInstances[0].CurrentState.Name)
170184
}
185+
} else if isTerminated {
186+
r.machine.Annotations[machinecontroller.MachineInstanceStateAnnotationName] = ec2.InstanceStateNameTerminated
171187
}
172188

173189
klog.Infof("Deleted machine %v", r.machine.Name)
@@ -183,7 +199,7 @@ func (r *Reconciler) update() error {
183199
return fmt.Errorf("%v: failed validating machine provider spec: %v", r.machine.GetName(), err)
184200
}
185201

186-
// Get all instances not terminated.
202+
// Get all instances
187203
existingInstances, err := r.getMachineInstances()
188204
if err != nil {
189205
metrics.RegisterFailedInstanceUpdate(&metrics.MachineLabels{
@@ -297,6 +313,13 @@ func (r *Reconciler) exists() (bool, error) {
297313
return false, nil
298314
}
299315

316+
if r.checkIfInstanceTerminated(existingInstances[0]) {
317+
// The instance exists, but is in a terminated state.
318+
// For the purposes of exists, this machine should not be considered to exist.
319+
// If the machine is already provisioned, it will go failed.
320+
return false, nil
321+
}
322+
300323
return existingInstances[0] != nil, err
301324
}
302325

@@ -472,6 +495,18 @@ func (r *Reconciler) requeueIfInstancePending(instance *ec2.Instance) error {
472495
return nil
473496
}
474497

498+
// Check if an instance is terminated so that we can handle it appropriately
499+
func (r *Reconciler) checkIfInstanceTerminated(instance *ec2.Instance) bool {
500+
501+
if instance != nil && instance.State != nil &&
502+
aws.StringValue(instance.State.Name) == ec2.InstanceStateNameTerminated {
503+
klog.Infof("%s: Instance state terminated", r.machine.Name)
504+
return true
505+
}
506+
507+
return false
508+
}
509+
475510
func (r *Reconciler) getMachineInstances() ([]*ec2.Instance, error) {
476511
// If there is a non-empty instance ID, search using that, otherwise
477512
// fallback to filtering based on tags.

pkg/actuators/machine/reconciler_test.go

+39-15
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,25 @@ func TestExists(t *testing.T) {
636636
return mockAWSClient
637637
},
638638
},
639+
{
640+
name: "Successfully find terminated instance",
641+
machine: func() *machinev1beta1.Machine {
642+
machine, err := stubMachine()
643+
if err != nil {
644+
t.Fatalf("unable to build stub machine: %v", err)
645+
}
646+
647+
return machine
648+
},
649+
existsResult: false,
650+
expectedError: nil,
651+
awsClient: func(ctrl *gomock.Controller) awsclient.Client {
652+
mockCtrl := gomock.NewController(t)
653+
mockAWSClient := mockaws.NewMockClient(mockCtrl)
654+
mockAWSClient.EXPECT().DescribeInstances(gomock.Any()).Return(stubDescribeInstancesOutput("test-ami", "test-id", ec2.InstanceStateNameTerminated, "1.1.1.1"), nil).AnyTimes()
655+
return mockAWSClient
656+
},
657+
},
639658
}
640659

641660
for _, tc := range testCases {
@@ -900,6 +919,24 @@ func TestDelete(t *testing.T) {
900919
return mockAWSClient
901920
},
902921
},
922+
{
923+
name: "Does not try to delete an instance if it's already in a terminated state",
924+
machine: func() *machinev1beta1.Machine {
925+
machine, err := stubMachine()
926+
if err != nil {
927+
t.Fatalf("unable to build stub machine: %v", err)
928+
}
929+
930+
return machine
931+
},
932+
expectedError: nil,
933+
awsClient: func(ctrl *gomock.Controller) awsclient.Client {
934+
mockCtrl := gomock.NewController(t)
935+
mockAWSClient := mockaws.NewMockClient(mockCtrl)
936+
mockAWSClient.EXPECT().DescribeInstances(gomock.Any()).Return(stubDescribeInstancesOutput("test-ami", "test-id", ec2.InstanceStateNameTerminated, "1.1.1.1"), nil).Times(1)
937+
return mockAWSClient
938+
},
939+
},
903940
}
904941

905942
for _, tc := range testCases {
@@ -1005,29 +1042,16 @@ func TestGetMachineInstances(t *testing.T) {
10051042
awsClientFunc: func(ctrl *gomock.Controller) awsclient.Client {
10061043
mockAWSClient := mockaws.NewMockClient(ctrl)
10071044

1008-
first := mockAWSClient.EXPECT().DescribeInstances(&ec2.DescribeInstancesInput{
1045+
mockAWSClient.EXPECT().DescribeInstances(&ec2.DescribeInstancesInput{
10091046
InstanceIds: aws.StringSlice([]string{instanceID}),
10101047
}).Return(
10111048
stubDescribeInstancesOutput(imageID, instanceID, ec2.InstanceStateNameTerminated, "192.168.0.10"),
10121049
nil,
10131050
).Times(1)
10141051

1015-
mockAWSClient.EXPECT().DescribeInstances(&ec2.DescribeInstancesInput{
1016-
Filters: []*ec2.Filter{
1017-
{
1018-
Name: awsTagFilter("Name"),
1019-
Values: aws.StringSlice([]string{machine.Name}),
1020-
},
1021-
1022-
clusterFilter(clusterID),
1023-
},
1024-
}).Return(
1025-
stubDescribeInstancesOutput(imageID, instanceID, ec2.InstanceStateNameTerminated, "192.168.0.10"),
1026-
nil,
1027-
).Times(1).After(first)
1028-
10291052
return mockAWSClient
10301053
},
1054+
exists: true,
10311055
},
10321056
}
10331057

pkg/actuators/machine/utils.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,16 @@ import (
3939
const upstreamMachineClusterIDLabel = "sigs.k8s.io/cluster-api-cluster"
4040

4141
// existingInstanceStates returns the list of states an EC2 instance can be in
42-
// while being considered "existing", i.e. mostly anything but "Terminated".
42+
// while being considered "existing"
43+
// terminated is also handled so that an instance can't get stuck in terminated
4344
func existingInstanceStates() []*string {
4445
return []*string{
4546
aws.String(ec2.InstanceStateNameRunning),
4647
aws.String(ec2.InstanceStateNamePending),
4748
aws.String(ec2.InstanceStateNameStopped),
4849
aws.String(ec2.InstanceStateNameStopping),
4950
aws.String(ec2.InstanceStateNameShuttingDown),
51+
aws.String(ec2.InstanceStateNameTerminated),
5052
}
5153
}
5254

@@ -68,7 +70,7 @@ func getStoppedInstances(machine *machinev1beta1.Machine, client awsclient.Clien
6870
return getInstances(machine, client, stoppedInstanceStateFilter)
6971
}
7072

71-
// getExistingInstances returns all instances not terminated
73+
// getExistingInstances returns all instances
7274
func getExistingInstances(machine *machinev1beta1.Machine, client awsclient.Client) ([]*ec2.Instance, error) {
7375
return getInstances(machine, client, existingInstanceStates())
7476
}

0 commit comments

Comments
 (0)