@@ -42,8 +42,16 @@ func (r *Reconciler) create() error {
42
42
43
43
if instances , err := r .getMachineInstances (); err == nil && len (instances ) > 0 {
44
44
klog .Infof ("%s: found existing instance %s for machine" , r .machine .Name , aws .StringValue (instances [0 ].InstanceId ))
45
+
46
+ if r .checkIfInstanceTerminated (instances [0 ]) {
47
+ // The instance exists, but is in a terminated state.
48
+ // This means the instance was terminated prior to MAPI realizing it existed.
49
+ // We should fail the machine in this scenario else we end up in a forever loop.
50
+ return machinecontroller .InvalidMachineConfiguration ("Instance %s is in a terminated state" , aws .StringValue (instances [0 ].InstanceId ))
51
+ }
52
+
45
53
// If we got here, then Exists failed to find the instance, and we were asked to create a new instance.
46
- // The instance already exists, so requeue and start the reconcile again, Exists should pass now.
54
+ // The instance already exists, and isn't terminated, so requeue and start the reconcile again, Exists should pass now.
47
55
// Don't bother updating the status, Update will configure everything on the next reconcile.
48
56
return fmt .Errorf ("%s: Possible eventual-consistency discrepancy; returning an error to requeue" , r .machine .Name )
49
57
}
@@ -122,7 +130,7 @@ func (r *Reconciler) create() error {
122
130
func (r * Reconciler ) delete () error {
123
131
klog .Infof ("%s: deleting machine" , r .machine .Name )
124
132
125
- // Get all instances not terminated.
133
+ // Get all instances (including terminated, so that we can handle a terminated state)
126
134
existingInstances , err := r .getMachineInstances ()
127
135
if err != nil {
128
136
metrics .RegisterFailedInstanceDelete (& metrics.MachineLabels {
@@ -141,23 +149,29 @@ func (r *Reconciler) delete() error {
141
149
return nil
142
150
}
143
151
144
- if err = r .removeFromLoadBalancers (existingInstances ); err != nil {
145
- metrics .RegisterFailedInstanceDelete (& metrics.MachineLabels {
146
- Name : r .machine .Name ,
147
- Namespace : r .machine .Namespace ,
148
- Reason : "failed to remove instance from load balancers" ,
149
- })
150
- return fmt .Errorf ("failed to remove instance from load balancers: %w" , err )
151
- }
152
+ isTerminated := r .checkIfInstanceTerminated (existingInstances [0 ])
153
+ var terminatingInstances []* ec2.InstanceStateChange
154
+
155
+ if ! isTerminated {
156
+ if err := r .removeFromLoadBalancers (existingInstances ); err != nil {
157
+ metrics .RegisterFailedInstanceDelete (& metrics.MachineLabels {
158
+ Name : r .machine .Name ,
159
+ Namespace : r .machine .Namespace ,
160
+ Reason : "failed to remove instance from load balancers" ,
161
+ })
162
+ return fmt .Errorf ("failed to remove instance from load balancers: %w" , err )
163
+ }
164
+
165
+ terminatingInstances , err = terminateInstances (r .awsClient , existingInstances )
166
+ if err != nil {
167
+ metrics .RegisterFailedInstanceDelete (& metrics.MachineLabels {
168
+ Name : r .machine .Name ,
169
+ Namespace : r .machine .Namespace ,
170
+ Reason : "failed to delete instances" ,
171
+ })
172
+ return fmt .Errorf ("failed to delete instaces: %w" , err )
173
+ }
152
174
153
- terminatingInstances , err := terminateInstances (r .awsClient , existingInstances )
154
- if err != nil {
155
- metrics .RegisterFailedInstanceDelete (& metrics.MachineLabels {
156
- Name : r .machine .Name ,
157
- Namespace : r .machine .Namespace ,
158
- Reason : "failed to delete instances" ,
159
- })
160
- return fmt .Errorf ("failed to delete instaces: %w" , err )
161
175
}
162
176
163
177
if r .machine .Annotations == nil {
@@ -168,6 +182,8 @@ func (r *Reconciler) delete() error {
168
182
if terminatingInstances [0 ] != nil && terminatingInstances [0 ].CurrentState != nil && terminatingInstances [0 ].CurrentState .Name != nil {
169
183
r .machine .Annotations [machinecontroller .MachineInstanceStateAnnotationName ] = aws .StringValue (terminatingInstances [0 ].CurrentState .Name )
170
184
}
185
+ } else if isTerminated {
186
+ r .machine .Annotations [machinecontroller .MachineInstanceStateAnnotationName ] = ec2 .InstanceStateNameTerminated
171
187
}
172
188
173
189
klog .Infof ("Deleted machine %v" , r .machine .Name )
@@ -183,7 +199,7 @@ func (r *Reconciler) update() error {
183
199
return fmt .Errorf ("%v: failed validating machine provider spec: %v" , r .machine .GetName (), err )
184
200
}
185
201
186
- // Get all instances not terminated.
202
+ // Get all instances
187
203
existingInstances , err := r .getMachineInstances ()
188
204
if err != nil {
189
205
metrics .RegisterFailedInstanceUpdate (& metrics.MachineLabels {
@@ -297,6 +313,13 @@ func (r *Reconciler) exists() (bool, error) {
297
313
return false , nil
298
314
}
299
315
316
+ if r .checkIfInstanceTerminated (existingInstances [0 ]) {
317
+ // The instance exists, but is in a terminated state.
318
+ // For the purposes of exists, this machine should not be considered to exist.
319
+ // If the machine is already provisioned, it will go failed.
320
+ return false , nil
321
+ }
322
+
300
323
return existingInstances [0 ] != nil , err
301
324
}
302
325
@@ -472,6 +495,18 @@ func (r *Reconciler) requeueIfInstancePending(instance *ec2.Instance) error {
472
495
return nil
473
496
}
474
497
498
+ // Check if an instance is terminated so that we can handle it appropriately
499
+ func (r * Reconciler ) checkIfInstanceTerminated (instance * ec2.Instance ) bool {
500
+
501
+ if instance != nil && instance .State != nil &&
502
+ aws .StringValue (instance .State .Name ) == ec2 .InstanceStateNameTerminated {
503
+ klog .Infof ("%s: Instance state terminated" , r .machine .Name )
504
+ return true
505
+ }
506
+
507
+ return false
508
+ }
509
+
475
510
func (r * Reconciler ) getMachineInstances () ([]* ec2.Instance , error ) {
476
511
// If there is a non-empty instance ID, search using that, otherwise
477
512
// fallback to filtering based on tags.
0 commit comments