Skip to content

Commit 83a836d

Browse files
Lu-Davidpete911Lu-David
authored
Imds asg lifecycle support (#1067)
* add imds asg target lifecycle hook * update comment on target lifecycle 404 response * drafted test * added imds support * cleaned up tests * cleaned comments and unit-test * updated aemm mock version * eks test update AEMM version --------- Co-authored-by: pete911 <[email protected]> Co-authored-by: Lu-David <[email protected]>
1 parent bc75e65 commit 83a836d

File tree

15 files changed

+601
-3
lines changed

15 files changed

+601
-3
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ When using the EC2 Console or EC2 API to terminate the instance, a state-change
8383
| Spot Instance Termination Notifications (ITN) |||
8484
| Scheduled Events |||
8585
| Instance Rebalance Recommendation |||
86+
| ASG Termination Lifecycle Hooks |||
8687
| AZ Rebalance Recommendation |||
87-
| ASG Termination Lifecycle Hooks |||
8888
| Instance State Change Events |||
8989

9090
### Kubernetes Compatibility

cmd/node-termination-handler.go

+6
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package main
1616
import (
1717
"context"
1818
"fmt"
19+
"github.com/aws/aws-node-termination-handler/pkg/monitor/asglifecycle"
1920
"os"
2021
"os/signal"
2122
"strings"
@@ -53,6 +54,7 @@ import (
5354
const (
5455
scheduledMaintenance = "Scheduled Maintenance"
5556
spotITN = "Spot ITN"
57+
asgLifecycle = "ASG Lifecycle"
5658
rebalanceRecommendation = "Rebalance Recommendation"
5759
sqsEvents = "SQS Event"
5860
timeFormat = "2006/01/02 15:04:05"
@@ -188,6 +190,10 @@ func main() {
188190
imdsSpotMonitor := spotitn.NewSpotInterruptionMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName)
189191
monitoringFns[spotITN] = imdsSpotMonitor
190192
}
193+
if nthConfig.EnableASGLifecycleDraining {
194+
asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName)
195+
monitoringFns[asgLifecycle] = asgLifecycleMonitor
196+
}
191197
if nthConfig.EnableScheduledEventDraining {
192198
imdsScheduledEventMonitor := scheduledevent.NewScheduledEventMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName)
193199
monitoringFns[scheduledMaintenance] = imdsScheduledEventMonitor

config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ spec:
143143
{{- end }}
144144
- name: ENABLE_SPOT_INTERRUPTION_DRAINING
145145
value: {{ .Values.enableSpotInterruptionDraining | quote }}
146+
- name: ENABLE_ASG_LIFECYCLE_DRAINING
147+
value: {{ .Values.enableASGLifecycleDraining | quote }}
146148
- name: ENABLE_SCHEDULED_EVENT_DRAINING
147149
value: {{ .Values.enableScheduledEventDraining | quote }}
148150
- name: ENABLE_REBALANCE_MONITORING

config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ spec:
143143
{{- end }}
144144
- name: ENABLE_SPOT_INTERRUPTION_DRAINING
145145
value: {{ .Values.enableSpotInterruptionDraining | quote }}
146+
- name: ENABLE_ASG_LIFECYCLE_DRAINING
147+
value: {{ .Values.enableASGLifecycleDraining | quote }}
146148
- name: ENABLE_SCHEDULED_EVENT_DRAINING
147149
value: {{ .Values.enableScheduledEventDraining | quote }}
148150
- name: ENABLE_REBALANCE_MONITORING

config/helm/aws-node-termination-handler/values.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,9 @@ metadataTries: 3
270270
# enableSpotInterruptionDraining If false, do not drain nodes when the spot interruption termination notice is received. Only used in IMDS mode.
271271
enableSpotInterruptionDraining: true
272272

273+
# enableASGLifecycleDraining If false, do not drain nodes when ASG target lifecycle state Terminated is received. Only used in IMDS mode.
274+
enableASGLifecycleDraining: true
275+
273276
# enableScheduledEventDraining If false, do not drain nodes before the maintenance window starts for an EC2 instance scheduled event. Only used in IMDS mode.
274277
enableScheduledEventDraining: true
275278

pkg/config/config.go

+4
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ const (
5353
enableScheduledEventDrainingDefault = true
5454
enableSpotInterruptionDrainingConfigKey = "ENABLE_SPOT_INTERRUPTION_DRAINING"
5555
enableSpotInterruptionDrainingDefault = true
56+
enableASGLifecycleDrainingConfigKey = "ENABLE_ASG_LIFECYCLE_DRAINING"
57+
enableASGLifecycleDrainingDefault = true
5658
enableSQSTerminationDrainingConfigKey = "ENABLE_SQS_TERMINATION_DRAINING"
5759
enableSQSTerminationDrainingDefault = false
5860
enableRebalanceMonitoringConfigKey = "ENABLE_REBALANCE_MONITORING"
@@ -132,6 +134,7 @@ type Config struct {
132134
WebhookProxy string
133135
EnableScheduledEventDraining bool
134136
EnableSpotInterruptionDraining bool
137+
EnableASGLifecycleDraining bool
135138
EnableSQSTerminationDraining bool
136139
EnableRebalanceMonitoring bool
137140
EnableRebalanceDraining bool
@@ -195,6 +198,7 @@ func ParseCliArgs() (config Config, err error) {
195198
flag.StringVar(&config.WebhookTemplateFile, "webhook-template-file", getEnv(webhookTemplateFileConfigKey, ""), "If specified, replaces the default webhook message template with content from template file.")
196199
flag.BoolVar(&config.EnableScheduledEventDraining, "enable-scheduled-event-draining", getBoolEnv(enableScheduledEventDrainingConfigKey, enableScheduledEventDrainingDefault), "If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event")
197200
flag.BoolVar(&config.EnableSpotInterruptionDraining, "enable-spot-interruption-draining", getBoolEnv(enableSpotInterruptionDrainingConfigKey, enableSpotInterruptionDrainingDefault), "If true, drain nodes when the spot interruption termination notice is received")
201+
flag.BoolVar(&config.EnableASGLifecycleDraining, "enable-asg-lifecycle-draining", getBoolEnv(enableASGLifecycleDrainingConfigKey, enableASGLifecycleDrainingDefault), "If true, drain nodes when the ASG target lifecyle state is Terminated is received")
198202
flag.BoolVar(&config.EnableSQSTerminationDraining, "enable-sqs-termination-draining", getBoolEnv(enableSQSTerminationDrainingConfigKey, enableSQSTerminationDrainingDefault), "If true, drain nodes when an SQS termination event is received")
199203
flag.BoolVar(&config.EnableRebalanceMonitoring, "enable-rebalance-monitoring", getBoolEnv(enableRebalanceMonitoringConfigKey, enableRebalanceMonitoringDefault), "If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set \"enableRebalanceDraining\".")
200204
flag.BoolVar(&config.EnableRebalanceDraining, "enable-rebalance-draining", getBoolEnv(enableRebalanceDrainingConfigKey, enableRebalanceDrainingDefault), "If true, drain nodes when the rebalance recommendation notice is received")

pkg/config/config_test.go

+7
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) {
3636
t.Setenv("DRY_RUN", "true")
3737
t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "true")
3838
t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "false")
39+
t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "false")
3940
t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false")
4041
t.Setenv("ENABLE_REBALANCE_MONITORING", "true")
4142
t.Setenv("ENABLE_REBALANCE_DRAINING", "true")
@@ -62,6 +63,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) {
6263
h.Equals(t, true, nthConfig.DryRun)
6364
h.Equals(t, true, nthConfig.EnableScheduledEventDraining)
6465
h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining)
66+
h.Equals(t, false, nthConfig.EnableASGLifecycleDraining)
6567
h.Equals(t, false, nthConfig.EnableSQSTerminationDraining)
6668
h.Equals(t, true, nthConfig.EnableRebalanceMonitoring)
6769
h.Equals(t, true, nthConfig.EnableRebalanceDraining)
@@ -98,6 +100,7 @@ func TestParseCliArgsSuccess(t *testing.T) {
98100
"--dry-run=true",
99101
"--enable-scheduled-event-draining=true",
100102
"--enable-spot-interruption-draining=false",
103+
"--enable-asg-lifecycle-draining=false",
101104
"--enable-sqs-termination-draining=false",
102105
"--enable-rebalance-monitoring=true",
103106
"--enable-rebalance-draining=true",
@@ -124,6 +127,7 @@ func TestParseCliArgsSuccess(t *testing.T) {
124127
h.Equals(t, true, nthConfig.DryRun)
125128
h.Equals(t, true, nthConfig.EnableScheduledEventDraining)
126129
h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining)
130+
h.Equals(t, false, nthConfig.EnableASGLifecycleDraining)
127131
h.Equals(t, false, nthConfig.EnableSQSTerminationDraining)
128132
h.Equals(t, true, nthConfig.EnableRebalanceMonitoring)
129133
h.Equals(t, true, nthConfig.EnableRebalanceDraining)
@@ -155,6 +159,7 @@ func TestParseCliArgsOverrides(t *testing.T) {
155159
t.Setenv("DRY_RUN", "false")
156160
t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "false")
157161
t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "true")
162+
t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "true")
158163
t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false")
159164
t.Setenv("ENABLE_REBALANCE_MONITORING", "true")
160165
t.Setenv("ENABLE_REBALANCE_DRAINING", "true")
@@ -178,6 +183,7 @@ func TestParseCliArgsOverrides(t *testing.T) {
178183
"--dry-run=true",
179184
"--enable-scheduled-event-draining=true",
180185
"--enable-spot-interruption-draining=false",
186+
"--enable-asg-lifecycle-draining=false",
181187
"--enable-sqs-termination-draining=true",
182188
"--enable-rebalance-monitoring=false",
183189
"--enable-rebalance-draining=false",
@@ -205,6 +211,7 @@ func TestParseCliArgsOverrides(t *testing.T) {
205211
h.Equals(t, true, nthConfig.DryRun)
206212
h.Equals(t, true, nthConfig.EnableScheduledEventDraining)
207213
h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining)
214+
h.Equals(t, false, nthConfig.EnableASGLifecycleDraining)
208215
h.Equals(t, true, nthConfig.EnableSQSTerminationDraining)
209216
h.Equals(t, false, nthConfig.EnableRebalanceMonitoring)
210217
h.Equals(t, false, nthConfig.EnableRebalanceDraining)

pkg/ec2metadata/ec2metadata.go

+24
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ import (
3030
const (
3131
// SpotInstanceActionPath is the context path to spot/instance-action within IMDS
3232
SpotInstanceActionPath = "/latest/meta-data/spot/instance-action"
33+
// ASGTargetLifecycleStatePath path to autoscaling target lifecycle state within IMDS
34+
ASGTargetLifecycleStatePath = "/latest/meta-data/autoscaling/target-lifecycle-state"
3335
// ScheduledEventPath is the context path to events/maintenance/scheduled within IMDS
3436
ScheduledEventPath = "/latest/meta-data/events/maintenance/scheduled"
3537
// RebalanceRecommendationPath is the context path to events/recommendations/rebalance within IMDS
@@ -193,6 +195,28 @@ func (e *Service) GetRebalanceRecommendationEvent() (rebalanceRec *RebalanceReco
193195
return rebalanceRec, nil
194196
}
195197

198+
// GetASGTargetLifecycleState retrieves ASG target lifecycle state from imds. State can be empty string
199+
// if the lifecycle hook is not present on the ASG
200+
func (e *Service) GetASGTargetLifecycleState() (state string, err error) {
201+
resp, err := e.Request(ASGTargetLifecycleStatePath)
202+
// 404s should not happen, but there can be a case if the instance is brand new and the field is not populated yet
203+
if resp != nil && resp.StatusCode == 404 {
204+
return "", nil
205+
} else if resp != nil && (resp.StatusCode < 200 || resp.StatusCode >= 300) {
206+
return "", fmt.Errorf("Metadata request received http status code: %d", resp.StatusCode)
207+
}
208+
if err != nil {
209+
return "", fmt.Errorf("Unable to parse metadata response: %w", err)
210+
}
211+
defer resp.Body.Close()
212+
213+
body, err := io.ReadAll(resp.Body)
214+
if err != nil {
215+
return "", fmt.Errorf("Unable to parse http response. Status code: %d. %w", resp.StatusCode, err)
216+
}
217+
return string(body), nil
218+
}
219+
196220
// GetMetadataInfo generic function for retrieving ec2 metadata
197221
func (e *Service) GetMetadataInfo(path string) (info string, err error) {
198222
metadataInfo := ""

pkg/ec2metadata/ec2metadata_test.go

+85
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,91 @@ func TestGetRebalanceRecommendationEventRequestFailure(t *testing.T) {
504504
h.Assert(t, err != nil, "error expected because no server should be running")
505505
}
506506

507+
func TestGetASGTargetLifecycleStateSuccess(t *testing.T) {
508+
requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state"
509+
510+
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
511+
rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100")
512+
if req.URL.String() == "/latest/api/token" {
513+
rw.WriteHeader(200)
514+
_, err := rw.Write([]byte(`token`))
515+
h.Ok(t, err)
516+
return
517+
}
518+
h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token")
519+
h.Equals(t, req.URL.String(), requestPath)
520+
_, err := rw.Write([]byte("InService"))
521+
h.Ok(t, err)
522+
}))
523+
defer server.Close()
524+
525+
expectedState := "InService"
526+
527+
// Use URL from our local test server
528+
imds := ec2metadata.New(server.URL, 1)
529+
530+
state, err := imds.GetASGTargetLifecycleState()
531+
h.Ok(t, err)
532+
h.Equals(t, expectedState, state)
533+
}
534+
535+
func TestGetASGTargetLifecycleState404Success(t *testing.T) {
536+
requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state"
537+
538+
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
539+
rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100")
540+
if req.URL.String() == "/latest/api/token" {
541+
rw.WriteHeader(200)
542+
_, err := rw.Write([]byte(`token`))
543+
h.Ok(t, err)
544+
return
545+
}
546+
h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token")
547+
h.Equals(t, req.URL.String(), requestPath)
548+
rw.WriteHeader(404)
549+
}))
550+
defer server.Close()
551+
552+
// Use URL from our local test server
553+
imds := ec2metadata.New(server.URL, 1)
554+
555+
state, err := imds.GetASGTargetLifecycleState()
556+
h.Ok(t, err)
557+
h.Assert(t, state == "", "ASG target lifecycle state should be empty")
558+
}
559+
560+
func TestGetASGTargetLifecycleState500Failure(t *testing.T) {
561+
requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state"
562+
563+
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
564+
rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100")
565+
if req.URL.String() == "/latest/api/token" {
566+
rw.WriteHeader(200)
567+
_, err := rw.Write([]byte(`token`))
568+
h.Ok(t, err)
569+
return
570+
}
571+
h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token")
572+
h.Equals(t, req.URL.String(), requestPath)
573+
rw.WriteHeader(500)
574+
}))
575+
defer server.Close()
576+
577+
// Use URL from our local test server
578+
imds := ec2metadata.New(server.URL, 1)
579+
580+
_, err := imds.GetASGTargetLifecycleState()
581+
h.Assert(t, err != nil, "error expected on non-200 or non-404 status code")
582+
}
583+
584+
func TestGetASGTargetLifecycleStateRequestFailure(t *testing.T) {
585+
// Use URL from our local test server
586+
imds := ec2metadata.New("/some-path-that-will-error", 1)
587+
588+
_, err := imds.GetASGTargetLifecycleState()
589+
h.Assert(t, err != nil, "error expected because no server should be running")
590+
}
591+
507592
func TestGetMetadataServiceRequest404(t *testing.T) {
508593
var requestPath string = "/latest/meta-data/instance-type"
509594

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License"). You may
4+
// not use this file except in compliance with the License. A copy of the
5+
// License is located at
6+
//
7+
// http://aws.amazon.com/apache2.0/
8+
//
9+
// or in the "license" file accompanying this file. This file is distributed
10+
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
// express or implied. See the License for the specific language governing
12+
// permissions and limitations under the License.
13+
14+
package asglifecycle
15+
16+
import (
17+
"crypto/sha256"
18+
"fmt"
19+
"time"
20+
21+
"github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
22+
"github.com/aws/aws-node-termination-handler/pkg/monitor"
23+
"github.com/aws/aws-node-termination-handler/pkg/node"
24+
)
25+
26+
// ASGLifecycleMonitorKind is a const to define this monitor kind
27+
const ASGLifecycleMonitorKind = "ASG_LIFECYCLE_MONITOR"
28+
29+
// ASGLifecycleMonitor is a struct definition which facilitates monitoring of ASG target lifecycle state from IMDS
30+
type ASGLifecycleMonitor struct {
31+
IMDS *ec2metadata.Service
32+
InterruptionChan chan<- monitor.InterruptionEvent
33+
CancelChan chan<- monitor.InterruptionEvent
34+
NodeName string
35+
}
36+
37+
// NewASGLifecycleMonitor creates an instance of a ASG lifecycle IMDS monitor
38+
func NewASGLifecycleMonitor(imds *ec2metadata.Service, interruptionChan chan<- monitor.InterruptionEvent, cancelChan chan<- monitor.InterruptionEvent, nodeName string) ASGLifecycleMonitor {
39+
return ASGLifecycleMonitor{
40+
IMDS: imds,
41+
InterruptionChan: interruptionChan,
42+
CancelChan: cancelChan,
43+
NodeName: nodeName,
44+
}
45+
}
46+
47+
// Monitor continuously monitors metadata for ASG target lifecycle state and sends interruption events to the passed in channel
48+
func (m ASGLifecycleMonitor) Monitor() error {
49+
interruptionEvent, err := m.checkForASGTargetLifecycleStateNotice()
50+
if err != nil {
51+
return err
52+
}
53+
if interruptionEvent != nil && interruptionEvent.Kind == monitor.ASGLifecycleKind {
54+
m.InterruptionChan <- *interruptionEvent
55+
}
56+
return nil
57+
}
58+
59+
// Kind denotes the kind of monitor
60+
func (m ASGLifecycleMonitor) Kind() string {
61+
return ASGLifecycleMonitorKind
62+
}
63+
64+
// checkForASGTargetLifecycleStateNotice Checks EC2 instance metadata for a asg lifecycle termination notice
65+
func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.InterruptionEvent, error) {
66+
state, err := m.IMDS.GetASGTargetLifecycleState()
67+
if err != nil {
68+
return nil, fmt.Errorf("There was a problem checking for ASG target lifecycle state: %w", err)
69+
}
70+
if state != "Terminated" {
71+
// if the state is not "Terminated", we can skip. State can also be empty (no hook configured).
72+
return nil, nil
73+
}
74+
75+
nodeName := m.NodeName
76+
// there is no time in the response, we just set time to the latest check
77+
interruptionTime := time.Now()
78+
79+
// There's no EventID returned, so we'll create it using a hash to prevent duplicates.
80+
hash := sha256.New()
81+
if _, err = hash.Write([]byte(fmt.Sprintf("%s:%s", state, interruptionTime))); err != nil {
82+
return nil, fmt.Errorf("There was a problem creating an event ID from the event: %w", err)
83+
}
84+
85+
return &monitor.InterruptionEvent{
86+
EventID: fmt.Sprintf("target-lifecycle-state-terminated-%x", hash.Sum(nil)),
87+
Kind: monitor.ASGLifecycleKind,
88+
Monitor: ASGLifecycleMonitorKind,
89+
StartTime: interruptionTime,
90+
NodeName: nodeName,
91+
Description: "AST target lifecycle state received. Instance will be terminated\n",
92+
PreDrainTask: setInterruptionTaint,
93+
}, nil
94+
}
95+
96+
func setInterruptionTaint(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
97+
err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID)
98+
if err != nil {
99+
return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID, err)
100+
}
101+
102+
return nil
103+
}

0 commit comments

Comments
 (0)