Skip to content

Commit 96cbbf1

Browse files
sliaptsouYauheni Sliaptsou
and
Yauheni Sliaptsou
authored
Implement Liveness probe (#389)
Co-authored-by: Yauheni Sliaptsou <[email protected]>
1 parent 2d1d8bc commit 96cbbf1

File tree

9 files changed

+144
-0
lines changed

9 files changed

+144
-0
lines changed

Diff for: cmd/node-termination-handler.go

+6
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,12 @@ func main() {
9393
log.Fatal().Err(err).Msg("Unable to instantiate observability metrics,")
9494
}
9595

96+
err = observability.InitProbes(nthConfig.EnableProbes, nthConfig.ProbesPort, nthConfig.ProbesEndpoint)
97+
if err != nil {
98+
nthConfig.Print()
99+
log.Fatal().Err(err).Msg("Unable to instantiate probes service,")
100+
}
101+
96102
imds := ec2metadata.New(nthConfig.MetadataURL, nthConfig.MetadataTries)
97103

98104
interruptionEventStore := interruptioneventstore.New(nthConfig)

Diff for: config/helm/aws-node-termination-handler/README.md

+3
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ Parameter | Description | Default
7474
`logLevel` | Sets the log level (INFO, DEBUG, or ERROR) | `INFO`
7575
`enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false`
7676
`prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092`
77+
`enableProbesServer` |If true, start an http server exposing `/healthz` endpoint for probes. | `false`
78+
`probesServerPort` | Replaces the default HTTP port for exposing probes endpoint. | `8080`
79+
`probesServerEndpoint` | Replaces the default endpoint for exposing probes endpoint. | `/healthz`
7780
`podMonitor.create` | if `true`, create a PodMonitor | `false`
7881
`podMonitor.interval` | Prometheus scrape interval | `30s`
7982
`podMonitor.sampleLimit` | Number of scraped samples accepted | `5000`

Diff for: config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,12 @@ spec:
166166
value: {{ .Values.enablePrometheusServer | quote }}
167167
- name: PROMETHEUS_SERVER_PORT
168168
value: {{ .Values.prometheusServerPort | quote }}
169+
- name: ENABLE_PROBES_SERVER
170+
value: {{ .Values.enableProbesServer | quote }}
171+
- name: PROBES_SERVER_PORT
172+
value: {{ .Values.probesServerPort | quote }}
173+
- name: PROBES_SERVER_ENDPOINT
174+
value: {{ .Values.probesServerEndpoint | quote }}
169175
resources:
170176
{{- toYaml .Values.resources | nindent 12 }}
171177
{{- if .Values.enablePrometheusServer }}
@@ -175,6 +181,13 @@ spec:
175181
name: http-metrics
176182
protocol: TCP
177183
{{- end }}
184+
{{- if .Values.enableProbesServer }}
185+
ports:
186+
- containerPort: {{ .Values.probesServerPort }}
187+
hostPort: {{ .Values.probesServerPort }}
188+
name: liveness-probe
189+
protocol: TCP
190+
{{- end }}
178191
nodeSelector:
179192
{{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux
180193
{{- with .Values.nodeSelector }}

Diff for: config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,12 @@ spec:
140140
value: {{ .Values.enablePrometheusServer | quote }}
141141
- name: PROMETHEUS_SERVER_PORT
142142
value: {{ .Values.prometheusServerPort | quote }}
143+
- name: ENABLE_PROBES_SERVER
144+
value: {{ .Values.enableProbesServer | quote }}
145+
- name: PROBES_SERVER_PORT
146+
value: {{ .Values.probesServerPort | quote }}
147+
- name: PROBES_SERVER_ENDPOINT
148+
value: {{ .Values.probesServerEndpoint | quote }}
143149
resources:
144150
{{- toYaml .Values.resources | nindent 12 }}
145151
{{- if .Values.enablePrometheusServer }}
@@ -149,6 +155,13 @@ spec:
149155
name: http-metrics
150156
protocol: TCP
151157
{{- end }}
158+
{{- if .Values.enableProbesServer }}
159+
ports:
160+
- containerPort: {{ .Values.probesServerPort }}
161+
hostPort: {{ .Values.probesServerPort }}
162+
name: liveness-probe
163+
protocol: TCP
164+
{{- end }}
152165
nodeSelector:
153166
{{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows
154167
{{- with .Values.nodeSelector }}

Diff for: config/helm/aws-node-termination-handler/templates/deployment.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ spec:
118118
value: {{ .Values.webhookProxy | quote }}
119119
- name: ENABLE_PROMETHEUS_SERVER
120120
value: {{ .Values.enablePrometheusServer | quote }}
121+
- name: ENABLE_PROBES_SERVER
122+
value: {{ .Values.enableProbesServer | quote }}
121123
- name: ENABLE_SPOT_INTERRUPTION_DRAINING
122124
value: "false"
123125
- name: ENABLE_SCHEDULED_EVENT_DRAINING
@@ -130,6 +132,10 @@ spec:
130132
value: {{ .Values.queueURL | quote }}
131133
- name: PROMETHEUS_SERVER_PORT
132134
value: {{ .Values.prometheusServerPort | quote }}
135+
- name: PROBES_SERVER_PORT
136+
value: {{ .Values.probesServerPort | quote }}
137+
- name: PROBES_SERVER_ENDPOINT
138+
value: {{ .Values.probesServerEndpoint | quote }}
133139
- name: AWS_REGION
134140
value: {{ .Values.awsRegion | quote }}
135141
- name: AWS_ENDPOINT
@@ -155,6 +161,13 @@ spec:
155161
name: http-metrics
156162
protocol: TCP
157163
{{- end }}
164+
{{- if .Values.enableProbesServer }}
165+
ports:
166+
- containerPort: {{ .Values.probesServerPort }}
167+
hostPort: {{ .Values.probesServerPort }}
168+
name: liveness-probe
169+
protocol: TCP
170+
{{- end }}
158171
nodeSelector:
159172
{{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux
160173
{{- with .Values.nodeSelector }}

Diff for: config/helm/aws-node-termination-handler/values.yaml

+12
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,14 @@ podLabels: {}
2525
linuxPodLabels: {}
2626
windowsPodLabels: {}
2727

28+
# liveness probe settings.
29+
probes:
30+
httpGet:
31+
path: /healthz
32+
port: 8080
33+
initialDelaySeconds: 5
34+
periodSeconds: 5
35+
2836
resources:
2937
requests:
3038
memory: "64Mi"
@@ -144,6 +152,10 @@ nodeSelectorTermsArch: ""
144152
enablePrometheusServer: false
145153
prometheusServerPort: 9092
146154

155+
enableProbesServer: false
156+
probesServerPort: 8080
157+
probesServerEndpoint: "/healthz"
158+
147159
tolerations:
148160
- operator: "Exists"
149161

Diff for: pkg/config/config.go

+13
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,13 @@ const (
7878
// https://github.com/prometheus/prometheus/wiki/Default-port-allocations
7979
prometheusPortDefault = 9092
8080
prometheusPortConfigKey = "PROMETHEUS_SERVER_PORT"
81+
// probes
82+
enableProbesDefault = false
83+
enableProbesConfigKey = "ENABLE_PROBES_SERVER"
84+
probesPortDefault = 8080
85+
probesPortConfigKey = "PROBES_SERVER_PORT"
86+
probesEndpointDefault = "/healthz"
87+
probesEndpointConfigKey = "PROBES_SERVER_ENDPOINT"
8188
region = ""
8289
awsRegionConfigKey = "AWS_REGION"
8390
awsEndpointConfigKey = "AWS_ENDPOINT"
@@ -115,6 +122,9 @@ type Config struct {
115122
UptimeFromFile string
116123
EnablePrometheus bool
117124
PrometheusPort int
125+
EnableProbes bool
126+
ProbesPort int
127+
ProbesEndpoint string
118128
AWSRegion string
119129
AWSEndpoint string
120130
QueueURL string
@@ -162,6 +172,9 @@ func ParseCliArgs() (config Config, err error) {
162172
flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).")
163173
flag.BoolVar(&config.EnablePrometheus, "enable-prometheus-server", getBoolEnv(enablePrometheusConfigKey, enablePrometheusDefault), "If true, a http server is used for exposing prometheus metrics in /metrics endpoint.")
164174
flag.IntVar(&config.PrometheusPort, "prometheus-server-port", getIntEnv(prometheusPortConfigKey, prometheusPortDefault), "The port for running the prometheus http server.")
175+
flag.BoolVar(&config.EnableProbes, "enable-probes-server", getBoolEnv(enableProbesConfigKey, enableProbesDefault), "If true, a http server is used for exposing probes in /healthz endpoint.")
176+
flag.IntVar(&config.ProbesPort, "probes-server-port", getIntEnv(probesPortConfigKey, probesPortDefault), "The port for running the probes http server.")
177+
flag.StringVar(&config.ProbesEndpoint, "probes-server-endpoint", getEnv(probesEndpointConfigKey, probesEndpointDefault), "If specified, use this endpoint to make liveness probe")
165178
flag.StringVar(&config.AWSRegion, "aws-region", getEnv(awsRegionConfigKey, ""), "If specified, use the AWS region for AWS API calls")
166179
flag.StringVar(&config.AWSEndpoint, "aws-endpoint", getEnv(awsEndpointConfigKey, ""), "[testing] If specified, use the AWS endpoint to make API calls")
167180
flag.StringVar(&config.QueueURL, "queue-url", getEnv(queueURLConfigKey, ""), "Listens for messages on the specified SQS queue URL")

Diff for: pkg/observability/probes.go

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package observability
2+
3+
import (
4+
"net"
5+
"net/http"
6+
"strconv"
7+
"time"
8+
9+
"github.com/rs/zerolog/log"
10+
)
11+
12+
// InitProbes will initialize, register and expose, via http server, the probes.
13+
func InitProbes(enabled bool, port int, endpoint string) error {
14+
if !enabled {
15+
return nil
16+
}
17+
18+
http.HandleFunc(endpoint, livenessHandler)
19+
20+
probes := &http.Server{
21+
Addr: net.JoinHostPort("", strconv.Itoa(port)),
22+
ReadTimeout: 1 * time.Second,
23+
WriteTimeout: 1 * time.Second,
24+
}
25+
26+
// Starts HTTP server exposing the probes path
27+
go func() {
28+
log.Info().Msgf("Starting to serve handler %s, port %d", endpoint, port)
29+
if err := probes.ListenAndServe(); err != nil && err != http.ErrServerClosed {
30+
log.Err(err).Msg("Failed to listen and serve http server")
31+
}
32+
}()
33+
34+
return nil
35+
}
36+
37+
func livenessHandler(w http.ResponseWriter, r *http.Request) {
38+
w.Header().Add("Content-Type", "application/json")
39+
w.WriteHeader(http.StatusOK)
40+
w.Write([]byte(`{"health":"OK"}`))
41+
}

Diff for: pkg/observability/probes_test.go

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package observability
2+
3+
import (
4+
"net/http"
5+
"net/http/httptest"
6+
"testing"
7+
)
8+
9+
func TestLivenessHandler(t *testing.T) {
10+
req := httptest.NewRequest("GET", "/healthz", nil)
11+
rr := httptest.NewRecorder()
12+
handler := http.HandlerFunc(livenessHandler)
13+
14+
handler.ServeHTTP(rr, req)
15+
16+
if contentType := rr.Header().Get("Content-Type"); contentType != "application/json" {
17+
t.Errorf("handler returned wrong status content type: got %v want %v",
18+
contentType, "application/json")
19+
}
20+
21+
if status := rr.Code; status != http.StatusOK {
22+
t.Errorf("handler returned wrong status code: got %v want %v",
23+
status, http.StatusOK)
24+
}
25+
26+
if body := rr.Body.String(); body != `{"health":"OK"}` {
27+
t.Errorf("handler returned wrong body: got %v want %v",
28+
body, http.StatusText(http.StatusOK))
29+
}
30+
}

0 commit comments

Comments
 (0)