@@ -46,26 +46,83 @@ spec:
46
46
- containerPort : 8000
47
47
name : http
48
48
protocol : TCP
49
+ lifecycle :
50
+ preStop :
51
+ # vLLM stops accepting connections when it receives SIGTERM, so we need to sleep
52
+ # to give upstream gateways a chance to take us out of rotation. The time we wait
53
+ # is dependent on the time it takes for all upstreams to completely remove us from
54
+ # rotation. Older or simpler load balancers might take upwards of 30s, but we expect
55
+ # our deployment to run behind a modern gateway like Envoy which is designed to
56
+ # probe for readiness aggressively.
57
+ sleep :
58
+ # Upstream gateway probers for health should be set on a low period, such as 5s
59
+ # and the shorter we can tighten that bound the faster that we release
60
+ # accelerators during controlled shutdowns.
61
+ seconds : 7
49
62
livenessProbe :
50
- failureThreshold : 240
51
63
httpGet :
52
64
path : /health
53
65
port : http
54
66
scheme : HTTP
55
- initialDelaySeconds : 5
56
- periodSeconds : 5
67
+ # vLLM's health check is simple, so we can more aggressively probe it. Liveness
68
+ # check endpoints should always be suitable for aggressive probing.
69
+ periodSeconds : 1
57
70
successThreshold : 1
71
+ # vLLM has a very simple health implementation, which means that any failure is
72
+ # likely significant. However, any liveness triggered restart requires the very
73
+ # large core model to be reloaded, and so we should bias towards ensuring the
74
+ # server is definitely unhealthy vs immediately restarting. Use 5 attempts as
75
+ # evidence of a serious problem.
76
+ failureThreshold : 5
58
77
timeoutSeconds : 1
59
78
readinessProbe :
60
- failureThreshold : 600
61
79
httpGet :
62
80
path : /health
63
81
port : http
64
82
scheme : HTTP
65
- initialDelaySeconds : 5
66
- periodSeconds : 5
83
+ # vLLM's health check is simple, so we can more aggressively probe it. Readiness
84
+ # check endpoints should always be suitable for aggressive probing, but may be
85
+ # slightly more expensive than readiness probes.
86
+ periodSeconds : 1
67
87
successThreshold : 1
88
+ # vLLM has a very simple health implementation, which means that any failure is
89
+ # likely significant,
90
+ failureThreshold : 1
68
91
timeoutSeconds : 1
92
+ # We set a startup probe so that we don't begin directing traffic to this instance
93
+ # until the model is loaded.
94
+ startupProbe :
95
+ # Failure threshold is when we believe startup will not happen at all, and is set
96
+ # to the maximum possible time we believe loading a model will take. In our
97
+ # default configuration we are downloading a model from HuggingFace, which may
98
+ # take a long time, then the model must load into the accelerator. We choose
99
+ # 10 minutes as a reasonable maximum startup time before giving up and attempting
100
+ # to restart the pod.
101
+ #
102
+ # IMPORTANT: If the core model takes more than 10 minutes to load, pods will crash
103
+ # loop.
104
+ failureThreshold : 600
105
+ # Set delay to start low so that if the base model changes to something smaller
106
+ # or an optimization is deployed, we don't wait unneccesarily.
107
+ initialDelaySeconds : 2
108
+ # As a startup probe, this stops running and so we can more aggressively probe
109
+ # even a moderately complex startup - this is a very important workload.
110
+ periodSeconds : 1
111
+ exec :
112
+ # Verify that our core model is loaded before we consider startup successful
113
+ command :
114
+ - /bin/bash
115
+ - -c
116
+ - set -eu
117
+ models=$( curl http://localhost:8000/v1/models )
118
+ echo ${models} | grep -q "$1"
119
+ if [[ $? -ne 0 ]]; then
120
+ echo "model not found"
121
+ exit 1
122
+ fi
123
+ echo "ok"
124
+ - ' '
125
+ - ' "id":"meta-llama/Llama-2-7b-hx"'
69
126
resources :
70
127
limits :
71
128
nvidia.com/gpu : 1
0 commit comments