@@ -46,26 +46,93 @@ spec:
46
46
- containerPort : 8000
47
47
name : http
48
48
protocol : TCP
49
+ lifecycle :
50
+ preStop :
51
+ # vLLM stops accepting connections when it receives SIGTERM, so we need to sleep
52
+ # to give upstream gateways a chance to take us out of rotation. The time we wait
53
+ # is dependent on the time it takes for all upstreams to completely remove us from
54
+ # rotation. Older or simpler load balancers might take upwards of 30s, but we expect
55
+ # our deployment to run behind a modern gateway like Envoy which is designed to
56
+ # probe for readiness aggressively.
57
+ sleep :
58
+ # Upstream gateway probers for health should be set on a low period, such as 5s,
59
+ # and the shorter we can tighten that bound the faster that we release
60
+ # accelerators during controlled shutdowns. However, we should expect variance,
61
+ # as load balancers may have internal delays, and we don't want to drop requests
62
+ # normally, so we're often aiming to set this value to a p99 propagation latency
63
+ # of readiness -> load balancer taking backend out of rotation, not the average.
64
+ #
65
+ # This value is generally stable and must often be experimentally determined on
66
+ # for a given load balancer and health check period. We set the value here to
67
+ # the highest value we observe on a supported load balancer, and we recommend
68
+ # tuning this value down and verifying no requests are dropped.
69
+ #
70
+ # If this value is updated, be sure to update terminationGracePeriodSeconds.
71
+ #
72
+ seconds : 30
73
+ #
74
+ # IMPORTANT: preStop.sleep is beta as of Kubernetes 1.30 - for older versions
75
+ # replace with this exec action.
76
+ # exec:
77
+ # command:
78
+ # - /usr/bin/sleep
79
+ # - 30
49
80
livenessProbe :
50
- failureThreshold : 240
51
81
httpGet :
52
82
path : /health
53
83
port : http
54
84
scheme : HTTP
55
- initialDelaySeconds : 5
56
- periodSeconds : 5
85
+ # vLLM's health check is simple, so we can more aggressively probe it. Liveness
86
+ # check endpoints should always be suitable for aggressive probing.
87
+ periodSeconds : 1
57
88
successThreshold : 1
89
+ # vLLM has a very simple health implementation, which means that any failure is
90
+ # likely significant. However, any liveness triggered restart requires the very
91
+ # large core model to be reloaded, and so we should bias towards ensuring the
92
+ # server is definitely unhealthy vs immediately restarting. Use 5 attempts as
93
+ # evidence of a serious problem.
94
+ failureThreshold : 5
58
95
timeoutSeconds : 1
59
96
readinessProbe :
60
- failureThreshold : 600
61
97
httpGet :
62
98
path : /health
63
99
port : http
64
100
scheme : HTTP
65
- initialDelaySeconds : 5
66
- periodSeconds : 5
101
+ # vLLM's health check is simple, so we can more aggressively probe it. Readiness
102
+ # check endpoints should always be suitable for aggressive probing, but may be
103
+ # slightly more expensive than readiness probes.
104
+ periodSeconds : 1
67
105
successThreshold : 1
106
+ # vLLM has a very simple health implementation, which means that any failure is
107
+ # likely significant,
108
+ failureThreshold : 1
68
109
timeoutSeconds : 1
110
+ # We set a startup probe so that we don't begin directing traffic or checking
111
+ # liveness to this instance until the model is loaded.
112
+ startupProbe :
113
+ # Failure threshold is when we believe startup will not happen at all, and is set
114
+ # to the maximum possible time we believe loading a model will take. In our
115
+ # default configuration we are downloading a model from HuggingFace, which may
116
+ # take a long time, then the model must load into the accelerator. We choose
117
+ # 10 minutes as a reasonable maximum startup time before giving up and attempting
118
+ # to restart the pod.
119
+ #
120
+ # IMPORTANT: If the core model takes more than 10 minutes to load, pods will crash
121
+ # loop forever. Be sure to set this appropriately.
122
+ failureThreshold : 600
123
+ # Set delay to start low so that if the base model changes to something smaller
124
+ # or an optimization is deployed, we don't wait unneccesarily.
125
+ initialDelaySeconds : 2
126
+ # As a startup probe, this stops running and so we can more aggressively probe
127
+ # even a moderately complex startup - this is a very important workload.
128
+ periodSeconds : 1
129
+ httpGet :
130
+ # vLLM does not start the OpenAI server (and hence make /health available)
131
+ # until models are loaded. This may not be true for all model servers.
132
+ path : /health
133
+ port : http
134
+ scheme : HTTP
135
+
69
136
resources :
70
137
limits :
71
138
nvidia.com/gpu : 1
@@ -92,8 +159,71 @@ spec:
92
159
- name : config-volume
93
160
mountPath : /config
94
161
restartPolicy : Always
95
- schedulerName : default-scheduler
96
- terminationGracePeriodSeconds : 30
162
+
163
+ # vLLM allows VLLM_PORT to be specified as an environment variable, but a user might
164
+ # create a 'vllm' service in their namespace. That auto-injects VLLM_PORT in docker
165
+ # compatible form as `tcp://<IP>:<PORT>` instead of the numeric value vLLM accepts
166
+ # causing CrashLoopBackoff. Set service environment injection off by default.
167
+ enableServiceLinks : false
168
+
169
+ # Generally, the termination grace period needs to last longer than the slowest request
170
+ # we expect to serve plus any extra time spent waiting for load balancers to take the
171
+ # model server out of rotation.
172
+ #
173
+ # An easy starting point is the p99 or max request latency measured for your workload,
174
+ # although LLM request latencies vary significantly if clients send longer inputs or
175
+ # trigger longer outputs. Since steady state p99 will be higher than the latency
176
+ # to drain a server, you may wish to slightly this value either experimentally or
177
+ # via the calculation below.
178
+ #
179
+ # For most models you can derive an upper bound for the maximum drain latency as
180
+ # follows:
181
+ #
182
+ # 1. Identify the maximum context length the model was trained on, or the maximum
183
+ # allowed length of output tokens configured on vLLM (llama2-7b was trained to
184
+ # 4k context length, while llama3-8b was trained to 128k).
185
+ # 2. Output tokens are the more compute intensive to calculate and the accelerator
186
+ # will have a maximum concurrency (batch size) - the time per output token at
187
+ # maximum batch with no prompt tokens being processed is the slowest an output
188
+ # token can be generated (for this model it would be about 100ms TPOT at a max
189
+ # batch size around 50)
190
+ # 3. Calculate the worst case request duration if a request starts immediately
191
+ # before the server stops accepting new connections - generally when it receives
192
+ # SIGTERM (for this model that is about 4096 / 10 ~ 40s)
193
+ # 4. If there are any requests generating prompt tokens that will delay when those
194
+ # output tokens start, and prompt token generation is roughly 6x faster than
195
+ # compute-bound output token generation, so add 20% to the time from above (40s +
196
+ # 16s ~ 55s)
197
+ #
198
+ # Thus we think it will take us at worst about 55s to complete the longest possible
199
+ # request the model is likely to receive at maximum concurrency (highest latency)
200
+ # once requests stop being sent.
201
+ #
202
+ # NOTE: This number will be lower than steady state p99 latency since we stop receiving
203
+ # new requests which require continuous prompt token computation.
204
+ # NOTE: The max timeout for backend connections from gateway to model servers should
205
+ # be configured based on steady state p99 latency, not drain p99 latency
206
+ #
207
+ # 5. Add the time the pod takes in its preStop hook to allow the load balancers have
208
+ # stopped sending us new requests (55s + 30s ~ 85s)
209
+ #
210
+ # Because termination grace period controls when the Kubelet forcibly terminates a
211
+ # stuck or hung process (a possibility due to a GPU crash), there is operational safety
212
+ # in keeping the value roughly proportional to the time to finish serving. There is also
213
+ # value in adding a bit of extra time to deal with unexpectedly long workloads.
214
+ #
215
+ # 6. Add a 50% safety buffer to this time since the operational impact should be low
216
+ # (85s * 1.5 ~ 130s)
217
+ #
218
+ # One additional source of drain latency is that some workloads may run close to
219
+ # saturation and have queued requests on each server. Since traffic in excess of the
220
+ # max sustainable QPS will result in timeouts as the queues grow, we assume that failure
221
+ # to drain in time due to excess queues at the time of shutdown is an expected failure
222
+ # mode of server overload. If your workload occasionally experiences high queue depths
223
+ # due to periodic traffic, consider increasing the safety margin above to account for
224
+ # time to drain queued requests.
225
+ terminationGracePeriodSeconds : 130
226
+
97
227
volumes :
98
228
- name : data
99
229
emptyDir : {}
0 commit comments