@@ -46,26 +46,102 @@ spec:
46
46
- containerPort : 8000
47
47
name : http
48
48
protocol : TCP
49
+ lifecycle :
50
+ preStop :
51
+ # vLLM stops accepting connections when it receives SIGTERM, so we need to sleep
52
+ # to give upstream gateways a chance to take us out of rotation. The time we wait
53
+ # is dependent on the time it takes for all upstreams to completely remove us from
54
+ # rotation. Older or simpler load balancers might take upwards of 30s, but we expect
55
+ # our deployment to run behind a modern gateway like Envoy which is designed to
56
+ # probe for readiness aggressively.
57
+ sleep :
58
+ # Upstream gateway probers for health should be set on a low period, such as 5s,
59
+ # and the shorter we can tighten that bound the faster that we release
60
+ # accelerators during controlled shutdowns. However, we should expect variance,
61
+ # as load balancers may have internal delays, and we don't want to drop requests
62
+ # normally, so we're often aiming to set this value to a p99 propagation latency
63
+ # of readiness -> load balancer taking backend out of rotation, not the average.
64
+ #
65
+ # This value is generally stable and must often be experimentally determined on
66
+ # for a given load balancer and health check period. We set the value here to
67
+ # the highest value we observe on a supported load balancer, and we recommend
68
+ # tuning this value down and verifying no requests are dropped.
69
+ #
70
+ # If this value is updated, be sure to update terminationGracePeriodSeconds.
71
+ seconds : 25
49
72
livenessProbe :
50
- failureThreshold : 240
51
73
httpGet :
52
74
path : /health
53
75
port : http
54
76
scheme : HTTP
55
- initialDelaySeconds : 5
56
- periodSeconds : 5
77
+ # vLLM's health check is simple, so we can more aggressively probe it. Liveness
78
+ # check endpoints should always be suitable for aggressive probing.
79
+ periodSeconds : 1
57
80
successThreshold : 1
81
+ # vLLM has a very simple health implementation, which means that any failure is
82
+ # likely significant. However, any liveness triggered restart requires the very
83
+ # large core model to be reloaded, and so we should bias towards ensuring the
84
+ # server is definitely unhealthy vs immediately restarting. Use 5 attempts as
85
+ # evidence of a serious problem.
86
+ failureThreshold : 5
58
87
timeoutSeconds : 1
59
88
readinessProbe :
60
- failureThreshold : 600
61
89
httpGet :
62
90
path : /health
63
91
port : http
64
92
scheme : HTTP
65
- initialDelaySeconds : 5
66
- periodSeconds : 5
93
+ # vLLM's health check is simple, so we can more aggressively probe it. Readiness
94
+ # check endpoints should always be suitable for aggressive probing, but may be
95
+ # slightly more expensive than readiness probes.
96
+ periodSeconds : 1
67
97
successThreshold : 1
98
+ # vLLM has a very simple health implementation, which means that any failure is
99
+ # likely significant,
100
+ failureThreshold : 1
68
101
timeoutSeconds : 1
102
+ # We set a startup probe so that we don't begin directing traffic to this instance
103
+ # until the model is loaded.
104
+ startupProbe :
105
+ # Failure threshold is when we believe startup will not happen at all, and is set
106
+ # to the maximum possible time we believe loading a model will take. In our
107
+ # default configuration we are downloading a model from HuggingFace, which may
108
+ # take a long time, then the model must load into the accelerator. We choose
109
+ # 10 minutes as a reasonable maximum startup time before giving up and attempting
110
+ # to restart the pod.
111
+ #
112
+ # IMPORTANT: If the core model takes more than 10 minutes to load, pods will crash
113
+ # loop forever. Be sure to set this appropriately.
114
+ failureThreshold : 600
115
+ # Set delay to start low so that if the base model changes to something smaller
116
+ # or an optimization is deployed, we don't wait unneccesarily.
117
+ initialDelaySeconds : 2
118
+ # As a startup probe, this stops running and so we can more aggressively probe
119
+ # even a moderately complex startup - this is a very important workload.
120
+ periodSeconds : 1
121
+ exec :
122
+ # Verify that our core model is loaded before we consider startup successful.
123
+ # /health starts returning true very early in vLLM startup, but we want to
124
+ # only consider ourselves as started up once the model has been loaded.
125
+ #
126
+ # vLLM should implement a readiness check that is only true once the model
127
+ # can begin serving, and then this can be switched to an httpGet probe.
128
+ command :
129
+ - /bin/bash
130
+ - -c
131
+ - |
132
+ set -eu
133
+ if ! models="$( curl -q http://0.0.0.0:8000/v1/models )"; then
134
+ echo "server not responding"
135
+ exit 1
136
+ fi
137
+ echo "${models}" | grep -q "$1"
138
+ if [[ $? -ne 0 ]]; then
139
+ echo "model not found"
140
+ exit 1
141
+ fi
142
+ echo "ok"
143
+ - ' '
144
+ - ' "id":"meta-llama/Llama-2-7b-hf"'
69
145
resources :
70
146
limits :
71
147
nvidia.com/gpu : 1
@@ -92,8 +168,59 @@ spec:
92
168
- name : config-volume
93
169
mountPath : /config
94
170
restartPolicy : Always
95
- schedulerName : default-scheduler
96
- terminationGracePeriodSeconds : 30
171
+
172
+ # Generally, the termination grace period needs to last longer than the slowest request
173
+ # we expect to serve plus any extra time spent waiting for load balancers to take the
174
+ # model server out of rotation.
175
+ #
176
+ # An easy starting point is the p99 or max request latency measured for your workload,
177
+ # although LLM request latencies vary significantly if clients send longer inputs or
178
+ # trigger longer outputs. Since steady state p99 will be higher than the latency
179
+ # to drain a server, you may wish to slightly this value either experimentally or
180
+ # via the calculation below.
181
+ #
182
+ # For most models you can derive an upper bound for the maximum drain latency as
183
+ # follows:
184
+ #
185
+ # 1. Identify the maximum context length the model was trained on, or the maximum
186
+ # allowed length of output tokens configured on vLLM (llama2-7b was trained to
187
+ # 4k context length, while llama3-8b was trained to 128k).
188
+ # 2. Output tokens are the more compute intensive to calculate and the accelerator
189
+ # will have a maximum concurrency (batch size) - the time per output token at
190
+ # maximum batch with no prompt tokens being processed is the slowest an output
191
+ # token can be generated (for this model it would be about 100ms TPOT at a max
192
+ # batch size around 50)
193
+ # 3. Calculate the worst case request duration if a request starts immediately
194
+ # before the server stops accepting new connections - generally when it receives
195
+ # SIGTERM (for this model that is about 4096 / 10 ~ 40s)
196
+ # 4. If there are any requests generating prompt tokens that will delay when those
197
+ # output tokens start, and prompt token generation is roughly 6x faster than
198
+ # compute-bound output token generation, so add 20% to the time from above (40s +
199
+ # 16s ~ 55s)
200
+ #
201
+ # Thus we think it will take us at worst about 55s to complete the longest possible
202
+ # request the model is likely to receive at maximum concurrency (highest latency)
203
+ # once requests stop being sent.
204
+ #
205
+ # NOTE: This number will be lower than steady state p99 latency since we stop receiving
206
+ # new requests which require continuous prompt token computation.
207
+ # NOTE: The max timeout for backend connections from gateway to model servers should
208
+ # be configured based on steady state p99 latency, not drain p99 latency
209
+ #
210
+ # 5. Add the time the pod takes in its preStop hook to allow the load balancers have
211
+ # stopped sending us new requests (55s + 25s ~ 80s)
212
+ #
213
+ # Because termination grace period controls when the Kubelet forcibly terminates a
214
+ # stuck or hung process (a possibility due to a GPU crash), there is operational safety
215
+ # in keeping the value roughly proportional to the time to finish serving. There is also
216
+ # value in adding a bit of extra time to deal with unexpectedly long workloads.
217
+ #
218
+ # 6. Add a 50% safety buffer to this time since the operational impact should be low
219
+ # (80s * 1.5 ~ 120s)
220
+ #
221
+ # NOTE: The max timeout for backend connections from gateway to model servers will be
222
+ terminationGracePeriodSeconds : 120
223
+
97
224
volumes :
98
225
- name : data
99
226
emptyDir : {}
0 commit comments