16
16
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
17
17
18
18
19
+ @pytest .fixture (scope = "module" , params = [True , False ])
20
+ def use_v1 (request ):
21
+ # Module-scoped variant of run_with_both_engines
22
+ #
23
+ # Use this fixture to run a test with both v0 and v1, and
24
+ # also to conditionalize the test logic e.g.
25
+ #
26
+ # def test_metrics_exist(use_v1, server, client):
27
+ # ...
28
+ # expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
29
+ # for metric in expected:
30
+ # assert metric in response.text
31
+ #
32
+ # @skip_v1 wouldn't work here because this is a module-level
33
+ # fixture - per-function decorators would have no effect
34
+ yield request .param
35
+
36
+
19
37
@pytest .fixture (scope = "module" )
20
38
def default_server_args ():
21
39
return [
@@ -36,10 +54,12 @@ def default_server_args():
36
54
"--enable-chunked-prefill" ,
37
55
"--disable-frontend-multiprocessing" ,
38
56
])
39
- def server (default_server_args , request ):
57
+ def server (use_v1 , default_server_args , request ):
40
58
if request .param :
41
59
default_server_args .append (request .param )
42
- with RemoteOpenAIServer (MODEL_NAME , default_server_args ) as remote_server :
60
+ env_dict = dict (VLLM_USE_V1 = '1' if use_v1 else '0' )
61
+ with RemoteOpenAIServer (MODEL_NAME , default_server_args ,
62
+ env_dict = env_dict ) as remote_server :
43
63
yield remote_server
44
64
45
65
@@ -84,7 +104,9 @@ async def client(server):
84
104
85
105
@pytest .mark .asyncio
86
106
async def test_metrics_counts (server : RemoteOpenAIServer ,
87
- client : openai .AsyncClient ):
107
+ client : openai .AsyncClient , use_v1 : bool ):
108
+ if use_v1 :
109
+ pytest .skip ("Skipping test on vllm V1" )
88
110
for _ in range (_NUM_REQUESTS ):
89
111
# sending a request triggers the metrics to be logged.
90
112
await client .completions .create (
@@ -174,10 +196,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
174
196
"swap_space_bytes" ,
175
197
]
176
198
199
+ EXPECTED_METRICS_V1 = [
200
+ "vllm:num_requests_running" ,
201
+ "vllm:num_requests_waiting" ,
202
+ ]
203
+
177
204
178
205
@pytest .mark .asyncio
179
206
async def test_metrics_exist (server : RemoteOpenAIServer ,
180
- client : openai .AsyncClient ):
207
+ client : openai .AsyncClient , use_v1 : bool ):
181
208
# sending a request triggers the metrics to be logged.
182
209
await client .completions .create (model = MODEL_NAME ,
183
210
prompt = "Hello, my name is" ,
@@ -187,11 +214,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
187
214
response = requests .get (server .url_for ("metrics" ))
188
215
assert response .status_code == HTTPStatus .OK
189
216
190
- for metric in EXPECTED_METRICS :
217
+ for metric in ( EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS ) :
191
218
assert metric in response .text
192
219
193
220
194
- def test_metrics_exist_run_batch ():
221
+ def test_metrics_exist_run_batch (use_v1 : bool ):
222
+ if use_v1 :
223
+ pytest .skip ("Skipping test on vllm V1" )
195
224
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501
196
225
197
226
base_url = "0.0.0.0"
0 commit comments