1
1
import asyncio
2
+ from contextlib import ExitStack
2
3
from typing import List , Tuple
3
4
4
5
import pytest
20
21
async def generate (engine : AsyncLLM , request_id : str ,
21
22
max_tokens : int ) -> Tuple [int , str ]:
22
23
count = 0
23
- async for _ in engine .generate (request_id = request_id ,
24
- prompt = "Hello my name is Robert and" ,
25
- sampling_params = SamplingParams (
26
- max_tokens = max_tokens , temperature = 0 )):
24
+ async for out in engine .generate (
25
+ request_id = request_id ,
26
+ prompt = "Hello my name is Robert and" ,
27
+ sampling_params = SamplingParams (max_tokens = max_tokens ,
28
+ temperature = 0 )):
27
29
28
- count += 1
30
+ count += len ( out . outputs [ 0 ]. token_ids )
29
31
await asyncio .sleep (0. )
30
32
31
33
return count , request_id
@@ -36,10 +38,11 @@ async def test_load(monkeypatch):
36
38
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
37
39
# so that in the future when we switch, we don't have to change all the
38
40
# tests.
39
- with monkeypatch .context () as m :
41
+ with monkeypatch .context () as m , ExitStack () as after :
40
42
m .setenv ("VLLM_USE_V1" , "1" )
41
43
42
44
engine = AsyncLLM .from_engine_args (ENGINE_ARGS )
45
+ after .callback (engine .shutdown )
43
46
44
47
NUM_REQUESTS = 10000
45
48
NUM_EXPECTED_TOKENS = 10
@@ -61,16 +64,16 @@ async def test_load(monkeypatch):
61
64
f"expected { NUM_EXPECTED_TOKENS } " )
62
65
63
66
assert not engine .output_processor .has_unfinished_requests ()
64
- engine .shutdown ()
65
67
66
68
67
69
@pytest .mark .asyncio
68
70
async def test_abort (monkeypatch ):
69
71
70
- with monkeypatch .context () as m :
72
+ with monkeypatch .context () as m , ExitStack () as after :
71
73
m .setenv ("VLLM_USE_V1" , "1" )
72
74
73
75
engine = AsyncLLM .from_engine_args (ENGINE_ARGS )
76
+ after .callback (engine .shutdown )
74
77
75
78
NUM_REQUESTS = 100
76
79
NUM_EXPECTED_TOKENS = 100
@@ -112,5 +115,3 @@ async def test_abort(monkeypatch):
112
115
num_generated_tokens , request_id = await task
113
116
assert num_generated_tokens == NUM_EXPECTED_TOKENS
114
117
assert not engine .output_processor .has_unfinished_requests ()
115
-
116
- engine .shutdown ()
0 commit comments