File tree 3 files changed +7
-2
lines changed
.buildkite/scripts/hardware_ci
3 files changed +7
-2
lines changed Original file line number Diff line number Diff line change @@ -19,6 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \
19
19
vllm-tpu /bin/bash -c " python3 -m pip install git+https://github.com/thuml/depyf.git \
20
20
&& python3 -m pip install pytest pytest-asyncio tpu-info \
21
21
&& python3 -m pip install lm_eval[api]==0.4.4 \
22
+ && export VLLM_XLA_CACHE_PATH= \
22
23
&& export VLLM_USE_V1=1 \
23
24
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
24
25
&& echo HARDWARE \
Original file line number Diff line number Diff line change 13
13
14
14
from vllm .entrypoints .llm import LLM
15
15
from vllm .outputs import RequestOutput
16
+ from vllm .platforms import current_platform
16
17
from vllm .sampling_params import GuidedDecodingParams , SamplingParams
17
18
18
19
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
@@ -63,10 +64,13 @@ def test_structured_output(
63
64
):
64
65
monkeypatch .setenv ("VLLM_USE_V1" , "1" )
65
66
67
+ # Don't use eager execution on TPUs because we want to test for no
68
+ # recompilation at runtime
69
+ enforce_eager = bool (not current_platform .is_tpu ())
66
70
# Use a single LLM instance for several scenarios to
67
71
# speed up the test suite.
68
72
llm = LLM (model = model_name ,
69
- enforce_eager = True ,
73
+ enforce_eager = enforce_eager ,
70
74
max_model_len = 1024 ,
71
75
guided_decoding_backend = guided_decoding_backend ,
72
76
tokenizer_mode = tokenizer_mode )
Original file line number Diff line number Diff line change @@ -23,7 +23,7 @@ def test_sampler_different(model_name: str):
23
23
different results.
24
24
"""
25
25
llm = LLM (model_name ,
26
- enforce_eager = True ,
26
+ enforce_eager = False ,
27
27
max_num_seqs = 1 ,
28
28
max_model_len = 512 ,
29
29
max_num_batched_tokens = 512 )
You can’t perform that action at this time.
0 commit comments