Skip to content

Commit 86c3369

Browse files
authored
[CI/Build] Fix CI LoRA failure (#16270)
Signed-off-by: Jee Jee Li <[email protected]>
1 parent 2755c34 commit 86c3369

File tree

8 files changed

+19
-13
lines changed

8 files changed

+19
-13
lines changed

tests/lora/conftest.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
256256
monkeypatch.setenv('VLLM_USE_V1', '0')
257257

258258
yield
259+
260+
261+
@pytest.fixture
262+
def reset_default_device():
263+
"""
264+
Some tests, such as `test_punica_ops.py`, explicitly set the
265+
default device, which can affect subsequent tests. Adding this fixture
266+
helps avoid this problem.
267+
"""
268+
original_device = torch.get_default_device()
269+
yield
270+
torch.set_default_device(original_device)

tests/lora/test_baichuan.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
7373
max_num_seqs=16,
7474
max_loras=4,
7575
max_lora_rank=64,
76-
tensor_parallel_size=1,
7776
trust_remote_code=True,
7877
fully_sharded_loras=fully_sharded)
7978
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)

tests/lora/test_chatglm3_tp.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
6161
enable_lora=True,
6262
max_loras=4,
6363
max_lora_rank=64,
64-
tensor_parallel_size=1,
6564
trust_remote_code=True,
6665
enable_chunked_prefill=True)
6766

tests/lora/test_layers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565

6666

6767
@pytest.fixture(autouse=True)
68-
def clean_cache():
68+
def clean_cache_reset_device(reset_default_device):
6969
# Release any memory we might be holding on to. CI runs OOMs otherwise.
7070
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
7171
_LORA_B_PTR_DICT)

tests/lora/test_llama_tp.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
8888
# also test odd max_num_seqs
8989
max_num_seqs=13,
9090
max_loras=4,
91-
tensor_parallel_size=1,
9291
enable_chunked_prefill=True)
9392
generate_and_test(llm, sql_lora_files)
9493

tests/lora/test_punica_ops.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
1414

1515

16+
@pytest.fixture(autouse=True)
17+
def reset_device(reset_default_device):
18+
pass
19+
20+
1621
# Utility shrink and expand operations used as reference implementations.
1722
def sgmv_shrink_for_nslices(
1823
nslices: int, inputs_tensor: torch.Tensor,

tests/lora/test_quant_model.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,20 +78,14 @@ def format_prompt_tuples(prompt):
7878

7979

8080
@pytest.mark.parametrize("model", MODELS)
81-
@pytest.mark.parametrize("tp_size", [1])
82-
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
83-
tp_size):
84-
if num_gpus_available < tp_size and \
85-
tp_size > 1 and current_platform.is_cuda_alike():
86-
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
81+
def test_quant_model_lora(tinyllama_lora_files, model):
8782

8883
llm = vllm.LLM(
8984
model=model.model_path,
9085
enable_lora=True,
9186
max_num_seqs=16,
9287
max_loras=4,
9388
max_model_len=400,
94-
tensor_parallel_size=tp_size,
9589
gpu_memory_utilization=0.2, #avoid OOM
9690
quantization=model.quantization,
9791
trust_remote_code=True,
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
185179
enable_lora=True,
186180
max_num_seqs=16,
187181
max_loras=4,
188-
tensor_parallel_size=1,
189182
gpu_memory_utilization=0.2, #avoid OOM
190183
quantization=model.quantization,
191184
trust_remote_code=True,

tests/lora/test_transfomers_model.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
5353
enable_lora=True,
5454
max_loras=4,
5555
max_lora_rank=16,
56-
tensor_parallel_size=1,
5756
trust_remote_code=True,
5857
enable_chunked_prefill=True)
5958

0 commit comments

Comments
 (0)