Skip to content

[CI/Build] Fix CI LoRA failure #16270

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
monkeypatch.setenv('VLLM_USE_V1', '0')

yield


@pytest.fixture
def reset_default_device():
"""
Some tests, such as `test_punica_ops.py`, explicitly set the
default device, which can affect subsequent tests. Adding this fixture
helps avoid this problem.
"""
original_device = torch.get_default_device()
yield
torch.set_default_device(original_device)
1 change: 0 additions & 1 deletion tests/lora/test_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
max_num_seqs=16,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=1,
trust_remote_code=True,
fully_sharded_loras=fully_sharded)
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
Expand Down
1 change: 0 additions & 1 deletion tests/lora/test_chatglm3_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
enable_lora=True,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=1,
trust_remote_code=True,
enable_chunked_prefill=True)

Expand Down
2 changes: 1 addition & 1 deletion tests/lora/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@


@pytest.fixture(autouse=True)
def clean_cache():
def clean_cache_reset_device(reset_default_device):
# Release any memory we might be holding on to. CI runs OOMs otherwise.
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
_LORA_B_PTR_DICT)
Expand Down
1 change: 0 additions & 1 deletion tests/lora/test_llama_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
# also test odd max_num_seqs
max_num_seqs=13,
max_loras=4,
tensor_parallel_size=1,
enable_chunked_prefill=True)
generate_and_test(llm, sql_lora_files)

Expand Down
5 changes: 5 additions & 0 deletions tests/lora/test_punica_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from .utils import PunicaTensors, assert_close, generate_data_for_nslices


@pytest.fixture(autouse=True)
def reset_device(reset_default_device):
pass


# Utility shrink and expand operations used as reference implementations.
def sgmv_shrink_for_nslices(
nslices: int, inputs_tensor: torch.Tensor,
Expand Down
9 changes: 1 addition & 8 deletions tests/lora/test_quant_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,20 +78,14 @@ def format_prompt_tuples(prompt):


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", [1])
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
tp_size):
if num_gpus_available < tp_size and \
tp_size > 1 and current_platform.is_cuda_alike():
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
def test_quant_model_lora(tinyllama_lora_files, model):

llm = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_model_len=400,
tensor_parallel_size=tp_size,
gpu_memory_utilization=0.2, #avoid OOM
quantization=model.quantization,
trust_remote_code=True,
Expand Down Expand Up @@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=1,
gpu_memory_utilization=0.2, #avoid OOM
quantization=model.quantization,
trust_remote_code=True,
Expand Down
1 change: 0 additions & 1 deletion tests/lora/test_transfomers_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=1,
trust_remote_code=True,
enable_chunked_prefill=True)

Expand Down