@@ -47,12 +47,10 @@ def test_filter_subtensors():
47
47
48
48
@pytest .fixture (scope = "module" )
49
49
def llama_3p2_1b_files ():
50
- with TemporaryDirectory () as cache_dir :
51
- input_dir = snapshot_download ("meta-llama/Llama-3.2-1B-Instruct" ,
52
- cache_dir = cache_dir ,
53
- ignore_patterns = ["*.bin*" , "original/*" ])
50
+ input_dir = snapshot_download ("meta-llama/Llama-3.2-1B-Instruct" ,
51
+ ignore_patterns = ["*.bin*" , "original/*" ])
54
52
55
- yield input_dir
53
+ yield input_dir
56
54
57
55
58
56
def _run_writer (input_dir , output_dir , weights_patterns , ** kwargs ):
@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
64
62
65
63
# Copy metadata files to output directory
66
64
for file in os .listdir (input_dir ):
67
- if not any (
68
- file . endswith ( ext ) and not os . path . isdir ( file )
69
- for ext in weights_patterns ):
65
+ if os . path . isdir ( os . path . join ( input_dir , file )):
66
+ continue
67
+ if not any ( file . endswith ( ext ) for ext in weights_patterns ):
70
68
shutil .copy (f"{ input_dir } /{ file } " , output_dir )
71
69
72
70
@@ -81,14 +79,17 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
81
79
@pytest .mark .parametrize ("enable_lora" , [False , True ])
82
80
@pytest .mark .parametrize ("tp_size" , [1 , 2 ])
83
81
def test_sharded_state_loader (enable_lora , tp_size , num_gpus_available ,
84
- llama_3p2_1b_files ):
82
+ llama_3p2_1b_files ,
83
+ monkeypatch : pytest .MonkeyPatch ):
85
84
if num_gpus_available < tp_size :
86
85
pytest .skip (f"Not enough GPUs for tensor parallelism { tp_size } " )
87
86
88
87
weights_patterns = ("*.safetensors" , )
89
88
gpu_memory_utilization = 0.8
90
89
input_dir = llama_3p2_1b_files
91
90
ctx = mp .get_context ("spawn" )
91
+ # The interface in v1 engine has changed, run in v1 engine will hang.
92
+ monkeypatch .setenv ("VLLM_USE_V1" , "0" )
92
93
93
94
# Run in separate processes for memory & CUDA isolation
94
95
with TemporaryDirectory () as output_dir :
0 commit comments