@@ -209,14 +209,15 @@ def _run_test(
209
209
# will hurt multiprocessing backend with fork method (the default method).
210
210
211
211
# max_model_len should be greater than image_feature_size
212
- with vllm_runner (model ,
213
- dtype = dtype ,
214
- max_model_len = 8192 ,
215
- max_num_seqs = 3 ,
216
- tensor_parallel_size = tensor_parallel_size ,
217
- distributed_executor_backend = distributed_executor_backend ,
218
- limit_mm_per_prompt = {"image" : _LIMIT_IMAGE_PER_PROMPT
219
- }) as vllm_model :
212
+ with vllm_runner (
213
+ model ,
214
+ dtype = dtype ,
215
+ max_model_len = 19212 , # 3 max size images
216
+ max_num_seqs = 3 ,
217
+ tensor_parallel_size = tensor_parallel_size ,
218
+ distributed_executor_backend = distributed_executor_backend ,
219
+ limit_mm_per_prompt = {"image" :
220
+ _LIMIT_IMAGE_PER_PROMPT }) as vllm_model :
220
221
vllm_outputs_per_image = [
221
222
vllm_model .generate_greedy_logprobs (prompts ,
222
223
max_tokens ,
@@ -507,7 +508,7 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
507
508
model ,
508
509
dtype = dtype ,
509
510
max_model_len = 8192 ,
510
- max_num_seqs = 2 ,
511
+ max_num_seqs = 4 ,
511
512
tensor_parallel_size = 1 ,
512
513
limit_mm_per_prompt = {"image" :
513
514
_LIMIT_IMAGE_PER_PROMPT }) as vllm_model :
@@ -552,6 +553,23 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
552
553
num_logprobs ,
553
554
images = images )
554
555
556
+ # Mixed batch with text and images with different numbers of tiles
557
+ prompts = [
558
+ "<|begin_of_text|>Hello!" ,
559
+ "<|begin_of_text|>Some text before.<|image|>What is in the image?" , # noqa: E501
560
+ "<|begin_of_text|>Some text before.<|image|>What is in the image?" , # noqa: E501
561
+ ]
562
+ images = [
563
+ None ,
564
+ [stop_sign ],
565
+ # smaller image must be 2nd for the repro
566
+ [stop_sign .resize ((448 , 448 ))],
567
+ ]
568
+ vllm_model .generate_greedy_logprobs (prompts ,
569
+ max_tokens ,
570
+ num_logprobs ,
571
+ images = images )
572
+
555
573
556
574
class DummyModel :
557
575
image_token_id = MLLAMA_IMAGE_TOKEN_ID
@@ -674,3 +692,26 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
674
692
f"full_text_row_masked_out_mask[{ idx } ] must be " \
675
693
f"'{ must_be_masked } ' "
676
694
idx += 1
695
+
696
+
697
+ @pytest .mark .core_model
698
+ @pytest .mark .parametrize ("encoder_seq_lens, num_tiles, expected" , [
699
+ ([6404 ], [[4 ]], [6404 ]),
700
+ ([0 , 6404 ], [[4 ]], [6404 ]),
701
+ ([0 , 1601 , 8005 ], [[1 ], [4 , 1 ]], [1601 , 8005 ]),
702
+ ([0 , 19212 , 0 , 3202 ], [[4 , 4 , 4 ], [2 ]], [19212 , 3202 ]),
703
+ ])
704
+ def test_parse_and_validate_encoder_lens (encoder_seq_lens , num_tiles ,
705
+ expected ) -> None :
706
+
707
+ dummy = DummyModel ()
708
+ num_tokens_per_tile = 1601
709
+ actual_encoder_seq_lens = MllamaForConditionalGeneration \
710
+ ._get_and_validate_encoder_lens (
711
+ dummy ,
712
+ encoder_seq_lens ,
713
+ num_tiles ,
714
+ num_tokens_per_tile ,
715
+ )
716
+ assert actual_encoder_seq_lens == expected , \
717
+ f"Expected { expected } but got { actual_encoder_seq_lens } "
0 commit comments