Skip to content

Commit e7c7c5e

Browse files
ywang96DarkLight1337Isotr0py
authored
[V1][VLM] V1 support for selected single-image models. (#11632)
Signed-off-by: Roger Wang <[email protected]> Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: Isotr0py <[email protected]> Co-authored-by: DarkLight1337 <[email protected]> Co-authored-by: Isotr0py <[email protected]>
1 parent 8c3230d commit e7c7c5e

File tree

19 files changed

+590
-636
lines changed

19 files changed

+590
-636
lines changed

docs/source/models/supported_models.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -570,28 +570,28 @@ See [this page](#generative-models) for more information on how to use generativ
570570
- `rhymes-ai/Aria`
571571
-
572572
- ✅︎
573-
-
573+
- ✅︎
574574
* - `Blip2ForConditionalGeneration`
575575
- BLIP-2
576576
- T + I<sup>E</sup>
577577
- `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
578578
-
579579
- ✅︎
580-
-
580+
- ✅︎
581581
* - `ChameleonForConditionalGeneration`
582582
- Chameleon
583583
- T + I
584584
- `facebook/chameleon-7b` etc.
585585
-
586586
- ✅︎
587-
-
587+
- ✅︎
588588
* - `FuyuForCausalLM`
589589
- Fuyu
590590
- T + I
591591
- `adept/fuyu-8b` etc.
592592
-
593593
- ✅︎
594-
-
594+
- ✅︎
595595
* - `ChatGLMModel`
596596
- GLM-4V
597597
- T + I
@@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ
633633
- `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
634634
-
635635
- ✅︎
636-
-
636+
- ✅︎
637637
* - `LlavaNextVideoForConditionalGeneration`
638638
- LLaVA-NeXT-Video
639639
- T + V

examples/offline_inference_vision_language.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@ def run_aria(question: str, modality: str):
2424
assert modality == "image"
2525
model_name = "rhymes-ai/Aria"
2626

27+
# NOTE: Need L40 (or equivalent) to avoid OOM
2728
llm = LLM(model=model_name,
2829
tokenizer_mode="slow",
29-
trust_remote_code=True,
3030
dtype="bfloat16",
31+
max_model_len=4096,
32+
max_num_seqs=2,
33+
trust_remote_code=True,
3134
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
3235

3336
prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
@@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str):
5760
prompt = f"{question}<image>"
5861
llm = LLM(model="facebook/chameleon-7b",
5962
max_model_len=4096,
63+
max_num_seqs=2,
6064
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
6165
stop_token_ids = None
6266
return llm, prompt, stop_token_ids
@@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str):
257261
# 2.5
258262
# model_name = "openbmb/MiniCPM-Llama3-V-2_5"
259263

260-
#2.6
264+
# 2.6
261265
model_name = "openbmb/MiniCPM-V-2_6"
262266
tokenizer = AutoTokenizer.from_pretrained(model_name,
263267
trust_remote_code=True)
@@ -430,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str):
430434

431435
model_name = "mistral-community/pixtral-12b"
432436

437+
# NOTE: Need L40 (or equivalent) to avoid OOM
433438
llm = LLM(
434439
model=model_name,
435440
max_model_len=8192,
441+
max_num_seqs=2,
436442
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
437443
)
438444

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,7 @@
140140
"aria": VLMTestInfo(
141141
models=["rhymes-ai/Aria"],
142142
tokenizer_mode="slow",
143-
test_type=(
144-
VLMTestType.IMAGE,
145-
VLMTestType.MULTI_IMAGE,
146-
),
143+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
147144
dtype="bfloat16",
148145
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
149146
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
@@ -179,6 +176,7 @@
179176
test_type=VLMTestType.IMAGE,
180177
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
181178
max_model_len=4096,
179+
max_num_seqs=2,
182180
auto_cls=AutoModelForVision2Seq,
183181
postprocess_inputs=model_utils.cast_dtype_post_processor(
184182
"pixel_values"
@@ -201,7 +199,6 @@
201199
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
202200
num_logprobs=10,
203201
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
204-
marks=[large_gpu_mark(min_gb=48)],
205202
),
206203
"glm4": VLMTestInfo(
207204
models=["THUDM/glm-4v-9b"],

tests/multimodal/test_processing.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ def _rand_audio(
528528

529529
def _test_processing_cache_correctness(
530530
model_id: str,
531-
modalities: set[str],
531+
modalities: dict[str, bool],
532532
hit_rate: float,
533533
num_batches: int,
534534
simplify_rate: float,
@@ -583,9 +583,8 @@ def _test_processing_cache_correctness(
583583
partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
584584
}
585585
input_max_count = {
586-
"image": 3,
587-
"video": 3,
588-
"audio": 3,
586+
modality: 3 if supports_multi else 1
587+
for modality, supports_multi in modalities.items()
589588
}
590589

591590
for batch_idx in range(num_batches):
@@ -624,20 +623,24 @@ def _test_processing_cache_correctness(
624623

625624
# yapf: disable
626625
@pytest.mark.parametrize(("model_id", "modalities"), [
627-
("llava-hf/llava-1.5-7b-hf", {"image"}),
628-
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
629-
("mistral-community/pixtral-12b", {"image"}),
630-
("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
631-
("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
632-
("fixie-ai/ultravox-v0_3", {"audio"}),
626+
("rhymes-ai/Aria", {"image": True}),
627+
("Salesforce/blip2-opt-2.7b", {"image": False}),
628+
("facebook/chameleon-7b", {"image": True}),
629+
("adept/fuyu-8b", {"image": False}),
630+
("llava-hf/llava-1.5-7b-hf", {"image": True}),
631+
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
632+
("mistral-community/pixtral-12b", {"image": True}),
633+
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
634+
("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
635+
("fixie-ai/ultravox-v0_3", {"audio": True}),
633636
])
634637
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
635638
@pytest.mark.parametrize("num_batches", [32])
636639
@pytest.mark.parametrize("simplify_rate", [1.0])
637640
# yapf: enable
638641
def test_processing_cache_correctness(
639642
model_id: str,
640-
modalities: set[str],
643+
modalities: dict[str, bool],
641644
hit_rate: float,
642645
num_batches: int,
643646
simplify_rate: float,
@@ -653,15 +656,15 @@ def test_processing_cache_correctness(
653656

654657
# yapf: disable
655658
@pytest.mark.parametrize(("model_id", "modalities"), [
656-
("microsoft/Phi-3-vision-128k-instruct", {"image"}),
659+
("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
657660
])
658661
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
659662
@pytest.mark.parametrize("num_batches", [32])
660663
@pytest.mark.parametrize("simplify_rate", [1.0])
661664
# yapf: enable
662665
def test_processing_cache_correctness_phi3v(
663666
model_id: str,
664-
modalities: set[str],
667+
modalities: dict[str, bool],
665668
hit_rate: float,
666669
num_batches: int,
667670
simplify_rate: float,

0 commit comments

Comments
 (0)