Skip to content

[V1] Enable multi-input by default #15799

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Apr 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.

:::{important}
To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:

Offline inference:
Expand All @@ -777,6 +777,8 @@ Online serving:
vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
```

**This is no longer required if you are using vLLM V1.**

:::

:::{note}
Expand Down
24 changes: 24 additions & 0 deletions docs/source/serving/offline_inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,30 @@ If you run out of CPU RAM, try the following options:
- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).

#### Disable unused modalities

You can disable unused modalities (except for text) by setting its limit to zero.

For example, if your application only accepts image input, there is no need to allocate any memory for videos.

```python
from vllm import LLM

# Accept images but not videos
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
limit_mm_per_prompt={"video": 0})
```

You can even run a multi-modal model for text-only inference:

```python
from vllm import LLM

# Don't accept images. Just text.
llm = LLM(model="google/gemma-3-27b-it",
limit_mm_per_prompt={"image": 0})
```

### Performance optimization and tuning

You can potentially improve the performance of vLLM by finetuning various options.
Expand Down
5 changes: 5 additions & 0 deletions examples/offline_inference/audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ def main(args):
req_data = model_example_map[model](question_per_audio_count[audio_count],
audio_count)

# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})

engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)

Expand Down
5 changes: 5 additions & 0 deletions examples/offline_inference/encoder_decoder_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ def main(args):

req_data = model_example_map[model]()

# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})

engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)

Expand Down
79 changes: 45 additions & 34 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096,
max_num_seqs=2,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
Expand All @@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
Expand All @@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand All @@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
model="facebook/chameleon-7b",
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand All @@ -129,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
limit_mm_per_prompt={"image": 1},
)

prompts = [
Expand All @@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
trust_remote_code=True,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
Expand All @@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand All @@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"do_pan_and_scan": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = [("<bos><start_of_turn>user\n"
Expand All @@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True,
enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = [
Expand All @@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

tokenizer = AutoTokenizer.from_pretrained(model_name,
Expand Down Expand Up @@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 3 * 364
},
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [(
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
Expand All @@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 384
},
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
Expand All @@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

tokenizer = AutoTokenizer.from_pretrained(model_name,
Expand Down Expand Up @@ -375,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand All @@ -392,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="llava-hf/llava-v1.6-mistral-7b-hf",
max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand All @@ -414,7 +414,7 @@ def run_llava_next_video(questions: list[str],
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand Down Expand Up @@ -442,7 +442,7 @@ def run_llava_onevision(questions: list[str],
engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand All @@ -465,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
stop_token_ids = [128009]

Expand Down Expand Up @@ -506,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0
Expand Down Expand Up @@ -561,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=8192,
max_num_seqs=2,
tensor_parallel_size=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
Expand All @@ -587,7 +587,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand All @@ -611,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
)


def run_llama4(questions: list[str], modality: str):
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"

model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
Expand All @@ -621,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
gpu_memory_utilization=0.4,
limit_mm_per_prompt={"image": 1},
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand Down Expand Up @@ -657,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = [
Expand All @@ -683,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True,
max_model_len=4096,
tensor_parallel_size=4,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

tokenizer = AutoTokenizer.from_pretrained(model_name,
Expand All @@ -710,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions]
engine_args = EngineArgs(
model="google/paligemma-3b-mix-224",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
engine_args=engine_args,
Expand All @@ -726,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions]
engine_args = EngineArgs(
model="google/paligemma2-3b-ft-docci-448",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
engine_args=engine_args,
Expand Down Expand Up @@ -762,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand Down Expand Up @@ -793,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand All @@ -813,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=6144,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
Expand All @@ -834,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=1024,
max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
Expand All @@ -859,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

if modality == "image":
Expand Down Expand Up @@ -894,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

if modality == "image":
Expand Down Expand Up @@ -925,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)

tokenizer = AutoTokenizer.from_pretrained(model_name,
Expand Down Expand Up @@ -1082,7 +1085,15 @@ def main(args):

req_data = model_example_map[model](questions, modality)

engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})

engine_args = asdict(req_data.engine_args) | {
"seed": args.seed,
"disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
}
llm = LLM(**engine_args)

# To maintain code compatibility in this script, we add LoRA here.
Expand Down
Loading