Skip to content

Commit eca7565

Browse files
mgoinyangw-dev
authored andcommitted
Enforce valid max_num_batched_tokens when disable_chunked_mm_input=True (vllm-project#16447)
Signed-off-by: mgoin <[email protected]> Signed-off-by: Yang Wang <[email protected]>
1 parent 473b7f7 commit eca7565

File tree

3 files changed

+18
-1
lines changed

3 files changed

+18
-1
lines changed

tests/v1/core/test_scheduler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,15 @@ def test_no_mm_input_chunking():
322322
assert len(output.finished_req_ids) == 0
323323
assert output.num_scheduled_tokens[requests[0].request_id] == 800
324324

325+
# Test that we fail if we disable chunked mm input and use too small
326+
# of a max_num_batched_tokens for the mm input.
327+
with pytest.raises(ValueError):
328+
_ = create_scheduler(
329+
model="llava-hf/llava-1.5-7b-hf",
330+
max_num_batched_tokens=100,
331+
disable_chunked_mm_input=True,
332+
)
333+
325334

326335
@pytest.mark.parametrize("enable_prefix_caching", [True, False])
327336
def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):

vllm/engine/arg_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1030,7 +1030,7 @@ def get_kwargs(cls: type[Any]) -> Dict[str, Any]:
10301030
action=StoreBoolean,
10311031
default=EngineArgs.disable_chunked_mm_input,
10321032
nargs="?",
1033-
const="False",
1033+
const="True",
10341034
help="Disable multimodal input chunking attention for V1. "
10351035
"If set to true and chunked prefill is enabled, we do not want to"
10361036
" partially schedule a multimodal item. This ensures that if a "

vllm/v1/core/encoder_cache_manager.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,14 @@ def _compute_encoder_budget_multimodal(
133133
_, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
134134
key=lambda item: item[1])
135135

136+
if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
137+
> scheduler_config.max_num_batched_tokens):
138+
raise ValueError(
139+
"Chunked MM input disabled but max_tokens_per_mm_item "
140+
f"({max_tokens_per_mm_item}) is larger than max_num_batched_tokens"
141+
f" ({scheduler_config.max_num_batched_tokens}). Please increase "
142+
"max_num_batched_tokens.")
143+
136144
encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
137145
max_tokens_per_mm_item)
138146
encoder_cache_size = max(scheduler_config.encoder_cache_size,

0 commit comments

Comments
 (0)