Skip to content

Commit a12faed

Browse files
ywang96DarkLight1337mgoinJenZhao
authored andcommitted
[V1] Scatter and gather placeholders in the model runner (vllm-project#16076)
Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: mgoin <[email protected]> Signed-off-by: Roger Wang <[email protected]> Co-authored-by: DarkLight1337 <[email protected]> Co-authored-by: mgoin <[email protected]> Co-authored-by: Jennifer Zhao <[email protected]>
1 parent e5d5507 commit a12faed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+522
-1021
lines changed

docs/source/contributing/model/multimodal.md

+8-8
Original file line numberDiff line numberDiff line change
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
860860
)
861861
```
862862

863-
To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
864-
with different `full` and `feature` attributes:
863+
To assign the vision embeddings to only the image tokens, instead of a string
864+
you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
865865

866866
```python
867867
hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
879879
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
880880
[_NEWLINE_TOKEN_ID]) * nrows
881881

882-
return PromptUpdateDetails(
883-
full=image_tokens + [bos_token_id],
884-
features=image_tokens,
882+
return PromptUpdateDetails.select_token_id(
883+
image_tokens + [bos_token_id],
884+
embed_token_id=_IMAGE_TOKEN_ID,
885885
)
886886
```
887887

@@ -914,9 +914,9 @@ def _get_prompt_updates(
914914
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
915915
[_NEWLINE_TOKEN_ID]) * nrows
916916

917-
return PromptUpdateDetails(
918-
full=image_tokens + [bos_token_id],
919-
features=image_tokens,
917+
return PromptUpdateDetails.select_token_id(
918+
image_tokens + [bos_token_id],
919+
embed_token_id=_IMAGE_TOKEN_ID,
920920
)
921921

922922
return [

docs/source/models/supported_models.md

-3
Original file line numberDiff line numberDiff line change
@@ -1006,9 +1006,6 @@ See [this page](#generative-models) for more information on how to use generativ
10061006
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
10071007

10081008
:::{important}
1009-
To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
1010-
`pip install git+https://github.com/huggingface/transformers`.
1011-
10121009
Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
10131010
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
10141011
:::

tests/models/decoder_only/vision_language/test_models.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -330,9 +330,8 @@
330330
max_num_seqs=4,
331331
dtype="bfloat16",
332332
auto_cls=AutoModelForImageTextToText,
333-
tensor_parallel_size=8,
334-
vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
335-
marks=multi_gpu_marks(num_gpus=8),
333+
tensor_parallel_size=4,
334+
marks=multi_gpu_marks(num_gpus=4),
336335
),
337336
"llava_next": VLMTestInfo(
338337
models=["llava-hf/llava-v1.6-mistral-7b-hf"],

tests/models/decoder_only/vision_language/test_pixtral.py

+8-16
Original file line numberDiff line numberDiff line change
@@ -200,22 +200,14 @@ def test_chat(
200200

201201

202202
@large_gpu_test(min_gb=48)
203-
@pytest.mark.parametrize(
204-
"prompt,expected_ranges",
205-
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{
206-
"offset": 11,
207-
"length": 494
208-
}]),
209-
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{
210-
"offset": 11,
211-
"length": 266
212-
}, {
213-
"offset": 277,
214-
"length": 1056
215-
}, {
216-
"offset": 1333,
217-
"length": 418
218-
}])])
203+
@pytest.mark.parametrize("prompt,expected_ranges",
204+
[(_create_engine_inputs_hf(IMG_URLS[:1]),
205+
[PlaceholderRange(offset=11, length=494)]),
206+
(_create_engine_inputs_hf(IMG_URLS[1:4]), [
207+
PlaceholderRange(offset=11, length=266),
208+
PlaceholderRange(offset=277, length=1056),
209+
PlaceholderRange(offset=1333, length=418)
210+
])])
219211
def test_multi_modal_placeholders(vllm_runner, prompt,
220212
expected_ranges: list[PlaceholderRange],
221213
monkeypatch) -> None:

tests/models/multimodal/processing/test_llama4.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,11 @@ def test_processor_override(
7171
# image token offsets
7272
img_locs = processed_inputs["mm_placeholders"].get("image", [])
7373
assert len(img_locs) == num_imgs
74-
assert [img_loc["offset"] for img_loc in img_locs] == \
74+
assert [img_loc.offset for img_loc in img_locs] == \
7575
[i for i, v in enumerate(prompt_token_ids) \
7676
if v == config.boi_token_index]
7777

7878
# patch sizes and masks
79-
assert prompt_token_ids.count(config.image_token_index) \
80-
== sum(img_patch.sum() for img_patch in mm_kwargs["embed_is_patch"])
8179
patch_token_id = vocab[hf_processor.img_patch_token]
8280
num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
8381
mm_counts = {"image": num_imgs}
@@ -89,11 +87,3 @@ def test_processor_override(
8987
== mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
9088
assert mm_kwargs["pixel_values"].shape[0] \
9189
== mm_kwargs["patches_per_image"].sum()
92-
93-
for embed_is_patch, aspect_ratio in zip(mm_kwargs["embed_is_patch"],
94-
mm_kwargs["aspect_ratios"]):
95-
assert embed_is_patch.shape[0] == \
96-
len(tokenizer.encode(
97-
hf_processor._prompt_split_image(
98-
aspect_ratio, num_patches_per_chunk),
99-
add_special_tokens=False))

tests/models/multimodal/processing/test_llava_next.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
9292
first_placeholder = image_placeholders[0]
9393

9494
# NOTE: There is a BOS token
95-
assert first_placeholder["offset"] == 1
96-
assert first_placeholder["length"] == (
95+
assert first_placeholder.offset == 1
96+
assert first_placeholder.length == (
9797
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
9898

9999
except Exception as exc:

tests/models/multimodal/processing/test_llava_onevision.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
9292

9393
first_placeholder = image_placeholders[0]
9494

95-
assert first_placeholder["offset"] == 0
96-
assert first_placeholder["length"] == len(
95+
assert first_placeholder.offset == 0
96+
assert first_placeholder.length == len(
9797
processed_inputs["prompt_token_ids"]) // num_imgs
9898
except Exception as exc:
9999
failed_size_excs.append((image_size, exc))

tests/multimodal/test_processing.py

+9
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,7 @@ def test_find_update_tokens(
785785
item_idx=0,
786786
start_idx=6,
787787
tokens=[32000, 32000],
788+
is_embed=None,
788789
),
789790
],
790791
"pattern_4": [
@@ -793,6 +794,7 @@ def test_find_update_tokens(
793794
item_idx=0,
794795
start_idx=3,
795796
tokens=[32000],
797+
is_embed=None,
796798
),
797799
],
798800
}
@@ -807,12 +809,14 @@ def test_find_update_tokens(
807809
item_idx=0,
808810
start_idx=1,
809811
tokens=[32000, 32000],
812+
is_embed=None,
810813
),
811814
PlaceholderFeaturesInfo(
812815
modality="pattern_1",
813816
item_idx=1,
814817
start_idx=5,
815818
tokens=[32000, 32000],
819+
is_embed=None,
816820
),
817821
],
818822
"pattern_3": [
@@ -821,6 +825,7 @@ def test_find_update_tokens(
821825
item_idx=0,
822826
start_idx=7,
823827
tokens=[1550, 918, 1550],
828+
is_embed=None,
824829
),
825830
],
826831
# No match for pattern_4 as it has lower priority than pattern_1
@@ -835,12 +840,14 @@ def test_find_update_tokens(
835840
item_idx=0,
836841
start_idx=1,
837842
tokens=[32000, 32000],
843+
is_embed=None,
838844
),
839845
PlaceholderFeaturesInfo(
840846
modality="pattern_1",
841847
item_idx=1,
842848
start_idx=3,
843849
tokens=[32000, 32000],
850+
is_embed=None,
844851
),
845852
],
846853
"pattern_4": [
@@ -849,6 +856,7 @@ def test_find_update_tokens(
849856
item_idx=0,
850857
start_idx=5,
851858
tokens=[32000],
859+
is_embed=None,
852860
),
853861
],
854862
"pattern_3": [
@@ -857,6 +865,7 @@ def test_find_update_tokens(
857865
item_idx=0,
858866
start_idx=6,
859867
tokens=[1550, 918, 1550],
868+
is_embed=None,
860869
),
861870
],
862871
}

tests/v1/core/test_kv_cache_utils.py

+17-29
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pytest
44
import torch
55

6-
from vllm.multimodal.inputs import MultiModalKwargs
6+
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
77
from vllm.sampling_params import SamplingParams
88
from vllm.utils import sha256
99
# disable yapf here as it formats differently than isort such that both fail
@@ -158,13 +158,10 @@ def test_generate_block_hash_extra_keys():
158158
request = make_request(
159159
request_id=0,
160160
prompt_token_ids=[_ for _ in range(20)],
161-
mm_positions=[{
162-
"offset": 0,
163-
"length": 5
164-
}, {
165-
"offset": 10,
166-
"length": 5
167-
}],
161+
mm_positions=[
162+
PlaceholderRange(offset=0, length=5),
163+
PlaceholderRange(offset=10, length=5),
164+
],
168165
mm_hashes=["hash1", "hash2"],
169166
)
170167

@@ -222,13 +219,10 @@ def test_hash_request_tokens(hash_fn):
222219
request = make_request(
223220
request_id=0,
224221
prompt_token_ids=[_ for _ in range(6)],
225-
mm_positions=[{
226-
"offset": 0,
227-
"length": 3
228-
}, {
229-
"offset": 3,
230-
"length": 3
231-
}],
222+
mm_positions=[
223+
PlaceholderRange(offset=0, length=3),
224+
PlaceholderRange(offset=3, length=3),
225+
],
232226
mm_hashes=["hash1", "hash2"],
233227
)
234228

@@ -253,25 +247,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
253247
request1 = make_request(
254248
request_id=0,
255249
prompt_token_ids=[_ for _ in range(6)],
256-
mm_positions=[{
257-
"offset": 0,
258-
"length": 3
259-
}, {
260-
"offset": 3,
261-
"length": 3
262-
}],
250+
mm_positions=[
251+
PlaceholderRange(offset=0, length=3),
252+
PlaceholderRange(offset=3, length=3),
253+
],
263254
mm_hashes=["hash1", "hash2"],
264255
)
265256
request2 = make_request(
266257
request_id=1,
267258
prompt_token_ids=[_ for _ in range(6)],
268-
mm_positions=[{
269-
"offset": 0,
270-
"length": 3
271-
}, {
272-
"offset": 3,
273-
"length": 3
274-
}],
259+
mm_positions=[
260+
PlaceholderRange(offset=0, length=3),
261+
PlaceholderRange(offset=3, length=3),
262+
],
275263
mm_hashes=["hash3", "hash2"],
276264
)
277265
block_size = 3

0 commit comments

Comments
 (0)