Skip to content

Commit 89f8b0b

Browse files
DarkLight1337LeiWang1999
authored andcommitted
[0/N] Rename MultiModalInputs to MultiModalKwargs (vllm-project#10040)
Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: LeiWang1999 <[email protected]>
1 parent a75b615 commit 89f8b0b

32 files changed

+151
-121
lines changed

docs/source/design/multimodal/multimodal_index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ Base Classes
5353

5454
.. autodata:: vllm.multimodal.MultiModalDataDict
5555

56-
.. autoclass:: vllm.multimodal.MultiModalInputs
56+
.. autoclass:: vllm.multimodal.MultiModalKwargs
5757
:members:
5858
:show-inheritance:
5959

tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from PIL.Image import Image
77

88
from vllm.inputs import InputContext, token_inputs
9-
from vllm.multimodal.base import MultiModalInputs
9+
from vllm.multimodal.base import MultiModalKwargs
1010
from vllm.multimodal.utils import cached_get_tokenizer
1111

1212
from .....conftest import IMAGE_ASSETS
@@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
9696
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
9797
# Ensure that we get the appropriately shaped pixel_values
9898
# for images and image embeddings, respectively.
99-
assert isinstance(mapped_img_data, MultiModalInputs)
99+
assert isinstance(mapped_img_data, MultiModalKwargs)
100100
assert "pixel_values" in mapped_img_data
101101
assert mapped_img_data["pixel_values"].shape == expected_shape
102102

tests/multimodal/test_base.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import torch
22

3-
from vllm.multimodal.base import MultiModalInputs, NestedTensors
3+
from vllm.multimodal.base import MultiModalKwargs, NestedTensors
44

55

66
def assert_nested_tensors_equal(expected: NestedTensors,
@@ -13,40 +13,40 @@ def assert_nested_tensors_equal(expected: NestedTensors,
1313
assert_nested_tensors_equal(expected_item, actual_item)
1414

1515

16-
def assert_multimodal_inputs_equal(expected: MultiModalInputs,
17-
actual: MultiModalInputs):
16+
def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
17+
actual: MultiModalKwargs):
1818
assert set(expected.keys()) == set(actual.keys())
1919
for key in expected:
2020
assert_nested_tensors_equal(expected[key], actual[key])
2121

2222

2323
def test_multimodal_input_batch_single_tensor():
2424
t = torch.rand([1, 2])
25-
result = MultiModalInputs.batch([{"image": t}])
25+
result = MultiModalKwargs.batch([{"image": t}])
2626
assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
2727

2828

2929
def test_multimodal_input_batch_multiple_tensors():
3030
a = torch.rand([1, 1, 2])
3131
b = torch.rand([1, 1, 2])
3232
c = torch.rand([1, 1, 2])
33-
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
33+
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
3434
assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
3535

3636

3737
def test_multimodal_input_batch_multiple_heterogeneous_tensors():
3838
a = torch.rand([1, 2, 2])
3939
b = torch.rand([1, 3, 2])
4040
c = torch.rand([1, 4, 2])
41-
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
41+
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
4242
assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
4343

4444

4545
def test_multimodal_input_batch_nested_tensors():
4646
a = torch.rand([2, 3])
4747
b = torch.rand([2, 3])
4848
c = torch.rand([2, 3])
49-
result = MultiModalInputs.batch([{
49+
result = MultiModalKwargs.batch([{
5050
"image": [a]
5151
}, {
5252
"image": [b]
@@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists():
6565
a = torch.rand([1, 2, 3])
6666
b = torch.rand([1, 2, 3])
6767
c = torch.rand([1, 2, 3])
68-
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
68+
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
6969
assert_multimodal_inputs_equal(
7070
result,
7171
{"image": [torch.stack([a, b]), c.unsqueeze(0)]})
@@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists():
7676
b = torch.rand([1, 2, 3])
7777
c = torch.rand([1, 2, 3])
7878
d = torch.rand([1, 2, 3])
79-
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}])
79+
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
8080
assert_multimodal_inputs_equal(
8181
result,
8282
{"image": torch.stack([torch.stack([a, b]),
@@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths():
8888
b = torch.rand([1, 3, 3])
8989
c = torch.rand([1, 4, 3])
9090

91-
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
91+
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
9292
assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
9393

94-
result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}])
94+
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
9595
assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})

vllm/model_executor/models/chatglm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
3131
from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
3232
from vllm.model_executor.sampling_metadata import SamplingMetadata
33-
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
33+
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
3434
from vllm.multimodal.base import MultiModalData
3535
from vllm.multimodal.utils import cached_get_tokenizer
3636
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv(
7474
raise
7575
pixel_values = raw_batch_data['images']
7676

77-
return MultiModalInputs({'pixel_values': pixel_values})
77+
return MultiModalKwargs({'pixel_values': pixel_values})
7878

7979

8080
def merge_glm_vision_embeddings(

vllm/model_executor/models/fuyu.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
3535
from vllm.model_executor.sampling_metadata import SamplingMetadata
3636
from vllm.multimodal import MULTIMODAL_REGISTRY
37-
from vllm.multimodal.base import MultiModalInputs
37+
from vllm.multimodal.base import MultiModalKwargs
3838
from vllm.multimodal.image import cached_get_image_processor
3939
from vllm.multimodal.utils import (cached_get_tokenizer,
4040
consecutive_placeholder_ranges)
@@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
218218
])
219219

220220
# image has been processed with prompt in input processor
221-
return MultiModalInputs({"pixel_values": data})
221+
return MultiModalKwargs({"pixel_values": data})
222222

223223

224224
@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)

vllm/model_executor/models/h2ovl.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
token_inputs)
1717
from vllm.model_executor.layers.quantization import QuantizationConfig
1818
from vllm.multimodal import MULTIMODAL_REGISTRY
19-
from vllm.multimodal.base import MultiModalInputs
19+
from vllm.multimodal.base import MultiModalKwargs
2020
from vllm.multimodal.utils import cached_get_tokenizer
2121
from vllm.utils import is_list_of
2222

@@ -324,12 +324,12 @@ def input_mapper(
324324
data: object,
325325
*,
326326
max_dynamic_patch: Optional[int] = None,
327-
) -> MultiModalInputs:
327+
) -> MultiModalKwargs:
328328

329329
# NOTE: Preprocessing for the image data is done in the
330330
# 'input_processor' function during actual inference.
331331
if isinstance(data, dict):
332-
return MultiModalInputs(data)
332+
return MultiModalKwargs(data)
333333

334334
# The section below is only used with dummy data during
335335
# memory profiling.
@@ -347,7 +347,7 @@ def input_mapper(
347347
pixel_values = [image_pixel_values_mapper(img) for img in data]
348348

349349
else:
350-
return MultiModalInputs({"image_embeds": data})
350+
return MultiModalKwargs({"image_embeds": data})
351351
model_config = ctx.model_config
352352
tokenizer = cached_get_tokenizer(
353353
model_config.tokenizer,
@@ -359,7 +359,7 @@ def input_mapper(
359359
return_tensors="pt",
360360
)[0]
361361

362-
return MultiModalInputs({
362+
return MultiModalKwargs({
363363
"pixel_values": pixel_values,
364364
"image_token_id": image_token_id
365365
})

vllm/model_executor/models/idefics3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
3737
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
3838
from vllm.model_executor.sampling_metadata import SamplingMetadata
39-
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
39+
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
4040
from vllm.multimodal.image import cached_get_image_processor
4141
from vllm.sequence import IntermediateTensors, SequenceData
4242
from vllm.transformers_utils.processor import cached_get_processor
@@ -127,7 +127,7 @@ def input_mapper_for_idefics3(
127127
logger.error("Failed to process image (%s)", data)
128128
raise
129129

130-
return MultiModalInputs(batch_data)
130+
return MultiModalKwargs(batch_data)
131131

132132

133133
def _resize_output_size(height: int,

vllm/model_executor/models/internvl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
InternVisionPatchModel)
2727
from vllm.model_executor.sampling_metadata import SamplingMetadata
2828
from vllm.multimodal import MULTIMODAL_REGISTRY
29-
from vllm.multimodal.base import MultiModalInputs
29+
from vllm.multimodal.base import MultiModalKwargs
3030
from vllm.multimodal.utils import cached_get_tokenizer
3131
from vllm.sequence import IntermediateTensors
3232
from vllm.utils import is_list_of
@@ -346,7 +346,7 @@ def input_mapper(
346346
# we can't stack here because images may have different num_patches
347347
data = [image_pixel_values_mapper(img) for img in data]
348348
else:
349-
return MultiModalInputs({"image_embeds": data})
349+
return MultiModalKwargs({"image_embeds": data})
350350
model_config = ctx.model_config
351351
tokenizer = cached_get_tokenizer(
352352
model_config.tokenizer,
@@ -355,7 +355,7 @@ def input_mapper(
355355
add_special_tokens=False,
356356
return_tensors="pt")[0]
357357

358-
return MultiModalInputs({
358+
return MultiModalKwargs({
359359
"pixel_values": data,
360360
"image_token_id": image_token_id
361361
})

vllm/model_executor/models/minicpmv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
from vllm.model_executor.models.utils import LLMWrapper
5353
from vllm.model_executor.sampling_metadata import SamplingMetadata
5454
from vllm.multimodal import MULTIMODAL_REGISTRY
55-
from vllm.multimodal.base import MultiModalInputs
55+
from vllm.multimodal.base import MultiModalKwargs
5656
from vllm.multimodal.image import cached_get_image_processor
5757
from vllm.multimodal.utils import cached_get_tokenizer
5858
from vllm.sequence import IntermediateTensors, SequenceData
@@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
374374
batch_data["slice_start_id"] = data[0]["slice_start_id"]
375375
batch_data["slice_end_id"] = data[0]["slice_end_id"]
376376

377-
return MultiModalInputs(batch_data)
377+
return MultiModalKwargs(batch_data)
378378

379379

380380
class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):

vllm/model_executor/models/mllama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1162,7 +1162,7 @@ def sample(
11621162

11631163
def _parse_and_validate_image_input(self, **kwargs: object):
11641164
# tensor with the same shape will be batched together by
1165-
# MultiModalInputs.batch, so pixel_values here can be:
1165+
# MultiModalKwargs.batch, so pixel_values here can be:
11661166
# - List[List[torch.Tensor]]:
11671167
# with shape (num_tiles, 3, image_res, image_res)
11681168
# - List[torch.Tensor]:

vllm/model_executor/models/molmo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from vllm.model_executor.layers.vocab_parallel_embedding import (
3838
ParallelLMHead, VocabParallelEmbedding)
3939
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40-
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
40+
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
4141
from vllm.multimodal.utils import cached_get_tokenizer
4242
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
4343
SequenceData)
@@ -866,7 +866,7 @@ def image_input_mapper_for_molmo(
866866
ctx: InputContext,
867867
data: object,
868868
):
869-
return MultiModalInputs(data)
869+
return MultiModalKwargs(data)
870870

871871

872872
def dummy_data_for_molmo(ctx: InputContext, seq_len: int,

vllm/model_executor/models/pixtral.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from vllm.model_executor.models.utils import merge_multimodal_embeddings
3131
from vllm.model_executor.sampling_metadata import SamplingMetadata
3232
from vllm.multimodal import MULTIMODAL_REGISTRY
33-
from vllm.multimodal.base import MultiModalInputs
33+
from vllm.multimodal.base import MultiModalKwargs
3434
from vllm.multimodal.utils import (cached_get_tokenizer,
3535
consecutive_placeholder_ranges)
3636
from vllm.sequence import IntermediateTensors, SequenceData
@@ -94,16 +94,16 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
9494

9595

9696
def input_mapper_for_pixtral(ctx: InputContext,
97-
data: object) -> MultiModalInputs:
98-
"""Maps the input data to its MultiModalInputs (if any).
97+
data: object) -> MultiModalKwargs:
98+
"""Maps the input data to its MultiModalKwargs (if any).
9999
100100
Args:
101101
ctx: Context of the loaded model.
102102
data: data potentially containing image/image embeddings to be mapped
103103
to pixel_values in .forward() for a visual QWenLMHeadModel model.
104104
105105
Returns:
106-
MultiModalInputs containing the stacked normalized images tensor or
106+
MultiModalKwargs containing the stacked normalized images tensor or
107107
image embeddings.
108108
"""
109109
# Early exit if we have provided an image to a language only Qwen model
@@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
121121
dtype=torch.float16)
122122
images.append(image)
123123

124-
return MultiModalInputs({"images": images})
124+
return MultiModalKwargs({"images": images})
125125

126126

127127
def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):

vllm/model_executor/models/qwen.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from vllm.model_executor.models.module_mapping import MultiModelKeys
4444
from vllm.model_executor.sampling_metadata import SamplingMetadata
4545
from vllm.multimodal import MULTIMODAL_REGISTRY
46-
from vllm.multimodal.base import MultiModalInputs
46+
from vllm.multimodal.base import MultiModalKwargs
4747
from vllm.multimodal.utils import cached_get_tokenizer
4848
from vllm.sequence import IntermediateTensors, SequenceData
4949
from vllm.utils import is_list_of
@@ -722,16 +722,16 @@ def input_processor_for_qwen(ctx: InputContext,
722722
multi_modal_data=multi_modal_data)
723723

724724

725-
def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
726-
"""Maps the input data to its MultiModalInputs (if any).
725+
def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
726+
"""Maps the input data to its MultiModalKwargs (if any).
727727
728728
Args:
729729
ctx: Context of the loaded model.
730730
data: data potentially containing image/image embeddings to be mapped
731731
to pixel_values in .forward() for a visual QWenLMHeadModel model.
732732
733733
Returns:
734-
MultiModalInputs containing the stacked normalized images tensor or
734+
MultiModalKwargs containing the stacked normalized images tensor or
735735
image embeddings.
736736
"""
737737
# Early exit if we have provided an image to a language only Qwen model
@@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
740740
logger.warning(
741741
"Images were provided but this model has no visual config; "
742742
"multimodal inputs will not be forwarded to the model.")
743-
return MultiModalInputs()
743+
return MultiModalKwargs()
744744

745745
model_config = ctx.model_config
746746
tokenizer = cached_get_tokenizer(
@@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
784784
data = [data]
785785
transformed_images = [transform(datum) for datum in data]
786786
pixel_values = torch.stack(transformed_images, dim=0)
787-
return MultiModalInputs({"pixel_values": pixel_values})
787+
return MultiModalKwargs({"pixel_values": pixel_values})
788788

789789

790790
def build_normalization_transform(image_size: int) -> transforms.Compose:

vllm/model_executor/models/qwen2_audio.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
default_weight_loader, maybe_remap_kv_scale_name)
4343
from vllm.model_executor.models.qwen2 import Qwen2Model
4444
from vllm.model_executor.sampling_metadata import SamplingMetadata
45-
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
45+
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
4646
from vllm.multimodal.utils import consecutive_placeholder_ranges
4747
from vllm.sequence import IntermediateTensors, SequenceData
4848

@@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio(
221221
def input_mapper_for_qwen2_audio(
222222
ctx: InputContext,
223223
multi_modal_data: Union[np.ndarray, List[np.ndarray]],
224-
) -> MultiModalInputs:
224+
) -> MultiModalKwargs:
225225
"""Input mapper for Qwen2-Audio."""
226226
if not isinstance(multi_modal_data, list):
227227
multi_modal_data = [multi_modal_data]
228228

229229
if len(multi_modal_data) == 0:
230-
return MultiModalInputs()
230+
return MultiModalKwargs()
231231

232232
processor = cached_get_processor(ctx.model_config.model)
233233
audio_feature_extractor = processor.feature_extractor
@@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio(
254254
logger.error("Failed to process audio (%s)", multi_modal_data)
255255
raise
256256

257-
return MultiModalInputs(batch_data)
257+
return MultiModalKwargs(batch_data)
258258

259259

260260
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)

0 commit comments

Comments
 (0)