Skip to content

[Misc] Rename MultiModalInputsV2 -> MultiModalInputs #12244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/api/multimodal/inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
```

```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
:members:
:show-inheritance:
```
12 changes: 6 additions & 6 deletions vllm/inputs/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
if TYPE_CHECKING:
from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
MultiModalPlaceholderDict)
from vllm.multimodal.inputs import MultiModalInputsV2
from vllm.multimodal.inputs import MultiModalInputs


class TextPrompt(TypedDict):
Expand Down Expand Up @@ -207,7 +207,7 @@ def token_inputs(
return inputs


DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"]
DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"]
"""
The inputs in :class:`~vllm.LLMEngine` before they are
passed to the model executor.
Expand All @@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict):

This specifies the required data for encoder-decoder models.
"""
encoder: Union[TokenInputs, "MultiModalInputsV2"]
encoder: Union[TokenInputs, "MultiModalInputs"]
"""The inputs for the encoder portion."""

decoder: Union[TokenInputs, "MultiModalInputsV2"]
decoder: Union[TokenInputs, "MultiModalInputs"]
"""The inputs for the decoder portion."""


SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"]
SingletonInputs = Union[TokenInputs, "MultiModalInputs"]
"""
A processed :class:`SingletonPrompt` which can be passed to
:class:`vllm.sequence.Sequence`.
Expand Down Expand Up @@ -311,7 +311,7 @@ def multi_modal_hashes(self) -> List[str]:
return inputs.get("multi_modal_hashes", [])

if inputs["type"] == "multimodal":
# only the case when we use MultiModalInputsV2
# only the case when we use MultiModalInputs
return inputs.get("mm_hashes", []) # type: ignore[return-value]

assert_never(inputs) # type: ignore[arg-type]
Expand Down
6 changes: 3 additions & 3 deletions vllm/inputs/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup

Expand Down Expand Up @@ -247,7 +247,7 @@ def _process_multimodal(
mm_data: MultiModalDataDict,
mm_processor_kwargs: Optional[Mapping[str, object]],
lora_request: Optional[LoRARequest],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
"""
Apply the model's multi-modal processor to a multi-modal prompt,
returning the corresponding token IDs and metadata.
Expand All @@ -271,7 +271,7 @@ async def _process_multimodal_async(
mm_data: MultiModalDataDict,
mm_processor_kwargs: Optional[Mapping[str, object]],
lora_request: Optional[LoRARequest],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
"""Async version of :meth:`_process_multimodal`."""
tokenizer_group = self.get_tokenizer_group()
tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputsV2, MultiModalKwargs,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
Expand Down Expand Up @@ -490,7 +490,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <image> tokens should be considered as placeholders,
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputsV2, MultiModalKwargs,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
Expand Down Expand Up @@ -159,7 +159,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <image> tokens should be considered as placeholders,
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputsV2, MultiModalKwargs,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems)
Expand Down Expand Up @@ -232,7 +232,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only |SPEAKER| (image) tokens should be considered as placeholders,
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputsV2, MultiModalKwargs,
MultiModalInputs, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems)
Expand Down Expand Up @@ -746,7 +746,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index

Expand Down Expand Up @@ -805,7 +805,7 @@ def get_replacement_mantis(item_idx: int):
for modality, placeholders in mm_placeholders.items()
}

return MultiModalInputsV2(
return MultiModalInputs(
type="multimodal",
prompt=prompt,
prompt_token_ids=prompt_ids,
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputsV2, MultiModalKwargs,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems)
Expand Down Expand Up @@ -484,7 +484,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <|image|> tokens should be considered as placeholders,
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputsV2, MultiModalKwargs,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
MultiModalDataParser)
Expand Down Expand Up @@ -245,7 +245,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <|AUDIO|> tokens should be considered as placeholders,
Expand Down
2 changes: 1 addition & 1 deletion vllm/multimodal/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
"""


class MultiModalInputsV2(TypedDict):
class MultiModalInputs(TypedDict):
"""
Represents the outputs of
:class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
Expand Down
10 changes: 5 additions & 5 deletions vllm/multimodal/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

from .hasher import MultiModalHasher
from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputsV2, MultiModalKwargs,
MultiModalKwargsItem, PlaceholderRange)
MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem,
PlaceholderRange)
from .parse import MultiModalDataItems, MultiModalDataParser

if TYPE_CHECKING:
Expand Down Expand Up @@ -609,7 +609,7 @@ def __call__(
prompt: str,
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
return self.apply(prompt, mm_data, hf_processor_mm_kwargs)

def _get_data_parser(self) -> MultiModalDataParser:
Expand Down Expand Up @@ -1067,7 +1067,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
"""
Process multi-modal inputs to be used in vLLM.

Expand Down Expand Up @@ -1169,7 +1169,7 @@ def apply(
for modality, placeholders in mm_placeholders.items()
}

return MultiModalInputsV2(
return MultiModalInputs(
type="multimodal",
prompt=prompt,
prompt_token_ids=prompt_ids,
Expand Down
4 changes: 2 additions & 2 deletions vllm/multimodal/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from vllm.inputs import DummyData
from vllm.logger import init_logger

from .inputs import MultiModalDataDict, MultiModalInputsV2
from .inputs import MultiModalDataDict, MultiModalInputs
from .processing import BaseMultiModalProcessor, BaseProcessingInfo

logger = init_logger(__name__)
Expand Down Expand Up @@ -131,7 +131,7 @@ def _get_dummy_mm_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalInputsV2:
) -> MultiModalInputs:
factory = self.dummy_inputs
processor_inputs = factory.get_dummy_processor_inputs(
seq_len, mm_counts)
Expand Down
Loading