|
17 | 17 | from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
18 | 18 | from vllm.model_executor.model_loader import get_model
|
19 | 19 | from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
| 20 | +from vllm.multimodal.utils import group_mm_inputs_by_modality |
20 | 21 | from vllm.sampling_params import SamplingType
|
21 | 22 | from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
22 | 23 | LayerBlockType, cdiv, is_pin_memory_available)
|
@@ -629,19 +630,34 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
|
629 | 630 | for input_id in encoder_input_ids:
|
630 | 631 | mm_inputs.append(req_state.mm_inputs[input_id])
|
631 | 632 | req_input_ids.append((req_id, input_id))
|
632 |
| - batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) |
633 |
| - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, |
634 |
| - device=self.device) |
635 |
| - |
636 |
| - # Run the encoder. |
637 |
| - # `encoder_outputs` is either of the following: |
638 |
| - # 1. A tensor of shape [num_images, feature_size, hidden_size] |
639 |
| - # in case when feature_size is fixed across all images. |
640 |
| - # 2. A list (length: num_images) of tensors, each of shape |
641 |
| - # [feature_size, hidden_size] in case when the feature size is |
642 |
| - # dynamic depending on input images. |
643 |
| - encoder_outputs = self.model.get_multimodal_embeddings( |
644 |
| - **batched_mm_inputs) |
| 633 | + |
| 634 | + # Batch mm inputs as much as we can: if a request in the batch has |
| 635 | + # multiple modalities or a different modality than the previous one, |
| 636 | + # we process it separately to preserve item order. |
| 637 | + # FIXME(ywang96): This is a hacky way to deal with multiple modalities |
| 638 | + # in the same batch while still being able to benefit from batching |
| 639 | + # multimodal inputs. The proper solution should be reordering the |
| 640 | + # encoder outputs. |
| 641 | + grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs) |
| 642 | + |
| 643 | + encoder_outputs = [] |
| 644 | + for grouped_mm_inputs in grouped_mm_inputs_list: |
| 645 | + batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) |
| 646 | + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, |
| 647 | + device=self.device) |
| 648 | + |
| 649 | + # Run the encoder. |
| 650 | + # `curr_group_outputs` is either of the following: |
| 651 | + # 1. A tensor of shape (num_items, feature_size, hidden_size) |
| 652 | + # in case feature_size is fixed across all multimodal items. |
| 653 | + # 2. A list or tuple (length: num_items) of tensors, each of shape |
| 654 | + # (feature_size, hidden_size) in case the feature size is dynamic |
| 655 | + # depending on the input multimodal items. |
| 656 | + curr_group_outputs = self.model.get_multimodal_embeddings( |
| 657 | + **batched_mm_inputs) |
| 658 | + |
| 659 | + for output in curr_group_outputs: |
| 660 | + encoder_outputs.append(output) |
645 | 661 |
|
646 | 662 | # Cache the encoder outputs.
|
647 | 663 | for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
|
|
0 commit comments