|
31 | 31 | ProcessingMixin, PromptReplacement)
|
32 | 32 | from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
|
33 | 33 | from vllm.sequence import IntermediateTensors
|
| 34 | +from vllm.utils import is_list_of |
34 | 35 |
|
35 | 36 | from .clip import CLIPVisionModel
|
36 | 37 | from .interfaces import SupportsMultiModal, SupportsPP
|
@@ -521,7 +522,7 @@ def sampler(self):
|
521 | 522 | return get_sampler()
|
522 | 523 |
|
523 | 524 | def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
|
524 |
| - # The image size may be different for Pixtral-HF |
| 525 | + # Only the longest edge is equal to image_size for Pixtral-HF |
525 | 526 | if self.config.vision_config.model_type == "pixtral":
|
526 | 527 | return data
|
527 | 528 |
|
@@ -550,10 +551,12 @@ def _parse_and_validate_image_input(
|
550 | 551 | raise ValueError("Incorrect type of pixel values. "
|
551 | 552 | f"Got type: {type(pixel_values)}")
|
552 | 553 |
|
| 554 | + pixel_values = flatten_bn(pixel_values, |
| 555 | + concat=is_list_of(pixel_values, list)) |
| 556 | + |
553 | 557 | return LlavaImagePixelInputs(
|
554 | 558 | type="pixel_values",
|
555 |
| - data=self._validate_pixel_values( |
556 |
| - flatten_bn(pixel_values, concat=True)), |
| 559 | + data=self._validate_pixel_values(pixel_values), |
557 | 560 | )
|
558 | 561 |
|
559 | 562 | if image_embeds is not None:
|
|
0 commit comments