Skip to content

Commit 51a624b

Browse files
[Misc] Move some multimodal utils to modality-specific modules (#11494)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 6ad909f commit 51a624b

File tree

13 files changed

+84
-77
lines changed

13 files changed

+84
-77
lines changed

tests/models/decoder_only/vision_language/test_awq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pytest
44
import torch
55

6-
from vllm.multimodal.utils import rescale_image_size
6+
from vllm.multimodal.image import rescale_image_size
77

88
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
99
from ...utils import check_logprobs_close

tests/models/decoder_only/vision_language/test_h2ovl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# Import the functions to test
99
from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
1010
image_to_pixel_values_wrapper)
11-
from vllm.multimodal.utils import rescale_image_size
11+
from vllm.multimodal.image import rescale_image_size
1212

1313
models = [
1414
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names

tests/models/decoder_only/vision_language/test_phi3v.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pytest
66
from transformers import AutoTokenizer
77

8-
from vllm.multimodal.utils import rescale_image_size
8+
from vllm.multimodal.image import rescale_image_size
99
from vllm.platforms import current_platform
1010
from vllm.sequence import SampleLogprobs
1111

tests/models/decoder_only/vision_language/test_qwen2_vl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
from PIL import Image
77

88
from vllm.entrypoints.llm import LLM
9-
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
10-
sample_frames_from_video)
9+
from vllm.multimodal.image import rescale_image_size
10+
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
1111

1212
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
1313
PromptVideoInput, VllmRunner)

tests/models/decoder_only/vision_language/vlm_utils/builders.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55

66
import torch
77

8-
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
9-
resize_video, sample_frames_from_video)
8+
from vllm.multimodal.image import rescale_image_size
9+
from vllm.multimodal.video import (rescale_video_size, resize_video,
10+
sample_frames_from_video)
1011

1112
from .....conftest import _ImageAssets, _VideoAssets
1213
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,

tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
"""Custom input builders for edge-cases in different models."""
22
from typing import Callable
33

4-
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
5-
resize_video, sample_frames_from_video)
4+
from vllm.multimodal.image import rescale_image_size
5+
from vllm.multimodal.video import (rescale_video_size, resize_video,
6+
sample_frames_from_video)
67

78
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
89
from .builders import build_multi_image_inputs, build_single_image_inputs

tests/models/encoder_decoder/vision_language/test_mllama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
88
global_force_attn_backend_context_manager)
9-
from vllm.multimodal.utils import rescale_image_size
9+
from vllm.multimodal.image import rescale_image_size
1010
from vllm.sequence import SampleLogprobs
1111

1212
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,

tests/multimodal/test_mapper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from vllm.config import ModelConfig
88
from vllm.multimodal import MultiModalRegistry
9-
from vllm.multimodal.utils import rescale_image_size
9+
from vllm.multimodal.image import rescale_image_size
1010

1111

1212
@pytest.fixture

vllm/assets/video.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from huggingface_hub import hf_hub_download
88
from PIL import Image
99

10-
from vllm.multimodal.utils import (sample_frames_from_video,
10+
from vllm.multimodal.video import (sample_frames_from_video,
1111
try_import_video_packages)
1212

1313
from .base import get_cache_dir

vllm/multimodal/audio.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Any
2+
13
import numpy as np
24
import numpy.typing as npt
35

@@ -26,6 +28,16 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
2628
"There is no default maximum multimodal tokens")
2729

2830

31+
def try_import_audio_packages() -> tuple[Any, Any]:
32+
try:
33+
import librosa
34+
import soundfile
35+
except ImportError as exc:
36+
raise ImportError(
37+
"Please install vllm[audio] for audio support.") from exc
38+
return librosa, soundfile
39+
40+
2941
def resample_audio(
3042
audio: npt.NDArray[np.floating],
3143
*,

vllm/multimodal/image.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,15 @@ def _default_input_mapper(
8484

8585
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
8686
return 3000
87+
88+
89+
def rescale_image_size(image: Image.Image,
90+
size_factor: float,
91+
transpose: int = -1) -> Image.Image:
92+
"""Rescale the dimensions of an image by a constant factor."""
93+
new_width = int(image.width * size_factor)
94+
new_height = int(image.height * size_factor)
95+
image = image.resize((new_width, new_height))
96+
if transpose >= 0:
97+
image = image.transpose(Image.Transpose(transpose))
98+
return image

vllm/multimodal/utils.py

Lines changed: 3 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
from functools import lru_cache
44
from io import BytesIO
5-
from typing import Any, List, Optional, Tuple, TypeVar, Union
5+
from typing import List, Optional, Tuple, TypeVar, Union
66

77
import numpy as np
88
import numpy.typing as npt
@@ -14,7 +14,9 @@
1414
from vllm.logger import init_logger
1515
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
1616

17+
from .audio import try_import_audio_packages
1718
from .inputs import MultiModalDataDict, PlaceholderRange
19+
from .video import try_import_video_packages
1820

1921
logger = init_logger(__name__)
2022

@@ -198,16 +200,6 @@ async def async_fetch_video(video_url: str,
198200
return video
199201

200202

201-
def try_import_audio_packages() -> Tuple[Any, Any]:
202-
try:
203-
import librosa
204-
import soundfile
205-
except ImportError as exc:
206-
raise ImportError(
207-
"Please install vllm[audio] for audio support.") from exc
208-
return librosa, soundfile
209-
210-
211203
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
212204
"""
213205
Load audio from a URL.
@@ -324,60 +316,6 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
324316
return _load_image_from_bytes(base64.b64decode(image))
325317

326318

327-
def rescale_image_size(image: Image.Image,
328-
size_factor: float,
329-
transpose: int = -1) -> Image.Image:
330-
"""Rescale the dimensions of an image by a constant factor."""
331-
new_width = int(image.width * size_factor)
332-
new_height = int(image.height * size_factor)
333-
image = image.resize((new_width, new_height))
334-
if transpose >= 0:
335-
image = image.transpose(Image.Transpose(transpose))
336-
return image
337-
338-
339-
def try_import_video_packages():
340-
try:
341-
import cv2
342-
import decord
343-
except ImportError as exc:
344-
raise ImportError(
345-
"Please install vllm[video] for video support.") from exc
346-
return cv2, decord
347-
348-
349-
def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
350-
cv2, _ = try_import_video_packages()
351-
352-
num_frames, _, _, channels = frames.shape
353-
new_height, new_width = size
354-
resized_frames = np.empty((num_frames, new_height, new_width, channels),
355-
dtype=frames.dtype)
356-
for i, frame in enumerate(frames):
357-
resized_frame = cv2.resize(frame, (new_width, new_height))
358-
resized_frames[i] = resized_frame
359-
return resized_frames
360-
361-
362-
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
363-
_, height, width, _ = frames.shape
364-
new_height = int(height * size_factor)
365-
new_width = int(width * size_factor)
366-
367-
return resize_video(frames, (new_height, new_width))
368-
369-
370-
def sample_frames_from_video(frames: npt.NDArray,
371-
num_frames: int) -> npt.NDArray:
372-
total_frames = frames.shape[0]
373-
if num_frames == -1:
374-
return frames
375-
else:
376-
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
377-
sampled_frames = frames[frame_indices, ...]
378-
return sampled_frames
379-
380-
381319
def encode_video_base64(frames: npt.NDArray) -> str:
382320
base64_frames = []
383321
frames_list = [frames[i] for i in range(frames.shape[0])]

vllm/multimodal/video.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import TYPE_CHECKING, Any, Dict, Optional
33

44
import numpy as np
5+
import numpy.typing as npt
56

67
from vllm.inputs.registry import InputContext
78
from vllm.logger import init_logger
@@ -75,3 +76,45 @@ def _default_input_mapper(
7576

7677
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
7778
return 4096
79+
80+
81+
def try_import_video_packages() -> tuple[Any, Any]:
82+
try:
83+
import cv2
84+
import decord
85+
except ImportError as exc:
86+
raise ImportError(
87+
"Please install vllm[video] for video support.") from exc
88+
return cv2, decord
89+
90+
91+
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
92+
cv2, _ = try_import_video_packages()
93+
94+
num_frames, _, _, channels = frames.shape
95+
new_height, new_width = size
96+
resized_frames = np.empty((num_frames, new_height, new_width, channels),
97+
dtype=frames.dtype)
98+
for i, frame in enumerate(frames):
99+
resized_frame = cv2.resize(frame, (new_width, new_height))
100+
resized_frames[i] = resized_frame
101+
return resized_frames
102+
103+
104+
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
105+
_, height, width, _ = frames.shape
106+
new_height = int(height * size_factor)
107+
new_width = int(width * size_factor)
108+
109+
return resize_video(frames, (new_height, new_width))
110+
111+
112+
def sample_frames_from_video(frames: npt.NDArray,
113+
num_frames: int) -> npt.NDArray:
114+
total_frames = frames.shape[0]
115+
if num_frames == -1:
116+
return frames
117+
118+
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
119+
sampled_frames = frames[frame_indices, ...]
120+
return sampled_frames

0 commit comments

Comments
 (0)