Skip to content

Commit b7e5324

Browse files
committed
Cleanup
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 62942e3 commit b7e5324

File tree

5 files changed

+52
-50
lines changed

5 files changed

+52
-50
lines changed

vllm/model_executor/models/llava.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
123123
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
124124
return {"image": self.get_max_image_tokens()}
125125

126-
def apply_feature_select_strategy(
126+
def _apply_feature_select_strategy(
127127
self,
128128
strategy: str,
129129
encoder_num_image_tokens: int,
@@ -145,7 +145,7 @@ def get_num_image_tokens(
145145
hf_config = self.get_hf_config()
146146
vision_encoder_info = self.get_vision_encoder_info()
147147

148-
return self.apply_feature_select_strategy(
148+
return self._apply_feature_select_strategy(
149149
hf_config.vision_feature_select_strategy,
150150
vision_encoder_info.get_num_image_tokens(
151151
image_width=image_width,

vllm/model_executor/models/llava_next.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from abc import abstractmethod
12
from functools import cached_property
23
from typing import (Final, Iterable, List, Literal, Mapping, Optional,
34
Protocol, Set, Tuple, TypedDict, TypeVar, Union)
@@ -82,7 +83,7 @@ def get_num_image_tokens(
8283
hf_config = self.get_hf_config()
8384
vision_encoder_info = self.get_vision_encoder_info()
8485

85-
base_feature_size = self.apply_feature_select_strategy(
86+
base_feature_size = self._apply_feature_select_strategy(
8687
hf_config.vision_feature_select_strategy,
8788
vision_encoder_info.get_num_image_tokens(
8889
image_width=image_width,
@@ -99,7 +100,7 @@ def get_num_image_tokens(
99100
(
100101
unpadded_feature_size,
101102
newline_feature_size,
102-
) = self.get_num_unpadded_features(
103+
) = self._get_num_unpadded_features(
103104
original_height=image_height,
104105
original_width=image_width,
105106
npatches=vision_encoder_info.get_patch_grid_length(),
@@ -110,7 +111,7 @@ def get_num_image_tokens(
110111
return unpadded_feature_size + newline_feature_size + base_feature_size
111112

112113
# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
113-
def get_num_unpadded_features(
114+
def _get_num_unpadded_features(
114115
self,
115116
*,
116117
original_height: int,
@@ -162,6 +163,19 @@ def get_image_size_with_most_features(self) -> ImageSize:
162163

163164
class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
164165

166+
# Copied from BaseMultiModalProcessor
167+
@abstractmethod
168+
def _get_mm_fields_config(
169+
self,
170+
hf_inputs: BatchFeature,
171+
hf_processor_mm_kwargs: Mapping[str, object],
172+
) -> Mapping[str, MultiModalFieldConfig]:
173+
raise NotImplementedError
174+
175+
176+
class LlavaNextMultiModalProcessor(
177+
BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
178+
165179
def _get_mm_fields_config(
166180
self,
167181
hf_inputs: BatchFeature,
@@ -174,11 +188,6 @@ def _get_mm_fields_config(
174188
)
175189

176190

177-
class LlavaNextMultiModalProcessor(
178-
BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
179-
pass
180-
181-
182191
@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
183192
info=LlavaNextProcessingInfo,
184193
dummy=LlavaDummyInputsBuilder)

vllm/model_executor/models/llava_next_video.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
6666
max_video_tokens = self.get_num_video_tokens(
6767
image_width=target_width,
6868
image_height=target_height,
69-
num_frames=self.get_max_num_frames(seq_len),
69+
num_frames=self.get_num_frames_with_most_features(seq_len),
7070
)
7171

7272
return {"video": max_video_tokens}
@@ -76,7 +76,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
7676
width = height = vision_encoder_info.get_image_size()
7777
return ImageSize(width=width, height=height)
7878

79-
def get_num_frame_tokens(
79+
def _get_num_frame_tokens(
8080
self,
8181
*,
8282
image_width: int,
@@ -98,14 +98,14 @@ def get_num_video_tokens(
9898
image_height: int,
9999
num_frames: int,
100100
) -> int:
101-
num_frame_tokens = self.get_num_frame_tokens(
101+
num_frame_tokens = self._get_num_frame_tokens(
102102
image_width=image_width,
103103
image_height=image_height,
104104
)
105105

106106
return num_frame_tokens * num_frames
107107

108-
def get_max_video_frames(self, max_tokens: int) -> int:
108+
def _get_max_video_frames(self, max_tokens: int) -> int:
109109
target_width, target_height = self.get_image_size_with_most_features()
110110

111111
num_frames = 0
@@ -125,11 +125,11 @@ def get_max_video_frames(self, max_tokens: int) -> int:
125125

126126
return num_frames
127127

128-
def get_max_num_frames(self, seq_len: int) -> int:
128+
def get_num_frames_with_most_features(self, seq_len: int) -> int:
129129
mm_config = self.ctx.get_mm_config()
130130
max_videos = mm_config.limit_per_prompt.get("video", 1)
131131

132-
max_total_frames = self.get_max_video_frames(seq_len)
132+
max_total_frames = self._get_max_video_frames(seq_len)
133133

134134
return max(max_total_frames // max(max_videos, 1), 1)
135135

@@ -146,15 +146,18 @@ def get_dummy_processor_inputs(
146146

147147
processor = self.info.get_hf_processor()
148148
video_token = processor.video_token
149+
149150
target_width, target_height = \
150151
self.info.get_image_size_with_most_features()
152+
target_num_frames = \
153+
self.info.get_num_frames_with_most_features(seq_len)
151154

152155
mm_data = {
153156
"video":
154157
self._get_dummy_videos(
155158
width=target_width,
156159
height=target_height,
157-
num_frames=self.info.get_max_num_frames(seq_len),
160+
num_frames=target_num_frames,
158161
num_videos=num_videos,
159162
)
160163
}

vllm/model_executor/models/llava_onevision.py

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from vllm.multimodal import MULTIMODAL_REGISTRY
2020
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
2121
NestedTensors)
22-
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
23-
VideoEmbeddingItems, VideoProcessorItems)
22+
from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
23+
VideoProcessorItems)
2424
from vllm.multimodal.processing import PromptReplacement
2525
from vllm.multimodal.profiling import ProcessorInputs
2626
from vllm.sequence import IntermediateTensors
@@ -109,7 +109,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
109109

110110
# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
111111
# with additional logic afterwards taken from LlavaOnevisionProcessor
112-
def get_num_unpadded_features(
112+
def _get_num_unpadded_features(
113113
self,
114114
*,
115115
original_height: int,
@@ -145,23 +145,7 @@ def get_num_unpadded_features(
145145

146146
return (unpadded_features, newline_features)
147147

148-
def get_image_size_with_most_features(self) -> ImageSize:
149-
hf_config = self.get_hf_config()
150-
largest_feature_size, largest_feature_pinpoint = 0, None
151-
for (height, width) in hf_config.image_grid_pinpoints:
152-
feat_size = self.get_num_image_tokens(image_width=width,
153-
image_height=height)
154-
if feat_size > largest_feature_size:
155-
largest_feature_size = feat_size
156-
largest_feature_pinpoint = ImageSize(width=width,
157-
height=height)
158-
159-
if largest_feature_size == 0 or largest_feature_pinpoint is None:
160-
raise ValueError("Cannot have a largest feature size of 0!")
161-
162-
return largest_feature_pinpoint
163-
164-
def get_num_frame_tokens(
148+
def _get_num_frame_tokens(
165149
self,
166150
*,
167151
image_width: int,
@@ -183,14 +167,14 @@ def get_num_video_tokens(
183167
image_height: int,
184168
num_frames: int,
185169
) -> int:
186-
num_frame_tokens = self.get_num_frame_tokens(
170+
num_frame_tokens = self._get_num_frame_tokens(
187171
image_width=image_width,
188172
image_height=image_height,
189173
)
190174

191175
return num_frame_tokens * num_frames + 1 # Newline token
192176

193-
def get_max_video_frames(self, max_tokens: int) -> int:
177+
def _get_max_video_frames(self, max_tokens: int) -> int:
194178
target_width, target_height = self.get_image_size_with_most_features()
195179

196180
num_frames = 0
@@ -210,14 +194,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:
210194

211195
return num_frames
212196

213-
def get_max_num_frames(self, seq_len: int) -> int:
197+
def get_num_frames_with_most_features(self, seq_len: int) -> int:
214198
mm_config = self.ctx.get_mm_config()
215199
max_images = mm_config.limit_per_prompt.get("image", 1)
216200
max_videos = mm_config.limit_per_prompt.get("video", 1)
217201

218202
max_image_tokens = self.get_max_image_tokens() * max_images
219-
max_total_frames = self.get_max_video_frames(seq_len -
220-
max_image_tokens)
203+
max_total_frames = self._get_max_video_frames(seq_len -
204+
max_image_tokens)
221205
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
222206
_MAX_FRAMES_PER_VIDEO)
223207

@@ -229,7 +213,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
229213
return self.get_num_video_tokens(
230214
image_width=target_width,
231215
image_height=target_height,
232-
num_frames=self.get_max_num_frames(seq_len),
216+
num_frames=self.get_num_frames_with_most_features(seq_len),
233217
)
234218

235219

@@ -247,8 +231,11 @@ def get_dummy_processor_inputs(
247231
processor = self.info.get_hf_processor()
248232
image_token = processor.image_token
249233
video_token = processor.video_token
234+
250235
target_width, target_height = \
251236
self.info.get_image_size_with_most_features()
237+
target_num_frames = \
238+
self.info.get_num_frames_with_most_features(seq_len)
252239

253240
mm_data = {
254241
"image":
@@ -259,7 +246,7 @@ def get_dummy_processor_inputs(
259246
self._get_dummy_videos(
260247
width=target_width,
261248
height=target_height,
262-
num_frames=self.info.get_max_num_frames(seq_len),
249+
num_frames=target_num_frames,
263250
num_videos=num_videos,
264251
)
265252
}

vllm/model_executor/models/qwen2_vl.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ def get_max_image_tokens(self) -> int:
836836
image_height=target_height,
837837
)
838838

839-
def get_max_video_frames(self, max_tokens: int) -> int:
839+
def _get_max_video_frames(self, max_tokens: int) -> int:
840840
target_width, target_height = self.get_image_size_with_most_features()
841841

842842
num_frames = 0
@@ -856,14 +856,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:
856856

857857
return num_frames
858858

859-
def get_max_num_frames(self, seq_len: int) -> int:
859+
def get_num_frames_with_most_features(self, seq_len: int) -> int:
860860
mm_config = self.ctx.get_mm_config()
861861
max_images = mm_config.limit_per_prompt.get("image", 1)
862862
max_videos = mm_config.limit_per_prompt.get("video", 1)
863863

864864
max_image_tokens = self.get_max_image_tokens() * max_images
865-
max_total_frames = self.get_max_video_frames(seq_len -
866-
max_image_tokens)
865+
max_total_frames = self._get_max_video_frames(seq_len -
866+
max_image_tokens)
867867

868868
num_frames = max(max_total_frames // max(max_videos, 1), 1)
869869

@@ -879,7 +879,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
879879
return self.get_num_video_tokens(
880880
image_width=target_width,
881881
image_height=target_height,
882-
num_frames=self.get_max_num_frames(seq_len),
882+
num_frames=self.get_num_frames_with_most_features(seq_len),
883883
)
884884

885885

@@ -896,8 +896,11 @@ def get_dummy_processor_inputs(
896896
hf_processor = self.info.get_hf_processor()
897897
image_token: str = hf_processor.image_token
898898
video_token: str = hf_processor.video_token
899+
899900
target_width, target_height = \
900901
self.info.get_image_size_with_most_features()
902+
target_num_frames = \
903+
self.info.get_num_frames_with_most_features(seq_len)
901904

902905
mm_data = {
903906
"image":
@@ -908,7 +911,7 @@ def get_dummy_processor_inputs(
908911
self._get_dummy_videos(
909912
width=target_width,
910913
height=target_height,
911-
num_frames=self.info.get_max_num_frames(seq_len),
914+
num_frames=target_num_frames,
912915
num_videos=num_videos,
913916
)
914917
}

0 commit comments

Comments
 (0)