Skip to content

Commit 9bfbc82

Browse files
committed
Implement merged processor for llava-next
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 23c1b10 commit 9bfbc82

File tree

10 files changed

+483
-354
lines changed

10 files changed

+483
-354
lines changed

tests/multimodal/test_processing.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ def _test_processing_cache_correctness(
631631
("facebook/chameleon-7b", {"image": False}),
632632
("adept/fuyu-8b", {"image": False}),
633633
("llava-hf/llava-1.5-7b-hf", {"image": True}),
634+
("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
634635
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
635636
("mistral-community/pixtral-12b", {"image": True}),
636637
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),

tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,11 @@
33
import torch
44

55
from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
6-
LlavaMultiModalProcessor,
7-
get_max_llava_image_tokens)
6+
LlavaMultiModalProcessor)
87
from vllm.model_executor.sampling_metadata import SamplingMetadata
98
from vllm.multimodal import MULTIMODAL_REGISTRY
109

1110

12-
@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
1311
@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
1412
class MyLlava(LlavaForConditionalGeneration):
1513

vllm/model_executor/models/clip.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
resolve_visual_encoder_outputs)
2525
from vllm.sequence import SequenceData
2626

27+
from .vision import VisionEncoderInfo
28+
2729

2830
def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
2931
assert image_size % patch_size == 0
@@ -149,6 +151,29 @@ def input_processor_for_clip(
149151
multi_modal_placeholders={"image": ranges})
150152

151153

154+
class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
155+
156+
def get_num_image_tokens(
157+
self,
158+
*,
159+
image_width: int,
160+
image_height: int,
161+
) -> int:
162+
return get_clip_image_feature_size(self.vision_config)
163+
164+
def get_max_image_tokens(self) -> int:
165+
return get_max_clip_image_tokens(self.vision_config)
166+
167+
def get_num_patches(self) -> int:
168+
return get_clip_patch_grid_length(
169+
image_size=self.vision_config.image_size,
170+
patch_size=self.vision_config.patch_size,
171+
)
172+
173+
def get_image_size(self) -> int:
174+
return self.vision_config.image_size
175+
176+
152177
# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
153178
class CLIPVisionEmbeddings(nn.Module):
154179

vllm/model_executor/models/fuyu.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _get_image_target_size(self) -> ImageSize:
7676
return ImageSize(width=target_size["width"],
7777
height=target_size["height"])
7878

79-
def _get_image_grid_size(
79+
def _get_image_feature_grid_size(
8080
self,
8181
*,
8282
image_width: int,
@@ -99,7 +99,7 @@ def _get_image_grid_size(
9999
def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
100100
target_width, target_height = self._get_image_target_size()
101101

102-
max_ncols, max_nrows = self._get_image_grid_size(
102+
max_ncols, max_nrows = self._get_image_feature_grid_size(
103103
image_width=target_width,
104104
image_height=target_height,
105105
)
@@ -172,7 +172,7 @@ def get_replacement_fuyu(item_idx: int):
172172
images = mm_items.get_items("image", ImageProcessorItems)
173173
image_size = images.get_image_size(item_idx)
174174

175-
ncols, nrows = self._get_image_grid_size(
175+
ncols, nrows = self._get_image_feature_grid_size(
176176
image_width=image_size.width,
177177
image_height=image_size.height,
178178
)

0 commit comments

Comments
 (0)