Skip to content

Commit 4da1f66

Browse files
[VLM] Keep track of whether prompt replacements have been applied (#13215)
1 parent 556ef7f commit 4da1f66

File tree

10 files changed

+373
-329
lines changed

10 files changed

+373
-329
lines changed

vllm/model_executor/models/glm4v.py

+8
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,14 @@ def get_dummy_processor_inputs(
484484

485485
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
486486

487+
def _hf_processor_applies_repl(
488+
self,
489+
prompt_text: str,
490+
mm_items: MultiModalDataItems,
491+
hf_processor_mm_kwargs: Mapping[str, object],
492+
) -> bool:
493+
return False
494+
487495
def _get_mm_fields_config(
488496
self,
489497
hf_inputs: BatchFeature,

vllm/model_executor/models/llava.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def _call_hf_processor(
294294
pixel_values = processed_outputs.get("pixel_values")
295295
if pixel_values is not None:
296296
# Before/after https://github.com/huggingface/transformers/pull/35122
297-
if Version(TRANSFORMERS_VERSION) <= Version("4.48.2"):
297+
if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
298298
images = mm_data["images"]
299299
assert isinstance(images, list)
300300

@@ -819,7 +819,6 @@ def get_replacement_mantis(item_idx: int):
819819
prompt_ids,
820820
mm_item_counts,
821821
)
822-
823822
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
824823

825824
mm_placeholder_ranges = {

vllm/model_executor/models/llava_onevision.py

+45-12
Original file line numberDiff line numberDiff line change
@@ -299,36 +299,69 @@ def _call_hf_processor(
299299
mm_kwargs=mm_kwargs,
300300
)
301301

302+
# LLaVA-OneVision processor doesn't support multiple videos
303+
# with different sizes when converting back to tensors
304+
# So, we process each component separately
305+
# NOTE: No prompt replacement is applied in this case
302306
processor = self.info.get_hf_processor()
307+
image_token = processor.image_token
303308
video_token = processor.video_token
304309

305-
# LLaVA-OneVision processor doesn't support multiple videos
306-
# with different sizes when converting back to tensors
307-
text_image_outputs = super()._call_hf_processor(
310+
text_outputs = super()._call_hf_processor(
308311
prompt=prompt,
309-
mm_data=mm_data,
312+
mm_data={},
310313
mm_kwargs=mm_kwargs,
311314
)
312315

316+
images = mm_data.pop("images", [])
317+
assert isinstance(images, list)
318+
if images:
319+
processor_outputs = super()._call_hf_processor(
320+
prompt=image_token * len(images),
321+
mm_data={"images": images},
322+
mm_kwargs=mm_kwargs,
323+
)
324+
image_outputs = {
325+
k: v
326+
for k, v in processor_outputs.items()
327+
if k in ("pixel_values", "image_sizes")
328+
}
329+
else:
330+
image_outputs = {}
331+
313332
pixel_values_videos = []
314333
for video in videos:
315-
item_processor_data = dict(prompt=video_token, videos=video)
316-
317334
item_outputs = super()._call_hf_processor(
318-
prompt=prompt,
319-
mm_data=item_processor_data,
335+
prompt=video_token,
336+
mm_data={"videos": video},
320337
mm_kwargs=mm_kwargs,
321338
)
322339

323-
pixel_values_videos.append(
324-
item_outputs.pop("pixel_values_videos")[0])
340+
pixel_values_videos.append(item_outputs["pixel_values_videos"][0])
341+
342+
video_outputs = {"pixel_values_videos": pixel_values_videos}
325343

326344
combined_outputs = dict(
327-
**text_image_outputs,
328-
pixel_values_videos=pixel_values_videos,
345+
text_outputs,
346+
**image_outputs,
347+
**video_outputs,
329348
)
330349
return BatchFeature(combined_outputs)
331350

351+
def _hf_processor_applies_repl(
352+
self,
353+
prompt_text: str,
354+
mm_items: MultiModalDataItems,
355+
hf_processor_mm_kwargs: Mapping[str, object],
356+
) -> bool:
357+
base_result = super()._hf_processor_applies_repl(
358+
prompt_text=prompt_text,
359+
mm_items=mm_items,
360+
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
361+
)
362+
363+
return base_result and mm_items.get_count("video", strict=False) == 0
364+
332365
def _get_prompt_replacements(
333366
self,
334367
mm_items: MultiModalDataItems,

vllm/model_executor/models/minicpmo.py

+51-39
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
Tuple, TypedDict, Union)
2828

2929
import torch
30-
import torch.types
3130
from torch import nn
31+
from transformers import BatchFeature
3232
from transformers.modeling_outputs import BaseModelOutputWithPast
3333
from transformers.models.whisper.modeling_whisper import (
3434
ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
@@ -37,23 +37,21 @@
3737
from vllm.config import VllmConfig
3838
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
3939
from vllm.multimodal.inputs import MultiModalFieldConfig
40-
from vllm.multimodal.parse import (ModalityData, ModalityDataItems,
41-
MultiModalDataItems, MultiModalDataParser,
42-
VideoItem)
43-
from vllm.multimodal.processing import (BaseMultiModalProcessor,
44-
PromptReplacement)
40+
from vllm.multimodal.parse import (AudioItem, DictEmbeddingItems, ModalityData,
41+
ModalityDataItems, MultiModalDataItems,
42+
MultiModalDataParser)
43+
from vllm.multimodal.processing import PromptReplacement
4544
from vllm.multimodal.profiling import ProcessorInputs
4645
from vllm.sequence import IntermediateTensors
4746

4847
from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
49-
MiniCPMVEmbeddingItems, MiniCPMVMultiModalDataParser,
50-
MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo)
48+
MiniCPMVMultiModalDataParser,
49+
MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
50+
_minicpmv_field_config)
5151
from .utils import AutoWeightsLoader, maybe_prefix
5252

5353
CPU_DEVICE = torch.device("cpu")
5454

55-
MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems
56-
5755

5856
class MiniCPMOAudioFeatureInputs(TypedDict):
5957
type: Literal["audio_features"]
@@ -103,28 +101,49 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
103101
MiniCPMOAudioEmbeddingInputs]
104102

105103

106-
class MiniCPMOAudioEmbeddingItems(MiniCPMOEmbeddingItems):
104+
def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
105+
audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0))
106+
107+
return dict(
108+
**_minicpmv_field_config(hf_inputs),
109+
audio_features=MultiModalFieldConfig.flat_from_sizes(
110+
"audio", audio_num_slices),
111+
audio_feature_lens=MultiModalFieldConfig.flat_from_sizes(
112+
"audio", audio_num_slices),
113+
audio_num_slices=MultiModalFieldConfig.batched("audio"),
114+
audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
115+
audio_embeds=MultiModalFieldConfig.flat_from_sizes(
116+
"audio", audio_num_slices),
117+
)
118+
107119

108-
def __init__(self, data: Dict) -> None:
109-
super().__init__(data, "audio")
110-
audio_embeds = self.data.get("audio_embeds", None)
111-
if audio_embeds is None:
112-
raise ValueError("Incorrect type of video_embeds",
113-
"Got type: None")
114-
self.data["audio_embeds"] = audio_embeds
120+
class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems):
115121

116-
def get(self, index: int) -> object:
117-
return self.data["audio_embeds"][index]
122+
def __init__(
123+
self,
124+
data: Mapping[str, torch.Tensor],
125+
fields_config: Mapping[str, MultiModalFieldConfig],
126+
) -> None:
127+
super().__init__(
128+
data,
129+
modality="image",
130+
fields_config=fields_config,
131+
required_fields={"audio_embeds"},
132+
)
118133

119134

120135
class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
121136

122137
def _parse_audio_data(
123138
self,
124-
data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
139+
data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
125140
) -> ModalityDataItems[Any, Any]:
126141
if isinstance(data, dict):
127-
return MiniCPMOAudioEmbeddingItems(data)
142+
return MiniCPMOAudioEmbeddingItems(
143+
data,
144+
fields_config=_minicpmo_field_config(data),
145+
)
146+
128147
return super()._parse_audio_data(data)
129148

130149

@@ -167,6 +186,10 @@ def get_max_audio_tokens_per_chunk(self) -> int:
167186
def get_max_audio_chunks_with_most_features(self) -> int:
168187
return 30
169188

189+
def get_max_audio_tokens(self) -> int:
190+
return self.get_max_audio_tokens_per_chunk(
191+
) * self.get_max_audio_chunks_with_most_features()
192+
170193
def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
171194
sampling_rate = self.get_default_audio_sampling_rate()
172195
# exclude <audio> </audio>
@@ -194,7 +217,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
194217
return num_frames
195218

196219

197-
class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder):
220+
class MiniCPMODummyInputsBuilder(
221+
MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
198222

199223
def get_dummy_processor_inputs(
200224
self, seq_len: int, mm_counts: Mapping[str,
@@ -222,8 +246,7 @@ def get_dummy_processor_inputs(
222246

223247

224248
class MiniCPMOMultiModalProcessor(
225-
MiniCPMVMultiModalProcessor,
226-
BaseMultiModalProcessor[MiniCPMOProcessingInfo]):
249+
MiniCPMVMultiModalProcessor[MiniCPMOProcessingInfo]):
227250

228251
def _get_data_parser(self) -> MultiModalDataParser:
229252
return MiniCPMOMultiModalDataParser(
@@ -369,21 +392,10 @@ def get_replacement_minicpmv(item_idx: int, modality: str):
369392

370393
def _get_mm_fields_config(
371394
self,
372-
hf_inputs,
395+
hf_inputs: BatchFeature,
373396
hf_processor_mm_kwargs: Mapping[str, object],
374397
) -> Mapping[str, MultiModalFieldConfig]:
375-
audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0))
376-
377-
return dict(
378-
**super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
379-
audio_features=MultiModalFieldConfig.flat_from_sizes(
380-
"audio", audio_num_slices),
381-
audio_feature_lens=MultiModalFieldConfig.flat_from_sizes(
382-
"audio", audio_num_slices),
383-
audio_num_slices=MultiModalFieldConfig.batched("audio"),
384-
audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
385-
audio_embeds=MultiModalFieldConfig.flat_from_sizes(
386-
"audio", audio_num_slices))
398+
return _minicpmo_field_config(hf_inputs)
387399

388400

389401
class MultiModalProjector(nn.Module):
@@ -406,7 +418,7 @@ def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
406418

407419
class MiniCPMWhisperEncoderLayer(nn.Module):
408420

409-
def __init__(self, config: WhisperConfig, layer_idx: int = None):
421+
def __init__(self, config: WhisperConfig, layer_idx: int):
410422
super().__init__()
411423
self.embed_dim = config.d_model
412424
self.self_attn = WHISPER_ATTENTION_CLASSES[

0 commit comments

Comments
 (0)