27
27
Tuple , TypedDict , Union )
28
28
29
29
import torch
30
- import torch .types
31
30
from torch import nn
31
+ from transformers import BatchFeature
32
32
from transformers .modeling_outputs import BaseModelOutputWithPast
33
33
from transformers .models .whisper .modeling_whisper import (
34
34
ACT2FN , WHISPER_ATTENTION_CLASSES , WhisperConfig , WhisperEncoder )
37
37
from vllm .config import VllmConfig
38
38
from vllm .multimodal import MULTIMODAL_REGISTRY , MultiModalKwargs
39
39
from vllm .multimodal .inputs import MultiModalFieldConfig
40
- from vllm .multimodal .parse import (ModalityData , ModalityDataItems ,
41
- MultiModalDataItems , MultiModalDataParser ,
42
- VideoItem )
43
- from vllm .multimodal .processing import (BaseMultiModalProcessor ,
44
- PromptReplacement )
40
+ from vllm .multimodal .parse import (AudioItem , DictEmbeddingItems , ModalityData ,
41
+ ModalityDataItems , MultiModalDataItems ,
42
+ MultiModalDataParser )
43
+ from vllm .multimodal .processing import PromptReplacement
45
44
from vllm .multimodal .profiling import ProcessorInputs
46
45
from vllm .sequence import IntermediateTensors
47
46
48
47
from .minicpmv import (MiniCPMV2_6 , MiniCPMVDummyInputsBuilder ,
49
- MiniCPMVEmbeddingItems , MiniCPMVMultiModalDataParser ,
50
- MiniCPMVMultiModalProcessor , MiniCPMVProcessingInfo )
48
+ MiniCPMVMultiModalDataParser ,
49
+ MiniCPMVMultiModalProcessor , MiniCPMVProcessingInfo ,
50
+ _minicpmv_field_config )
51
51
from .utils import AutoWeightsLoader , maybe_prefix
52
52
53
53
CPU_DEVICE = torch .device ("cpu" )
54
54
55
- MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems
56
-
57
55
58
56
class MiniCPMOAudioFeatureInputs (TypedDict ):
59
57
type : Literal ["audio_features" ]
@@ -103,28 +101,49 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
103
101
MiniCPMOAudioEmbeddingInputs ]
104
102
105
103
106
- class MiniCPMOAudioEmbeddingItems (MiniCPMOEmbeddingItems ):
104
+ def _minicpmo_field_config (hf_inputs : Mapping [str , torch .Tensor ]):
105
+ audio_num_slices = hf_inputs .get ("audio_num_slices" , torch .empty (0 ))
106
+
107
+ return dict (
108
+ ** _minicpmv_field_config (hf_inputs ),
109
+ audio_features = MultiModalFieldConfig .flat_from_sizes (
110
+ "audio" , audio_num_slices ),
111
+ audio_feature_lens = MultiModalFieldConfig .flat_from_sizes (
112
+ "audio" , audio_num_slices ),
113
+ audio_num_slices = MultiModalFieldConfig .batched ("audio" ),
114
+ audio_orders_in_mm_data = MultiModalFieldConfig .batched ("audio" ),
115
+ audio_embeds = MultiModalFieldConfig .flat_from_sizes (
116
+ "audio" , audio_num_slices ),
117
+ )
118
+
107
119
108
- def __init__ (self , data : Dict ) -> None :
109
- super ().__init__ (data , "audio" )
110
- audio_embeds = self .data .get ("audio_embeds" , None )
111
- if audio_embeds is None :
112
- raise ValueError ("Incorrect type of video_embeds" ,
113
- "Got type: None" )
114
- self .data ["audio_embeds" ] = audio_embeds
120
+ class MiniCPMOAudioEmbeddingItems (DictEmbeddingItems ):
115
121
116
- def get (self , index : int ) -> object :
117
- return self .data ["audio_embeds" ][index ]
122
+ def __init__ (
123
+ self ,
124
+ data : Mapping [str , torch .Tensor ],
125
+ fields_config : Mapping [str , MultiModalFieldConfig ],
126
+ ) -> None :
127
+ super ().__init__ (
128
+ data ,
129
+ modality = "image" ,
130
+ fields_config = fields_config ,
131
+ required_fields = {"audio_embeds" },
132
+ )
118
133
119
134
120
135
class MiniCPMOMultiModalDataParser (MiniCPMVMultiModalDataParser ):
121
136
122
137
def _parse_audio_data (
123
138
self ,
124
- data : Union [dict [str , torch .Tensor ], ModalityData [VideoItem ]],
139
+ data : Union [dict [str , torch .Tensor ], ModalityData [AudioItem ]],
125
140
) -> ModalityDataItems [Any , Any ]:
126
141
if isinstance (data , dict ):
127
- return MiniCPMOAudioEmbeddingItems (data )
142
+ return MiniCPMOAudioEmbeddingItems (
143
+ data ,
144
+ fields_config = _minicpmo_field_config (data ),
145
+ )
146
+
128
147
return super ()._parse_audio_data (data )
129
148
130
149
@@ -167,6 +186,10 @@ def get_max_audio_tokens_per_chunk(self) -> int:
167
186
def get_max_audio_chunks_with_most_features (self ) -> int :
168
187
return 30
169
188
189
+ def get_max_audio_tokens (self ) -> int :
190
+ return self .get_max_audio_tokens_per_chunk (
191
+ ) * self .get_max_audio_chunks_with_most_features ()
192
+
170
193
def get_audio_len_by_num_chunks (self , num_chunks : int ) -> int :
171
194
sampling_rate = self .get_default_audio_sampling_rate ()
172
195
# exclude <audio> </audio>
@@ -194,7 +217,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
194
217
return num_frames
195
218
196
219
197
- class MiniCPMODummyInputsBuilder (MiniCPMVDummyInputsBuilder ):
220
+ class MiniCPMODummyInputsBuilder (
221
+ MiniCPMVDummyInputsBuilder [MiniCPMOProcessingInfo ]):
198
222
199
223
def get_dummy_processor_inputs (
200
224
self , seq_len : int , mm_counts : Mapping [str ,
@@ -222,8 +246,7 @@ def get_dummy_processor_inputs(
222
246
223
247
224
248
class MiniCPMOMultiModalProcessor (
225
- MiniCPMVMultiModalProcessor ,
226
- BaseMultiModalProcessor [MiniCPMOProcessingInfo ]):
249
+ MiniCPMVMultiModalProcessor [MiniCPMOProcessingInfo ]):
227
250
228
251
def _get_data_parser (self ) -> MultiModalDataParser :
229
252
return MiniCPMOMultiModalDataParser (
@@ -369,21 +392,10 @@ def get_replacement_minicpmv(item_idx: int, modality: str):
369
392
370
393
def _get_mm_fields_config (
371
394
self ,
372
- hf_inputs ,
395
+ hf_inputs : BatchFeature ,
373
396
hf_processor_mm_kwargs : Mapping [str , object ],
374
397
) -> Mapping [str , MultiModalFieldConfig ]:
375
- audio_num_slices = hf_inputs .get ("audio_num_slices" , torch .empty (0 ))
376
-
377
- return dict (
378
- ** super ()._get_mm_fields_config (hf_inputs , hf_processor_mm_kwargs ),
379
- audio_features = MultiModalFieldConfig .flat_from_sizes (
380
- "audio" , audio_num_slices ),
381
- audio_feature_lens = MultiModalFieldConfig .flat_from_sizes (
382
- "audio" , audio_num_slices ),
383
- audio_num_slices = MultiModalFieldConfig .batched ("audio" ),
384
- audio_orders_in_mm_data = MultiModalFieldConfig .batched ("audio" ),
385
- audio_embeds = MultiModalFieldConfig .flat_from_sizes (
386
- "audio" , audio_num_slices ))
398
+ return _minicpmo_field_config (hf_inputs )
387
399
388
400
389
401
class MultiModalProjector (nn .Module ):
@@ -406,7 +418,7 @@ def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
406
418
407
419
class MiniCPMWhisperEncoderLayer (nn .Module ):
408
420
409
- def __init__ (self , config : WhisperConfig , layer_idx : int = None ):
421
+ def __init__ (self , config : WhisperConfig , layer_idx : int ):
410
422
super ().__init__ ()
411
423
self .embed_dim = config .d_model
412
424
self .self_attn = WHISPER_ATTENTION_CLASSES [
0 commit comments