Skip to content

Commit bc4b6c1

Browse files
ywang96mzusman
authored andcommitted
[V1] Support audio language models on V1 (vllm-project#11733)
Signed-off-by: Roger Wang <[email protected]>
1 parent 15774d6 commit bc4b6c1

File tree

3 files changed

+29
-12
lines changed

3 files changed

+29
-12
lines changed

docs/source/models/supported_models.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
710710
- `Qwen/Qwen2-Audio-7B-Instruct`
711711
-
712712
- ✅︎
713-
-
713+
- ✅︎
714714
* - `Qwen2VLForConditionalGeneration`
715715
- Qwen2-VL
716716
- T + I<sup>E+</sup> + V<sup>E+</sup>
@@ -724,7 +724,7 @@ See [this page](#generative-models) for more information on how to use generativ
724724
- `fixie-ai/ultravox-v0_3`
725725
-
726726
- ✅︎
727-
-
727+
- ✅︎
728728
```
729729

730730
<sup>E</sup> Pre-computed embeddings can be inputted for this modality.

vllm/model_executor/models/qwen2_audio.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -335,13 +335,16 @@ def _process_audio_input(self,
335335
selected_audio_feature = audio_outputs.last_hidden_state
336336
audio_features = self.multi_modal_projector(selected_audio_feature)
337337
num_audios, max_audio_tokens, embed_dim = audio_features.shape
338+
audio_output_lengths = audio_output_lengths.unsqueeze(1)
338339
audio_features_mask = torch.arange(max_audio_tokens).expand(
339-
num_audios, max_audio_tokens
340-
).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1)
340+
num_audios, max_audio_tokens).to(
341+
audio_output_lengths.device) < audio_output_lengths
341342
masked_audio_features = audio_features[audio_features_mask].view(
342343
-1, embed_dim)
343344

344-
return masked_audio_features
345+
# Split to tuple of embeddings for individual audio input.
346+
return torch.split(masked_audio_features,
347+
audio_output_lengths.flatten().tolist())
345348

346349
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
347350
audio_input = self._parse_and_validate_audio_input(**kwargs)

vllm/model_executor/models/ultravox.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
22
"""PyTorch Ultravox model."""
3-
43
import math
54
from functools import cached_property
65
from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
@@ -14,6 +13,7 @@
1413
from transformers.models.whisper import WhisperFeatureExtractor
1514
from transformers.models.whisper.modeling_whisper import WhisperEncoder
1615

16+
from vllm import envs
1717
from vllm.attention import AttentionMetadata
1818
from vllm.config import VllmConfig
1919
from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
@@ -35,8 +35,11 @@
3535
from .interfaces import SupportsMultiModal, SupportsPP
3636
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
3737
init_vllm_registered_model, maybe_prefix,
38+
merge_multimodal_embeddings,
3839
merge_multimodal_embeddings_from_map)
3940

41+
_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
42+
_AUDIO_PLACEHOLDER_TOKEN = 128002
4043
_AUDIO_TOKENS_PER_SECOND = 6.25
4144

4245

@@ -64,7 +67,14 @@ def _get_hf_processor(
6467
# Ignored in initialization
6568
sampling_rate: Optional[int] = None,
6669
) -> ProcessorMixin:
67-
return self.ctx.get_hf_processor()
70+
hf_processor = self.ctx.get_hf_processor()
71+
72+
# NOTE: Ultravox processing definition uses '<|eot_id|>' as the
73+
# placeholder that will cause confusion with the actual end of turn
74+
# token, thus we override placeholder with a reserved special
75+
# token.
76+
hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
77+
return hf_processor
6878

6979
def _get_feature_extractor(
7080
self,
@@ -465,11 +475,15 @@ def get_input_embeddings(
465475
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
466476
if multimodal_embeddings is not None:
467477

468-
# TODO(ywang96): use merge_multimodal_embeddings after
469-
# v0 is deprecated
470-
merge_multimodal_embeddings_from_map(
471-
inputs_embeds, multimodal_embeddings,
472-
attn_metadata.multi_modal_placeholder_index_maps["audio"])
478+
# TODO(ywang96): remove this block after v0 is deprecated.
479+
if not envs.VLLM_USE_V1:
480+
merge_multimodal_embeddings_from_map(
481+
inputs_embeds, multimodal_embeddings,
482+
attn_metadata.multi_modal_placeholder_index_maps["audio"])
483+
else:
484+
inputs_embeds = merge_multimodal_embeddings(
485+
input_ids, inputs_embeds, multimodal_embeddings,
486+
_AUDIO_PLACEHOLDER_TOKEN)
473487
return inputs_embeds
474488

475489
def forward(self,

0 commit comments

Comments
 (0)