[Model] Future-proof Qwen2-Audio multi-modal processor (vllm-project#11776)

DarkLight1337 · mzusman · commit c455a282a2f6 · 2025-03-12T11:40:47.000+02:00
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
@@ -227,12 +227,14 @@ def get_replacement_qwen2_audio(item_idx: int):
         ]
 
     def _always_apply_prompt_replacements(self) -> bool:
-        # HF never applies prompt replacements, so we have to do it ourselves.
+        # Qwen2-Audio processor will start inserting placeholder tokens
+        # in an upcoming release:
+        # https://github.com/huggingface/transformers/pull/35534
         # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return True
+        return not hasattr(self._get_hf_processor(), "audio_token")
 
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)