Skip to content

Commit 6691a77

Browse files
congcongchen123shreyankg
authored andcommitted
[Model] New model support for Phi-4-multimodal-instruct (vllm-project#14119)
1 parent 56709de commit 6691a77

File tree

10 files changed

+7159
-3
lines changed

10 files changed

+7159
-3
lines changed

docs/source/models/supported_models.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ See [this page](#generative-models) for more information on how to use generativ
410410
* ✅︎
411411
- * `Phi3ForCausalLM`
412412
* Phi-4, Phi-3
413-
* `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
413+
* `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
414414
* ✅︎
415415
* ✅︎
416416
- * `Phi3SmallForCausalLM`
@@ -856,6 +856,13 @@ See [this page](#generative-models) for more information on how to use generativ
856856
*
857857
* ✅︎
858858
* ✅︎
859+
- * `Phi4MMForCausalLM`
860+
* Phi-4-multimodal
861+
* T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup>
862+
* `microsoft/Phi-4-multimodal-instruct`, etc.
863+
* ✅︎
864+
*
865+
*
859866
- * `PixtralForConditionalGeneration`
860867
* Pixtral
861868
* T + I<sup>+</sup>

requirements-common.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
3737
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
3838
watchfiles # required for http server to monitor the updates of TLS files
3939
python-json-logger # Used by logging as per examples/other/logging_configuration.md
40+
scipy # Required for phi-4-multimodal-instruct

tests/models/registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,8 @@ def check_available_online(
272272
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
273273
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
274274
trust_remote_code=True),
275+
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
276+
trust_remote_code=True),
275277
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
276278
tokenizer_mode="mistral"),
277279
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",

vllm/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2286,9 +2286,9 @@ def compute_hash(self) -> str:
22862286
return hash_str
22872287

22882288
def __post_init__(self):
2289-
# Setting the maximum rank to 256 should be able to satisfy the vast
2289+
# Setting the maximum rank to 512 should be able to satisfy the vast
22902290
# majority of applications.
2291-
possible_max_ranks = (8, 16, 32, 64, 128, 256)
2291+
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
22922292
possible_lora_extra_vocab_size = (0, 256, 512)
22932293
if self.max_lora_rank not in possible_max_ranks:
22942294
raise ValueError(

vllm/entrypoints/chat_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,8 @@ def _placeholder_str(self, modality: ModalityStr,
395395
if model_type == "phi3_v":
396396
# Workaround since this token is not defined in the tokenizer
397397
return f"<|image_{current_count}|>"
398+
if model_type == "phi4mm":
399+
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
398400
if model_type in ("minicpmo", "minicpmv"):
399401
return "(<image>./</image>)"
400402
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
@@ -424,6 +426,8 @@ def _placeholder_str(self, modality: ModalityStr,
424426
elif modality == "audio":
425427
if model_type == "ultravox":
426428
return "<|audio|>"
429+
if model_type == "phi4mm":
430+
return "<|endoftext11|>" # 200011 (see vocab.json in hf model)
427431
if model_type == "qwen2_audio":
428432
return (f"Audio {current_count}: "
429433
f"<|audio_bos|><|AUDIO|><|audio_eos|>")

0 commit comments

Comments
 (0)