Merge branch 'main' of github.com:zRzRzRzRzRzRzR/vllm

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit f76cf1c02ecf · 2025-04-09T20:24:15.000+08:00
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
@@ -79,6 +79,17 @@ Further update the model as follows:
             return inputs_embeds
     ```
 
+- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_language_model(self) -> torch.nn.Module:
+            # Change `language_model` according to your implementation.
+            return self.language_model
+    ```
+
 - Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 
   ```diff
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
@@ -1,14 +1,89 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for the SamplingParams class.
 """
+
+import pytest
+
 from vllm import SamplingParams
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+MODEL_NAME = "Qwen/Qwen1.5-7B"
 
 
 def test_max_tokens_none():
     """max_tokens=None should be allowed"""
     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 
 
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
+@pytest.fixture(scope="module")
+def model_config():
+    return ModelConfig(
+        MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+
+@pytest.fixture(scope="module")
+def default_max_tokens():
+    return 4096
+
+
+def test_sampling_params_from_request_with_no_guided_decoding_backend(
+        model_config, default_max_tokens):
+    # guided_decoding_backend is not present at request level
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # we do not expect any backend to be present and the default
+    # guided_decoding_backend at engine level will be used.
+    assert sampling_params.guided_decoding.backend is None
+
+
+@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
+                         [("xgrammar", "xgrammar"),
+                          ("lm-format-enforcer", "lm-format-enforcer"),
+                          ("outlines", "outlines")])
+def test_sampling_params_from_request_with_guided_decoding_backend(
+        request_level_guided_decoding_backend: str, expected: str,
+        model_config, default_max_tokens):
+
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+        'guided_decoding_backend':
+        request_level_guided_decoding_backend,
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # backend correctly identified in resulting sampling_params
+    assert sampling_params.guided_decoding.backend == expected
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -476,8 +476,6 @@ def to_sampling_params(
                 json_schema = self.response_format.json_schema
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
-                if self.guided_decoding_backend is None:
-                    self.guided_decoding_backend = "xgrammar"
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
@@ -605,6 +605,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_outputs, image_attn_mask)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
@@ -424,6 +424,9 @@ def _parse_and_validate_image_input(
             num_patches=num_patches,
         )
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
@@ -627,6 +627,9 @@ def _process_image_input(self,
 
         return self.language_projection(query_output)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
@@ -988,6 +988,9 @@ def _parse_and_validate_image_input(
             data=self._validate_pixel_values(pixel_values),
         )
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
@@ -604,6 +604,9 @@ def _process_image_input(
         return self._pixel_values_to_embedding(
             pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
@@ -1050,6 +1050,9 @@ def _process_image_input(
         pixel_values = image_input["data"]
         return self._encode_image(pixel_values)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
@@ -341,6 +341,9 @@ def _process_image_input(
 
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
@@ -591,6 +591,9 @@ def _process_image_input(
             e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
@@ -596,6 +596,9 @@ def _process_image_input(
 
         return self.transformer.vision(pixel_values)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
@@ -710,6 +710,9 @@ def _process_image_input(
             e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -56,6 +56,18 @@ def get_multimodal_embeddings(
         """
         ...
 
+    def get_language_model(self) -> torch.nn.Module:
+        """
+        Returns the underlying language model used for text generation.
+
+        This is typically the `torch.nn.Module` instance responsible for 
+        processing the merged multimodal embeddings and producing hidden states
+
+        Returns:
+            torch.nn.Module: The core language model component.
+        """
+        ...
+
     # Only for models that support v0 chunked prefill
     # TODO(ywang96): Remove this overload once v0 is deprecated
     @overload
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
@@ -884,6 +884,9 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         else:
             self.visual_token_mask = None
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
@@ -674,6 +674,9 @@ def _process_image_input(
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
@@ -480,6 +480,9 @@ def _process_image_input(
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
@@ -421,6 +421,9 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
 
         return [e.flatten(0, 1) for e in embeds]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         video_input = self._parse_and_validate_video_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
@@ -852,6 +852,9 @@ def apply_pooling(self, image_features: torch.Tensor, stride: int = 2):
         image_feature = image_feature.view(batch_frames, -1, dim)
         return image_feature
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
@@ -892,6 +892,9 @@ def _process_multimodal_inputs(self, modalities: dict):
 
         return multimodal_embeddings
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
@@ -514,6 +514,9 @@ def _process_image_input(
             image_embeds = (image_embeds, )
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -1325,6 +1325,9 @@ def flat_encoder_result(self, cross_attention_states: torch.Tensor,
         cross_attention_states = cross_attention_states_flat
         return cross_attention_states
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_cross_attention_states(
         self,
         image_inputs: MllamaImagePixelInputs,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
@@ -742,6 +742,9 @@ def _process_image_input(
             for img in vision_embeddings_flat.split(patches_per_image, dim=0)
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(self,
                                   **kwargs) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
@@ -1488,6 +1488,9 @@ def _process_image_input(
             )
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
@@ -323,6 +323,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_features)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
@@ -674,6 +674,9 @@ def _process_image_input(
 
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
@@ -1802,3 +1802,6 @@ def get_mm_mapping(self) -> MultiModelKeys:
             connector=["audio_projection_for_vision", "audio_projection"],
             tower_model=["vision_encoder", "embed_tokens_extend"],
         )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
@@ -396,6 +396,9 @@ def _process_image_input(
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
@@ -967,6 +967,9 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                     **kwargs)
         return modalities
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
@@ -355,6 +355,9 @@ def _process_audio_input(self,
         return torch.split(masked_audio_features,
                            audio_output_lengths.flatten().tolist())
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py

Original file line number	Diff line number	Diff line change
`@@ -424,6 +424,9 @@ def _parse_and_validate_image_input(`
`424`	`424`	`num_patches=num_patches,`
`425`	`425`	`)`
`426`	`426`
	`427`	`+ def get_language_model(self) -> torch.nn.Module:`
	`428`	`+ return self.language_model`
	`429`	`+`
`427`	`430`	`def get_multimodal_embeddings(`
`428`	`431`	`self, **kwargs: object) -> Optional[MultiModalEmbeddings]:`
`429`	`432`	`image_input = self._parse_and_validate_image_input(**kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -988,6 +988,9 @@ def _parse_and_validate_image_input(`
`988`	`988`	`data=self._validate_pixel_values(pixel_values),`
`989`	`989`	`)`
`990`	`990`
	`991`	`+ def get_language_model(self) -> torch.nn.Module:`
	`992`	`+ return self.model`
	`993`	`+`
`991`	`994`	`def get_multimodal_embeddings(`
`992`	`995`	`self, **kwargs: object) -> Optional[MultiModalEmbeddings]:`
`993`	`996`	`image_input = self._parse_and_validate_image_input(**kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -591,6 +591,9 @@ def _process_image_input(`
`591`	`591`	`e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())`
`592`	`592`	`]`
`593`	`593`
	`594`	`+ def get_language_model(self) -> torch.nn.Module:`
	`595`	`+ return self.language_model`
	`596`	`+`
`594`	`597`	`def get_multimodal_embeddings(`
`595`	`598`	`self, **kwargs: object) -> Optional[MultiModalEmbeddings]:`
`596`	`599`	`image_input = self._parse_and_validate_image_input(**kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -710,6 +710,9 @@ def _process_image_input(`
`710`	`710`	`e.flatten(0, 1) for e in image_features.split(num_patches.tolist())`
`711`	`711`	`]`
`712`	`712`
	`713`	`+ def get_language_model(self) -> torch.nn.Module:`
	`714`	`+ return self.model`
	`715`	`+`
`713`	`716`	`def get_multimodal_embeddings(`
`714`	`717`	`self, **kwargs: object) -> Optional[MultiModalEmbeddings]:`
`715`	`718`	`image_input = self._parse_and_validate_image_input(**kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -480,6 +480,9 @@ def _process_image_input(`
`480`	`480`	`for i, patch_features_batch in enumerate(patch_embeddings)`
`481`	`481`	`]`
`482`	`482`
	`483`	`+ def get_language_model(self) -> torch.nn.Module:`
	`484`	`+ return self.language_model`
	`485`	`+`
`483`	`486`	`def get_multimodal_embeddings(`
`484`	`487`	`self, **kwargs: object) -> Optional[MultiModalEmbeddings]:`
`485`	`488`	`image_input = self._parse_and_validate_image_input(**kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -742,6 +742,9 @@ def _process_image_input(`
`742`	`742`	`for img in vision_embeddings_flat.split(patches_per_image, dim=0)`
`743`	`743`	`]`
`744`	`744`
	`745`	`+ def get_language_model(self) -> torch.nn.Module:`
	`746`	`+ return self.language_model`
	`747`	`+`
`745`	`748`	`def get_multimodal_embeddings(self,`
`746`	`749`	`**kwargs) -> Optional[MultiModalEmbeddings]:`
`747`	`750`	`image_input = self._parse_and_validate_image_input(**kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -1488,6 +1488,9 @@ def _process_image_input(`
`1488`	`1488`	`)`
`1489`	`1489`	`]`
`1490`	`1490`
	`1491`	`+ def get_language_model(self) -> torch.nn.Module:`
	`1492`	`+ return self.model`
	`1493`	`+`
`1491`	`1494`	`def get_multimodal_embeddings(`
`1492`	`1495`	`self, **kwargs: object) -> Optional[MultiModalEmbeddings]:`
`1493`	`1496`	`image_input = self._parse_and_validate_image_input(**kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -1802,3 +1802,6 @@ def get_mm_mapping(self) -> MultiModelKeys:`
`1802`	`1802`	`connector=["audio_projection_for_vision", "audio_projection"],`
`1803`	`1803`	`tower_model=["vision_encoder", "embed_tokens_extend"],`
`1804`	`1804`	`)`
	`1805`	`+`
	`1806`	`+ def get_language_model(self) -> torch.nn.Module:`
	`1807`	`+ return self.model`