vllm-project
diff --git a/‎docs/source/contributing/model/multimodal.md
+8-8 b/‎docs/source/contributing/model/multimodal.md
+8-8
diff --git a/‎docs/source/models/supported_models.md
+3 b/‎docs/source/models/supported_models.md
+3
diff --git a/‎examples/offline_inference/audio_language.py
+1-1 b/‎examples/offline_inference/audio_language.py
+1-1
diff --git a/‎tests/models/decoder_only/audio_language/test_ultravox.py
+1-4 b/‎tests/models/decoder_only/audio_language/test_ultravox.py
+1-4
diff --git a/‎tests/models/decoder_only/vision_language/test_models.py
+1-1 b/‎tests/models/decoder_only/vision_language/test_models.py
+1-1
diff --git a/‎tests/models/decoder_only/vision_language/test_pixtral.py
+16-10 b/‎tests/models/decoder_only/vision_language/test_pixtral.py
+16-10
diff --git a/‎tests/models/multimodal/processing/test_llava_next.py
+2-2 b/‎tests/models/multimodal/processing/test_llava_next.py
+2-2
diff --git a/‎tests/models/multimodal/processing/test_llava_onevision.py
+2-2 b/‎tests/models/multimodal/processing/test_llava_onevision.py
+2-2
diff --git a/‎tests/models/registry.py
+1-3 b/‎tests/models/registry.py
+1-3
diff --git a/‎tests/multimodal/test_processing.py
-9 b/‎tests/multimodal/test_processing.py
-9
diff --git a/‎tests/v1/core/test_kv_cache_utils.py
+29-17 b/‎tests/v1/core/test_kv_cache_utils.py
+29-17
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To assign the vision embeddings to only the image tokens, instead of a string
-you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
+To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
+with different `full` and `feature` attributes:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails.select_token_id(
-        image_tokens + [bos_token_id],
-        embed_token_id=_IMAGE_TOKEN_ID,
+    return PromptUpdateDetails(
+        full=image_tokens + [bos_token_id],
+        features=image_tokens,
     )
 ```
 
@@ -914,9 +914,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails.select_token_id(
-            image_tokens + [bos_token_id],
-            embed_token_id=_IMAGE_TOKEN_ID,
+        return PromptUpdateDetails(
+            full=image_tokens + [bos_token_id],
+            features=image_tokens,
         )
 
     return [
 
@@ -989,6 +989,9 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
+To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers`.
+
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
 
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=2,
+        max_num_seqs=5,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
 
@@ -55,10 +55,7 @@ def server(request, audio_assets):
         for key, value in request.param.items()
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
 
@@ -167,7 +167,7 @@
             "cherry_blossom": "<image>What is the season?",  # noqa: E501
         }),
         multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
 
@@ -176,8 +176,6 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -200,14 +198,22 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("prompt,expected_ranges",
-                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
-                           [PlaceholderRange(offset=11, length=494)]),
-                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
-                              PlaceholderRange(offset=11, length=266),
-                              PlaceholderRange(offset=277, length=1056),
-                              PlaceholderRange(offset=1333, length=418)
-                          ])])
+@pytest.mark.parametrize(
+    "prompt,expected_ranges",
+    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
+        "offset": 11,
+        "length": 494
+    }]),
+     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
+         "offset": 11,
+         "length": 266
+     }, {
+         "offset": 277,
+         "length": 1056
+     }, {
+         "offset": 1333,
+         "length": 418
+     }])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
 
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder.offset == 1
-        assert first_placeholder.length == (
+        assert first_placeholder["offset"] == 1
+        assert first_placeholder["length"] == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
 
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder.offset == 0
-        assert first_placeholder.length == len(
+        assert first_placeholder["offset"] == 0
+        assert first_placeholder["length"] == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
 
@@ -277,9 +277,7 @@ def check_available_online(
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
-                                      max_transformers_version="4.48",  # noqa: E501
-                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
 
@@ -785,7 +785,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -794,7 +793,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
-                        is_embed=None,
                     ),
                 ],
             }
@@ -809,14 +807,12 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -825,7 +821,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
-                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -840,14 +835,12 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -856,7 +849,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -865,7 +857,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
-                        is_embed=None,
                     ),
                 ],
             }
 
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 # disable yapf here as it formats differently than isort such that both fail
@@ -158,10 +158,13 @@ def test_generate_block_hash_extra_keys():
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=5),
-            PlaceholderRange(offset=10, length=5),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 5
+        }, {
+            "offset": 10,
+            "length": 5
+        }],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -219,10 +222,13 @@ def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -247,19 +253,25 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
         request_id=1,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:`
`47`	`47`	`model=model_name,`
`48`	`48`	`trust_remote_code=True,`
`49`	`49`	`max_model_len=4096,`
`50`		`- max_num_seqs=2,`
	`50`	`+ max_num_seqs=5,`
`51`	`51`	`limit_mm_per_prompt={"audio": audio_count},`
`52`	`52`	`)`
`53`	`53`
Original file line number	Diff line number	Diff line change
`@@ -55,10 +55,7 @@ def server(request, audio_assets):`
`55`	`55`	`for key, value in request.param.items()`
`56`	`56`	`]`
`57`	`57`
`58`		`- with RemoteOpenAIServer(MODEL_NAME,`
`59`		`- args,`
`60`		`- env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":`
`61`		`- "30"}) as remote_server:`
	`58`	`+ with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:`
`62`	`59`	`yield remote_server`
`63`	`60`
`64`	`61`