vllm-project
diff --git a/‎tests/models/decoder_only/vision_language/test_models.py
Lines changed: 13 additions & 0 deletions b/‎tests/models/decoder_only/vision_language/test_models.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
Lines changed: 57 additions & 0 deletions b/‎tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎tests/models/multimodal/processing/test_common.py
Lines changed: 1 addition & 0 deletions b/‎tests/models/multimodal/processing/test_common.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/registry.py
Lines changed: 1 addition & 0 deletions b/‎tests/models/registry.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/entrypoints/chat_utils.py
Lines changed: 1 addition & 1 deletion b/‎vllm/entrypoints/chat_utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/model_executor/models/registry.py
Lines changed: 1 addition & 0 deletions b/‎vllm/model_executor/models/registry.py
Lines changed: 1 addition & 0 deletions
@@ -316,6 +316,19 @@
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "skywork_r1v": VLMTestInfo(
+        models=["Skywork/Skywork-R1V-38B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
+    ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
 
@@ -376,6 +376,63 @@ def __call__(self, text: str, images: Union[Image, list[Image]],
     return hf_model
 
 
+def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
+
+    class SkyworkR1VProcessor:
+        """A simple processor for SkyworkR1V."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, list[Image]],
+                     **kwargs):
+            from vllm.model_executor.models.skyworkr1v import (
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_skyworkr1v)
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values_skyworkr1v(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = SkyworkR1VProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
 def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for InternVL."""
 
 
@@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
     "openai/whisper-large-v3",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
+    "Skywork/Skywork-R1V-38B"
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
 
@@ -301,6 +301,7 @@ def check_available_online(
                                                          tokenizer="facebook/bart-base",
                                                          trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
+    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
 
 
@@ -423,7 +423,7 @@ def _placeholder_str(self, modality: ModalityStr,
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
             if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
-                              "NVLM_D", "h2ovl_chat"):
+                              "skywork_chat", "NVLM_D", "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
 
@@ -189,6 +189,7 @@
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
Original file line number	Diff line number	Diff line change
`@@ -301,6 +301,7 @@ def check_available_online(`
`301`	`301`	`tokenizer="facebook/bart-base",`
`302`	`302`	`trust_remote_code=True), # noqa: E501`
`303`	`303`	`"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501`
	`304`	`+ "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),`
`304`	`305`	`"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501`
`305`	`306`	`}`
`306`	`307`
Original file line number	Diff line number	Diff line change
`@@ -189,6 +189,7 @@`
`189`	`189`	`# [Encoder-decoder]`
`190`	`190`	`"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501`
`191`	`191`	`"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501`
	`192`	`+ "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),`
`192`	`193`	`"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501`
`193`	`194`	`}`
`194`	`195`