Skip to content

Commit ff0d3b2

Browse files
committed
skyworkr1v update
Signed-off-by: jiacai.liu <[email protected]>
1 parent 61c7a1b commit ff0d3b2

File tree

11 files changed

+1167
-7
lines changed

11 files changed

+1167
-7
lines changed

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,19 @@
316316
use_tokenizer_eos=True,
317317
patch_hf_runner=model_utils.internvl_patch_hf_runner,
318318
),
319+
"skywork_r1v": VLMTestInfo(
320+
models=["Skywork/Skywork-R1V-38B"],
321+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
322+
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
323+
single_image_prompts=IMAGE_ASSETS.prompts({
324+
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
325+
"cherry_blossom": "<image>\nWhat is the season?",
326+
}),
327+
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
328+
max_model_len=4096,
329+
use_tokenizer_eos=True,
330+
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
331+
),
319332
"llava_next": VLMTestInfo(
320333
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
321334
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),

tests/models/decoder_only/vision_language/vlm_utils/model_utils.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,61 @@ def __call__(self, text: str, images: Union[Image, list[Image]],
375375
hf_model.model)
376376
return hf_model
377377

378+
def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
379+
"""Patches and returns an instance of the HfRunner to use for SkkworkR1V."""
380+
381+
class SkyworkR1VProcessor:
382+
"""A simple processor for SkkworkR1V."""
383+
384+
def __init__(self, hf_runner: HfRunner):
385+
self.num_image_token = hf_runner.model.num_image_token
386+
self.tokenizer = hf_runner.tokenizer
387+
388+
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
389+
trust_remote_code=True)
390+
self.vision_config = self.config.vision_config
391+
self.use_thumbnail = self.config.use_thumbnail
392+
self.min_num = self.config.min_dynamic_patch
393+
self.max_num = self.config.max_dynamic_patch
394+
self.image_size = self.vision_config.image_size
395+
396+
def __call__(self, text: str, images: Union[Image, list[Image]],
397+
**kwargs):
398+
from vllm.model_executor.models.skyworkr1v import (
399+
IMG_CONTEXT, IMG_END, IMG_START,
400+
image_to_pixel_values_skyworkr1v)
401+
images = [images] if isinstance(images, Image) else images
402+
pixel_values = [
403+
image_to_pixel_values_skyworkr1v(
404+
image,
405+
input_size=self.image_size,
406+
min_num=self.min_num,
407+
max_num=self.max_num,
408+
use_thumbnail=self.use_thumbnail,
409+
) for image in images
410+
]
411+
num_patches_list = [
412+
pixel_value.shape[0] for pixel_value in pixel_values
413+
]
414+
pixel_values = torch.cat(pixel_values, dim=0)
415+
for num_patches in num_patches_list:
416+
context_tokens = IMG_CONTEXT * self.num_image_token \
417+
* num_patches
418+
image_tokens = IMG_START + context_tokens + IMG_END
419+
text = text.replace('<image>', image_tokens, 1)
420+
prompt = self.tokenizer(text, return_tensors="pt")
421+
prompt.update({"pixel_values": pixel_values})
422+
return prompt
423+
424+
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
425+
"<IMG_CONTEXT>")
426+
hf_model.model.img_context_token_id = img_context_token_id
427+
hf_model.processor = SkyworkR1VProcessor(hf_model)
428+
hf_model.model.get_output_embeddings = lambda: \
429+
hf_model.model.language_model.get_output_embeddings()
430+
hf_model.model.generate = types.MethodType(_internvl_generate,
431+
hf_model.model)
432+
return hf_model
378433

379434
def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
380435
"""Patches and returns an instance of the HfRunner to use for InternVL."""

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
274274
"openai/whisper-large-v3",
275275
"google/paligemma-3b-mix-224",
276276
"google/paligemma2-3b-ft-docci-448",
277+
"Skywork/Skywork-R1V-38B"
277278
])
278279
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
279280
@pytest.mark.parametrize("num_batches", [32])

tests/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ def check_available_online(
301301
tokenizer="facebook/bart-base",
302302
trust_remote_code=True), # noqa: E501
303303
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
304+
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
304305
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
305306
}
306307

vllm/entrypoints/chat_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ def _placeholder_str(self, modality: ModalityStr,
423423
return self._cached_token_str(self._tokenizer,
424424
hf_config.image_token_index)
425425
if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
426-
"NVLM_D", "h2ovl_chat"):
426+
"skywork_chat", "NVLM_D", "h2ovl_chat"):
427427
return "<image>"
428428
if model_type == "mllama":
429429
return "<|image|>"

vllm/model_executor/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@
189189
# [Encoder-decoder]
190190
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
191191
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
192+
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
192193
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
193194
}
194195

0 commit comments

Comments
 (0)