Skip to content

Commit 19a139e

Browse files
committed
skyworkr1v update
Signed-off-by: jiacai.liu <[email protected]>
1 parent 61c7a1b commit 19a139e

File tree

12 files changed

+1157
-11
lines changed

12 files changed

+1157
-11
lines changed

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,19 @@
316316
use_tokenizer_eos=True,
317317
patch_hf_runner=model_utils.internvl_patch_hf_runner,
318318
),
319+
"skywork_r1v": VLMTestInfo(
320+
models=["Skywork/Skywork-R1V-38B"],
321+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
322+
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
323+
single_image_prompts=IMAGE_ASSETS.prompts({
324+
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
325+
"cherry_blossom": "<image>\nWhat is the season?",
326+
}),
327+
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
328+
max_model_len=4096,
329+
use_tokenizer_eos=True,
330+
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
331+
),
319332
"llava_next": VLMTestInfo(
320333
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
321334
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),

tests/models/decoder_only/vision_language/vlm_utils/model_utils.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,63 @@ def __call__(self, text: str, images: Union[Image, list[Image]],
376376
return hf_model
377377

378378

379+
def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
380+
"""Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
381+
382+
class SkyworkR1VProcessor:
383+
"""A simple processor for SkyworkR1V."""
384+
385+
def __init__(self, hf_runner: HfRunner):
386+
self.num_image_token = hf_runner.model.num_image_token
387+
self.tokenizer = hf_runner.tokenizer
388+
389+
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
390+
trust_remote_code=True)
391+
self.vision_config = self.config.vision_config
392+
self.use_thumbnail = self.config.use_thumbnail
393+
self.min_num = self.config.min_dynamic_patch
394+
self.max_num = self.config.max_dynamic_patch
395+
self.image_size = self.vision_config.image_size
396+
397+
def __call__(self, text: str, images: Union[Image, list[Image]],
398+
**kwargs):
399+
from vllm.model_executor.models.skyworkr1v import (
400+
IMG_CONTEXT, IMG_END, IMG_START,
401+
image_to_pixel_values_skyworkr1v)
402+
images = [images] if isinstance(images, Image) else images
403+
pixel_values = [
404+
image_to_pixel_values_skyworkr1v(
405+
image,
406+
input_size=self.image_size,
407+
min_num=self.min_num,
408+
max_num=self.max_num,
409+
use_thumbnail=self.use_thumbnail,
410+
) for image in images
411+
]
412+
num_patches_list = [
413+
pixel_value.shape[0] for pixel_value in pixel_values
414+
]
415+
pixel_values = torch.cat(pixel_values, dim=0)
416+
for num_patches in num_patches_list:
417+
context_tokens = IMG_CONTEXT * self.num_image_token \
418+
* num_patches
419+
image_tokens = IMG_START + context_tokens + IMG_END
420+
text = text.replace('<image>', image_tokens, 1)
421+
prompt = self.tokenizer(text, return_tensors="pt")
422+
prompt.update({"pixel_values": pixel_values})
423+
return prompt
424+
425+
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
426+
"<IMG_CONTEXT>")
427+
hf_model.model.img_context_token_id = img_context_token_id
428+
hf_model.processor = SkyworkR1VProcessor(hf_model)
429+
hf_model.model.get_output_embeddings = lambda: \
430+
hf_model.model.language_model.get_output_embeddings()
431+
hf_model.model.generate = types.MethodType(_internvl_generate,
432+
hf_model.model)
433+
return hf_model
434+
435+
379436
def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
380437
"""Patches and returns an instance of the HfRunner to use for InternVL."""
381438

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
274274
"openai/whisper-large-v3",
275275
"google/paligemma-3b-mix-224",
276276
"google/paligemma2-3b-ft-docci-448",
277+
"Skywork/Skywork-R1V-38B"
277278
])
278279
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
279280
@pytest.mark.parametrize("num_batches", [32])

tests/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ def check_available_online(
301301
tokenizer="facebook/bart-base",
302302
trust_remote_code=True), # noqa: E501
303303
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
304+
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
304305
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
305306
}
306307

vllm/entrypoints/chat_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ def _placeholder_str(self, modality: ModalityStr,
423423
return self._cached_token_str(self._tokenizer,
424424
hf_config.image_token_index)
425425
if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
426-
"NVLM_D", "h2ovl_chat"):
426+
"skywork_chat", "NVLM_D", "h2ovl_chat"):
427427
return "<image>"
428428
if model_type == "mllama":
429429
return "<|image|>"

vllm/model_executor/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@
189189
# [Encoder-decoder]
190190
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
191191
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
192+
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
192193
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
193194
}
194195

0 commit comments

Comments
 (0)