Skip to content

Commit c9d3ecf

Browse files
[VLM] Merged multi-modal processor for Molmo (vllm-project#12966)
1 parent fdcf64d commit c9d3ecf

File tree

9 files changed

+750
-498
lines changed

9 files changed

+750
-498
lines changed

docs/source/models/supported_models.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,7 @@ See [this page](#generative-models) for more information on how to use generativ
793793
- * `MolmoForCausalLM`
794794
* Molmo
795795
* T + I
796-
* `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
796+
* `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.
797797
* ✅︎
798798
* ✅︎
799799
* ✅︎

tests/models/decoder_only/language/test_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
2828
),
2929
pytest.param(
30-
"THUDM/chatglm3-6b", # ChatGLM (text-only)
30+
"THUDM/chatglm3-6b", # chatglm (text-only)
3131
),
3232
pytest.param(
3333
"meta-llama/Llama-3.2-1B-Instruct", # llama

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -404,11 +404,10 @@
404404
"molmo": VLMTestInfo(
405405
models=["allenai/Molmo-7B-D-0924"],
406406
test_type=(VLMTestType.IMAGE),
407-
prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
407+
prompt_formatter=identity,
408408
max_model_len=4096,
409409
max_num_seqs=2,
410-
image_size_factors=[(),(1.0, 1.0, 1.0)],
411-
patch_hf_runner=model_utils.mlomo_patch_hf_runner,
410+
patch_hf_runner=model_utils.molmo_patch_hf_runner,
412411
postprocess_inputs=model_utils.molmo_post_processor,
413412
),
414413
# Tests for phi3v currently live in another file because of a bug in

tests/models/decoder_only/vision_language/vlm_utils/model_utils.py

Lines changed: 21 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import re
77
import types
88
from pathlib import PosixPath
9-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
9+
from typing import Callable, List, Optional, Tuple, Union
1010

1111
import torch
1212
from PIL.Image import Image
@@ -17,9 +17,7 @@
1717
from vllm.transformers_utils.tokenizer import patch_padding_side
1818
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
1919

20-
from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
21-
PromptImageInput, PromptVideoInput, _ImageAssets)
22-
from ....utils import TokensTextLogprobs
20+
from .....conftest import HfRunner, ImageAsset, _ImageAssets
2321
from .types import RunnerOutput
2422

2523

@@ -522,74 +520,7 @@ def _generate(self, *args, **kwargs):
522520
return hf_model
523521

524522

525-
def _generate_greedy_logprobs_limit(
526-
self,
527-
prompts: List[str],
528-
max_tokens: int,
529-
num_logprobs: int,
530-
images: Optional[PromptImageInput] = None,
531-
audios: Optional[PromptAudioInput] = None,
532-
videos: Optional[PromptVideoInput] = None,
533-
**kwargs: Any,
534-
) -> List[TokensTextLogprobs]:
535-
all_inputs = self.get_inputs(prompts,
536-
images=images,
537-
videos=videos,
538-
audios=audios)
539-
540-
# Process in batches for inference.
541-
if len(all_inputs):
542-
input_ids_lst = []
543-
images_lst = []
544-
images_input_idx_lst = []
545-
imges_masks_lst = []
546-
for inputs in all_inputs:
547-
input_ids_lst.append(inputs["input_ids"])
548-
images_lst.append(inputs["images"])
549-
images_input_idx_lst.append(inputs["image_input_idx"])
550-
imges_masks_lst.append(inputs["image_masks"])
551-
batch_inputs = {}
552-
batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
553-
batch_inputs['images'] = torch.cat(images_lst, dim=0)
554-
batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
555-
dim=0)
556-
batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
557-
558-
outputs = self.model.generate_from_batch(
559-
batch=self.wrap_device(batch_inputs,
560-
device=self.model.device.type),
561-
generation_config=GenerationConfig(
562-
max_new_tokens=max_tokens,
563-
stop_strings="<|endoftext|>",
564-
do_sample=False,
565-
),
566-
tokenizer=self.tokenizer,
567-
output_hidden_states=True,
568-
return_dict_in_generate=True,
569-
)
570-
571-
all_logprobs: List[List[Dict[int, float]]] = []
572-
all_output_ids: List[List[int]] = []
573-
all_output_strs: List[str] = []
574-
575-
for index in range(len(all_inputs)):
576-
(
577-
seq_logprobs_lst,
578-
output_len,
579-
) = self._hidden_states_to_logprobs(outputs.hidden_states,
580-
num_logprobs)
581-
all_logprobs.append(seq_logprobs_lst)
582-
seq_ids = outputs.sequences[index]
583-
output_ids = seq_ids[-output_len:]
584-
all_output_ids.append(output_ids.tolist())
585-
all_output_strs.append(self.tokenizer.decode(output_ids))
586-
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
587-
return [(output_ids, output_str, output_logprobs)
588-
for output_ids, output_str, output_logprobs in outputs]
589-
590-
591-
####### Molmo-specific HuggingFace runner patchers
592-
def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
523+
def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
593524
"""Patches and returns an instance of the HfRunner to use for Molmo."""
594525
hf_processor = hf_model.processor
595526

@@ -598,10 +529,23 @@ def _processor(*args, **kwargs):
598529

599530
hf_model.processor = _processor
600531

601-
setattr( # noqa: B010
602-
hf_model,
603-
"generate_greedy_logprobs_limit",
604-
types.MethodType(_generate_greedy_logprobs_limit, hf_model),
605-
)
532+
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
533+
batch = {
534+
k: kwargs.pop(k)
535+
for k in ("input_ids", "images", "image_input_idx", "image_masks")
536+
if k in kwargs
537+
}
538+
539+
return self.generate_from_batch(
540+
batch,
541+
generation_config=GenerationConfig(
542+
max_new_tokens=max_new_tokens,
543+
stop_strings="<|endoftext|>",
544+
do_sample=do_sample,
545+
),
546+
**kwargs,
547+
)
548+
549+
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
606550

607551
return hf_model

tests/models/multimodal/processing/test_common.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ def _test_processing_correctness(
168168
"mistral-community/pixtral-12b",
169169
"openbmb/MiniCPM-o-2_6",
170170
"openbmb/MiniCPM-V-2_6",
171+
"allenai/Molmo-7B-D-0924",
172+
"allenai/Molmo-7B-O-0924",
171173
"nvidia/NVLM-D-72B",
172174
"Qwen/Qwen-VL-Chat",
173175
"Qwen/Qwen2-VL-2B-Instruct",

tests/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ def check_available_online(
256256
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6",
257257
trust_remote_code=True),
258258
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
259+
extras={"olmo": "allenai/Molmo-7B-O-0924"}, # noqa: E501
259260
trust_remote_code=True),
260261
"NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
261262
trust_remote_code=True),

0 commit comments

Comments
 (0)