Skip to content

Commit 49f6070

Browse files
ywang96Isotr0py
authored andcommitted
[Bugfix] Fix max image feature size for Llava-one-vision (vllm-project#12104)
Signed-off-by: Roger Wang <[email protected]> Signed-off-by: Isotr0py <[email protected]>
1 parent f31ddb8 commit 49f6070

File tree

3 files changed

+129
-2
lines changed

3 files changed

+129
-2
lines changed

tests/models/multimodal/processing/test_llava_next.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,67 @@
1313
from ...utils import build_model_context
1414

1515

16+
def _validate_image_max_tokens_one(
17+
processor: BaseMultiModalProcessor,
18+
max_tokens: int,
19+
failed_size_excs: list[tuple[ImageSize, Exception]],
20+
image_size: ImageSize,
21+
) -> None:
22+
info = processor.info
23+
feature_size = info.get_num_image_tokens(image_width=image_size.width,
24+
image_height=image_size.height)
25+
26+
try:
27+
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
28+
except Exception as exc:
29+
failed_size_excs.append((image_size, exc))
30+
31+
32+
@pytest.mark.skip("This test takes around 5 minutes to run. "
33+
"Comment this out to run it manually.")
34+
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
35+
def test_processor_max_tokens(model_id):
36+
ctx = build_model_context(
37+
model_name=model_id,
38+
tokenizer_name=model_id,
39+
mm_processor_kwargs=None,
40+
limit_mm_per_prompt={"image": 1},
41+
)
42+
processor = MULTIMODAL_REGISTRY.create_processor(
43+
ctx.model_config,
44+
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
45+
)
46+
info = processor.info
47+
48+
seen_aspect_ratios = set[float]()
49+
image_sizes = list[ImageSize]()
50+
51+
# The aspect ratio of the grid layout is between 1 and 2
52+
# NOTE: Assumes that feature size calculation is the same if we
53+
# swap the width and height of the image
54+
for w, h in itertools.product(range(32, 4096), repeat=2):
55+
aspect_ratio = w / h
56+
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
57+
image_sizes.append(ImageSize(w, h))
58+
seen_aspect_ratios.add(aspect_ratio)
59+
60+
failed_size_excs = list[tuple[ImageSize, Exception]]()
61+
62+
validate_one = partial(
63+
_validate_image_max_tokens_one,
64+
processor,
65+
info.get_max_image_tokens(), # type: ignore
66+
failed_size_excs,
67+
)
68+
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
69+
70+
if failed_size_excs:
71+
msg = "Found failing image sizes:" \
72+
+ "\n========\n".join(f"[{size}]\n{exc}"
73+
for size, exc in failed_size_excs)
74+
raise AssertionError(msg)
75+
76+
1677
def _validate_image_prompt_replacements_one(
1778
processor: BaseMultiModalProcessor,
1879
num_imgs: int,

tests/models/multimodal/processing/test_llava_onevision.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,68 @@
1313
from ...utils import build_model_context
1414

1515

16+
def _validate_image_max_tokens_one(
17+
processor: BaseMultiModalProcessor,
18+
max_tokens: int,
19+
failed_size_excs: list[tuple[ImageSize, Exception]],
20+
image_size: ImageSize,
21+
) -> None:
22+
info = processor.info
23+
feature_size = info.get_num_image_tokens(image_width=image_size.width,
24+
image_height=image_size.height)
25+
26+
try:
27+
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
28+
except Exception as exc:
29+
failed_size_excs.append((image_size, exc))
30+
31+
32+
@pytest.mark.skip("This test takes around 5 minutes to run. "
33+
"Comment this out to run it manually.")
34+
@pytest.mark.parametrize("model_id",
35+
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
36+
def test_processor_max_tokens(model_id):
37+
ctx = build_model_context(
38+
model_name=model_id,
39+
tokenizer_name=model_id,
40+
mm_processor_kwargs=None,
41+
limit_mm_per_prompt={"image": 1},
42+
)
43+
processor = MULTIMODAL_REGISTRY.create_processor(
44+
ctx.model_config,
45+
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
46+
)
47+
info = processor.info
48+
49+
seen_aspect_ratios = set[float]()
50+
image_sizes = list[ImageSize]()
51+
52+
# The aspect ratio of the grid layout is between 1 and 6
53+
# NOTE: Assumes that feature size calculation is the same if we
54+
# swap the width and height of the image
55+
for w, h in itertools.product(range(32, 4096), repeat=2):
56+
aspect_ratio = w / h
57+
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
58+
image_sizes.append(ImageSize(w, h))
59+
seen_aspect_ratios.add(aspect_ratio)
60+
61+
failed_size_excs = list[tuple[ImageSize, Exception]]()
62+
63+
validate_one = partial(
64+
_validate_image_max_tokens_one,
65+
processor,
66+
info.get_max_image_tokens(), # type: ignore
67+
failed_size_excs,
68+
)
69+
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
70+
71+
if failed_size_excs:
72+
msg = "Found failing image sizes:" \
73+
+ "\n========\n".join(f"[{size}]\n{exc}"
74+
for size, exc in failed_size_excs)
75+
raise AssertionError(msg)
76+
77+
1678
def _validate_image_prompt_replacements_one(
1779
processor: BaseMultiModalProcessor,
1880
num_imgs: int,

vllm/model_executor/models/llava_onevision.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from vllm.multimodal import MULTIMODAL_REGISTRY
2020
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
2121
NestedTensors)
22-
from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
23-
VideoProcessorItems)
22+
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
23+
VideoEmbeddingItems, VideoProcessorItems)
2424
from vllm.multimodal.processing import PromptReplacement
2525
from vllm.multimodal.profiling import ProcessorInputs
2626
from vllm.sequence import IntermediateTensors
@@ -145,6 +145,10 @@ def _get_num_unpadded_features(
145145

146146
return (unpadded_features, newline_features)
147147

148+
def get_image_size_with_most_features(self) -> ImageSize:
149+
# NOTE: This hardcoded value is found via processor tests
150+
return ImageSize(width=1153, height=944)
151+
148152
def _get_num_frame_tokens(
149153
self,
150154
*,

0 commit comments

Comments
 (0)