We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent e69a3d6 commit a23fd78Copy full SHA for a23fd78
vllm/model_executor/models/qwen2_vl.py
@@ -800,7 +800,11 @@ def _get_vision_info(
800
preprocessed_size = ImageSize(width=image_width,
801
height=image_height)
802
803
- grid_t = max(num_frames // temporal_patch_size, 1)
+ # NOTE: Frames are padded to be divisible by `temporal_patch_size`
804
+ # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
805
+ padded_num_frames = num_frames + num_frames % temporal_patch_size
806
+
807
+ grid_t = max(padded_num_frames // temporal_patch_size, 1)
808
grid_h = preprocessed_size.height // patch_size
809
grid_w = preprocessed_size.width // patch_size
810
0 commit comments