Skip to content

Commit aadb656

Browse files
[Misc] Clean up Kimi-VL (#16833)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 87e067d commit aadb656

File tree

3 files changed

+20
-44
lines changed

3 files changed

+20
-44
lines changed

examples/offline_inference/vision_language.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
376376

377377
engine_args = EngineArgs(
378378
model="moonshotai/Kimi-VL-A3B-Instruct",
379-
max_model_len=4096,
380-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
381379
trust_remote_code=True,
380+
max_model_len=4096,
381+
limit_mm_per_prompt={"image": 1},
382382
)
383383

384384
return ModelRequestData(

examples/offline_inference/vision_language_multi_image.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
331331

332332
engine_args = EngineArgs(
333333
model=model_name,
334+
trust_remote_code=True,
334335
max_model_len=4096,
335336
max_num_seqs=4,
336-
tensor_parallel_size=1,
337337
limit_mm_per_prompt={"image": len(image_urls)},
338-
trust_remote_code=True,
339338
)
340339

341340
placeholders = [{"type": "image", "image": url} for url in image_urls]

vllm/model_executor/models/kimi_vl.py

Lines changed: 17 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@
5656
from vllm.config import VllmConfig
5757
from vllm.distributed import (get_tensor_model_parallel_rank,
5858
get_tensor_model_parallel_world_size)
59-
from vllm.logger import init_logger
6059
from vllm.model_executor.layers.fused_moe import FusedMoE
6160
from vllm.model_executor.layers.logits_processor import LogitsProcessor
6261
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -70,22 +69,20 @@
7069
from vllm.model_executor.models.utils import merge_multimodal_embeddings
7170
from vllm.model_executor.sampling_metadata import SamplingMetadata
7271
from vllm.multimodal import MULTIMODAL_REGISTRY
73-
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
74-
NestedTensors)
72+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
73+
MultiModalKwargs, NestedTensors)
7574
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
7675
MultiModalDataItems)
7776
from vllm.multimodal.processing import (BaseMultiModalProcessor,
7877
BaseProcessingInfo, PromptReplacement,
7978
PromptUpdate)
80-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
79+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
8180
from vllm.sequence import IntermediateTensors
8281
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
8382
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
8483

8584
from .utils import is_pp_missing_parameter, maybe_prefix
8685

87-
logger = init_logger(__name__)
88-
8986

9087
# For dummy input only
9188
@dataclass
@@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
143140
def get_hf_config(self):
144141
return self.ctx.get_hf_config(KimiVLConfig)
145142

143+
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
144+
return {"image": None}
145+
146146
def get_num_image_tokens(
147147
self,
148148
*,
@@ -180,58 +180,35 @@ def get_num_image_tokens(
180180
token_width = (width + pad_width) // (kernel_size[1] * patch_size)
181181
return int(token_height * token_width)
182182

183-
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
184-
# None means unlimited
185-
return {"image": None}
186-
187-
def get_mm_max_tokens_per_item(
188-
self,
189-
seq_len: int,
190-
mm_counts: Mapping[str, int],
191-
) -> Mapping[str, int]:
192-
return {
193-
"image":
194-
self.get_num_image_tokens(
195-
image_width=MaxImageTokenMeta.width,
196-
image_height=MaxImageTokenMeta.height,
197-
),
198-
}
199-
200183
@property
201184
def image_token_id(self) -> int:
202185
return self.get_hf_config().media_placeholder_token_id
203186

204187

205188
class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
206189

207-
def __init__(self, info: KimiVLProcessingInfo) -> None:
208-
super().__init__(info)
190+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
191+
num_images = mm_counts.get("image", 0)
192+
193+
processor = self.info.get_hf_processor()
194+
image_token = processor.image_token
209195

210-
self.image_token_id = self.info.image_token_id
211-
self.image_token = self.info.get_tokenizer().decode(
212-
self.image_token_id)
196+
return image_token * num_images
213197

214-
def get_dummy_processor_inputs(
198+
def get_dummy_mm_data(
215199
self,
216200
seq_len: int,
217201
mm_counts: Mapping[str, int],
218-
) -> ProcessorInputs:
202+
) -> MultiModalDataDict:
219203
num_images = mm_counts.get("image", 0)
220204

221-
width = MaxImageTokenMeta.width
222-
height = MaxImageTokenMeta.height
223-
mm_data = {
205+
return {
224206
"image":
225-
self._get_dummy_images(width=width,
226-
height=height,
207+
self._get_dummy_images(width=MaxImageTokenMeta.width,
208+
height=MaxImageTokenMeta.height,
227209
num_images=num_images)
228210
}
229211

230-
return ProcessorInputs(
231-
prompt_text=self.image_token * num_images,
232-
mm_data=mm_data,
233-
)
234-
235212

236213
class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
237214

0 commit comments

Comments
 (0)