|
56 | 56 | from vllm.config import VllmConfig
|
57 | 57 | from vllm.distributed import (get_tensor_model_parallel_rank,
|
58 | 58 | get_tensor_model_parallel_world_size)
|
59 |
| -from vllm.logger import init_logger |
60 | 59 | from vllm.model_executor.layers.fused_moe import FusedMoE
|
61 | 60 | from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
62 | 61 | from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|
70 | 69 | from vllm.model_executor.models.utils import merge_multimodal_embeddings
|
71 | 70 | from vllm.model_executor.sampling_metadata import SamplingMetadata
|
72 | 71 | from vllm.multimodal import MULTIMODAL_REGISTRY
|
73 |
| -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, |
74 |
| - NestedTensors) |
| 72 | +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, |
| 73 | + MultiModalKwargs, NestedTensors) |
75 | 74 | from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
76 | 75 | MultiModalDataItems)
|
77 | 76 | from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
78 | 77 | BaseProcessingInfo, PromptReplacement,
|
79 | 78 | PromptUpdate)
|
80 |
| -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs |
| 79 | +from vllm.multimodal.profiling import BaseDummyInputsBuilder |
81 | 80 | from vllm.sequence import IntermediateTensors
|
82 | 81 | from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
|
83 | 82 | from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
84 | 83 |
|
85 | 84 | from .utils import is_pp_missing_parameter, maybe_prefix
|
86 | 85 |
|
87 |
| -logger = init_logger(__name__) |
88 |
| - |
89 | 86 |
|
90 | 87 | # For dummy input only
|
91 | 88 | @dataclass
|
@@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
|
143 | 140 | def get_hf_config(self):
|
144 | 141 | return self.ctx.get_hf_config(KimiVLConfig)
|
145 | 142 |
|
| 143 | + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: |
| 144 | + return {"image": None} |
| 145 | + |
146 | 146 | def get_num_image_tokens(
|
147 | 147 | self,
|
148 | 148 | *,
|
@@ -180,58 +180,35 @@ def get_num_image_tokens(
|
180 | 180 | token_width = (width + pad_width) // (kernel_size[1] * patch_size)
|
181 | 181 | return int(token_height * token_width)
|
182 | 182 |
|
183 |
| - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: |
184 |
| - # None means unlimited |
185 |
| - return {"image": None} |
186 |
| - |
187 |
| - def get_mm_max_tokens_per_item( |
188 |
| - self, |
189 |
| - seq_len: int, |
190 |
| - mm_counts: Mapping[str, int], |
191 |
| - ) -> Mapping[str, int]: |
192 |
| - return { |
193 |
| - "image": |
194 |
| - self.get_num_image_tokens( |
195 |
| - image_width=MaxImageTokenMeta.width, |
196 |
| - image_height=MaxImageTokenMeta.height, |
197 |
| - ), |
198 |
| - } |
199 |
| - |
200 | 183 | @property
|
201 | 184 | def image_token_id(self) -> int:
|
202 | 185 | return self.get_hf_config().media_placeholder_token_id
|
203 | 186 |
|
204 | 187 |
|
205 | 188 | class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
|
206 | 189 |
|
207 |
| - def __init__(self, info: KimiVLProcessingInfo) -> None: |
208 |
| - super().__init__(info) |
| 190 | + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: |
| 191 | + num_images = mm_counts.get("image", 0) |
| 192 | + |
| 193 | + processor = self.info.get_hf_processor() |
| 194 | + image_token = processor.image_token |
209 | 195 |
|
210 |
| - self.image_token_id = self.info.image_token_id |
211 |
| - self.image_token = self.info.get_tokenizer().decode( |
212 |
| - self.image_token_id) |
| 196 | + return image_token * num_images |
213 | 197 |
|
214 |
| - def get_dummy_processor_inputs( |
| 198 | + def get_dummy_mm_data( |
215 | 199 | self,
|
216 | 200 | seq_len: int,
|
217 | 201 | mm_counts: Mapping[str, int],
|
218 |
| - ) -> ProcessorInputs: |
| 202 | + ) -> MultiModalDataDict: |
219 | 203 | num_images = mm_counts.get("image", 0)
|
220 | 204 |
|
221 |
| - width = MaxImageTokenMeta.width |
222 |
| - height = MaxImageTokenMeta.height |
223 |
| - mm_data = { |
| 205 | + return { |
224 | 206 | "image":
|
225 |
| - self._get_dummy_images(width=width, |
226 |
| - height=height, |
| 207 | + self._get_dummy_images(width=MaxImageTokenMeta.width, |
| 208 | + height=MaxImageTokenMeta.height, |
227 | 209 | num_images=num_images)
|
228 | 210 | }
|
229 | 211 |
|
230 |
| - return ProcessorInputs( |
231 |
| - prompt_text=self.image_token * num_images, |
232 |
| - mm_data=mm_data, |
233 |
| - ) |
234 |
| - |
235 | 212 |
|
236 | 213 | class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
|
237 | 214 |
|
|
0 commit comments