Skip to content

Commit b5f00e2

Browse files
DarkLight1337tjtanaa
authored andcommitted
[VLM] Avoid unnecessary tokenization (vllm-project#12310)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 8a8edd5 commit b5f00e2

File tree

9 files changed

+71
-40
lines changed

9 files changed

+71
-40
lines changed

vllm/model_executor/models/blip2.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -475,15 +475,23 @@ def _get_prompt_replacements(
475475
hf_processor_mm_kwargs: Mapping[str, object],
476476
out_mm_kwargs: MultiModalKwargs,
477477
) -> list[PromptReplacement]:
478+
tokenizer = self.info.get_tokenizer()
479+
vocab = tokenizer.get_vocab()
480+
481+
bos_token_id = tokenizer.bos_token_id
482+
assert isinstance(bos_token_id, int)
483+
484+
image_token_id = vocab["image"]
478485
num_image_tokens = self.info.get_num_image_tokens()
486+
image_tokens = [image_token_id] * num_image_tokens
479487

480488
return [
481489
PromptReplacement(
482490
modality="image",
483491
target="</s>",
484492
replacement=PromptReplacementDetails(
485-
full="<image>" * num_image_tokens + "</s>",
486-
features="<image>" * num_image_tokens,
493+
full=image_tokens + [bos_token_id],
494+
features=image_tokens,
487495
),
488496
)
489497
]

vllm/model_executor/models/chameleon.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,9 @@ def _apply_hf_processor_tokens_only(
122122
) -> list[int]:
123123
# HF processor adds sep token for chat mode
124124
tokenizer = self.info.get_tokenizer()
125-
sep_token_id: int = \
126-
tokenizer.vocab[tokenizer.sep_token] # type: ignore
125+
vocab = tokenizer.get_vocab()
126+
127+
sep_token_id = vocab[tokenizer.sep_token] # type: ignore
127128

128129
return prompt_tokens + [sep_token_id]
129130

@@ -141,18 +142,22 @@ def _get_prompt_replacements(
141142
out_mm_kwargs: MultiModalKwargs,
142143
) -> list[PromptReplacement]:
143144
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
144-
image_tokens = processor.image_token * self.info.get_num_image_tokens()
145+
tokenizer = self.info.get_tokenizer()
146+
vocab = tokenizer.get_vocab()
147+
148+
image_start_id = vocab[processor.image_start_token]
149+
image_token_id = vocab[processor.image_token]
150+
image_end_id = vocab[processor.image_end_token]
151+
152+
num_image_tokens = self.info.get_num_image_tokens()
153+
image_tokens = [image_token_id] * num_image_tokens
145154

146155
return [
147156
PromptReplacement(
148157
modality="image",
149-
target="<image>",
158+
target=[image_token_id],
150159
replacement=PromptReplacementDetails(
151-
full="".join([
152-
processor.image_start_token,
153-
image_tokens,
154-
processor.image_end_token,
155-
]),
160+
full=([image_start_id] + image_tokens + [image_end_id]),
156161
features=image_tokens,
157162
),
158163
)

vllm/model_executor/models/deepseek_vl2.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,8 +249,10 @@ def _get_prompt_replacements(
249249
hf_processor_mm_kwargs: Mapping[str, object],
250250
out_mm_kwargs: MultiModalKwargs,
251251
) -> list[PromptReplacement]:
252-
hf_processor = self.info.get_hf_processor()
253-
image_token_id: int = hf_processor.image_token_id
252+
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
253+
254+
image_token_id = hf_processor.image_token_id
255+
assert isinstance(image_token_id, int)
254256

255257
def get_replacement_deepseek_vl2(item_idx: int):
256258
images = mm_items.get_items(

vllm/model_executor/models/fuyu.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,9 @@ def _apply_hf_processor_tokens_only(
183183
) -> list[int]:
184184
# HF processor adds boa_token_id
185185
tokenizer = self.info.get_tokenizer()
186-
boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore
186+
vocab = tokenizer.get_vocab()
187+
188+
boa_token_id = vocab["<0x04>"]
187189

188190
return prompt_tokens + [boa_token_id]
189191

@@ -202,6 +204,7 @@ def _get_prompt_replacements(
202204
) -> list[PromptReplacement]:
203205
hf_config = self.info.get_hf_config()
204206
bos_token_id = hf_config.bos_token_id
207+
assert isinstance(bos_token_id, int)
205208

206209
tokenizer = self.info.get_tokenizer()
207210
eot_token_id = tokenizer.bos_token_id

vllm/model_executor/models/llava.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -315,13 +315,14 @@ def _get_prompt_replacements(
315315
hf_processor_mm_kwargs: Mapping[str, object],
316316
out_mm_kwargs: MultiModalKwargs,
317317
) -> list[PromptReplacement]:
318+
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
318319
hf_config = self.info.get_hf_config()
319-
image_token_id = hf_config.image_token_index
320+
tokenizer = self.info.get_tokenizer()
321+
vocab = tokenizer.get_vocab()
320322

321-
processor = self.info.get_hf_processor()
322-
image_token = processor.image_token
323-
image_break_token = processor.image_break_token
324-
image_end_token = processor.image_end_token
323+
image_break_id = vocab[processor.image_break_token]
324+
image_token_id = hf_config.image_token_index
325+
image_end_id = vocab[processor.image_end_token]
325326

326327
vision_config = hf_config.vision_config
327328
assert isinstance(vision_config, PixtralVisionConfig)
@@ -336,10 +337,10 @@ def get_replacement(item_idx: int):
336337
image_height=image_size.height,
337338
)
338339

339-
tokens = ([image_token] * ncols + [image_break_token]) * nrows
340-
tokens[-1] = image_end_token
340+
tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
341+
tokens[-1] = image_end_id
341342

342-
return "".join(tokens)
343+
return tokens
343344

344345
return [
345346
PromptReplacement(

vllm/model_executor/models/qwen2_audio.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,9 @@ def _get_prompt_replacements(
188188
hf_processor_mm_kwargs: Mapping[str, object],
189189
out_mm_kwargs: MultiModalKwargs,
190190
) -> list[PromptReplacement]:
191-
processor = self.info.get_hf_processor()
191+
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
192+
tokenizer = self.info.get_tokenizer()
193+
vocab = tokenizer.get_vocab()
192194

193195
# Use getattr with default to be compatible with transformers<4.48
194196
audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
@@ -197,6 +199,10 @@ def _get_prompt_replacements(
197199
audio_eos_token = getattr(processor, "audio_eos_token",
198200
"<|audio_eos|>")
199201

202+
audio_token_id = vocab[audio_token]
203+
audio_bos_id = vocab[audio_bos_token]
204+
audio_eos_id = vocab[audio_eos_token]
205+
200206
feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
201207
if feature_attention_mask is None:
202208
audio_output_lengths = []
@@ -208,22 +214,18 @@ def _get_prompt_replacements(
208214
audio_output_lengths = audio_output_lens.tolist()
209215

210216
def get_replacement_qwen2_audio(item_idx: int):
211-
num_placeholders = audio_output_lengths[item_idx]
212-
if num_placeholders == 0:
217+
num_features = audio_output_lengths[item_idx]
218+
if num_features == 0:
213219
audios = mm_items.get_items("audio", AudioProcessorItems)
214220
audio = audios.get(item_idx)
215221
raise ValueError(
216222
f"The audio {audio} (len={len(audio)}) is too short "
217223
"to be represented inside the model")
218224

219-
audio_tokens = audio_token * num_placeholders
225+
audio_tokens = [audio_token_id] * num_features
220226

221227
return PromptReplacementDetails(
222-
full="".join([
223-
audio_bos_token,
224-
audio_tokens,
225-
audio_eos_token,
226-
]),
228+
full=[audio_bos_id] + audio_tokens + [audio_eos_id],
227229
features=audio_tokens,
228230
)
229231

vllm/model_executor/models/qwen2_vl.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -953,12 +953,14 @@ def _get_prompt_replacements(
953953
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
954954
image_processor = self.info.get_image_processor(
955955
**hf_processor_mm_kwargs)
956+
tokenizer = self.info.get_tokenizer()
957+
vocab = tokenizer.get_vocab()
956958

957959
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
958960
# image_token and video_token registered
959961
placeholder = {
960-
"image": hf_processor.image_token,
961-
"video": hf_processor.video_token,
962+
"image": vocab[hf_processor.image_token],
963+
"video": vocab[hf_processor.video_token],
962964
}
963965

964966
merge_length = image_processor.merge_size**2
@@ -967,13 +969,13 @@ def get_replacement_qwen2vl(item_idx: int, modality: str):
967969
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
968970
assert isinstance(grid_thw, torch.Tensor)
969971

970-
num_tokens = grid_thw.prod().item() // merge_length
971-
return placeholder[modality] * num_tokens
972+
num_tokens = int(grid_thw.prod()) // merge_length
973+
return [placeholder[modality]] * num_tokens
972974

973975
return [
974976
PromptReplacement(
975977
modality=modality,
976-
target=placeholder[modality],
978+
target=[placeholder[modality]],
977979
replacement=partial(get_replacement_qwen2vl,
978980
modality=modality),
979981
) for modality in ("image", "video")

vllm/model_executor/models/ultravox.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -205,16 +205,20 @@ def _get_prompt_replacements(
205205
out_mm_kwargs: MultiModalKwargs,
206206
) -> list[PromptReplacement]:
207207
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
208-
placeholder = hf_processor.audio_token_replacement # type: ignore
208+
tokenizer = self.info.get_tokenizer()
209+
vocab = tokenizer.get_vocab()
210+
211+
replacement_id = vocab[
212+
hf_processor.audio_token_replacement] # type: ignore
209213

210214
def get_replacement_ultravox(item_idx: int):
211215
audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
212-
return placeholder * audio_token_len
216+
return [replacement_id] * int(audio_token_len) # type: ignore
213217

214218
return [
215219
PromptReplacement(
216220
modality="audio",
217-
target="<|audio|>",
221+
target='<|audio|>',
218222
replacement=get_replacement_ultravox,
219223
)
220224
]

vllm/transformers_utils/tokenizer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,10 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
6767
tokenizer_all_special_tokens_extended = (
6868
tokenizer.all_special_tokens_extended)
6969
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
70+
tokenizer_vocab = tokenizer.get_vocab()
7071
tokenizer_len = len(tokenizer)
7172

72-
max_token_id = max(tokenizer.get_vocab().values())
73+
max_token_id = max(tokenizer_vocab.values())
7374
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
7475
# are added and included in the implementation of the vocab_size
7576
# property, but not in get_vocab(); if there is an implementation
@@ -96,6 +97,9 @@ def all_special_tokens_extended(self):
9697
def max_token_id(self):
9798
return max_token_id
9899

100+
def get_vocab(self):
101+
return tokenizer_vocab
102+
99103
def __len__(self):
100104
return tokenizer_len
101105

0 commit comments

Comments
 (0)