@@ -216,17 +216,18 @@ def get_encoder_dummy_data(
216
216
# Encoder-decoder multimodal models only support v0
217
217
if total_len > seq_len :
218
218
# `max_num_batched_tokens` is defined by `SchedulerConfig`
219
- logger .warning (
219
+ logger .warning_once (
220
220
"The encoder sequence length used for profiling ("
221
- "max_num_batched_tokens / max_num_seqs = %d) is too short "
221
+ f"max_num_batched_tokens / max_num_seqs = { seq_len } ) "
222
+ " is too short "
222
223
"to hold the multi-modal embeddings in the worst case "
223
- "(%d tokens in total, out of which %s are reserved for "
224
+ f"({ total_len } tokens in total, out of which "
225
+ f"{ total_placeholders_by_modality } are reserved for "
224
226
"multi-modal embeddings). This may cause certain "
225
227
"multi-modal inputs to fail during inference, even when "
226
228
"the input text is short. To avoid this, you should "
227
229
"increase `max_model_len`, reduce `max_num_seqs`, "
228
- "and/or reduce `mm_counts`." , seq_len , total_len ,
229
- total_placeholders_by_modality )
230
+ "and/or reduce `mm_counts`." )
230
231
231
232
processor = cast (EncDecMultiModalProcessor , self .processor )
232
233
if processor .pad_dummy_encoder_prompt :
@@ -251,17 +252,18 @@ def get_decoder_dummy_data(
251
252
# V0 does not support chunked prefill.
252
253
if total_len > seq_len and not envs .VLLM_USE_V1 :
253
254
# `max_num_batched_tokens` is defined by `SchedulerConfig`
254
- logger .warning (
255
+ logger .warning_once (
255
256
"The sequence length used for profiling ("
256
- "max_num_batched_tokens / max_num_seqs = %d) is too short "
257
+ f"max_num_batched_tokens / max_num_seqs = { seq_len } ) "
258
+ "is too short "
257
259
"to hold the multi-modal embeddings in the worst case "
258
- "(%d tokens in total, out of which %s are reserved for "
260
+ f"({ total_len } tokens in total, out of which "
261
+ f"{ total_placeholders_by_modality } are reserved for "
259
262
"multi-modal embeddings). This may cause certain "
260
263
"multi-modal inputs to fail during inference, even when "
261
264
"the input text is short. To avoid this, you should "
262
265
"increase `max_model_len`, reduce `max_num_seqs`, "
263
- "and/or reduce `mm_counts`." , seq_len , total_len ,
264
- total_placeholders_by_modality )
266
+ "and/or reduce `mm_counts`." )
265
267
266
268
if total_len < seq_len :
267
269
prompt_token_ids .extend ([0 ] * (seq_len - total_len ))
0 commit comments