@@ -411,6 +411,10 @@ def update_from_output(
411
411
num_scheduled_tokens = scheduler_output .num_scheduled_tokens
412
412
new_running : List [Request ] = []
413
413
outputs : List [EngineCoreOutput ] = []
414
+
415
+ # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
416
+ # loop can be a performance bottleneck. We should do our best to avoid
417
+ # expensive operations inside the loop.
414
418
for request in self .running :
415
419
req_id = request .request_id
416
420
request .num_computed_tokens += num_scheduled_tokens [req_id ]
@@ -421,13 +425,15 @@ def update_from_output(
421
425
422
426
cached_encoder_input_ids = (
423
427
self .encoder_cache_manager .get_cached_input_ids (request ))
424
- for input_id in list (cached_encoder_input_ids ):
425
- start_pos = request .mm_positions [input_id ]["offset" ]
426
- num_tokens = request .mm_positions [input_id ]["length" ]
427
- if start_pos + num_tokens <= request .num_computed_tokens :
428
- # The encoder output is already processed and stored
429
- # in the decoder's KV cache.
430
- self .encoder_cache_manager .free (request , input_id )
428
+ # OPTIMIZATION: Avoid list(set) if the set is empty.
429
+ if cached_encoder_input_ids :
430
+ for input_id in list (cached_encoder_input_ids ):
431
+ start_pos = request .mm_positions [input_id ]["offset" ]
432
+ num_tokens = request .mm_positions [input_id ]["length" ]
433
+ if start_pos + num_tokens <= request .num_computed_tokens :
434
+ # The encoder output is already processed and stored
435
+ # in the decoder's KV cache.
436
+ self .encoder_cache_manager .free (request , input_id )
431
437
432
438
if request .num_computed_tokens == request .num_tokens :
433
439
req_index = model_runner_output .req_id_to_index [req_id ]
0 commit comments