From aee5550911e5fe97e50ad78b621aa1c1d6d37104 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 26 Jan 2025 12:18:41 -0800 Subject: [PATCH 1/4] [V1][Minor] Minor optimizations for update_from_output Signed-off-by: Woosuk Kwon --- vllm/v1/core/encoder_cache_manager.py | 6 +++--- vllm/v1/core/scheduler.py | 19 ++++++++++++------- vllm/v1/request.py | 8 +++++--- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 0cd8c806a3e..2a833109e8e 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY @@ -35,8 +35,8 @@ def allocate(self, request: Request, input_id: int) -> None: self.cached[req_id].add(input_id) self.num_free_slots -= request.get_num_encoder_tokens(input_id) - def get_cached_input_ids(self, request: Request) -> Set[int]: - return self.cached.get(request.request_id, set()) + def get_cached_input_ids(self, request: Request) -> Optional[Set[int]]: + return self.cached.get(request.request_id, None) def free(self, request: Request, input_id: int) -> None: req_id = request.request_id diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 8ded5e57871..15c184e121e 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -411,6 +411,10 @@ def update_from_output( num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] outputs: List[EngineCoreOutput] = [] + + # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below + # loop can be a performance bottleneck. We should do our best to avoid + # expensive operations inside the loop. for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -421,13 +425,14 @@ def update_from_output( cached_encoder_input_ids = ( self.encoder_cache_manager.get_cached_input_ids(request)) - for input_id in list(cached_encoder_input_ids): - start_pos = request.mm_positions[input_id]["offset"] - num_tokens = request.mm_positions[input_id]["length"] - if start_pos + num_tokens <= request.num_computed_tokens: - # The encoder output is already processed and stored - # in the decoder's KV cache. - self.encoder_cache_manager.free(request, input_id) + if cached_encoder_input_ids: + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 2cfcd8b63cc..cd71f62ea01 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -91,9 +91,11 @@ def append_output_token_ids( token_ids: Union[int, List[int]], ) -> None: if isinstance(token_ids, int): - token_ids = [token_ids] - self._output_token_ids.extend(token_ids) - self._all_token_ids.extend(token_ids) + self._output_token_ids.append(token_ids) + self._all_token_ids.append(token_ids) + else: + self._output_token_ids.extend(token_ids) + self._all_token_ids.extend(token_ids) @property def num_tokens(self) -> int: From 26b34db9a59987c7dee892f8de6d4207ede80754 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 26 Jan 2025 13:42:53 -0800 Subject: [PATCH 2/4] Fix Signed-off-by: Woosuk Kwon --- vllm/v1/core/encoder_cache_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 2a833109e8e..0cd8c806a3e 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Dict, List, Set, Tuple from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY @@ -35,8 +35,8 @@ def allocate(self, request: Request, input_id: int) -> None: self.cached[req_id].add(input_id) self.num_free_slots -= request.get_num_encoder_tokens(input_id) - def get_cached_input_ids(self, request: Request) -> Optional[Set[int]]: - return self.cached.get(request.request_id, None) + def get_cached_input_ids(self, request: Request) -> Set[int]: + return self.cached.get(request.request_id, set()) def free(self, request: Request, input_id: int) -> None: req_id = request.request_id From 3ee5bb2c70cd42e2b5dc7f6a0c7aa66b748b823a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 26 Jan 2025 13:46:52 -0800 Subject: [PATCH 3/4] Minor Signed-off-by: Woosuk Kwon --- vllm/v1/core/scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 15c184e121e..de7fb1a698d 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -425,6 +425,7 @@ def update_from_output( cached_encoder_input_ids = ( self.encoder_cache_manager.get_cached_input_ids(request)) + # OPTIMIZATION: Avoid list(set) if the set is empty. if cached_encoder_input_ids: for input_id in list(cached_encoder_input_ids): start_pos = request.mm_positions[input_id]["offset"] From 9b5f48dcee2b39dc47fc63950824225e6f30feac Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 26 Jan 2025 22:37:06 -0800 Subject: [PATCH 4/4] Rever Signed-off-by: Woosuk Kwon --- vllm/v1/request.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index cd71f62ea01..2cfcd8b63cc 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -91,11 +91,9 @@ def append_output_token_ids( token_ids: Union[int, List[int]], ) -> None: if isinstance(token_ids, int): - self._output_token_ids.append(token_ids) - self._all_token_ids.append(token_ids) - else: - self._output_token_ids.extend(token_ids) - self._all_token_ids.extend(token_ids) + token_ids = [token_ids] + self._output_token_ids.extend(token_ids) + self._all_token_ids.extend(token_ids) @property def num_tokens(self) -> int: