update based on review

heheda12345 · heheda12345 · commit 941f770ff2a9 · 2025-03-30T09:26:48.000-07:00
Signed-off-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
@@ -36,10 +36,10 @@ def run_one_case(block_is_cached, expect_length):
                     i: block_pool.blocks[i + 10]
                 }
 
-        computed_blocks = manager.get_longest_cached_prefix(block_hash_list)
+        computed_blocks = manager.find_longest_cache_hit(block_hash_list)
         assert len(computed_blocks) == expect_length
 
-        assert all(block == block_pool.get_null_block()
+        assert all(block == block_pool.null_block
                    for block in computed_blocks[:expect_length - 2])
         for i in range(2):
             if i < expect_length:
@@ -67,7 +67,7 @@ def run_one_case(block_is_cached, expect_length):
     ], 8)
 
 
-def test_sliding_window_remove_useless_blocks():
+def test_sliding_window_remove_skipped_blocks():
     sliding_window_spec = SlidingWindowSpec(
         block_size=2,
         num_kv_heads=1,
@@ -81,59 +81,58 @@ def test_sliding_window_remove_useless_blocks():
 
     manager = SlidingWindowManager(sliding_window_spec, block_pool)
 
-    null_block_id = block_pool.get_null_block().block_id
+    null_block_id = block_pool.null_block.block_id
 
     def id_to_block_table(ids):
         return [
             KVCacheBlock(id_)
-            if id_ != null_block_id else block_pool.get_null_block()
-            for id_ in ids
+            if id_ != null_block_id else block_pool.null_block for id_ in ids
         ]
 
     def assert_block_id(block_table, ids):
         for block, id_ in zip(block_table, ids):
             if id_ == null_block_id:
-                assert block == block_pool.get_null_block()
+                assert block == block_pool.null_block
             else:
                 assert block.block_id == id_
 
     original_block_ids = [
         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
     ]
     block_table = id_to_block_table(original_block_ids)
-    removed = manager.remove_useless_blocks(block_table, 0)
+    removed = manager.remove_skipped_blocks(block_table, 0)
     assert_block_id(removed, [])
     assert_block_id(block_table, original_block_ids)
 
     # 5 tokens are computed. Only token 0 is out of the sliding window. As
     # block 1000 also contains token 1 that is in the sliding window, block 1000
     # cannot be removed.
-    removed = manager.remove_useless_blocks(block_table, 5)
+    removed = manager.remove_skipped_blocks(block_table, 5)
     assert_block_id(removed, [])
     assert_block_id(block_table, original_block_ids)
 
     # 6 tokens are computed. Token 0 & 1 are out of the sliding window.
     # Block 1000 can be removed.
-    removed = manager.remove_useless_blocks(block_table, 6)
+    removed = manager.remove_skipped_blocks(block_table, 6)
     assert_block_id(removed, [original_block_ids[0]])
     assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
 
     # 7 tokens are computed. Token 0-2 are out of the sliding window.
     # Cannot remove new block as the block 1001 is still used by token 3.
-    removed = manager.remove_useless_blocks(block_table, 7)
+    removed = manager.remove_skipped_blocks(block_table, 7)
     assert_block_id(removed, [])
     assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
 
     # 8 tokens are computed. Token 0-3 are out of the sliding window.
     # Block 1001 can be removed and block 1000 is already removed.
-    removed = manager.remove_useless_blocks(block_table, 8)
+    removed = manager.remove_skipped_blocks(block_table, 8)
     assert_block_id(removed, [original_block_ids[1]])
     assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
 
     # 12 tokens are computed. Token 0-7 are out of the sliding window.
     # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
     # sequence, and is expected to be evicted earlier than 1002, so the order
     # of removed blocks should be [1003, 1002].
-    removed = manager.remove_useless_blocks(block_table, 12)
+    removed = manager.remove_skipped_blocks(block_table, 12)
     assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
     assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
@@ -54,7 +54,7 @@ def __init__(self, num_gpu_blocks: int, enable_caching: bool):
         # To represent a placeholder block with block_id=0.
         # The ref_cnt of null_block is not maintained, needs special care to
         # avoid freeing it.
-        self._null_block = self.free_block_queue.popleft()
+        self.null_block = self.free_block_queue.popleft()
 
     def get_cached_block(self,
                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
@@ -220,7 +220,7 @@ def touch(self, blocks: list[KVCacheBlock]) -> None:
         for block in blocks:
             # ref_cnt=0 means this block is in the free list (i.e. eviction
             # candidate), so remove it.
-            if block.ref_cnt == 0 and block != self._null_block:
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.remove(block)
             block.incr_ref()
 
@@ -235,7 +235,7 @@ def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         for block in ordered_blocks:
             block.decr_ref()
             # null_block should not be added to the free list.
-            if block.ref_cnt == 0 and block != self._null_block:
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool:
@@ -279,11 +279,3 @@ def get_usage(self) -> float:
             The KV cache usage (between 0.0 and 1.0).
         """
         return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
-
-    def get_null_block(self) -> KVCacheBlock:
-        """Get the null block.
-
-        Returns:
-            The null block.
-        """
-        return self._null_block
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -127,13 +127,16 @@ def get_computed_blocks(
 
         self.prefix_cache_stats.requests += 1
         if request.sampling_params.prompt_logprobs is None:
-            computed_blocks = (self.specialized_manager.
-                               get_longest_cached_prefix(block_hashes))
-            num_computed_tokens = len(computed_blocks) * self.block_size
+            computed_blocks = (
+                self.specialized_manager.find_longest_cache_hit(block_hashes))
 
             self.prefix_cache_stats.queries += len(block_hashes)
             self.prefix_cache_stats.hits += len(computed_blocks)
 
+            # NOTE(woosuk): Since incomplete blocks are not eligible for
+            # sharing, `num_computed_tokens` is always a multiple of
+            # `block_size`.
+            num_computed_tokens = len(computed_blocks) * self.block_size
             return computed_blocks, num_computed_tokens
         else:
             # Skip cache hits for prompt logprobs
@@ -176,11 +179,13 @@ def allocate_slots(
 
         req_blocks = self.req_to_blocks[request.request_id]
 
-        # We can free blocks that are no longer needed even if we cannot
-        # schedule this request due to the limit of free blocks.
+        # Free the blocks that are skipped during the attention computation
+        # (e.g., tokens outside the sliding window).
+        # We can do this even if we cannot schedule this request due to
+        # insufficient free blocks.
         # Should call this function before allocating new blocks to reduce
         # the number of evicted blocks.
-        removed_blocks = self.specialized_manager.remove_useless_blocks(
+        removed_blocks = self.specialized_manager.remove_skipped_blocks(
             req_blocks, request.num_computed_tokens)
         self.block_pool.free_blocks(removed_blocks)
 
@@ -372,7 +377,7 @@ def _free_useless_blocks(self, req_blocks: list[KVCacheBlock],
         """
         # The first call always comes from `get_computed_blocks` which
         # passes `touched=False`.
-        removed_blocks = self.specialized_manager.remove_useless_blocks(
+        removed_blocks = self.specialized_manager.remove_skipped_blocks(
             req_blocks, num_computed_tokens)
         if touched:
             self.block_pool.free_blocks(removed_blocks)
diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py
@@ -30,10 +30,10 @@ def __init__(
         self.block_pool = block_pool
 
     @abstractmethod
-    def get_longest_cached_prefix(
+    def find_longest_cache_hit(
             self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
         """
-        Get the longest cached prefix of the blocks. If no cached prefix is 
+        Get the longest cache hit prefix of the blocks. If no cache hit is 
         found, returns an empty list.
 
         Args:
@@ -48,7 +48,7 @@ def get_longest_cached_prefix(
         raise NotImplementedError
 
     @abstractmethod
-    def remove_useless_blocks(self, blocks: list[KVCacheBlock],
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
                               num_computed_tokens: int) -> list[KVCacheBlock]:
         """
         Remove the blocks that are no longer needed from. The removed blocks 
@@ -66,7 +66,7 @@ def remove_useless_blocks(self, blocks: list[KVCacheBlock],
 
 class FullAttentionManager(SpecializedManager):
 
-    def get_longest_cached_prefix(
+    def find_longest_cache_hit(
             self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
         computed_blocks: list[KVCacheBlock] = []
         for block_hash in block_hashes:
@@ -79,7 +79,7 @@ def get_longest_cached_prefix(
                 break
         return computed_blocks
 
-    def remove_useless_blocks(self, blocks: list[KVCacheBlock],
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
                               num_computed_tokens: int) -> list[KVCacheBlock]:
         # No need to remove blocks for full attention.
         return []
@@ -91,9 +91,9 @@ def __init__(self, kv_cache_spec: SlidingWindowSpec,
                  block_pool: BlockPool):
         super().__init__(kv_cache_spec, block_pool)
         self.sliding_window = kv_cache_spec.sliding_window
-        self._null_block = block_pool.get_null_block()
+        self._null_block = block_pool.null_block
 
-    def get_longest_cached_prefix(
+    def find_longest_cache_hit(
             self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
         # TODO: reduce i by num_block_sliding_window when cache miss, to
         # optimize the time complexity from O(len(block_hashes)) to
@@ -102,22 +102,23 @@ def get_longest_cached_prefix(
         # which is good for low cache hit rate scenarios.
         computed_blocks: list[KVCacheBlock] = [self._null_block
                                                ] * len(block_hashes)
-        num_computed_blocks = 0
+        num_contiguous_blocks = 0
 
         for i in range(len(block_hashes) - 1, -1, -1):
             if cached_block := self.block_pool.get_cached_block(
                     block_hashes[i]):
                 computed_blocks[i] = cached_block
-                num_computed_blocks += 1
-                if num_computed_blocks * self.block_size >= self.sliding_window:
-                    del computed_blocks[i + num_computed_blocks:]
+                num_contiguous_blocks += 1
+                if (num_contiguous_blocks * self.block_size
+                        >= self.sliding_window):
+                    del computed_blocks[i + num_contiguous_blocks:]
                     return computed_blocks
             else:
-                num_computed_blocks = 0
-        del computed_blocks[num_computed_blocks:]
+                num_contiguous_blocks = 0
+        del computed_blocks[num_contiguous_blocks:]
         return computed_blocks
 
-    def remove_useless_blocks(self, blocks: list[KVCacheBlock],
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
                               num_computed_tokens: int) -> list[KVCacheBlock]:
         # Remove the blocks that are no longer be in the sliding window.
         last_useful_token = num_computed_tokens - self.sliding_window
@@ -137,7 +138,7 @@ def remove_useless_blocks(self, blocks: list[KVCacheBlock],
 
 spec_manager_map: dict[type[KVCacheSpec], type[SpecializedManager]] = {
     FullAttentionSpec: FullAttentionManager,
-    SlidingWindowSpec: SlidingWindowManager
+    SlidingWindowSpec: SlidingWindowManager,
 }