simplify specialized manager interface

heheda12345 · heheda12345 · commit 2ca014effb7f · 2025-03-30T09:26:05.000-07:00
Signed-off-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
@@ -3,8 +3,7 @@
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
-                                         PrefixLengthRange)
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
 from vllm.v1.core.specialized_manager import SlidingWindowManager
 from vllm.v1.kv_cache_interface import SlidingWindowSpec
 
@@ -22,35 +21,50 @@ def test_sliding_window_possible_cached_prefix():
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
     manager = SlidingWindowManager(sliding_window_spec, block_pool)
 
-    block_is_cached = [
+    def run_one_case(block_is_cached, expect_length):
+        block_hash_list = [
+            BlockHashType(i, ()) for i in range(len(block_is_cached))
+        ]
+
+        block_pool.cached_block_hash_to_block.clear()
+
+        # Mock the block pool with the cached blocks
+        for i, (block_hash,
+                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+            if is_cached:
+                block_pool.cached_block_hash_to_block[block_hash] = {
+                    i: block_pool.blocks[i + 10]
+                }
+
+        computed_blocks = manager.get_longest_cached_prefix(block_hash_list)
+        assert len(computed_blocks) == expect_length
+
+        assert all(block == block_pool.get_null_block()
+                   for block in computed_blocks[:expect_length - 2])
+        for i in range(2):
+            if i < expect_length:
+                block_index = expect_length - i - 1
+                assert computed_blocks[
+                    block_index].block_id == block_index + 10
+
+    run_one_case([False] * 10, 0)
+    run_one_case([True], 1)
+    run_one_case([True, False], 1)
+    run_one_case([True, True], 2)
+    run_one_case([True, True, False], 2)
+    run_one_case([True, True, True], 3)
+    run_one_case([True, True, True, False], 3)
+    run_one_case([
         True, True, False, True, False, False, True, True, False, True, True,
         True
-    ]
-    block_hash_list = [
-        BlockHashType(i, ()) for i in range(len(block_is_cached))
-    ]
-
-    # Mock the block pool with the cached blocks
-    for i, (block_hash,
-            is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
-        if is_cached:
-            block_pool.cached_block_hash_to_block[block_hash] = {
-                i: block_pool.blocks[i + 10]
-            }
-
-    ranges, computed_blocks = manager.get_possible_cached_prefix(
-        block_hash_list)
-    assert ranges == [
-        PrefixLengthRange(0, 4),
-        PrefixLengthRange(16, 16),
-        PrefixLengthRange(22, 24)
-    ]
-    expected_computed_blocks = [
-        block_pool.blocks[i +
-                          10] if is_cached else block_pool.get_null_block()
-        for i, is_cached in enumerate(block_is_cached)
-    ]
-    assert computed_blocks == expected_computed_blocks
+    ], 12)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False
+    ], 8)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False,
+        True
+    ], 8)
 
 
 def test_sliding_window_remove_useless_blocks():
@@ -87,49 +101,39 @@ def assert_block_id(block_table, ids):
         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
     ]
     block_table = id_to_block_table(original_block_ids)
-    removed = manager.remove_useless_blocks(block_table, 0, is_first_call=True)
+    removed = manager.remove_useless_blocks(block_table, 0)
     assert_block_id(removed, [])
     assert_block_id(block_table, original_block_ids)
 
     # 5 tokens are computed. Only token 0 is out of the sliding window. As
     # block 1000 also contains token 1 that is in the sliding window, block 1000
     # cannot be removed.
-    removed = manager.remove_useless_blocks(block_table,
-                                            5,
-                                            is_first_call=False)
+    removed = manager.remove_useless_blocks(block_table, 5)
     assert_block_id(removed, [])
     assert_block_id(block_table, original_block_ids)
 
     # 6 tokens are computed. Token 0 & 1 are out of the sliding window.
     # Block 1000 can be removed.
-    removed = manager.remove_useless_blocks(block_table,
-                                            6,
-                                            is_first_call=False)
+    removed = manager.remove_useless_blocks(block_table, 6)
     assert_block_id(removed, [original_block_ids[0]])
     assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
 
     # 7 tokens are computed. Token 0-2 are out of the sliding window.
     # Cannot remove new block as the block 1001 is still used by token 3.
-    removed = manager.remove_useless_blocks(block_table,
-                                            7,
-                                            is_first_call=False)
+    removed = manager.remove_useless_blocks(block_table, 7)
     assert_block_id(removed, [])
     assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
 
     # 8 tokens are computed. Token 0-3 are out of the sliding window.
     # Block 1001 can be removed and block 1000 is already removed.
-    removed = manager.remove_useless_blocks(block_table,
-                                            8,
-                                            is_first_call=False)
+    removed = manager.remove_useless_blocks(block_table, 8)
     assert_block_id(removed, [original_block_ids[1]])
     assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
 
     # 12 tokens are computed. Token 0-7 are out of the sliding window.
     # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
     # sequence, and is expected to be evicted earlier than 1002, so the order
     # of removed blocks should be [1003, 1002].
-    removed = manager.remove_useless_blocks(block_table,
-                                            12,
-                                            is_first_call=False)
+    removed = manager.remove_useless_blocks(block_table, 12)
     assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
     assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -127,31 +127,9 @@ def get_computed_blocks(
 
         self.prefix_cache_stats.requests += 1
         if request.sampling_params.prompt_logprobs is None:
-            # Check for cache hits
-            # E.g., for a model with sliding window size 32 (2 * block_size)
-            # computed_blocks = [NULL, 5, 2, NULL, 9, 7, 3, NULL]
-            # We can have the first 3 blocks, 5 blocks, or 6 blocks as
-            # the cached prefix, so the prefix_length should be:
-            # prefix_length = [
-            #   PrefixLengthRange(3 * 16, 3 * 16),
-            #   PrefixLengthRange(6 * 16, 7 * 16)
-            # ]
-            prefix_length, computed_blocks = \
-                self.specialized_manager.get_possible_cached_prefix(
-                    block_hashes)
-            # E.g., num_computed_tokens = 7 * 16
-            num_computed_tokens = prefix_length[-1].end
-            # NOTE(woosuk): Since incomplete blocks are not eligible for
-            # sharing, `num_computed_tokens` should always be a multiple of
-            # `block_size`.
-            assert num_computed_tokens % self.block_size == 0
-            # E.g., computed_blocks = [NULL, 5, 2, NULL, 9, 7, 3]
-            computed_blocks = computed_blocks[:num_computed_tokens //
-                                              self.block_size]
-            # E.g., computed_blocks = [NULL, NULL, NULL, NULL, 9, 7, 3]
-            self._free_useless_blocks(computed_blocks,
-                                      num_computed_tokens,
-                                      touched=False)
+            computed_blocks = (self.specialized_manager.
+                               get_longest_cached_prefix(block_hashes))
+            num_computed_tokens = len(computed_blocks) * self.block_size
 
             self.prefix_cache_stats.queries += len(block_hashes)
             self.prefix_cache_stats.hits += len(computed_blocks)
@@ -196,20 +174,23 @@ def allocate_slots(
 
         new_computed_blocks = new_computed_blocks or []
 
+        req_blocks = self.req_to_blocks[request.request_id]
+
+        # We can free blocks that are no longer needed even if we cannot
+        # schedule this request due to the limit of free blocks.
+        # Should call this function before allocating new blocks to reduce
+        # the number of evicted blocks.
+        removed_blocks = self.specialized_manager.remove_useless_blocks(
+            req_blocks, request.num_computed_tokens)
+        self.block_pool.free_blocks(removed_blocks)
+
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
         num_computed_tokens = (request.num_computed_tokens +
                                len(new_computed_blocks) * self.block_size)
         num_required_blocks = cdiv(num_computed_tokens + num_tokens,
                                    self.block_size)
-        req_blocks = self.req_to_blocks[request.request_id]
-        # We can free blocks that are no longer needed even if we cannot
-        # schedule this request due to the limit of free blocks.
-        # Should call this function before allocating new blocks to reduce
-        # the number of evicted blocks.
-        self._free_useless_blocks(req_blocks,
-                                  request.num_computed_tokens,
-                                  touched=True)
+
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(new_computed_blocks))
 
@@ -231,12 +212,6 @@ def allocate_slots(
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
-        # Should call this function before allocating new blocks to reduce
-        # the number of evicted blocks.
-        self._free_useless_blocks(req_blocks,
-                                  request.num_computed_tokens,
-                                  touched=True)
-
         # Append the new computed blocks to the request blocks until now to
         # avoid the case where the new blocks cannot be allocated.
         req_blocks.extend(new_computed_blocks)
@@ -398,6 +373,6 @@ def _free_useless_blocks(self, req_blocks: list[KVCacheBlock],
         # The first call always comes from `get_computed_blocks` which
         # passes `touched=False`.
         removed_blocks = self.specialized_manager.remove_useless_blocks(
-            req_blocks, num_computed_tokens, is_first_call=not touched)
+            req_blocks, num_computed_tokens)
         if touched:
             self.block_pool.free_blocks(removed_blocks)
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
@@ -686,11 +686,3 @@ def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
         kv_cache_config.num_blocks = min_num_blocks
 
     return kv_cache_configs
-
-
-class PrefixLengthRange(NamedTuple):
-    """
-    A closed interval [start, end] representing a range of valid prefix lengths.
-    """
-    start: int
-    end: int
diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py