Skip to content

Commit 7e5655a

Browse files
comaniactjtanaa
authored andcommitted
[V1] Add uncache_blocks (vllm-project#12333)
1 parent 8dab4e9 commit 7e5655a

File tree

2 files changed

+61
-2
lines changed

2 files changed

+61
-2
lines changed

tests/v1/core/test_prefix_caching.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,3 +626,33 @@ def test_reset_prefix_cache():
626626
assert manager.reset_prefix_cache()
627627
assert not manager.cached_block_hash_to_block
628628
assert all([blk.block_hash is None for blk in manager.block_pool])
629+
630+
631+
def test_uncache_blocks():
632+
manager = KVCacheManager(
633+
block_size=16,
634+
num_gpu_blocks=10,
635+
max_model_len=8192,
636+
sliding_window=None,
637+
enable_caching=True,
638+
num_preallocate_tokens=0,
639+
)
640+
641+
req0 = make_request("0", list(range(30)))
642+
blocks = manager.allocate_slots(req0, 30, [])
643+
assert [b.block_id for b in blocks] == [0, 1]
644+
assert len(manager.cached_block_hash_to_block) == 1
645+
646+
req0.num_computed_tokens = 30
647+
648+
# Simulate speculative tokens.
649+
for _ in range(5):
650+
req0.append_output_token_ids(8)
651+
manager.append_slots(req0, 5)
652+
assert len(manager.cached_block_hash_to_block) == 2
653+
654+
# After sampling, assuming only 1 token is accepted.
655+
req0.num_computed_tokens = 31
656+
num_uncached_blocks = manager.uncache_blocks(req0)
657+
assert num_uncached_blocks == 1
658+
assert len(manager.cached_block_hash_to_block) == 1

vllm/v1/core/kv_cache_manager.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,29 @@ def free(self, request: Request) -> None:
285285
if block.ref_cnt == 0:
286286
self.free_block_queue.append(block)
287287

288+
def uncache_blocks(self, request: Request) -> int:
289+
"""Uncache the blocks that are no longer full based on the
290+
num_computed_tokens in the given request. This happens when
291+
the blocks were full and cached due to speculative tokens, but the
292+
speculative tokens are not accepted.
293+
294+
Args:
295+
request: The request.
296+
297+
Returns:
298+
The number of uncached blocks.
299+
"""
300+
blocks = self.req_to_blocks[request.request_id]
301+
num_computed_tokens = request.num_computed_tokens
302+
num_full_blocks = num_computed_tokens // self.block_size
303+
num_uncached_blocks = 0
304+
for block in blocks[num_full_blocks:]:
305+
# If the block is not cached, the following blocks are not cached.
306+
if not self._maybe_evict_cached_block(block):
307+
break
308+
num_uncached_blocks += 1
309+
return num_uncached_blocks
310+
288311
def reset_prefix_cache(self) -> bool:
289312
"""Reset prefix cache. This function may be used in RLHF
290313
flows to invalid prefix caching after the weights are updated,
@@ -386,21 +409,24 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
386409

387410
# If the block is cached, evict it.
388411
if self.enable_caching:
389-
self._evict_cached_block(curr_block)
412+
self._maybe_evict_cached_block(curr_block)
390413

391414
curr_block.incr_ref()
392415
ret.append(curr_block)
393416
idx += 1
394417

395418
return ret
396419

397-
def _evict_cached_block(self, block: KVCacheBlock) -> None:
420+
def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
398421
"""
399422
If a block is cached in `cached_block_hash_to_block`, we reset its hash
400423
metadata and evict it from the cache.
401424
402425
Args:
403426
block: The block to evict.
427+
428+
Returns:
429+
True if the block is evicted, False otherwise.
404430
"""
405431
block_hash = block.block_hash
406432
if block_hash and block_hash in self.cached_block_hash_to_block:
@@ -410,6 +436,9 @@ def _evict_cached_block(self, block: KVCacheBlock) -> None:
410436
if len(self.cached_block_hash_to_block[block_hash]) == 0:
411437
del self.cached_block_hash_to_block[block_hash]
412438

439+
return True
440+
return False
441+
413442
def _get_cached_block(self,
414443
block_hash: BlockHashType) -> Optional[KVCacheBlock]:
415444
"""Get a cached block by the block hash, or None if cache miss.

0 commit comments

Comments
 (0)