Skip to content

Commit c7f9ed9

Browse files
[Feature][v1]: cached_tokens in Chat Completion Response usage
Signed-off-by: chaunceyjiang <[email protected]> Co-authored-by: simon-mo <[email protected]>
1 parent 355df4c commit c7f9ed9

File tree

5 files changed

+18
-4
lines changed

5 files changed

+18
-4
lines changed

tests/v1/core/test_scheduler_e2e.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,20 @@ def model() -> LLM:
1919
enable_prefix_caching=True,
2020
long_prefill_token_threshold=2,
2121
max_num_batched_tokens=6,
22-
max_num_seqs=3)
22+
max_num_seqs=3,
23+
block_size=16)
2324

2425

2526
def test_concurrent_partial_prefill(model):
2627
outputs = model.generate([PROMPT] * 3)
2728
assert len(outputs) == 3
2829
for output in outputs:
2930
assert len(output.outputs) == 1
31+
32+
33+
def test_prefix_cache_stats_is_recorded(model):
34+
# 17 tokens will make sure first 16 tokens are cached in a block
35+
input_tokens = {"prompt_token_ids": [101] * 17}
36+
_ = model.generate([input_tokens])
37+
outputs = model.generate([input_tokens])
38+
assert outputs[0].num_cached_tokens != 0

vllm/outputs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ class RequestOutput:
103103
encoder_prompt_token_ids: The token IDs of the encoder prompt.
104104
None if decoder-only.
105105
num_cached_tokens: The number of tokens with prefix cache hit.
106+
Over the lifetime of the request, this number
107+
is possible to be refreshed if it is preempted
108+
and resumed.
106109
kv_transfer_params: The params for remote K/V transfer.
107110
"""
108111

vllm/v1/core/kv_cache_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def get_computed_blocks(self,
165165
assert self.prefix_cache_stats is not None
166166
self.prefix_cache_stats.queries += request.num_tokens
167167
self.prefix_cache_stats.hits += num_computed_tokens
168-
request.num_cached_tokens = num_computed_tokens
168+
169169
return KVCacheBlocks(computed_blocks), num_computed_tokens
170170

171171
def allocate_slots(

vllm/v1/core/sched/scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,8 @@ def schedule(self) -> SchedulerOutput:
370370

371371
# Total computed tokens (allocated in prior step).
372372
num_computed_tokens = num_prealloc_computed_tokens
373-
373+
if request.num_cached_tokens < 0:
374+
request.num_cached_tokens = num_computed_tokens
374375
encoder_inputs_to_schedule = None
375376
new_encoder_budget = encoder_budget
376377

vllm/v1/request.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ def __init__(
7878
self.all_token_ids = ConstantList(self._all_token_ids)
7979

8080
# State
81-
self.num_cached_tokens = 0
81+
# The number of tokens with prefix cache hits.
82+
self.num_cached_tokens = -1
8283

8384
@classmethod
8485
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":

0 commit comments

Comments
 (0)