[Feature][v1]: cached_tokens in Chat Completion Response usage

chaunceyjiang · simon-mo · chaunceyjiang · commit c7f9ed991f79 · 2025-05-21T03:44:32.000Z
Signed-off-by: chaunceyjiang &lt;chaunceyjiang@gmail.com&gt;

Co-authored-by: simon-mo &lt;xmo@berkeley.edu&gt;
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
@@ -19,11 +19,20 @@ def model() -> LLM:
                enable_prefix_caching=True,
                long_prefill_token_threshold=2,
                max_num_batched_tokens=6,
-               max_num_seqs=3)
+               max_num_seqs=3,
+               block_size=16)
 
 
 def test_concurrent_partial_prefill(model):
     outputs = model.generate([PROMPT] * 3)
     assert len(outputs) == 3
     for output in outputs:
         assert len(output.outputs) == 1
+
+
+def test_prefix_cache_stats_is_recorded(model):
+    # 17 tokens will make sure first 16 tokens are cached in a block
+    input_tokens = {"prompt_token_ids": [101] * 17}
+    _ = model.generate([input_tokens])
+    outputs = model.generate([input_tokens])
+    assert outputs[0].num_cached_tokens != 0
diff --git a/vllm/outputs.py b/vllm/outputs.py
@@ -103,6 +103,9 @@ class RequestOutput:
         encoder_prompt_token_ids: The token IDs of the encoder prompt.
                                   None if decoder-only.
         num_cached_tokens: The number of tokens with prefix cache hit.
+                           Over the lifetime of the request, this number
+                           is possible to be refreshed if it is preempted
+                           and resumed.
         kv_transfer_params: The params for remote K/V transfer.
     """
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -165,7 +165,7 @@ def get_computed_blocks(self,
             assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.queries += request.num_tokens
             self.prefix_cache_stats.hits += num_computed_tokens
-        request.num_cached_tokens = num_computed_tokens
+
         return KVCacheBlocks(computed_blocks), num_computed_tokens
 
     def allocate_slots(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -370,7 +370,8 @@ def schedule(self) -> SchedulerOutput:
 
                     # Total computed tokens (allocated in prior step).
                     num_computed_tokens = num_prealloc_computed_tokens
-
+                if request.num_cached_tokens < 0:
+                    request.num_cached_tokens = num_computed_tokens
                 encoder_inputs_to_schedule = None
                 new_encoder_budget = encoder_budget
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
@@ -78,7 +78,8 @@ def __init__(
         self.all_token_ids = ConstantList(self._all_token_ids)
 
         # State
-        self.num_cached_tokens = 0
+        # The number of tokens with prefix cache hits.
+        self.num_cached_tokens = -1
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":