rebase and move kv cache group validation

zixi-qi · zixi-qi · commit a12b247a8c2c · 2025-05-15T19:36:52.000-07:00
Signed-off-by: qizixi &lt;qizixi@meta.com&gt;
diff --git a/profile_client.sh b/profile_client.sh
@@ -0,0 +1,12 @@
+export LLAMA_DIR=meta-llama/Llama-3.1-8B-Instruct
+
+python benchmarks/benchmark_serving.py \
+--backend vllm \
+--model $LLAMA_DIR \
+--dataset-name random \
+--random-input-len 2000 \
+--random-output-len 150 \
+--max-concurrency 2 \
+--num-prompts 2  \
+--profile \
+2>&1 | tee ./base_client_$(date +%Y%m%d_%H%M%S).log
diff --git a/server.sh b/server.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Configuration of environment variables
+export VLLM_USE_MODELSCOPE=False
+export VLLM_TORCH_PROFILER_DIR=~/vllm_profile
+export LLAMA_DIR=meta-llama/Llama-3.1-8B-Instruct
+export VLLM_USE_V1=1
+# Eagle config
+spec_dec_config='{"method": "eagle", "model": "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", "num_speculative_tokens": 3, "draft_tensor_parallel_size": 1, "max_model_len": 2048}'
+# Command to run the vllm server
+vllm serve $LLAMA_DIR --disable-log-requests \
+    -tp 1 \
+    --max-num-seqs 128 \
+    --max_num_batched_tokens=8000 \
+    --speculative-config="$spec_dec_config" \
+    --num-lookahead-slots=3 \
+    --max-model-len=8192 \
+    --enable-prefix-caching \
+    --trust-remote-code \
+    2>&1 | tee /data/users/$USER/logs/server/vllm_17b16e_vllm_serving$(date +%Y%m%d_%H%M%S).log
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -14,6 +14,7 @@
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.triton_utils import tl, triton
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
 
 logger = init_logger(__name__)
@@ -352,6 +353,24 @@ def dummy_run(
                 hidden_states=self.hidden_states[:num_tokens],
             )
 
+    def validate_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Validate that all eagle layers belong to the same KVCacheGroup.
+        Need this assumption to ensure all eagle layers can use the
+        same AttentionMetadata.
+        May extend to multiple AttentionMetadata in the future.
+        """
+        kv_cache_groups: dict[str, int] = {}
+        for id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
+            for layer_name in kv_cache_group.layer_names:
+                kv_cache_groups[layer_name] = id
+        assert len(
+            set([
+                kv_cache_groups[layer_name]
+                for layer_name in self.attn_layer_names
+            ])
+        ) == 1, "All eagle layers should belong to the same kv cache group"
+
 
 # NOTE(woosuk): Currently, the below code is not used and we always use argmax
 # to sample the draft tokens. We will use this after we find a way to manage
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1890,12 +1890,10 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self.initialize_attn_backend(kv_cache_config)
 
         kv_caches: dict[str, torch.Tensor] = {}
-        kv_cache_group_ids: dict[str, int] = {}
 
-        for id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
+        for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
             kv_cache_spec = kv_cache_group.kv_cache_spec
             for layer_name in kv_cache_group.layer_names:
-                kv_cache_group_ids[layer_name] = id
                 tensor_config = kv_cache_config.tensors[layer_name]
                 assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
@@ -1922,12 +1920,9 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
 
         if self.speculative_config and self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
-            assert len(
-                set([
-                    kv_cache_group_ids[layer_name]
-                    for layer_name in self.drafter.attn_layer_names
-                ])) == 1, "For multi-layer eagle draft model, "
-            "all layers should belong to the same kv cache group"
+            # validate all draft model layers belong to the same kv cache
+            # group
+            self.drafter.validate_kv_cache_group(kv_cache_config)
 
         bind_kv_cache(
             kv_caches,