[Bugfix] Fix cache block size calculation for CPU MLA (vllm-project#15848)

gau-nernst · nishith-fujitsu · commit 9ecb93da4959 · 2025-04-09T13:40:24.000+05:30
Signed-off-by: Thien Tran &lt;gau.nernst@yahoo.com.sg&gt;
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
@@ -106,7 +106,7 @@ def get_cache_block_size(
         num_layers = model_config.get_num_layers(parallel_config)
 
         key_cache_block = block_size * num_heads * head_size
-        value_cache_block = key_cache_block
+        value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_layers * (key_cache_block + value_cache_block)
         if cache_dtype == "auto":
             dtype = model_config.dtype