Reshape_and_cache_flash kernel to be kv-cache layout aware.

wenscarl · wenscarl · commit c845f81cae4a · 2025-03-30T16:45:32.000Z
Signed-off-by: shuw &lt;shuw@nvidia.com&gt;
diff --git a/csrc/cache.h b/csrc/cache.h
@@ -29,7 +29,8 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              torch::Tensor& value_cache,
                              torch::Tensor& slot_mapping,
                              const std::string& kv_cache_dtype,
-                             torch::Tensor& k_scale, torch::Tensor& v_scale);
+                             torch::Tensor& k_scale, torch::Tensor& v_scale,
+			     const bool is_NHD = true);
 
 void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                           torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
@@ -45,4 +46,4 @@ void gather_cache(
     torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
     torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
     torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -265,14 +265,14 @@ template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
     const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
     const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
-    cache_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads,
-                                         // head_size]
-    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
-                                         // head_size]
+    cache_t* __restrict__ key_cache, cache_t* __restrict__ value_cache,
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int block_stride, const int key_stride, const int value_stride,
     const int num_heads, const int head_size, const int block_size,
-    const float* k_scale, const float* v_scale) {
+    const float* k_scale, const float* v_scale, const bool is_NHD) {
+  // For key/value_cache layout:
+  // - NHD: [num_blocks, block_size, num_heads, head_size]
+  // - HND: [num_blocks, num_heads, block_size, head_size]
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   // NOTE: slot_idx can be -1 if the token is padded
@@ -287,9 +287,12 @@ __global__ void reshape_and_cache_flash_kernel(
     const int64_t src_value_idx = token_idx * value_stride + i;
     const int head_idx = i / head_size;
     const int head_offset = i % head_size;
-    const int64_t tgt_key_value_idx = block_idx * block_stride +
-                                      block_offset * num_heads * head_size +
-                                      head_idx * head_size + head_offset;
+    const int64_t tgt_key_value_idx =
+        block_idx * block_stride +
+        (is_NHD ? block_offset * num_heads * head_size + head_idx * head_size +
+                      head_offset
+                : head_idx * block_size * head_size + block_offset * head_size +
+                      head_offset);
     scalar_t tgt_key = key[src_key_idx];
     scalar_t tgt_value = value[src_value_idx];
     if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
@@ -416,7 +419,7 @@ void reshape_and_cache_flash(
         value_cache,  // [num_blocks, block_size, num_heads, head_size]
     torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale) {
+    torch::Tensor& v_scale, const bool is_NHD) {
   // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
   // slot_mapping.size(0) because of padding for CUDA graphs.
   // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
@@ -427,10 +430,14 @@ void reshape_and_cache_flash(
   // before padding.
   // For compatibility with both cases, we use slot_mapping.size(0) as the
   // number of tokens.
-  int num_tokens = slot_mapping.size(0);
+  // For key/value_cache layout:
+  // - NHD: [num_blocks, block_size, num_heads, head_size]
+  // - HND: [num_blocks, num_heads, block_size, head_size]
+
   int num_heads = key.size(1);
+  int num_tokens = slot_mapping.size(0);
   int head_size = key.size(2);
-  int block_size = key_cache.size(1);
+  int block_size = is_NHD ? key_cache.size(1) : key_cache.size(2);
 
   int key_stride = key.stride(0);
   int value_stride = value.stride(0);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -570,7 +570,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                        Tensor! value_cache,"
       "                        Tensor slot_mapping,"
       "                        str kv_cache_dtype,"
-      "                        Tensor k_scale, Tensor v_scale) -> ()");
+      "                        Tensor k_scale, Tensor v_scale,"
+      "                        bool is_NHD) -> ()");
   cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
                  &reshape_and_cache_flash);
 
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
@@ -16,6 +16,7 @@
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [8, 16, 32]
+CACHE_LAYOUTS = ["NHD", "HND"]
 
 # Parameters for MLA tests.
 KV_LORA_RANKS = [512]
@@ -220,6 +221,7 @@ def test_reshape_and_cache(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("kv_layout", CACHE_LAYOUTS)
 @torch.inference_mode()
 def test_reshape_and_cache_flash(
     kv_cache_factory_flashinfer,
@@ -232,6 +234,7 @@ def test_reshape_and_cache_flash(
     seed: int,
     device: str,
     kv_cache_dtype: str,
+    kv_layout: str,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -242,7 +245,7 @@ def test_reshape_and_cache_flash(
     slot_mapping = torch.tensor(slot_mapping_lst,
                                 dtype=torch.long,
                                 device=device)
-
+    is_NHD = kv_layout == "NHD"
     qkv = torch.randn(num_tokens,
                       3,
                       num_heads,
@@ -261,6 +264,7 @@ def test_reshape_and_cache_flash(
         kv_cache_dtype,
         dtype,
         device=device,
+        is_NHD=is_NHD,
     )
     key_cache, value_cache = key_caches[0].contiguous(
     ), value_caches[0].contiguous()
@@ -285,10 +289,11 @@ def test_reshape_and_cache_flash(
     # Call the reshape_and_cache kernel.
     opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
             (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
-             k_scale, v_scale),
+             k_scale, v_scale, is_NHD),
             cond=(head_size == HEAD_SIZES[0]))
     ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
+                                slot_mapping, kv_cache_dtype, k_scale, v_scale,
+                                is_NHD)
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
@@ -310,8 +315,12 @@ def test_reshape_and_cache_flash(
     for i in range(num_tokens):
         block_idx = block_indicies_lst[i]
         block_offset = block_offsets_lst[i]
-        cloned_key_cache[block_idx, block_offset, :, :] = key[i]
-        cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+        if is_NHD:
+            cloned_key_cache[block_idx, block_offset, :, :] = key[i]
+            cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+        else:
+            cloned_key_cache[block_idx, :, block_offset, :] = key[i]
+            cloned_value_cache[block_idx, :, block_offset, :] = value[i]
 
     if kv_cache_dtype == "fp8":
         torch.testing.assert_close(result_key_cache,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1272,11 +1272,12 @@ def reshape_and_cache_flash(
     kv_cache_dtype: str,
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
+    is_NHD: bool = True,
 ) -> None:
     torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
                                                    value_cache, slot_mapping,
                                                    kv_cache_dtype, k_scale,
-                                                   v_scale)
+                                                   v_scale, is_NHD)
 
 
 def concat_and_cache_mla(