Format

kaixih · kaixih · commit b20ac92a3aba · 2025-04-03T20:30:54.000Z
Signed-off-by: kaixih &lt;kaixih@nvidia.com&gt;
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
@@ -37,7 +37,7 @@
 using namespace cute;
 using namespace cutlass::fmha::kernel;
 
-template<bool v>
+template <bool v>
 struct IsPersistent {
   static const bool value = v;
 };
@@ -54,31 +54,28 @@ struct MlaSm100 {
 
   // H K (D_latent D_rope) B
   using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
-  
+
   using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
   using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
   using StrideO = StrideK;                            // H D B
   using StrideLSE = cute::tuple<_1, int>;             // H B
 
-  using TileScheduler = std::conditional_t<
-      PersistenceOption::value,
-      Sm100MlaPersistentTileScheduler,
-      Sm100MlaIndividualTileScheduler>;
+  using TileScheduler = std::conditional_t<PersistenceOption::value,
+                                           Sm100MlaPersistentTileScheduler,
+                                           Sm100MlaIndividualTileScheduler>;
 
   using FmhaKernel =
       cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
-          TileShape, Element, ElementAcc, ElementOut, ElementAcc,
-          TileScheduler, /*kIsCpAsync=*/true>;
+          TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
+          /*kIsCpAsync=*/true>;
   using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
 };
 
-
 template <typename T>
-typename T::Fmha::Arguments args_from_options(at::Tensor const& out,
-                                              at::Tensor const& q_nope_and_q_pe,
-                                              at::Tensor const& kv_c_and_k_pe_cache,
-                                              at::Tensor const& seq_lens,
-                                              at::Tensor const& page_table) {
+typename T::Fmha::Arguments args_from_options(
+    at::Tensor const& out, at::Tensor const& q_nope_and_q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
+    at::Tensor const& page_table) {
   cutlass::KernelHardwareInfo hw_info;
   hw_info.device_id = q_nope_and_q_pe.device().index();
   hw_info.sm_count =
@@ -92,8 +89,8 @@ typename T::Fmha::Arguments args_from_options(at::Tensor const& out,
   int max_seq_len = page_size * page_count_per_seq;
   using TileShapeH = typename T::TileShapeH;
   using TileShapeD = typename T::TileShapeD;
-  auto problem_shape = cute::make_tuple(
-      TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+  auto problem_shape =
+      cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
 
   auto [H, K, D, B] = problem_shape;
   auto [D_latent, D_rope] = D;
@@ -108,66 +105,55 @@ typename T::Fmha::Arguments args_from_options(at::Tensor const& out,
   using StrideO = typename T::StrideO;
   using StrideLSE = typename T::StrideLSE;
 
-  StrideQ stride_Q = cute::make_tuple(
-      static_cast<int64_t>(0 + D_latent + D_rope),
-      _1{},
-      static_cast<int64_t>(H * (0 + D_latent + D_rope)));
-  StrideK stride_C = cute::make_tuple(
-      static_cast<int64_t>(0 + D_latent + D_rope),
-      _1{},
-      static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideQ stride_Q =
+      cute::make_tuple(static_cast<int64_t>(0 + D_latent + D_rope), _1{},
+                       static_cast<int64_t>(H * (0 + D_latent + D_rope)));
+  StrideK stride_C =
+      cute::make_tuple(static_cast<int64_t>(0 + D_latent + D_rope), _1{},
+                       static_cast<int64_t>(page_size * (D_latent + D_rope)));
   StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
   StrideLSE stride_LSE = cute::make_tuple(_1{}, 0 + H);
-  StrideO stride_O = cute::make_tuple(
-      static_cast<int64_t>(0 + D_latent),
-      _1{},
-      static_cast<int64_t>(0 + H * D_latent));
+  StrideO stride_O =
+      cute::make_tuple(static_cast<int64_t>(0 + D_latent), _1{},
+                       static_cast<int64_t>(0 + H * D_latent));
 
   using Element = typename T::Element;
   using ElementOut = typename T::ElementOut;
   using ElementAcc = typename T::ElementAcc;
   auto Q_ptr = static_cast<Element*>(q_nope_and_q_pe.data_ptr());
   auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
   typename T::Fmha::Arguments arguments{
-    problem_shape,
-    { scale,
-      Q_ptr, stride_Q,
-      Q_ptr + D_latent, stride_Q,
-      C_ptr, stride_C,
-      C_ptr + D_latent, stride_C,
-      static_cast<int*>(seq_lens.data_ptr()),
-      static_cast<int*>(page_table.data_ptr()), stride_PT,
-      page_count_total, page_size},
-    { static_cast<ElementOut*>(out.data_ptr()), stride_O,
-      // static_cast<ElementAcc*>(lse.data_ptr()), stride_LSE},
-      static_cast<ElementAcc*>(nullptr), stride_LSE},
-    hw_info,
-    -1, // split_kv
-    nullptr, // is_var_split_kv
+      problem_shape,
+      {scale, Q_ptr, stride_Q, Q_ptr + D_latent, stride_Q, C_ptr, stride_C,
+       C_ptr + D_latent, stride_C, static_cast<int*>(seq_lens.data_ptr()),
+       static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
+       page_size},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
+       static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      -1,       // split_kv
+      nullptr,  // is_var_split_kv
   };
   // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
-  // split_kv automatically based on batch size and sequence length to balance 
+  // split_kv automatically based on batch size and sequence length to balance
   // workload across available SMs. Consider using var_split_kv for manual
   // control if needed.
   T::Fmha::set_split_kv(arguments);
   return arguments;
 }
 
 template <typename Element>
-void runMla(at::Tensor const& out,
-            at::Tensor const& q_nope_and_q_pe,
-            at::Tensor const& kv_c_and_k_pe_cache,
-            at::Tensor const& seq_lens,
-            at::Tensor const& page_table,
-            cudaStream_t stream) {
+void runMla(at::Tensor const& out, at::Tensor const& q_nope_and_q_pe,
+            at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
+            at::Tensor const& page_table, cudaStream_t stream) {
   using MlaSm100Type = MlaSm100<Element>;
   typename MlaSm100Type::Fmha fmha;
-  auto arguments =
-      args_from_options<MlaSm100Type>(out, q_nope_and_q_pe, kv_c_and_k_pe_cache,
-                                      seq_lens, page_table);
+  auto arguments = args_from_options<MlaSm100Type>(
+      out, q_nope_and_q_pe, kv_c_and_k_pe_cache, seq_lens, page_table);
   size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(q_nope_and_q_pe.device());
+  auto const workspace_options = torch::TensorOptions()
+                                     .dtype(torch::kUInt8)
+                                     .device(q_nope_and_q_pe.device());
   auto workspace = torch::empty(workspace_size, workspace_options);
 
   CUTLASS_CHECK(fmha.can_implement(arguments));
@@ -182,20 +168,20 @@ void cutlass_mla_decode_sm100a(torch::Tensor const& out,
                                torch::Tensor const& kv_c_and_k_pe_cache,
                                torch::Tensor const& seq_lens,
                                torch::Tensor const& page_table) {
-    auto in_dtype = q_nope_and_q_pe.dtype();
-    at::cuda::CUDAGuard device_guard{(char)q_nope_and_q_pe.get_device()};
-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream(
-                                    q_nope_and_q_pe.get_device());
-    if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t>(
-            out, q_nope_and_q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, stream);
-    } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t>(
-            out, q_nope_and_q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, stream);
-    } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t>(
-            out, q_nope_and_q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, stream);
-    } else {
-        TORCH_CHECK(false, "Unsupported input data type of MLA");
-    }
+  auto in_dtype = q_nope_and_q_pe.dtype();
+  at::cuda::CUDAGuard device_guard{(char)q_nope_and_q_pe.get_device()};
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(q_nope_and_q_pe.get_device());
+  if (in_dtype == at::ScalarType::Half) {
+    runMla<cutlass::half_t>(out, q_nope_and_q_pe, kv_c_and_k_pe_cache, seq_lens,
+                            page_table, stream);
+  } else if (in_dtype == at::ScalarType::BFloat16) {
+    runMla<cutlass::bfloat16_t>(out, q_nope_and_q_pe, kv_c_and_k_pe_cache,
+                                seq_lens, page_table, stream);
+  } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
+    runMla<cutlass::float_e4m3_t>(out, q_nope_and_q_pe, kv_c_and_k_pe_cache,
+                                  seq_lens, page_table, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported input data type of MLA");
+  }
 }
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
@@ -8,8 +8,10 @@
 from vllm.platforms import current_platform
 
 if not current_platform.has_device_capability(100):
-    pytest.skip(reason="Cutlass MLA Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        reason="Cutlass MLA Requires compute capability of 10 or above.",
+        allow_module_level=True)
+
 
 def ref_mla(
         out: Tensor,  # (bs, num_heads, v_head_dim)
@@ -40,20 +42,22 @@ def ref_mla(
 
     return out
 
+
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
 @pytest.mark.parametrize("bs", [1, 2, 4])
 @pytest.mark.parametrize("varlen", [False, True])
 @pytest.mark.parametrize("block_size", [16, 128])
-def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int, varlen: bool, block_size: int):
+def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
+                            varlen: bool, block_size: int):
     torch.set_default_dtype(dtype)
     torch.set_default_device('cuda')
     torch.manual_seed(42)
 
     d = 576
     h_q = 128
     dv = 512
-    
+
     q_nope_dim = 128
     q_pe_dim = 64
     scale = (q_nope_dim + q_pe_dim)**(-0.5)
@@ -66,7 +70,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int, varl
     block_num = (max_seq_len + block_size - 1) // block_size
 
     q = torch.randn(bs, h_q, d)
-    block_table = torch.randint(0, bs * block_num, (bs, block_num), dtype=torch.int32)
+    block_table = torch.randint(0,
+                                bs * block_num, (bs, block_num),
+                                dtype=torch.int32)
 
     kv_cache = torch.randn(block_table.numel(), block_size, d)
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1432,10 +1432,11 @@ def flash_mla_with_kvcache(
     )
     return out, softmax_lse
 
+
 def cutlass_mla_decode(q_nope_and_q_pe: torch.Tensor,
-                      kv_c_and_k_pe_cache: torch.Tensor,
-                      seq_lens: torch.Tensor,
-                      page_table: torch.Tensor) -> torch.Tensor:
+                       kv_c_and_k_pe_cache: torch.Tensor,
+                       seq_lens: torch.Tensor,
+                       page_table: torch.Tensor) -> torch.Tensor:
     assert not current_platform.is_rocm()
     assert q_nope_and_q_pe.ndim == 3, f"q_nope_and_q_pe must be a 3D tensor, but got {q_nope_and_q_pe.ndim}"
     assert kv_c_and_k_pe_cache.ndim == 3, f"kv_c_and_k_pe_cache must be a 3D tensor, but got {kv_c_and_k_pe_cache.ndim}"
@@ -1446,13 +1447,17 @@ def cutlass_mla_decode(q_nope_and_q_pe: torch.Tensor,
     D_rope = 64
     assert D_q == D_ckv and D_q == D_latent + D_rope, (
         f"D_q must be equal to D_ckv and D_q must be equal to D_latent + D_rope, "
-        f"but got D_q = {D_q}, D_ckv = {D_ckv}, D_latent = {D_latent}, D_rope = {D_rope}")
+        f"but got D_q = {D_q}, D_ckv = {D_ckv}, D_latent = {D_latent}, D_rope = {D_rope}"
+    )
     assert H == 128, f"H must be 128, but got {H}"
-    assert PAGE_SIZE > 0 and (PAGE_SIZE & (PAGE_SIZE - 1)) == 0, f"PAGE_SIZE must be a power of 2, but got {PAGE_SIZE}"
-    
+    assert PAGE_SIZE > 0 and (
+        PAGE_SIZE & (PAGE_SIZE - 1)
+    ) == 0, f"PAGE_SIZE must be a power of 2, but got {PAGE_SIZE}"
+
     # TODO(kaixih@nvidia): support fp8
     assert q_nope_and_q_pe.dtype in (torch.float16, torch.bfloat16), (
-        f'q_nope_and_q_pe.dtype needs to be fp16 or bf16 but got {q_nope_and_q_pe.dtype}.')
+        f'q_nope_and_q_pe.dtype needs to be fp16 or bf16 but got {q_nope_and_q_pe.dtype}.'
+    )
     assert kv_c_and_k_pe_cache.dtype == q_nope_and_q_pe.dtype, (
         f'kv_c_and_k_pe_cache.dtype needs to be the same as q_nope_and_q_pe.dtype, '
         f'but got {kv_c_and_k_pe_cache.dtype}.')
@@ -1461,7 +1466,9 @@ def cutlass_mla_decode(q_nope_and_q_pe: torch.Tensor,
     assert page_table.dtype == torch.int32, (
         f'page_table.dtype needs to be int32 but got {page_table.dtype}.')
 
-    out = torch.empty((B_q, H, D_latent), device=q_nope_and_q_pe.device, dtype=q_nope_and_q_pe.dtype)
+    out = torch.empty((B_q, H, D_latent),
+                      device=q_nope_and_q_pe.device,
+                      dtype=q_nope_and_q_pe.dtype)
 
     torch.ops._C.cutlass_mla_decode(out, q_nope_and_q_pe, kv_c_and_k_pe_cache,
                                     seq_lens, page_table)