perf: dynamic split-k for MLA (#863)

yzh119 · web-flow · commit 41a4f5618179 · 2025-02-17T07:30:20.000-08:00
#804 didn't implement split-k, which might result in performance degradation if concurrency is not large enough. This PR fixes issue. We implemented the v2 scheduler and write-through optimization mentioned in [our paper](https://arxiv.org/pdf/2501.01005) (section 3.3 and appendix in D.2) for load-balancing. In an early PR (#72), we turned off `cudaLaunchCooperativeKernels` and `grid.sync()` because we are not sure whether it's compatible with CUDAGraph. This PR adds them back again for grid synchronization, to save some kernel launch overhead. ## Benchmark On H100 SXM5 80GB (3352 GB/s), this PR: ``` Config: batch_size=1, seq_len=1024, num_heads=16 Memory bandwidth: 22.33 GB/s Config: batch_size=16, seq_len=1024, num_heads=16 Memory bandwidth: 330.72 GB/s Config: batch_size=32, seq_len=1024, num_heads=16 Memory bandwidth: 638.73 GB/s Config: batch_size=64, seq_len=1024, num_heads=16 Memory bandwidth: 1188.90 GB/s Config: batch_size=1, seq_len=2048, num_heads=16 Memory bandwidth: 40.74 GB/s Config: batch_size=16, seq_len=2048, num_heads=16 Memory bandwidth: 592.77 GB/s Config: batch_size=32, seq_len=2048, num_heads=16 Memory bandwidth: 1112.83 GB/s Config: batch_size=64, seq_len=2048, num_heads=16 Memory bandwidth: 1506.01 GB/s Config: batch_size=1, seq_len=4096, num_heads=16 Memory bandwidth: 72.53 GB/s Config: batch_size=16, seq_len=4096, num_heads=16 Memory bandwidth: 1007.80 GB/s Config: batch_size=32, seq_len=4096, num_heads=16 Memory bandwidth: 1438.99 GB/s Config: batch_size=64, seq_len=4096, num_heads=16 Memory bandwidth: 1730.62 GB/s Config: batch_size=1, seq_len=8192, num_heads=16 Memory bandwidth: 120.74 GB/s Config: batch_size=16, seq_len=8192, num_heads=16 Memory bandwidth: 1340.86 GB/s Config: batch_size=32, seq_len=8192, num_heads=16 Memory bandwidth: 1689.36 GB/s Config: batch_size=64, seq_len=8192, num_heads=16 Memory bandwidth: 1901.26 GB/s Config: batch_size=1, seq_len=16384, num_heads=16 Memory bandwidth: 177.94 GB/s Config: batch_size=16, seq_len=16384, num_heads=16 Memory bandwidth: 1619.51 GB/s Config: batch_size=32, seq_len=16384, num_heads=16 Memory bandwidth: 1876.50 GB/s Config: batch_size=64, seq_len=16384, num_heads=16 Memory bandwidth: 2010.58 GB/s Config: batch_size=1, seq_len=32768, num_heads=16 Memory bandwidth: 231.70 GB/s Config: batch_size=16, seq_len=32768, num_heads=16 Memory bandwidth: 1835.16 GB/s Config: batch_size=32, seq_len=32768, num_heads=16 Memory bandwidth: 1997.24 GB/s Config: batch_size=64, seq_len=32768, num_heads=16 Memory bandwidth: 2067.99 GB/s ``` Before this PR: ``` Config: batch_size=1, seq_len=1024, num_heads=16 Memory bandwidth: 15.46 GB/s Config: batch_size=16, seq_len=1024, num_heads=16 Memory bandwidth: 238.49 GB/s Config: batch_size=32, seq_len=1024, num_heads=16 Memory bandwidth: 472.44 GB/s Config: batch_size=64, seq_len=1024, num_heads=16 Memory bandwidth: 929.12 GB/s Config: batch_size=1, seq_len=2048, num_heads=16 Memory bandwidth: 15.47 GB/s Config: batch_size=16, seq_len=2048, num_heads=16 Memory bandwidth: 250.71 GB/s Config: batch_size=32, seq_len=2048, num_heads=16 Memory bandwidth: 500.21 GB/s Config: batch_size=64, seq_len=2048, num_heads=16 Memory bandwidth: 996.37 GB/s Config: batch_size=1, seq_len=4096, num_heads=16 Memory bandwidth: 16.36 GB/s Config: batch_size=16, seq_len=4096, num_heads=16 Memory bandwidth: 257.59 GB/s Config: batch_size=32, seq_len=4096, num_heads=16 Memory bandwidth: 515.88 GB/s Config: batch_size=64, seq_len=4096, num_heads=16 Memory bandwidth: 1035.55 GB/s Config: batch_size=1, seq_len=8192, num_heads=16 Memory bandwidth: 16.37 GB/s Config: batch_size=16, seq_len=8192, num_heads=16 Memory bandwidth: 261.47 GB/s Config: batch_size=32, seq_len=8192, num_heads=16 Memory bandwidth: 524.76 GB/s Config: batch_size=64, seq_len=8192, num_heads=16 Memory bandwidth: 1054.54 GB/s Config: batch_size=1, seq_len=16384, num_heads=16 Memory bandwidth: 16.50 GB/s Config: batch_size=16, seq_len=16384, num_heads=16 Memory bandwidth: 263.69 GB/s Config: batch_size=32, seq_len=16384, num_heads=16 Memory bandwidth: 528.89 GB/s Config: batch_size=64, seq_len=16384, num_heads=16 Memory bandwidth: 1064.87 GB/s Config: batch_size=1, seq_len=32768, num_heads=16 Memory bandwidth: 16.45 GB/s Config: batch_size=16, seq_len=32768, num_heads=16 Memory bandwidth: 264.66 GB/s Config: batch_size=32, seq_len=32768, num_heads=16 Memory bandwidth: 530.87 GB/s Config: batch_size=64, seq_len=32768, num_heads=16 Memory bandwidth: 1070.93 GB/s ```
diff --git a/benchmarks/bench_deepseek_mla.py b/benchmarks/bench_deepseek_mla.py
@@ -74,9 +74,6 @@ def bench_deepseek_mla_decode(batch_size, seq_len, num_heads):
 
 
 if __name__ == "__main__":
-    bench_deepseek_mla_decode(768, 1024, 16)
-    bench_deepseek_mla_decode(768, 1024, 32)
-    bench_deepseek_mla_decode(768, 1024, 64)
-    bench_deepseek_mla_decode(768, 2048, 16)
-    bench_deepseek_mla_decode(768, 2048, 32)
-    bench_deepseek_mla_decode(768, 2048, 64)
+    for seq_len in [1024, 2048, 4096, 8192, 16384, 32768]:
+        for batch_size in [1, 16, 32, 64]:
+            bench_deepseek_mla_decode(batch_size, seq_len, 16)
diff --git a/csrc/batch_mla_run.cu b/csrc/batch_mla_run.cu
@@ -71,6 +71,8 @@ void BatchMLAPagedAttentionRun(at::Tensor float_workspace_buffer, at::Tensor int
 
         params.q_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.q_indptr_offset);
         params.kv_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_indptr_offset);
+        params.partial_indptr =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.partial_indptr_offset);
         params.kv_indices = static_cast<IdType*>(kv_indices.data_ptr());
         params.q_len = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.q_len_offset);
         params.kv_len = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_len_offset);
@@ -79,6 +81,12 @@ void BatchMLAPagedAttentionRun(at::Tensor float_workspace_buffer, at::Tensor int
         params.kv_end = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_end_offset);
         params.work_indptr =
             GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.work_indptr_offset);
+        params.merge_packed_offset_start = GetPtrFromBaseOffset<IdType>(
+            int_buffer_ptr, plan_info.merge_packed_offset_start_offset);
+        params.merge_packed_offset_end =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.merge_packed_offset_end_offset);
+        params.merge_indptr =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.merge_indptr_offset);
         params.final_o = static_cast<DTypeO*>(o.data_ptr());
         params.final_lse =
             maybe_lse.has_value() ? static_cast<float*>(maybe_lse->data_ptr()) : nullptr;
diff --git a/include/flashinfer/attention/cascade.cuh b/include/flashinfer/attention/cascade.cuh
@@ -16,16 +16,13 @@
 #ifndef FLASHINFER_CASCADE_CUH_
 #define FLASHINFER_CASCADE_CUH_
 
-#include <cooperative_groups.h>
-
 #include "../cp_async.cuh"
 #include "../math.cuh"
 #include "../utils.cuh"
 #include "state.cuh"
 
 namespace flashinfer {
 
-namespace cg = cooperative_groups;
 using cp_async::PrefetchMode;
 using cp_async::SharedMemFillMode;
 
@@ -323,8 +320,8 @@ __global__ void MergeStatesLargeNumIndexSetsKernel(DTypeIn* __restrict__ V, floa
 }
 
 /*!
- * \brief The CUDA kernel to merge self-attention states of multiple index sets, the number of index
- *   sets at each position might vary.
+ * \brief The CUDA kernel to merge self-attention states of multiple index sets, the number of
+ * index sets at each position might vary.
  *
  * For CUDA graph support, the kernel can be built with a maximum sequence length and executed
  * using a truncated, dynamic sequence length passed through `seq_len_ptr`.
diff --git a/include/flashinfer/attention/mla_fa2.cuh b/include/flashinfer/attention/mla_fa2.cuh
@@ -15,6 +15,8 @@
  */
 #ifndef FLASHINFER_MLA_FA2_CUH_
 #define FLASHINFER_MLA_FA2_CUH_
+#include <cooperative_groups.h>
+
 #include <cstdint>
 #include <sstream>
 
@@ -90,7 +92,6 @@ struct KernelTraits {
   static constexpr uint32_t UPCAST_STRIDE_KPE = HEAD_DIM_KPE / upcast_size<DTypeKV_>();
   static constexpr uint32_t UPCAST_STRIDE_FINAL_O = HEAD_DIM_CKV / upcast_size<DTypeO_>();
   static constexpr uint32_t UPCAST_STRIDE_P = CTA_TILE_KV / upcast_size<DTypeKV_>();
-  static constexpr uint32_t UPCAST_STRIDE_PARTIAL_O = HEAD_DIM_CKV / upcast_size<float>();
 
   using DTypeQ = DTypeQ_;
   using DTypeKV = DTypeKV_;
@@ -618,6 +619,52 @@ __device__ __forceinline__ void finalize_m_(typename KTraits::AttentionVariant v
   }
 }
 
+template <typename KTraits>
+__device__ void DevicePersistentMergeStates(typename KTraits::IdType* merge_packed_offset_start,
+                                            typename KTraits::IdType* merge_packed_offset_end,
+                                            typename KTraits::IdType* merge_indptr,
+                                            float* partial_o, float* partial_lse,
+                                            typename KTraits::DTypeO* final_o, float* final_lse,
+                                            const uint32_t o_stride_n, const uint32_t o_stride_h,
+                                            const uint32_t cluster_tile_q,
+                                            const uint_fastdiv& num_heads) {
+  constexpr uint32_t VEC_SIZE = 4;  // partial o has data type float
+  constexpr uint32_t NUM_THRS_PER_ROW = KTraits::HEAD_DIM_CKV / VEC_SIZE;
+  constexpr uint32_t ROWS_PER_ITERATION = (KTraits::NUM_THREADS) / NUM_THRS_PER_ROW;
+  const uint32_t cluster_id = blockIdx.y;
+  const uint32_t thread_id = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+  const uint32_t offset_start = merge_packed_offset_start[cluster_id];
+  const uint32_t offset_end = merge_packed_offset_end[cluster_id];
+  const uint32_t partial_offset_start = merge_indptr[cluster_id];
+  const uint32_t partial_offset_end = merge_indptr[cluster_id + 1];
+  const uint32_t stride = offset_end - offset_start;
+#pragma unroll 1
+  for (uint32_t local_packed_offset =
+           blockIdx.x * ROWS_PER_ITERATION + thread_id / NUM_THRS_PER_ROW;
+       local_packed_offset < stride; local_packed_offset += gridDim.x * ROWS_PER_ITERATION) {
+    uint32_t final_packed_offset = offset_start + local_packed_offset;
+    uint32_t q, r;
+    num_heads.divmod(final_packed_offset, q, r);
+    state_t<VEC_SIZE> st;
+#pragma unroll 2
+    for (uint32_t partial_packed_offset = partial_offset_start + local_packed_offset;
+         partial_packed_offset < partial_offset_end; partial_packed_offset += stride) {
+      vec_t<float, VEC_SIZE> o_partial;
+      float lse_partial;
+      o_partial.load(partial_o + partial_packed_offset * KTraits::HEAD_DIM_CKV +
+                     (thread_id % NUM_THRS_PER_ROW) * VEC_SIZE);
+      lse_partial = partial_lse[partial_packed_offset];
+      st.merge(o_partial, lse_partial, 1);
+    }
+    st.normalize();
+    st.o.cast_store(final_o +
+                    (q * o_stride_n + r * o_stride_h + (thread_id % NUM_THRS_PER_ROW) * VEC_SIZE));
+    if (final_lse) {
+      final_lse[q * num_heads + r] = st.get_lse();
+    }
+  }
+}
+
 template <typename KTraits>
 __device__ __forceinline__ void write_o(typename KTraits::SharedStorage* smem_storage,
                                         typename KTraits::DTypeO* final_o, float* final_lse,
@@ -631,12 +678,40 @@ __device__ __forceinline__ void write_o(typename KTraits::SharedStorage* smem_st
   constexpr uint32_t HEAD_DIM_CKV = KTraits::HEAD_DIM_CKV;
   constexpr uint32_t UPCAST_STRIDE_FINAL_O = KTraits::UPCAST_STRIDE_FINAL_O;
   const uint32_t lane_idx = threadIdx.x, warpgroup_idx = threadIdx.z, warp_idx_in_wg = threadIdx.y;
+  smem_t<KTraits::SWIZZLE_MODE_O> o_smem(smem_storage->o_smem);
+
+  if (partial_o != nullptr) {
+    // write to partial_o
+#pragma unroll
+    for (uint32_t j = 0; j < 2; ++j) {
+      uint32_t q_idx = (packed_offset + warp_idx_in_wg * 16 + 8 * j + lane_idx / 4) / num_heads;
+      if (lane_idx % 4 == 0 && q_idx < q_len) {
+        partial_lse[(blockIdx.x * 4 + warp_idx_in_wg) * 16 + 8 * j + lane_idx / 4] =
+            math::ptx_log2(d[j]) + float(m[j]);
+      }
+    }
 
-  if (false) {
-    // TOOD(Zihao): write to partial
+#pragma unroll
+    for (uint32_t j = 0; j < 2; ++j) {
+      uint32_t q_idx = (packed_offset + warp_idx_in_wg * 16 + 8 * j + lane_idx / 4) / num_heads;
+#pragma unroll
+      for (uint32_t mma_d = 0; mma_d < NUM_MMA_D_CKV / 2; ++mma_d) {
+        if (q_idx < q_len) {
+          *reinterpret_cast<float2*>(
+              partial_o +
+              ((blockIdx.x * 4 + warp_idx_in_wg) * 16 + 8 * j + lane_idx / 4) * HEAD_DIM_CKV +
+              warpgroup_idx * (HEAD_DIM_CKV / 2) + mma_d * 16 + (lane_idx % 4) * 2) =
+              *reinterpret_cast<float2*>(&o_frag[mma_d][j * 2]);
+          *reinterpret_cast<float2*>(
+              partial_o +
+              ((blockIdx.x * 4 + warp_idx_in_wg) * 16 + 8 * j + lane_idx / 4) * HEAD_DIM_CKV +
+              warpgroup_idx * (HEAD_DIM_CKV / 2) + mma_d * 16 + 8 + (lane_idx % 4) * 2) =
+              *reinterpret_cast<float2*>(&o_frag[mma_d][4 + j * 2]);
+        }
+      }
+    }
   } else {
     // write to final_o
-    smem_t<KTraits::SWIZZLE_MODE_O> o_smem(smem_storage->o_smem);
 
     if (final_lse) {
 #pragma unroll
@@ -748,13 +823,14 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
   const uint32_t kpe_stride_n = params.kpe_stride_n;
   const uint32_t o_stride_n = params.o_stride_n;
   const uint32_t o_stride_h = params.o_stride_h;
-  const uint32_t cluster_tile_q = blockDim.x * KTraits::CTA_TILE_Q;
+  const uint32_t cluster_tile_q = gridDim.x * KTraits::CTA_TILE_Q;
 
 #pragma unroll 1
   for (IdType work_idx = work_indptr[blockIdx.y]; work_idx < work_indptr[blockIdx.y + 1];
        ++work_idx) {
     const uint32_t q_indptr = params.q_indptr[work_idx];
     const uint32_t kv_indptr = params.kv_indptr[work_idx];
+    const int32_t partial_indptr = params.partial_indptr[work_idx];
     const uint32_t q_len = params.q_len[work_idx];
     const uint32_t kv_len = params.kv_len[work_idx];
     const uint32_t packed_qo_start = params.q_start[work_idx];
@@ -778,14 +854,14 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
             (CAUSAL ? min(kv_end, kv_len - q_len + (packed_qo_start + cluster_tile_q) / num_heads)
                     : kv_end),
             CTA_TILE_KV) -
-        1;
+        1 - (kv_start / CTA_TILE_KV);
 
     int mask_tile_idx =
-        (CAUSAL ? min(kv_end, kv_len - q_len + packed_qo_start / num_heads) : kv_end) / CTA_TILE_KV;
+        (CAUSAL ? min(kv_end, kv_len - q_len + packed_qo_start / num_heads) : kv_end) /
+            CTA_TILE_KV -
+        (kv_start / CTA_TILE_KV);
 
-    int start_tile_idx = kv_start / CTA_TILE_KV;  // ceil_div(kv_start, CTA_TILE_KV);
     uint32_t block_iter_base = kv_indptr * block_size + kv_start;
-
     // last kv tile
     __syncthreads();
     uint32_t kv_bound = kv_indptr + (kv_len + block_size - 1) / block_size;  // ceil_div
@@ -796,7 +872,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
     cp_async::commit_group();
 #pragma unroll
     for (int stage_idx = 1; stage_idx < NUM_STAGES; ++stage_idx) {
-      if (kv_tile_idx - stage_idx >= start_tile_idx) {
+      if (kv_tile_idx - stage_idx >= 0) {
         load_kv<KTraits>(&smem_storage, ckv, kpe, kv_indices, ckv_stride_n, ckv_stride_page,
                          kpe_stride_n, kpe_stride_page, kv_bound,
                          block_iter_base + (kv_tile_idx - stage_idx) * CTA_TILE_KV, block_size,
@@ -807,7 +883,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
 
     // loop with mask
 #pragma unroll 1
-    for (; kv_tile_idx >= mask_tile_idx && kv_tile_idx > start_tile_idx; --kv_tile_idx) {
+    for (; kv_tile_idx >= mask_tile_idx && kv_tile_idx > 0; --kv_tile_idx) {
       cp_async::wait_group<NUM_STAGES - 1>();
       __syncthreads();
 
@@ -825,7 +901,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
       // compute sfm * v
       compute_mla_pv<KTraits>(&smem_storage, kv_tile_idx % NUM_STAGES, s_frag, d, o_frag);
 
-      if (kv_tile_idx - NUM_STAGES >= start_tile_idx) {
+      if (kv_tile_idx - NUM_STAGES >= 0) {
         __syncthreads();
         load_kv<KTraits>(&smem_storage, ckv, kpe, kv_indices, ckv_stride_n, ckv_stride_page,
                          kpe_stride_n, kpe_stride_page, kv_bound,
@@ -837,7 +913,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
 
     // loop without mask
 #pragma unroll 1
-    for (; kv_tile_idx + 1 > start_tile_idx + NUM_STAGES; --kv_tile_idx) {
+    for (; kv_tile_idx + 1 > NUM_STAGES; --kv_tile_idx) {
       cp_async::wait_group<NUM_STAGES - 1>();
       __syncthreads();
 
@@ -862,7 +938,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
 
     // last tiles
 #pragma unroll
-    for (; kv_tile_idx >= start_tile_idx; --kv_tile_idx) {
+    for (; kv_tile_idx >= 0; --kv_tile_idx) {
       // compute mla qk
       compute_mla_qk<KTraits>(&smem_storage, kv_tile_idx % NUM_STAGES, s_frag);
 
@@ -884,11 +960,22 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
 
     finalize_m_<KTraits>(variant, m);
 
-    write_o<KTraits>(&smem_storage, final_o + q_indptr * o_stride_n,
-                     final_lse ? final_lse + q_indptr * num_heads : nullptr, partial_o, partial_lse,
-                     o_frag, m, d, o_stride_n, o_stride_h, qo_upperbound, qo_packed_idx_base,
-                     num_heads);
+    write_o<KTraits>(
+        &smem_storage, final_o + q_indptr * o_stride_n,
+        final_lse ? final_lse + q_indptr * num_heads : nullptr,
+        (partial_indptr == -1) ? nullptr : partial_o + partial_indptr * KTraits::HEAD_DIM_CKV,
+        (partial_indptr == -1) ? nullptr : partial_lse + partial_indptr, o_frag, m, d, o_stride_n,
+        o_stride_h, qo_upperbound, qo_packed_idx_base, num_heads);
   }
+
+  auto grid = cg::this_grid();
+  grid.sync();
+
+  // the second stage, merge partial outputs
+  DevicePersistentMergeStates<KTraits>(params.merge_packed_offset_start,
+                                       params.merge_packed_offset_end, params.merge_indptr,
+                                       partial_o, partial_lse, final_o, final_lse, o_stride_n,
+                                       o_stride_h, cluster_tile_q, num_heads);
 }
 
 #define DISPATCH_SMEM_CONFIG(smem_limit_per_sm, NUM_STAGES, CTA_TILE_KV, QK_SHARD, ...) \
@@ -948,7 +1035,8 @@ cudaError_t BatchMLAPagedAttention(Params params, uint32_t num_blks_x, uint32_t
 
     FLASHINFER_CUDA_CALL(
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
+    FLASHINFER_CUDA_CALL(
+        cudaLaunchCooperativeKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
   });
 
   return cudaSuccess;
diff --git a/include/flashinfer/attention/mla_params.cuh b/include/flashinfer/attention/mla_params.cuh
@@ -39,6 +39,10 @@ struct MLAParams {
 
   IdType* q_indptr;
   IdType* kv_indptr;
+  IdType* partial_indptr;
+  IdType* merge_packed_offset_start;
+  IdType* merge_packed_offset_end;
+  IdType* merge_indptr;
   IdType* kv_indices;
   IdType* q_len;
   IdType* kv_len;
diff --git a/include/flashinfer/attention/scheduler.cuh b/include/flashinfer/attention/scheduler.cuh