perf: refactor fa2 prefill template (#776)

yzh119 · web-flow · commit fc037728a9ab · 2025-02-03T21:20:55.000-08:00
This PR refactors the FA2-based prefill template, including the
following changes:
1. Using KernelTraits for all constexpr and data types.
2. Using SharedStorage class for a clean interface shared memory
management.
3. Unlock `CTA_TILE_Q=32`.

We also tried `CTA_TILE_Q=8`, the half-mma optimization for GQA decoding
with low group ratio (&lt;=8), however, the performance improvement is very
marginal (&lt;1%) and make codebase complicated and thus we didn't
incorporate this feature in the PR.
diff --git a/aot_build_utils/generate_batch_paged_prefill_inst.py b/aot_build_utils/generate_batch_paged_prefill_inst.py
@@ -37,7 +37,7 @@ def get_cu_file_str(
     dtype_out,
     idtype,
 ):
-    cta_tile_q_choice = [128, 64, 16]
+    cta_tile_q_choice = [128, 64, 32, 16]
 
     def get_insts(attention_variant, dtype_out):
         return "\n".join(
diff --git a/aot_build_utils/generate_batch_ragged_prefill_inst.py b/aot_build_utils/generate_batch_ragged_prefill_inst.py
@@ -37,7 +37,7 @@ def get_cu_file_str(
     dtype_out,
     idtype,
 ):
-    cta_tile_q_choice = [128, 64, 16]
+    cta_tile_q_choice = [128, 64, 32, 16]
 
     def get_insts(attention_variant, dtype_out):
         return "\n".join(
diff --git a/csrc/batch_prefill_paged_kernel_inst.jinja b/csrc/batch_prefill_paged_kernel_inst.jinja
@@ -5,7 +5,7 @@ namespace flashinfer {
 
 constexpr auto use_custom_mask = {{ mask_mode }} == MaskMode::kCustom;
 
-{% for cta_tile_q in [16, 64, 128] %}
+{% for cta_tile_q in [16, 32, 64, 128] %}
 template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
     /*CTA_TILE_Q=*/{{cta_tile_q}}, {{head_dim_qk}}, {{head_dim_vo}}, {{pos_encoding_mode}}, {{use_fp16_qk_reduction}}, {{mask_mode}},
     {{ variant_name }}, PagedParams>(PagedParams params, {{ dtype_o }}* tmp_v, float* tmp_s, cudaStream_t stream);
diff --git a/csrc/batch_prefill_ragged_kernel_inst.jinja b/csrc/batch_prefill_ragged_kernel_inst.jinja
@@ -5,7 +5,7 @@ namespace flashinfer {
 
 constexpr auto use_custom_mask = {{ mask_mode }} == MaskMode::kCustom;
 
-{% for cta_tile_q in [16, 64, 128] %}
+{% for cta_tile_q in [16, 32, 64, 128] %}
 template cudaError_t BatchPrefillWithRaggedKVCacheDispatched<
     /*CTA_TILE_Q=*/{{cta_tile_q}}, {{head_dim_qk}}, {{head_dim_vo}}, {{pos_encoding_mode}}, {{use_fp16_qk_reduction}}, {{mask_mode}},
     {{ variant_name }}, RaggedParams>(RaggedParams params, {{ dtype_o }}* tmp_v, float* tmp_s, cudaStream_t stream);
diff --git a/include/flashinfer/attention/prefill.cuh b/include/flashinfer/attention/prefill.cuh
diff --git a/include/flashinfer/attention/scheduler.cuh b/include/flashinfer/attention/scheduler.cuh
@@ -418,27 +418,6 @@ inline cudaError_t DecodePlan(void* float_buffer, size_t float_workspace_size_in
   return cudaSuccess;
 }
 
-inline uint32_t DetermineCtaTileQ(int64_t avg_packed_qo_len, uint32_t head_dim) {
-  if (avg_packed_qo_len > 64 && head_dim < 256) {
-    return 128;
-  } else {
-    auto compute_capacity = GetCudaComputeCapability();
-    if (compute_capacity.first >= 8) {
-      // Ampere or newer
-      if (avg_packed_qo_len > 16) {
-        // avg_packed_qo_len <= 64
-        return 64;
-      } else {
-        // avg_packed_qo_len <= 16
-        return 16;
-      }
-    } else {
-      // NOTE(Zihao): not enough shared memory on Turing for 1x4 warp layout
-      return 64;
-    }
-  }
-}
-
 template <typename IdType>
 inline auto PrefillSplitQOKVIndptr(IdType* qo_indptr_h, IdType* kv_indptr_h,
                                    uint32_t total_num_rows, uint32_t batch_size,
@@ -480,7 +459,7 @@ inline auto PrefillSplitQOKVIndptr(IdType* qo_indptr_h, IdType* kv_indptr_h,
     // the CUDA graph is created fixes the maximum number of tokens.
     const uint64_t max_seq_len = total_num_rows - batch_size + 1;
     uint64_t max_qo_len = uint64_t(max_seq_len) * gqa_group_size;
-    cta_tile_q = DetermineCtaTileQ(max_qo_len, head_dim);
+    cta_tile_q = FA2DetermineCtaTileQ(max_qo_len, head_dim);
 
     // Find an upper bound for the number of tiles, derived from the total
     // number of rows and the batch size.  The sum of qo lengths rounded
@@ -493,7 +472,7 @@ inline auto PrefillSplitQOKVIndptr(IdType* qo_indptr_h, IdType* kv_indptr_h,
       sum_packed_qo_len += packed_qo_len_arr[i];
     }
     const int64_t avg_packed_qo_len = sum_packed_qo_len / batch_size;
-    cta_tile_q = DetermineCtaTileQ(avg_packed_qo_len, head_dim);
+    cta_tile_q = FA2DetermineCtaTileQ(avg_packed_qo_len, head_dim);
 
     total_num_tiles_q = 0;
     for (uint32_t i = 0; i < batch_size; ++i) {
diff --git a/include/flashinfer/mma.cuh b/include/flashinfer/mma.cuh
@@ -480,7 +480,7 @@ __device__ __forceinline__ void mma_sync_m16n16k16_row_col_f16f16f32(float* C, u
  * \brief Use mma instructions to compute rowsum.
  */
 template <typename DType>
-__device__ __forceinline__ void rowsum_f8f8f32(float* d, DType* s) {
+__device__ __forceinline__ void m16k32_rowsum_f8f8f32(float* d, DType* s) {
   static_assert(sizeof(DType) == 1, "DType must be 8bit floating data type");
   uint32_t* s_u32 = (uint32_t*)(s);
 #if defined(FLASHINFER_MMA_F8F8F32_M16N8K32_ENABLED)
@@ -519,7 +519,7 @@ __device__ __forceinline__ void rowsum_f8f8f32(float* d, DType* s) {
  * \brief Use mma instructions to compute rowsum.
  */
 template <typename DType>
-__device__ __forceinline__ void rowsum_f16f16f32(float* d, DType* s) {
+__device__ __forceinline__ void m16k16_rowsum_f16f16f32(float* d, DType* s) {
   static_assert(sizeof(DType) == 2, "DType must be 16bit floating data type");
   uint32_t* s_u32 = (uint32_t*)(s);
 #if defined(FLASHINFER_MMA_F16F16F32_M16N8K16_ENABLED)
diff --git a/include/flashinfer/utils.cuh b/include/flashinfer/utils.cuh
@@ -108,6 +108,11 @@
       __VA_ARGS__                                          \
       break;                                               \
     }                                                      \
+    case 32: {                                             \
+      constexpr uint32_t CTA_TILE_Q = 32;                  \
+      __VA_ARGS__                                          \
+      break;                                               \
+    }                                                      \
     case 16: {                                             \
       constexpr uint32_t CTA_TILE_Q = 16;                  \
       __VA_ARGS__                                          \
@@ -290,6 +295,30 @@ inline void DebugPrintCUDAArray(T* device_ptr, size_t size, std::string prefix =
   std::cout << std::endl;
 }
 
+inline uint32_t FA2DetermineCtaTileQ(int64_t avg_packed_qo_len, uint32_t head_dim) {
+  if (avg_packed_qo_len > 64 && head_dim < 256) {
+    return 128;
+  } else {
+    auto compute_capacity = GetCudaComputeCapability();
+    if (compute_capacity.first >= 8) {
+      // Ampere or newer
+      if (avg_packed_qo_len > 32) {
+        // avg_packed_qo_len <= 64
+        return 64;
+      } else if (avg_packed_qo_len > 16) {
+        // avg_packed_qo_len <= 32
+        return 32;
+      } else {
+        // avg_packed_qo_len <= 16
+        return 16;
+      }
+    } else {
+      // NOTE(Zihao): not enough shared memory on Turing for 1x4 warp layout
+      return 64;
+    }
+  }
+}
+
 /*!
  * \brief Return x - y if x > y, otherwise return 0.
  */
diff --git a/tests/test_jit_example.py b/tests/test_jit_example.py
@@ -561,7 +561,6 @@ def test_batch_prefill_sm90_flash_sigmoid():
     sigmoid_bias = 0.25
 
     o = wrapper.run(q, k, v, logits_scale, sigmoid_bias)
-    print(o)
     wrapper_paged = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         float_workspace_buffer, kv_layout="NHD", backend="fa3", jit_args=jit_args
     )
@@ -696,7 +695,7 @@ def test_sm90_debug_print_logits():
 
   template <int NUM_ROWS_PER_THREAD>
   __device__ auto GetAttentionUpdater() {
-    return OnlineSoftmax<NUM_ROWS_PER_THREAD, /*WITH_SCALE*/true>(sm_scale_log2);
+    return OnlineSoftmax<NUM_ROWS_PER_THREAD, /*WITH_SCALE*/false>(sm_scale_log2);
   }
 
 
@@ -753,12 +752,12 @@ def test_sm90_debug_print_logits():
 
 
 if __name__ == "__main__":
-    # test_single_decode_mask()
-    # test_flash_sigmoid()
-    # test_dump_logits()
-    # test_debug_print_logits()
-    # test_sm90_debug_print_logits()
-    # test_batch_decode_flash_sigmoid(False)
-    # test_batch_decode_flash_sigmoid(True)
-    # test_batch_prefill_flash_sigmoid()
+    test_single_decode_mask()
+    test_flash_sigmoid()
+    test_dump_logits()
+    test_debug_print_logits()
+    test_sm90_debug_print_logits()
+    test_batch_decode_flash_sigmoid(False)
+    test_batch_decode_flash_sigmoid(True)
+    test_batch_prefill_flash_sigmoid()
     test_batch_prefill_sm90_flash_sigmoid()