bugfix: drop CTA_TILE_Q=32 (#785)

abcdabcd987 · web-flow · commit 83bab99d1469 · 2025-02-04T15:07:13.000-05:00
#776 added CTA_TILE_Q=32 but it produces incorrect result.
diff --git a/aot_build_utils/generate_batch_paged_prefill_inst.py b/aot_build_utils/generate_batch_paged_prefill_inst.py
@@ -37,7 +37,7 @@ def get_cu_file_str(
     dtype_out,
     idtype,
 ):
-    cta_tile_q_choice = [128, 64, 32, 16]
+    cta_tile_q_choice = [128, 64, 16]
 
     def get_insts(attention_variant, dtype_out):
         return "\n".join(
diff --git a/aot_build_utils/generate_batch_ragged_prefill_inst.py b/aot_build_utils/generate_batch_ragged_prefill_inst.py
@@ -37,7 +37,7 @@ def get_cu_file_str(
     dtype_out,
     idtype,
 ):
-    cta_tile_q_choice = [128, 64, 32, 16]
+    cta_tile_q_choice = [128, 64, 16]
 
     def get_insts(attention_variant, dtype_out):
         return "\n".join(
diff --git a/csrc/batch_prefill_paged_kernel_inst.jinja b/csrc/batch_prefill_paged_kernel_inst.jinja
@@ -5,7 +5,7 @@ namespace flashinfer {
 
 constexpr auto use_custom_mask = {{ mask_mode }} == MaskMode::kCustom;
 
-{% for cta_tile_q in [16, 32, 64, 128] %}
+{% for cta_tile_q in [16, 64, 128] %}
 template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
     /*CTA_TILE_Q=*/{{cta_tile_q}}, {{head_dim_qk}}, {{head_dim_vo}}, {{pos_encoding_mode}}, {{use_fp16_qk_reduction}}, {{mask_mode}},
     {{ variant_name }}, PagedParams>(PagedParams params, {{ dtype_o }}* tmp_v, float* tmp_s, cudaStream_t stream);
diff --git a/csrc/batch_prefill_ragged_kernel_inst.jinja b/csrc/batch_prefill_ragged_kernel_inst.jinja
@@ -5,7 +5,7 @@ namespace flashinfer {
 
 constexpr auto use_custom_mask = {{ mask_mode }} == MaskMode::kCustom;
 
-{% for cta_tile_q in [16, 32, 64, 128] %}
+{% for cta_tile_q in [16, 64, 128] %}
 template cudaError_t BatchPrefillWithRaggedKVCacheDispatched<
     /*CTA_TILE_Q=*/{{cta_tile_q}}, {{head_dim_qk}}, {{head_dim_vo}}, {{pos_encoding_mode}}, {{use_fp16_qk_reduction}}, {{mask_mode}},
     {{ variant_name }}, RaggedParams>(RaggedParams params, {{ dtype_o }}* tmp_v, float* tmp_s, cudaStream_t stream);
diff --git a/include/flashinfer/utils.cuh b/include/flashinfer/utils.cuh
@@ -108,11 +108,6 @@
       __VA_ARGS__                                          \
       break;                                               \
     }                                                      \
-    case 32: {                                             \
-      constexpr uint32_t CTA_TILE_Q = 32;                  \
-      __VA_ARGS__                                          \
-      break;                                               \
-    }                                                      \
     case 16: {                                             \
       constexpr uint32_t CTA_TILE_Q = 16;                  \
       __VA_ARGS__                                          \
@@ -302,12 +297,9 @@ inline uint32_t FA2DetermineCtaTileQ(int64_t avg_packed_qo_len, uint32_t head_di
     auto compute_capacity = GetCudaComputeCapability();
     if (compute_capacity.first >= 8) {
       // Ampere or newer
-      if (avg_packed_qo_len > 32) {
+      if (avg_packed_qo_len > 16) {
         // avg_packed_qo_len <= 64
         return 64;
-      } else if (avg_packed_qo_len > 16) {
-        // avg_packed_qo_len <= 32
-        return 32;
       } else {
         // avg_packed_qo_len <= 16
         return 16;