perf: fix the iteration bound of SWA in FA2 prefill template (#714)

yzh119 · web-flow · commit 989dbfa65f28 · 2025-01-03T23:31:22.000-08:00
We forgot to divide the packed row index by group_size when computing the sliding window iteration bound, making it larger than its actual value, and slows down the execution. Thank @Ying1123 for spotting this bug.
diff --git a/include/flashinfer/attention/prefill.cuh b/include/flashinfer/attention/prefill.cuh
@@ -1226,7 +1226,7 @@ __launch_bounds__(NUM_WARPS_Q* NUM_WARPS_KV* WARP_SIZE) void SinglePrefillWithKV
         16 * NUM_WARPS_KV * NUM_MMA_KV);
 
     const uint32_t window_iteration =
-        ceil_div(sub_if_greater_or_zero(kv_len + (bx + 1) * num_rows_per_cta,
+        ceil_div(sub_if_greater_or_zero(kv_len + (bx + 1) * num_rows_per_cta / group_size,
                                         qo_len + window_left + chunk_start),
                  (16 * NUM_WARPS_KV * NUM_MMA_KV));
 
@@ -1652,7 +1652,7 @@ __launch_bounds__(NUM_WARPS_Q* NUM_WARPS_KV* WARP_SIZE) void BatchPrefillWithRag
         16 * NUM_WARPS_KV * NUM_MMA_KV);
 
     const uint32_t window_iteration =
-        ceil_div(sub_if_greater_or_zero(kv_len + (qo_tile_idx + 1) * num_rows_per_cta,
+        ceil_div(sub_if_greater_or_zero(kv_len + (qo_tile_idx + 1) * num_rows_per_cta / group_size,
                                         qo_len + window_left + chunk_start),
                  (16 * NUM_WARPS_KV * NUM_MMA_KV));
 
@@ -1980,7 +1980,7 @@ __launch_bounds__(NUM_WARPS_Q* NUM_WARPS_KV* WARP_SIZE) void BatchPrefillWithPag
         16 * NUM_WARPS_KV * NUM_MMA_KV);
 
     const uint32_t window_iteration =
-        ceil_div(sub_if_greater_or_zero(kv_len + (qo_tile_idx + 1) * num_rows_per_cta,
+        ceil_div(sub_if_greater_or_zero(kv_len + (qo_tile_idx + 1) * num_rows_per_cta / group_size,
                                         qo_len + window_left + chunk_start),
                  (16 * NUM_WARPS_KV * NUM_MMA_KV));