bugfix: bugfix to #949 (#951)

yzh119 · web-flow · commit 30b2838e6258 · 2025-03-17T09:28:40.000-07:00
The sm86/sm89 version of mla kernel was not tests after change #942, this PR fixes the issue. This PR also make the following changes: 1. adding the mla unittest to CI (on a10g node). 2. shrinking the unittest of mla so that CI can finish in reasonable time. 3. change `is_sm90a_supported(torch.device("cuda"))` to `backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):` for non-hopper GPUs, as pointed by @Atream .
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -106,6 +106,16 @@ stage('JIT Unittest') {
           sh(script: "${docker_run} ./scripts/task_jit_run_tests_part2.sh", label: 'JIT Unittest Part 2')
         }
       }
+    },
+    'GPU-G5-Test-4': {
+      node('GPU-G5-SPOT') {
+        ws(per_exec_ws('flashinfer-unittest')) {
+          init_git(true) // we need cutlass submodule
+          sh(script: "ls -alh", label: 'Show work directory')
+          sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
+          sh(script: "${docker_run} ./scripts/task_jit_run_tests_part4.sh", label: 'JIT Unittest Part 4')
+        }
+      }
     }
   )
 }
diff --git a/include/flashinfer/attention/mla.cuh b/include/flashinfer/attention/mla.cuh
@@ -198,8 +198,7 @@ __device__ __forceinline__ void load_kv(
   if constexpr (KTraits::NUM_MMA_KV == 1) {
     if (warpgroup_idx == 0) {
       uint32_t q, r;
-      uint32_t packed_block_iter =
-          packed_block_iter_base + lane_idx / 8 + lane_idx / 8 + warp_idx_in_wg * 4;
+      uint32_t packed_block_iter = packed_block_iter_base + lane_idx / 8 + warp_idx_in_wg * 4;
       block_size.divmod(packed_block_iter, q, r);
 
       DTypeKV* ckv_ptr = ckv +
diff --git a/include/flashinfer/attention/prefill.cuh b/include/flashinfer/attention/prefill.cuh
@@ -1504,19 +1504,21 @@ cudaError_t SinglePrefillWithKVCacheDispatched(Params params, typename Params::D
     FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(
         &max_smem_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev_id));
     // we expect each sm execute two threadblocks
-    // TODO(Zihao): fix the following computation
-    const int num_ctas_per_sm = max_smem_per_sm > (16 * HEAD_DIM_QK * sizeof(DTypeQ) * 16) ? 2 : 1;
+    const int num_ctas_per_sm =
+        max_smem_per_sm >= 2 * (CTA_TILE_Q * HEAD_DIM_QK * sizeof(DTypeQ) +
+                                (HEAD_DIM_QK + HEAD_DIM_VO) * 16 * NUM_WARPS_KV * sizeof(DTypeKV))
+            ? 2
+            : 1;
     const int max_smem_per_threadblock = max_smem_per_sm / num_ctas_per_sm;
 
     const uint32_t max_num_mma_kv_reg =
         (HEAD_DIM_VO >= 128 && NUM_MMA_Q == 2 && POS_ENCODING_MODE == PosEncodingMode::kRoPELlama &&
          !USE_FP16_QK_REDUCTION)
             ? 2
             : (8 / NUM_MMA_Q);
-    // TODO(Zihao): fix the following computation
     const uint32_t max_num_mma_kv_smem =
-        (max_smem_per_threadblock / (16 * HEAD_DIM_QK * sizeof(DTypeQ)) - NUM_MMA_Q * NUM_WARPS_Q) /
-        (2 * NUM_WARPS_KV);
+        (max_smem_per_threadblock - CTA_TILE_Q * HEAD_DIM_QK * sizeof(DTypeQ)) /
+        ((HEAD_DIM_QK + HEAD_DIM_VO) * 16 * NUM_WARPS_KV * sizeof(DTypeKV));
 
     // control NUM_MMA_KV for maximum warp occupancy
     DISPATCH_NUM_MMA_KV(min(max_num_mma_kv_smem, max_num_mma_kv_reg), NUM_MMA_KV, {
@@ -2223,19 +2225,21 @@ cudaError_t BatchPrefillWithRaggedKVCacheDispatched(Params params, typename Para
   FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&max_smem_per_sm,
                                               cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev_id));
   // we expect each sm execute two threadblocks
-  // TODO(Zihao): fix the following computation
-  const int num_ctas_per_sm = max_smem_per_sm > (16 * HEAD_DIM_QK * sizeof(DTypeQ) * 16) ? 2 : 1;
+  const int num_ctas_per_sm =
+      max_smem_per_sm >= 2 * (CTA_TILE_Q * HEAD_DIM_QK * sizeof(DTypeQ) +
+                              (HEAD_DIM_QK + HEAD_DIM_VO) * 16 * NUM_WARPS_KV * sizeof(DTypeKV))
+          ? 2
+          : 1;
   const int max_smem_per_threadblock = max_smem_per_sm / num_ctas_per_sm;
 
   const uint32_t max_num_mma_kv_reg =
       (HEAD_DIM_VO >= 128 && NUM_MMA_Q == 2 && POS_ENCODING_MODE == PosEncodingMode::kRoPELlama &&
        !USE_FP16_QK_REDUCTION)
           ? 2
           : (8 / NUM_MMA_Q);
-  // TODO(Zihao): fix the following computation
   const uint32_t max_num_mma_kv_smem =
-      (max_smem_per_threadblock / (16 * HEAD_DIM_QK * sizeof(DTypeQ)) - NUM_MMA_Q * NUM_WARPS_Q) /
-      (2 * NUM_WARPS_KV);
+      (max_smem_per_threadblock - CTA_TILE_Q * HEAD_DIM_QK * sizeof(DTypeQ)) /
+      ((HEAD_DIM_QK + HEAD_DIM_VO) * 16 * NUM_WARPS_KV * sizeof(DTypeKV));
 
   DISPATCH_NUM_MMA_KV(min(max_num_mma_kv_smem, max_num_mma_kv_reg), NUM_MMA_KV, {
     using KTraits =
@@ -2324,19 +2328,21 @@ cudaError_t BatchPrefillWithPagedKVCacheDispatched(Params params, typename Param
   FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&max_smem_per_sm,
                                               cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev_id));
   // we expect each sm execute two threadblocks
-  // TODO(Zihao): fix the following computation
-  const int num_ctas_per_sm = max_smem_per_sm > (16 * HEAD_DIM_QK * sizeof(DTypeQ) * 16) ? 2 : 1;
+  const int num_ctas_per_sm =
+      max_smem_per_sm >= 2 * (CTA_TILE_Q * HEAD_DIM_QK * sizeof(DTypeQ) +
+                              (HEAD_DIM_QK + HEAD_DIM_VO) * 16 * NUM_WARPS_KV * sizeof(DTypeKV))
+          ? 2
+          : 1;
   const int max_smem_per_threadblock = max_smem_per_sm / num_ctas_per_sm;
 
   const uint32_t max_num_mma_kv_reg =
       (HEAD_DIM_VO >= 128 && NUM_MMA_Q == 2 && POS_ENCODING_MODE == PosEncodingMode::kRoPELlama &&
        !USE_FP16_QK_REDUCTION)
           ? 2
           : (8 / NUM_MMA_Q);
-  // TODO(Zihao): fix the following computation
   const uint32_t max_num_mma_kv_smem =
-      (max_smem_per_threadblock / (16 * HEAD_DIM_QK * sizeof(DTypeQ)) - NUM_MMA_Q * NUM_WARPS_Q) /
-      (2 * NUM_WARPS_KV);
+      (max_smem_per_threadblock - CTA_TILE_Q * HEAD_DIM_QK * sizeof(DTypeQ)) /
+      ((HEAD_DIM_QK + HEAD_DIM_VO) * 16 * NUM_WARPS_KV * sizeof(DTypeKV));
 
   DISPATCH_NUM_MMA_KV(min(max_num_mma_kv_smem, max_num_mma_kv_reg), NUM_MMA_KV, {
     using KTraits =
diff --git a/scripts/task_jit_run_tests_part4.sh b/scripts/task_jit_run_tests_part4.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -eo pipefail
+set -x
+: ${MAX_JOBS:=$(nproc)}
+: ${CUDA_VISIBLE_DEVICES:=0}
+
+pip install -e . -v
+
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True  # avoid memory fragmentation
+pytest -s tests/test_deepseek_mla.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -121,15 +121,17 @@ def pytest_configure(config):
             _monkeypatch_add_torch_compile(fn)
 
 
+def is_cuda_oom_error_str(e: str) -> bool:
+    return "CUDA" in e and "out of memory" in e
+
+
 @pytest.hookimpl(tryfirst=True)
 def pytest_runtest_call(item):
     # skip OOM error
     try:
         item.runtest()
-    except (torch.OutOfMemoryError, RuntimeError) as e:
-        if isinstance(e, torch.OutOfMemoryError) or "CUDA error: out of memory" in str(
-            e
-        ):
+    except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+        if isinstance(e, torch.cuda.OutOfMemoryError) or is_cuda_oom_error_str(str(e)):
             pytest.skip("Skipping due to OOM")
         else:
             raise
diff --git a/tests/test_deepseek_mla.py b/tests/test_deepseek_mla.py
@@ -20,9 +20,91 @@
 import torch
 
 import flashinfer
+from flashinfer.jit.attention import (
+    gen_batch_mla_module,
+    gen_batch_prefill_module,
+    gen_single_prefill_module,
+)
 from flashinfer.utils import is_sm90a_supported
 
 
+@pytest.fixture(autouse=True, scope="module")
+def warmup_jit():
+    try:
+        modules = []
+        for backend in ["fa2", "fa3"]:
+            if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
+                continue
+
+            modules.append(
+                (
+                    gen_single_prefill_module,
+                    [
+                        backend,
+                        torch.float16,
+                        torch.float16,
+                        torch.float16,
+                        192,
+                        128,
+                        0,
+                        False,
+                        False,
+                        False,
+                    ],
+                )
+            )
+
+        for backend in ["fa2", "fa3"]:
+            if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
+                continue
+
+            modules.append(
+                (
+                    gen_batch_prefill_module,
+                    [
+                        backend,
+                        torch.float16,
+                        torch.float16,
+                        torch.float16,
+                        torch.int32,
+                        192,
+                        128,
+                        0,
+                        False,
+                        False,
+                        False,
+                    ],
+                )
+            )
+
+        for backend in ["fa2", "fa3"]:
+            if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
+                continue
+
+            modules.append(
+                (
+                    gen_batch_mla_module,
+                    [
+                        backend,
+                        torch.float16,
+                        torch.float16,
+                        torch.float16,
+                        torch.int32,
+                        512,
+                        64,
+                        False,
+                    ],
+                )
+            )
+
+        flashinfer.jit.parallel_load_modules(modules)
+    except Exception as e:
+        # abort the test session if warmup fails
+        pytest.exit(str(e))
+    finally:
+        yield
+
+
 def attention_ref(
     batch_size,
     q: torch.Tensor,
@@ -83,7 +165,7 @@ def test_single_prefill_with_kv_cache(
     backend,
     dtype,
 ):
-    if not is_sm90a_supported(torch.device("cuda")):
+    if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
         pytest.skip("FA3 is not supported on this device")
     torch.manual_seed(42)
     head_dim_qk = 192
@@ -117,7 +199,7 @@ def test_batch_prefill_with_ragged_kv_cache(
     backend,
     dtype,
 ):
-    if not is_sm90a_supported(torch.device("cuda")):
+    if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
         pytest.skip("FA3 is not supported on this device")
     torch.manual_seed(42)
     kv_layout = "NHD"
@@ -188,17 +270,15 @@ def generate_kv_from_cache(ckv, kpe, kv_len, batch_size, num_heads):
     return k, v
 
 
-@pytest.mark.parametrize("batch_size", [1, 2, 3, 4, 5, 6, 7])
-@pytest.mark.parametrize("kv_len_0", [0, 1, 2, 3, 4, 11])
-@pytest.mark.parametrize("kv_len_1", [17, 19, 33, 79, 114])
+@pytest.mark.parametrize("batch_size", [1, 3, 5, 7])
+@pytest.mark.parametrize("kv_len_0", [0, 1, 3, 11])
+@pytest.mark.parametrize("kv_len_1", [17, 33, 79, 114])
 @pytest.mark.parametrize("kv_len_2", [514, 2743, 8736])
-@pytest.mark.parametrize(
-    "qo_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
-)
-@pytest.mark.parametrize("num_heads", [16, 32, 64])
+@pytest.mark.parametrize("qo_len", [1, 3, 5, 7, 9, 11, 13, 15, 17])
+@pytest.mark.parametrize("num_heads", [16, 64])
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("page_size", [1])
-@pytest.mark.parametrize("backend", ["fa3"])
+@pytest.mark.parametrize("backend", ["fa2", "fa3"])
 @pytest.mark.parametrize("dtype", [torch.half])
 def test_batch_mla_varlen_page_attention(
     batch_size,
@@ -212,7 +292,7 @@ def test_batch_mla_varlen_page_attention(
     backend,
     dtype,
 ):
-    if not is_sm90a_supported(torch.device("cuda")):
+    if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
         pytest.skip("FA3 is not supported on this device")
     if causal and qo_len > min(kv_len_0, kv_len_1, kv_len_2):
         pytest.skip("qo_len > kv_len not supported for causal attention")
@@ -336,7 +416,7 @@ def test_batch_mla_varlen_page_attention(
 def test_batch_mla_oob_kv_nan(
     batch_size, kv_len, qo_len, num_heads, causal, page_size, backend, dtype
 ):
-    if not is_sm90a_supported(torch.device("cuda")):
+    if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
         pytest.skip("FA3 is not supported on this device")
     if causal and qo_len > kv_len:
         pytest.skip("qo_len > kv_len not supported for causal attention")
@@ -405,16 +485,14 @@ def test_batch_mla_oob_kv_nan(
         torch.testing.assert_close(lse, lse_ref, rtol=1e-3, atol=1e-3)
 
 
-@pytest.mark.parametrize("batch_size", [1, 2, 3, 4, 5, 6, 7, 157])
+@pytest.mark.parametrize("batch_size", [1, 3, 5, 7, 157])
 @pytest.mark.parametrize("kv_len", [0, 17, 33, 96, 97, 114, 514, 1024])
-@pytest.mark.parametrize(
-    "qo_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
-)
+@pytest.mark.parametrize("qo_len", [1, 3, 5, 7, 9, 11, 13, 15, 17])
 @pytest.mark.parametrize("num_heads", [16])
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("page_size", [1, 16])
 @pytest.mark.parametrize("backend", ["fa2", "fa3"])
-@pytest.mark.parametrize("use_cuda_graph", [True, False])
+@pytest.mark.parametrize("use_cuda_graph", [False])
 @pytest.mark.parametrize("dtype", [torch.half])
 def test_batch_mla_page_attention(
     batch_size,
@@ -427,7 +505,7 @@ def test_batch_mla_page_attention(
     use_cuda_graph,
     dtype,
 ):
-    if not is_sm90a_supported(torch.device("cuda")):
+    if backend == "fa3" and not is_sm90a_supported(torch.device("cuda")):
         pytest.skip("FA3 is not supported on this device")
     if causal and qo_len > kv_len:
         pytest.skip("qo_len > kv_len not supported for causal attention")

Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,16 @@ stage('JIT Unittest') {`
`106`	`106`	`sh(script: "${docker_run} ./scripts/task_jit_run_tests_part2.sh", label: 'JIT Unittest Part 2')`
`107`	`107`	`}`
`108`	`108`	`}`
	`109`	`+ },`
	`110`	`+ 'GPU-G5-Test-4': {`
	`111`	`+ node('GPU-G5-SPOT') {`
	`112`	`+ ws(per_exec_ws('flashinfer-unittest')) {`
	`113`	`+ init_git(true) // we need cutlass submodule`
	`114`	`+ sh(script: "ls -alh", label: 'Show work directory')`
	`115`	`+ sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')`
	`116`	`+ sh(script: "${docker_run} ./scripts/task_jit_run_tests_part4.sh", label: 'JIT Unittest Part 4')`
	`117`	`+ }`
	`118`	`+ }`
`109`	`119`	`}`
`110`	`120`	`)`
`111`	`121`	`}`