bugfix: fix the JIT warmup arguments in unittests (#775)

yzh119 · web-flow · commit c04755e21f4d · 2025-02-01T16:19:36.000-05:00
Followup of #765 , fix the JIT warmup utilities functions.
diff --git a/tests/jit_utils.py b/tests/jit_utils.py
@@ -53,7 +53,8 @@ def jit_decode_attention_func_args(
                     q_dtype,
                     kv_dtype,
                     q_dtype,
-                    head_dim,
+                    head_dim,  # head_dim_qk
+                    head_dim,  # head_dim_vo
                     pos_encoding_mode,
                     use_sliding_window,
                     use_logits_soft_cap,
@@ -68,7 +69,8 @@ def jit_decode_attention_func_args(
                     kv_dtype,
                     q_dtype,
                     torch.int32,
-                    head_dim,
+                    head_dim,  # head_dim_qk
+                    head_dim,  # head_dim_vo
                     pos_encoding_mode,
                     use_sliding_window,
                     use_logits_soft_cap,
diff --git a/tests/test_jit_warmup.py b/tests/test_jit_warmup.py
@@ -36,7 +36,8 @@ def test_warmpup_llama():
                     torch.float16,
                     torch.float16,
                     torch.int32,
-                    128,
+                    128,  # head_dim_qk
+                    128,  # head_dim_vo
                     PosEncodingMode.NONE.value,
                     False,  # use_sliding_window
                     False,  # use_logits_soft_cap
@@ -45,11 +46,13 @@ def test_warmpup_llama():
             (
                 flashinfer.prefill.gen_batch_prefill_module,
                 [
+                    "fa2",  # backend
                     torch.float16,
                     torch.float16,
                     torch.float16,
                     torch.int32,
-                    128,
+                    128,  # head_dim_qk
+                    128,  # head_dim_vo
                     PosEncodingMode.NONE.value,
                     False,  # use_sliding_window
                     False,  # use_logits_soft_cap
@@ -75,7 +78,8 @@ def test_warmpup_llama_sm90():
                     torch.float16,
                     torch.float16,
                     torch.int32,
-                    128,
+                    128,  # head_dim_qk
+                    128,  # head_dim_vo
                     PosEncodingMode.NONE.value,
                     False,  # use_sliding_window
                     False,  # use_logits_soft_cap
@@ -84,25 +88,29 @@ def test_warmpup_llama_sm90():
             (
                 flashinfer.prefill.gen_batch_prefill_module,
                 [
+                    "fa2",  # backend
                     torch.float16,
                     torch.float16,
                     torch.float16,
                     torch.int32,
-                    128,
+                    128,  # head_dim_qk
+                    128,  # head_dim_vo
                     PosEncodingMode.NONE.value,
                     False,  # use_sliding_window
                     False,  # use_logits_soft_cap
                     False,  # use_fp16_qk_reduction
                 ],
             ),
             (
-                flashinfer.prefill.gen_batch_prefill_sm90_module,
+                flashinfer.prefill.gen_batch_prefill_module,
                 [
+                    "fa3",  # backend
                     torch.float16,
                     torch.float16,
                     torch.float16,
                     torch.int32,
-                    128,
+                    128,  # head_dim_qk
+                    128,  # head_dim_vo
                     PosEncodingMode.NONE.value,
                     False,  # use_sliding_window
                     False,  # use_logits_soft_cap