More checks

kaixih · kaixih · commit 3c17c627c5f3 · 2025-04-15T17:23:19.000Z
Signed-off-by: kaixih &lt;kaixih@nvidia.com&gt;
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
@@ -47,7 +47,7 @@ def ref_mla(
 @pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
 @pytest.mark.parametrize("bs", [1, 2, 4])
 @pytest.mark.parametrize("varlen", [False, True])
-@pytest.mark.parametrize("block_size", [16, 128])
+@pytest.mark.parametrize("block_size", [16, 64, 128])
 def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
                             varlen: bool, block_size: int):
     torch.set_default_dtype(dtype)
@@ -69,6 +69,12 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
     max_seq_len = seq_lens.max().item()
     block_num = (max_seq_len + block_size - 1) // block_size
 
+    # Pad block_num so that small blocks can be packed into full 128-sized
+    # CUTLASS tiles. One 128-wide tile can hold (128 // block_size) small
+    # blocks.
+    pack_factor = 128 // block_size
+    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
+
     q = torch.randn(bs, h_q, d)
     block_table = torch.randint(0,
                                 bs * block_num, (bs, block_num),
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1440,8 +1440,10 @@ def cutlass_mla_decode(q_nope_and_q_pe: torch.Tensor,
     assert not current_platform.is_rocm()
     assert q_nope_and_q_pe.ndim == 3, f"q_nope_and_q_pe must be a 3D tensor, but got {q_nope_and_q_pe.ndim}"
     assert kv_c_and_k_pe_cache.ndim == 3, f"kv_c_and_k_pe_cache must be a 3D tensor, but got {kv_c_and_k_pe_cache.ndim}"
+    assert page_table.ndim == 2, f"page_table must be a 2D tensor, but got {page_table.ndim}"
     B_q, H, D_q = q_nope_and_q_pe.shape
     _, PAGE_SIZE, D_ckv = kv_c_and_k_pe_cache.shape
+    B_pt, PAGE_NUM = page_table.shape
 
     D_latent = 512
     D_rope = 64
@@ -1453,6 +1455,11 @@ def cutlass_mla_decode(q_nope_and_q_pe: torch.Tensor,
     assert PAGE_SIZE > 0 and (
         PAGE_SIZE & (PAGE_SIZE - 1)
     ) == 0, f"PAGE_SIZE must be a power of 2, but got {PAGE_SIZE}"
+    assert B_pt == B_q, f"Batch dims must be same for page_table and q_nope_and_q_pe, but got {B_pt} and {B_q}"
+
+    # Current cutlass MLA implementation will pack smaller pages into a 128 page.
+    assert PAGE_NUM % (128 / PAGE_SIZE) == 0, f"PAGE_NUM must be divisible by 128 / PAGE_SIZE, but got {PAGE_NUM} and {128 / PAGE_SIZE}"
+
 
     # TODO(kaixih@nvidia): support fp8
     assert q_nope_and_q_pe.dtype in (torch.float16, torch.bfloat16), (