bugfix: fix MLA with new JIT pipeline (#620)

yzh119 · web-flow · commit cbe65a99999c · 2024-11-20T01:58:17.000-08:00
Some of the commits for fixing MLA are missing in #618, this PR add them back.
diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py
@@ -1270,6 +1270,8 @@ def plan(
             q_data_type = data_type
         q_data_type = canonicalize_torch_dtype(q_data_type)
 
+        indptr_host = indptr.to("cpu")
+
         self._cached_module = get_batch_decode_mla_module(
             q_data_type,
             data_type,
@@ -1284,7 +1286,7 @@ def plan(
                 self._float_workspace_buffer,
                 self._int_workspace_buffer,
                 self._pin_memory_int_workspace_buffer,
-                indptr,
+                indptr_host,
                 batch_size,
                 num_qo_heads,
                 page_size,
@@ -1357,24 +1359,36 @@ def run(
         if rope_theta is None:
             rope_theta = 1e4
 
-        out = self._cached_module.run(
-            self._float_workspace_buffer,
-            self._int_workspace_buffer,
-            self._plan_info,
-            q_nope,
-            q_pe,
-            paged_ckv_cache,
-            paged_kpe_cache,
-            self._paged_kv_indptr_buf,
-            self._paged_kv_indices_buf,
-            self._paged_kv_last_page_len_buf,
-            sm_scale,
-            window_left,
-            logits_soft_cap,
-            rope_scale,
-            rope_theta,
-            return_lse,
-        )
+        with self.device as device:
+            o = torch.empty_like(q_nope, device=device)
+            maybe_lse = (
+                torch.empty(
+                    (q_nope.size(0), q_nope.size(1)), dtype=torch.float32, device=device
+                )
+                if return_lse
+                else None
+            )
+            self._cached_module.run(
+                self._float_workspace_buffer,
+                self._int_workspace_buffer,
+                self._plan_info,
+                q_nope,
+                q_pe,
+                paged_ckv_cache,
+                paged_kpe_cache,
+                self._paged_kv_indptr_buf,
+                self._paged_kv_indices_buf,
+                self._paged_kv_last_page_len_buf,
+                o,
+                sm_scale,
+                window_left,
+                logits_soft_cap,
+                rope_scale,
+                rope_theta,
+                maybe_lse,
+                get_cuda_stream(device),
+            )
+            out = (o, maybe_lse) if return_lse else (o,)
         if v_scale is not None:
             out[0] *= v_scale
 
diff --git a/python/flashinfer/jit/attention.py b/python/flashinfer/jit/attention.py
@@ -183,7 +183,9 @@ def get_batch_decode_mla_sources(
             "dtype_kv": dtype_map[dtype_kv],
             "dtype_o": dtype_map[dtype_o],
             "dtype_idx": dtype_map[dtype_idx],
-            "head_dim": head_dim,
+            "head_dim_ckv": head_dim,
+            "head_dim_kpe": head_dim
+            // 8,  # fixme: head_dim_ckv(kv_lora_rank) is 8 times the size of head_dim_kpe(qk_rope_head_dim) for all MLA model (DeepSeek-V2-Lite, DeepSeek-V2.5, MiniCPM3) at the time Oct.2024
             "use_sliding_window": "true" if use_sliding_window else "false",
             "use_logits_soft_cap": "true" if use_logits_soft_cap else "false",
         },
diff --git a/python/flashinfer/jit/batch_decode_mla_templ.py b/python/flashinfer/jit/batch_decode_mla_templ.py
@@ -106,8 +106,8 @@
 
   if (maybe_lse) {
     const auto& lse = *maybe_lse;
-    TORCH_CHECK(lse.size(0) == batch_size, lse.size(0), q.size(0));
-    TORCH_CHECK(lse.size(1) == num_qo_heads, lse.size(1), q.size(1));
+    TORCH_CHECK(lse.size(0) == batch_size, lse.size(0), q_nope.size(0));
+    TORCH_CHECK(lse.size(1) == num_qo_heads, lse.size(1), q_nope.size(1));
   }
 
   TORCH_CHECK(logits_soft_cap >= 0.f, "logits_soft_cap must be non-negative");
@@ -146,9 +146,10 @@
   }
   params.padded_batch_size = plan_info.padded_batch_size;
 
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
   cudaError_t status = BatchDecodeWithPagedKVCacheDispatchedMLA<
       {{ head_dim_ckv }}, {{ head_dim_kpe }}, AttentionVariant>(
-      params, tmp_v, tmp_s, /*stream=*/torch_current_stream);
+      params, tmp_v, tmp_s, /*stream=*/stream);
   TORCH_CHECK(status == cudaSuccess, "BatchDecodeWithPagedKVCache failed with error ",
               cudaGetErrorString(status));
 }
diff --git a/tests/test_mla_decode_kernel.py b/tests/test_mla_decode_kernel.py
@@ -367,7 +367,7 @@ def run_proof_of_concept(
 
     dev_id = 1
 
-    # torch.manual_seed(666)
+    torch.manual_seed(666)
     torch.set_grad_enabled(False)
 
     mla_vanilla = DeepseekV2AttentionVanilla().cuda(device=dev_id)
@@ -436,7 +436,7 @@ def run_proof_of_concept(
         output_vanilla.reshape(-1), output_mat_absorbed_use_torch_f16.reshape(-1)
     )
     print(f"wmape_use_torch_f16 = {wmape_use_torch_f16}")
-    assert wmape_use_torch_f16 < 0.02
+    assert wmape_use_torch_f16 < 0.03
 
     mse_use_torch_f16 = F.mse_loss(
         output_vanilla.reshape(-1), output_mat_absorbed_use_torch_f16.reshape(-1)

Original file line number	Diff line number	Diff line change
`@@ -106,8 +106,8 @@`
`106`	`106`
`107`	`107`	`if (maybe_lse) {`
`108`	`108`	`const auto& lse = *maybe_lse;`
`109`		`- TORCH_CHECK(lse.size(0) == batch_size, lse.size(0), q.size(0));`
`110`		`- TORCH_CHECK(lse.size(1) == num_qo_heads, lse.size(1), q.size(1));`
	`109`	`+ TORCH_CHECK(lse.size(0) == batch_size, lse.size(0), q_nope.size(0));`
	`110`	`+ TORCH_CHECK(lse.size(1) == num_qo_heads, lse.size(1), q_nope.size(1));`
`111`	`111`	`}`
`112`	`112`
`113`	`113`	`TORCH_CHECK(logits_soft_cap >= 0.f, "logits_soft_cap must be non-negative");`
`@@ -146,9 +146,10 @@`
`146`	`146`	`}`
`147`	`147`	`params.padded_batch_size = plan_info.padded_batch_size;`
`148`	`148`
	`149`	`+ cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);`
`149`	`150`	`cudaError_t status = BatchDecodeWithPagedKVCacheDispatchedMLA<`
`150`	`151`	`{{ head_dim_ckv }}, {{ head_dim_kpe }}, AttentionVariant>(`
`151`		`- params, tmp_v, tmp_s, /stream=/torch_current_stream);`
	`152`	`+ params, tmp_v, tmp_s, /stream=/stream);`
`152`	`153`	`TORCH_CHECK(status == cudaSuccess, "BatchDecodeWithPagedKVCache failed with error ",`
`153`	`154`	`cudaGetErrorString(status));`
`154`	`155`	`}`