fix rope logic in mla decoding (#793)

zhyncs · web-flow · commit 504b99036332 · 2025-02-07T15:00:34.000+08:00
Co-authored-by: pankajroark <pankajroark@users.noreply.github.com> As titled, unblock the FlashInfer integration. E2E testing is functioning properly. cc @yzh119 @pankajroark @merrymercy @Ying1123 @ispobock ```bash python3 tests/test_mla_decode_kernel.py ``` ``` Now use MLA decode kernel! 2025-02-06 22:55:31,946 - INFO - flashinfer.jit: Loading JIT ops: batch_decode_mla_with_kv_cache_dtype_q_f16_dtype_kv_f16_dtype_o_f16_dtype_idx_i32_head_dim_qk_512_head_dim_vo_512_use_swa_False_use_logits_cap_False /usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. warnings.warn( 2025-02-06 22:55:31,960 - INFO - flashinfer.jit: Finished loading JIT ops: batch_decode_mla_with_kv_cache_dtype_q_f16_dtype_kv_f16_dtype_o_f16_dtype_idx_i32_head_dim_qk_512_head_dim_vo_512_use_swa_False_use_logits_cap_False cos_use_torch_f32 = 1.0 wmape_use_torch_f32 = 1.4899706573821664e-05 mse_use_torch_f32=0.004270492121577263 cos_use_torch_f16 = 0.999683678150177 wmape_use_torch_f16 = 0.020623904841957166 mse_use_torch_f16 = 5391.00048828125 cos_use_flashinfer = 0.9999864101409912 wmape_use_flashinfer = 0.004352144090863914 mse_use_flashinfer = 231.20518493652344 ```
diff --git a/include/flashinfer/attention/decode.cuh b/include/flashinfer/attention/decode.cuh
@@ -794,8 +794,8 @@ __device__ __forceinline__ void compute_qk_and_update_local_stat_mla(
     ckv_vec.cast_load(ckv_smem + j * head_dim_ckv + tx * vec_size_ckv);
 
     vec_t<float, vec_size_kpe> kpe_vec;
-    kpe_vec = vec_apply_llama_rope_interleave<vec_size_kpe, bdx>(kpe_smem + j * head_dim_kpe, freq,
-                                                                 kv_idx_base + tz * tile_size + j);
+    kpe_vec.cast_load(kpe_smem + j * head_dim_kpe + tx * vec_size_kpe);
+
     s[j] = 0.f;
 #pragma unroll
     for (uint32_t i = 0; i < vec_size_ckv; ++i) {
@@ -920,9 +920,9 @@ __global__ void BatchDecodeWithPagedKVCacheKernelMLA(Params params) {
       q_nope_vec[i].cast_load(q_nope +
                               (mapped_batch_idx * num_qo_heads + qo_head_idx[i]) * head_dim_ckv +
                               tx * vec_size_ckv);
-      q_pe_vec[i] = vec_apply_llama_rope_interleave<vec_size_kpe, bdx>(
-          q_pe + (mapped_batch_idx * num_qo_heads + qo_head_idx[i]) * head_dim_kpe, freq,
-          q_rope_offset_val);
+      q_pe_vec[i].cast_load(q_pe +
+                            (mapped_batch_idx * num_qo_heads + qo_head_idx[i]) * head_dim_kpe +
+                            tx * vec_size_kpe);
     }
   }
 
diff --git a/tests/test_mla_decode_kernel.py b/tests/test_mla_decode_kernel.py
@@ -316,6 +316,16 @@ def run_proof_of_concept(
                 raise ValueError(
                     "For simplicity, kv_len should be multiple of page_size."
                 )
+            freqs_cis = precompute_freqs_cis(
+                self.qk_rope_head_dim, kv_len, self.rope_theta, use_scaled=False
+            ).to(k_pe_cache.device)
+            q_pe, k_pe_cache = apply_rotary_emb(
+                q_pe.unsqueeze(1).repeat(1, kv_len, 1, 1),
+                k_pe_cache.unsqueeze(2),
+                freqs_cis,
+            )
+            q_pe = q_pe[:, -1:, :, :].squeeze(1).contiguous()
+            k_pe_cache = k_pe_cache.squeeze(2)
             num_pages_per_seq = kv_len // page_size
             total_num_pages = num_pages_per_seq * bsz