bugfix: MLA decode should multiply sm_scale by math::log2e (#787)

tsu-bin · web-flow · commit 23413e00ac75 · 2025-02-05T11:00:21.000-05:00
diff --git a/flashinfer/jit/attention.py b/flashinfer/jit/attention.py
@@ -180,6 +180,7 @@ def gen_batch_decode_mla_module(
         dtype_o,
         dtype_idx,
         head_dim,
+        head_dim,
         use_sliding_window,
         use_logits_soft_cap,
     )
diff --git a/include/flashinfer/attention/decode.cuh b/include/flashinfer/attention/decode.cuh
@@ -857,6 +857,7 @@ __global__ void BatchDecodeWithPagedKVCacheKernelMLA(Params params) {
   const float rope_rcp_scale = params.rope_rcp_scale;
   const float rope_rcp_theta = params.rope_rcp_theta;
   const bool partition_kv = params.partition_kv;
+  params.sm_scale *= math::log2e;
 
   constexpr uint32_t head_dim_ckv = bdx * vec_size_ckv;
   constexpr uint32_t head_dim_kpe = bdx * vec_size_kpe;

Original file line number	Diff line number	Diff line change
`@@ -180,6 +180,7 @@ def gen_batch_decode_mla_module(`
`180`	`180`	`dtype_o,`
`181`	`181`	`dtype_idx,`
`182`	`182`	`head_dim,`
	`183`	`+ head_dim,`
`183`	`184`	`use_sliding_window,`
`184`	`185`	`use_logits_soft_cap,`
`185`	`186`	`)`