flashinfer-ai
diff --git a/‎benchmarks/bench_deepseek_mla.py
+82 b/‎benchmarks/bench_deepseek_mla.py
+82
diff --git a/‎csrc/batch_mla_config.jinja
+24 b/‎csrc/batch_mla_config.jinja
+24
diff --git a/‎csrc/batch_mla_plan.cu
+51 b/‎csrc/batch_mla_plan.cu
+51
diff --git a/‎csrc/batch_mla_pybind.cu
+37 b/‎csrc/batch_mla_pybind.cu
+37
diff --git a/‎csrc/batch_mla_run.cu
+114 b/‎csrc/batch_mla_run.cu
+114
diff --git a/‎flashinfer/__init__.py
+2-2 b/‎flashinfer/__init__.py
+2-2
diff --git a/‎flashinfer/jit/__init__.py
+2 b/‎flashinfer/jit/__init__.py
+2
@@ -0,0 +1,82 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import triton
+
+import flashinfer
+
+
+def bench_deepseek_mla_decode(batch_size, seq_len, num_heads):
+    head_dim_ckv = 512
+    head_dim_kpe = 64
+    page_size = 1
+    q_nope = torch.randn(
+        batch_size * 1, num_heads, head_dim_ckv, dtype=torch.half, device="cuda"
+    )
+    q_pe = torch.zeros(
+        batch_size * 1, num_heads, head_dim_kpe, dtype=torch.half, device="cuda"
+    )
+    ckv = torch.randn(
+        batch_size * seq_len, 1, head_dim_ckv, dtype=torch.half, device="cuda"
+    )
+    kpe = torch.zeros(
+        batch_size * seq_len, 1, head_dim_kpe, dtype=torch.half, device="cuda"
+    )
+    sm_scale = 1.0 / ((head_dim_ckv + head_dim_kpe) ** 0.5)
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
+    wrapper = flashinfer.mla.BatchMLAPageAttentionWrapper(
+        workspace_buffer, backend="fa2"
+    )
+    q_indptr = torch.arange(0, batch_size + 1).to(0).int()
+    kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * seq_len
+    kv_indices = torch.arange(0, batch_size * seq_len).to(0).int()
+    kv_lens = torch.full((batch_size,), seq_len, dtype=torch.int32).to(0)
+    wrapper.plan(
+        q_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_lens,
+        num_heads,
+        head_dim_ckv,
+        head_dim_kpe,
+        page_size,
+        False,  # causal
+        sm_scale,
+        q_nope.dtype,
+        ckv.dtype,
+    )
+    o = wrapper.run(q_nope, q_pe, ckv, kpe, return_lse=False)
+
+    ms = triton.testing.do_bench(
+        lambda: wrapper.run(q_nope, q_pe, ckv, kpe),
+        warmup=100,
+        rep=1000,
+    )
+
+    io = sum([_.numel() * _.element_size() for _ in [q_nope, q_pe, ckv, kpe, o]])
+
+    print(f"Config: batch_size={batch_size}, seq_len={seq_len}, num_heads={num_heads}")
+    print(f"Memory bandwidth: {io * 1e-6 / ms:.2f} GB/s")
+
+
+if __name__ == "__main__":
+    bench_deepseek_mla_decode(768, 1024, 16)
+    bench_deepseek_mla_decode(768, 1024, 32)
+    bench_deepseek_mla_decode(768, 1024, 64)
+    bench_deepseek_mla_decode(768, 2048, 16)
+    bench_deepseek_mla_decode(768, 2048, 32)
+    bench_deepseek_mla_decode(768, 2048, 64)
@@ -0,0 +1,24 @@
+#pragma once
+#include <flashinfer/page.cuh>
+#include <flashinfer/math.cuh>
+#include <flashinfer/layout.cuh>
+#include <flashinfer/utils.cuh>
+#include <flashinfer/pos_enc.cuh>
+#include <flashinfer/fastdiv.cuh>
+#include <flashinfer/attention/variant_helper.cuh>
+#include <flashinfer/attention/mla_params.cuh>
+
+using namespace flashinfer;
+
+using DTypeQ = {{ dtype_q }};
+using DTypeKV = {{ dtype_kv }};
+using DTypeO = {{ dtype_o }};
+using IdType = {{ dtype_idx }};
+constexpr int HEAD_DIM_CKV = {{ head_dim_ckv }};
+constexpr int HEAD_DIM_KPE = {{ head_dim_kpe }};
+
+#define DISPATCH_context(DTypeQ, DTypeKV, DTypeO, IdType, MASK_MODE, HEAD_DIM_CKV, HEAD_DIM_KPE, Params, ...) \
+  DISPATCH_MASK_MODE(mask_mode, MASK_MODE, { \
+    using Params = MLAParams<DTypeQ, DTypeKV, DTypeO, IdType>; \
+    __VA_ARGS__(); \
+  })
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/attention/scheduler.cuh>
+#include <optional>
+
+#include "batch_mla_config.inc"
+#include "pytorch_extension_utils.h"
+
+using namespace flashinfer;
+
+std::vector<int64_t> BatchMLAPageAttentionPlan(at::Tensor float_workspace_buffer,
+                                               at::Tensor int_workspace_buffer,
+                                               at::Tensor page_locked_int_workspace_buffer,
+                                               at::Tensor qo_indptr, at::Tensor kv_indptr,
+                                               at::Tensor kv_len, unsigned int num_heads,
+                                               unsigned int head_dim_o, bool causal,
+                                               int64_t cuda_stream) {
+  size_t float_workspace_size_in_bytes =
+      float_workspace_buffer.size(0) * float_workspace_buffer.element_size();
+  size_t int_workspace_size_in_bytes =
+      int_workspace_buffer.size(0) * int_workspace_buffer.element_size();
+
+  MLAPlanInfo plan_info;
+
+  int batch_size = kv_len.size(0);
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  cudaError_t status =
+      MLAPlan(float_workspace_buffer.data_ptr(), float_workspace_size_in_bytes,
+              int_workspace_buffer.data_ptr(), page_locked_int_workspace_buffer.data_ptr(),
+              int_workspace_size_in_bytes, plan_info, static_cast<IdType*>(qo_indptr.data_ptr()),
+              static_cast<IdType*>(kv_indptr.data_ptr()), static_cast<IdType*>(kv_len.data_ptr()),
+              batch_size, num_heads, head_dim_o, causal, stream);
+
+  TORCH_CHECK(status == cudaSuccess, "Failed to plan MLA, error: ", cudaGetErrorString(status));
+
+  return plan_info.ToVector();
+}
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "batch_mla_config.inc"
+#include "pytorch_extension_utils.h"
+
+std::vector<int64_t> BatchMLAPageAttentionPlan(at::Tensor float_workspace_buffer,
+                                               at::Tensor int_workspace_buffer,
+                                               at::Tensor page_locked_int_workspace_buffer,
+                                               at::Tensor qo_indptr, at::Tensor kv_indptr,
+                                               at::Tensor kv_len, unsigned int num_heads,
+                                               unsigned int head_dim_o, bool causal,
+                                               int64_t cuda_stream);
+
+void BatchMLAPageAttentionRun(at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
+                              std::vector<int64_t> plan_info_vec, at::Tensor q_nope,
+                              at::Tensor q_pe, at::Tensor ckv_cache, at::Tensor kpe_cache,
+                              at::Tensor kv_indices, at::Tensor o,
+                              std::optional<at::Tensor> maybe_lse, int mask_mode_code,
+                              int num_heads, int page_size, float sm_scale, int64_t cuda_stream);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("plan", &BatchMLAPageAttentionPlan, "Batch MLA Page Attention Plan");
+  m.def("run", &BatchMLAPageAttentionRun, "Batch MLA Page Attention Run");
+}
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <driver_types.h>
+
+#include <flashinfer/attention/mla_fa2.cuh>
+#include <flashinfer/attention/scheduler.cuh>
+#include <flashinfer/fastdiv.cuh>
+#include <optional>
+
+#include "batch_mla_config.inc"
+#include "pytorch_extension_utils.h"
+
+using namespace flashinfer;
+
+void BatchMLAPageAttentionRun(at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
+                              std::vector<int64_t> plan_info_vec, at::Tensor q_nope,
+                              at::Tensor q_pe, at::Tensor ckv_cache, at::Tensor kpe_cache,
+                              at::Tensor kv_indices, at::Tensor o,
+                              std::optional<at::Tensor> maybe_lse, int mask_mode_code,
+                              int num_heads, int page_size, float sm_scale, int64_t cuda_stream) {
+  // q_nope: [n, num_heads, head_dim_ckv]
+  // q_pe: [n, num_heads, head_dim_kpe]
+  // ckv_cache: [num_pages, page_size, head_dim_ckv]
+  // kpe_cache: [num_pages, page_size, head_dim_kpe]
+  MLAPlanInfo plan_info;
+  plan_info.FromVector(plan_info_vec);
+
+  auto device = q_nope.device();
+
+  void* float_buffer_ptr = float_workspace_buffer.data_ptr();
+  void* int_buffer_ptr = int_workspace_buffer.data_ptr();
+
+  const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
+
+  auto q_scalar_type = q_nope.scalar_type();
+  auto kv_scalar_type = ckv_cache.scalar_type();
+
+  unsigned int q_nope_stride_n = q_nope.stride(0);
+  unsigned int q_nope_stride_h = q_nope.stride(1);
+  unsigned int q_pe_stride_n = q_pe.stride(0);
+  unsigned int q_pe_stride_h = q_pe.stride(1);
+  unsigned int ckv_stride_page = ckv_cache.stride(0);
+  unsigned int ckv_stride_n = ckv_cache.stride(1);
+  unsigned int kpe_stride_page = kpe_cache.stride(0);
+  unsigned int kpe_stride_n = kpe_cache.stride(1);
+  unsigned int o_stride_n = o.stride(0);
+  unsigned int o_stride_h = o.stride(1);
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
+  DISPATCH_context(
+      DTypeQ, DTypeKV, DTypeO, IdType, MASK_MODE, HEAD_DIM_CKV, HEAD_DIM_KPE, Params, [&] {
+        Params params;
+
+        params.q_nope = static_cast<DTypeQ*>(q_nope.data_ptr());
+        params.q_pe = static_cast<DTypeQ*>(q_pe.data_ptr());
+        params.ckv = static_cast<DTypeKV*>(ckv_cache.data_ptr());
+        params.kpe = static_cast<DTypeKV*>(kpe_cache.data_ptr());
+        params.kv_indices = static_cast<IdType*>(kv_indices.data_ptr());
+
+        params.q_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.q_indptr_offset);
+        params.kv_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_indptr_offset);
+        params.kv_indices = static_cast<IdType*>(kv_indices.data_ptr());
+        params.q_len = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.q_len_offset);
+        params.kv_len = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_len_offset);
+        params.q_start = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.q_start_offset);
+        params.kv_start = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_start_offset);
+        params.kv_end = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_end_offset);
+        params.work_indptr =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.work_indptr_offset);
+        params.final_o = static_cast<DTypeO*>(o.data_ptr());
+        params.final_lse =
+            maybe_lse.has_value() ? static_cast<float*>(maybe_lse->data_ptr()) : nullptr;
+        params.partial_o =
+            GetPtrFromBaseOffset<float>(float_buffer_ptr, plan_info.partial_o_offset);
+        params.partial_lse =
+            GetPtrFromBaseOffset<float>(float_buffer_ptr, plan_info.partial_lse_offset);
+
+        params.num_heads = uint_fastdiv(num_heads);
+        params.block_size = uint_fastdiv(page_size);
+
+        params.q_nope_stride_n = q_nope_stride_n;
+        params.q_nope_stride_h = q_nope_stride_h;
+        params.q_pe_stride_n = q_pe_stride_n;
+        params.q_pe_stride_h = q_pe_stride_h;
+        params.ckv_stride_page = ckv_stride_page;
+        params.ckv_stride_n = ckv_stride_n;
+        params.kpe_stride_page = kpe_stride_page;
+        params.kpe_stride_n = kpe_stride_n;
+        params.o_stride_n = o_stride_n;
+        params.o_stride_h = o_stride_h;
+
+        params.sm_scale = sm_scale;
+
+        cudaError_t status = mla::BatchMLAPageAttention<MASK_MODE, HEAD_DIM_CKV, HEAD_DIM_KPE>(
+            params, plan_info.num_blks_x, plan_info.num_blks_y, stream);
+
+        TORCH_CHECK(status == cudaSuccess,
+                    "Failed to run MLA, error: ", cudaGetErrorString(status));
+      });
+}
@@ -14,6 +14,7 @@
 limitations under the License.
 """
 
+from ._build_meta import __version__ as __version__
 from .activation import gelu_and_mul as gelu_and_mul
 from .activation import gelu_tanh_and_mul as gelu_tanh_and_mul
 from .activation import silu_and_mul as silu_and_mul
@@ -41,6 +42,7 @@
 from .decode import single_decode_with_kv_cache as single_decode_with_kv_cache
 from .gemm import SegmentGEMMWrapper as SegmentGEMMWrapper
 from .gemm import bmm_fp8 as bmm_fp8
+from .mla import BatchMLAPageAttentionWrapper as BatchMLAPageAttentionWrapper
 from .norm import fused_add_rmsnorm as fused_add_rmsnorm
 from .norm import gemma_fused_add_rmsnorm as gemma_fused_add_rmsnorm
 from .norm import gemma_rmsnorm as gemma_rmsnorm
@@ -87,5 +89,3 @@
 from .sampling import top_p_renorm_probs as top_p_renorm_probs
 from .sampling import top_p_sampling_from_probs as top_p_sampling_from_probs
 from .sparse import BlockSparseAttentionWrapper as BlockSparseAttentionWrapper
-
-from ._build_meta import __version__ as __version__
@@ -19,6 +19,7 @@
 from .activation import get_act_and_mul_cu_str as get_act_and_mul_cu_str
 from .attention import gen_batch_decode_mla_module as gen_batch_decode_mla_module
 from .attention import gen_batch_decode_module as gen_batch_decode_module
+from .attention import gen_batch_mla_module as gen_batch_mla_module
 from .attention import gen_batch_prefill_module as gen_batch_prefill_module
 from .attention import (
     gen_customize_batch_decode_module as gen_customize_batch_decode_module,
@@ -36,6 +37,7 @@
 from .attention import gen_single_prefill_module as gen_single_prefill_module
 from .attention import get_batch_decode_mla_uri as get_batch_decode_mla_uri
 from .attention import get_batch_decode_uri as get_batch_decode_uri
+from .attention import get_batch_mla_uri as get_batch_mla_uri
 from .attention import get_batch_prefill_uri as get_batch_prefill_uri
 from .attention import get_single_decode_uri as get_single_decode_uri
 from .attention import get_single_prefill_uri as get_single_prefill_uri