doc: improve mla related documentation (#818)

yzh119 · web-flow · commit 88fa03f3c209 · 2025-02-12T23:59:54.000-08:00
diff --git a/docs/api/decode.rst b/docs/api/decode.rst
@@ -26,8 +26,3 @@ Batch Decoding
     :members:
 
     .. automethod:: __init__
-
-.. autoclass:: BatchDecodeMlaWithPagedKVCacheWrapper
-    :members:
-
-    .. automethod:: __init__
diff --git a/docs/tutorials/kv_layout.rst b/docs/tutorials/kv_layout.rst
@@ -24,16 +24,14 @@ by default).
 Ragged Tensor
 -------------
 
-In batched inference/serving, the input sequence length may vary across different samples.
-When there is no need to change the sequence length (e.g. in prefilling stage), we can use ``RaggedTensor``
-with a single ragged (variable length) dimension to store the key/value tensors in KV-Cache:
+We use Ragged Tensor to store the variable length Q/K/V tensors in FlashInfer for batch prefill self-attention:
 
 .. image:: https://raw.githubusercontent.com/flashinfer-ai/web-data/main/tutorials/ragged.png
   :width: 400
   :align: center
   :alt: Data structure of Ragged KV-Cache.
 
-The keys (or values) of all requests are packed into a single ``data`` tensor without padding,
+In Ragged Tensor, all requests's Q/K/V are packed into a single ``data`` tensor without padding,
 we use a ``indptr`` array (``num_requests+1`` elements, the first element is always zero)
 to store the information of variable sequence lengths of each request
 (``indptr[i+1]-indptr[i]`` is the sequence length of request ``i``), the ``data`` tensor has
@@ -42,7 +40,7 @@ shape ``(indptr[-1], num_heads, head_dim)`` when the layout is ``NHD``.
 We can use ``data[indptr[i]:indptr[i+1]]`` to slice the keys (or values) of request ``i``.
 
 .. note::
-  ``indptr`` arrays across the flashinfer library should be of type ``int32``. Arrays of type ``int64`` can cause indexing errors. 
+  ``indptr`` arrays across the flashinfer library should be of type ``int32``. Arrays of type ``int64`` can cause indexing errors.
 
 FlashInfer APIs
 ~~~~~~~~~~~~~~~
@@ -127,21 +125,48 @@ when stored in a single tensor, ``kv_data`` has shape:
 
 .. code:: python
 
-  (max_num_pages, 2, page_size, num_heads, head_dim) # NHD layout
-  (max_num_pages, 2, num_heads, page_size, head_dim) # HND layout
+  kv_cache_nhd = torch.empty(max_num_pages, 2, page_size, num_heads, head_dim, dtype=torch.bfloat16) # NHD layout
+  kv_cache_hnd = torch.empty(max_num_pages, 2, num_heads, page_size, head_dim, dtype=torch.bfloat16) # HND layout
 
 when stored in a tuple of tensors, ``kv_data = (k_data, v_data)``, and each one of them has shape:
 
 .. code:: python
 
-  (max_num_pages, page_size, num_heads, head_dim) # NHD layout
-  (max_num_pages, num_heads, page_size, head_dim) # HND layout
+  k_cache_nhd = torch.empty(max_num_pages, page_size, num_heads, head_dim, dtype=torch.bfloat16) # NHD layout
+  k_cache_nhd = torch.empty(max_num_pages, num_heads, page_size, head_dim, dtype=torch.bfloat16) # HND layout
+  v_cache_nhd = torch.empty(max_num_pages, page_size, num_heads, head_dim, dtype=torch.bfloat16) # NHD layout
+  v_cache_nhd = torch.empty(max_num_pages, num_heads, page_size, head_dim, dtype=torch.bfloat16) # HND layout
+
 
 where ``max_num_pages`` is the maximum number of pages used by all requests, ``page_size`` is the number of tokens
 we fit into each page. ``2`` in single tensor storage means K/V (first one for keys, the second one for values).
 
 .. note::
-  ``indptr`` arrays across the flashinfer library should be of type ``int32``. Arrays of type ``int64`` can cause indexing errors. This is also true of the ``kv_page_indices`` and ``kv_last_page_lens`` arrays. 
+  ``indptr`` arrays across the flashinfer library should be of type ``int32``. Arrays of type ``int64`` can cause indexing errors. This is also true of the ``kv_page_indices`` and ``kv_last_page_lens`` arrays.
+
+.. _mla-page-layout:
+
+Multi-head Latent Attention Page Layout
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Multi-head Latent Attention (MLA) is a new attention mechanism proposed in `DeepSeek v2 <https://arxiv.org/abs/2405.04434>`_ and was
+used in later DeepSeek models. MLA unifies key cache and value cache into a single tensor, so there is no need to store them seperately.
+Compared to multi-head atteniton or grouped query attention, the KV-Cache of MLA do not have the ``num_heads`` dimension,
+so there is no distinction like ``NHD`` and ``HND`` layout.
+
+MLA separates RoPE (Rotary Positional Encoding) dimensions and other head dimensions. We use ``kpe`` (key w/ positional encoding) and ``ckv`` (compressed key/value)
+to name these two components. User can store them in a single Paged KV-Cache:
+
+.. code:: python
+
+  head_dim_ckv = 512
+  head_dim_kpe = 64
+  mla_paged_kv_cache = torch.empty(max_num_pages, page_size, head_dim_ckv + head_dim_kpe, dtype=torch.bfloat16)
+  ckv = mla_paged_kv_cache[:, :, :head_dim_ckv]  # Slicing here does not copy or move data
+  kpe = mla_paged_kv_cache[:, :, head_dim_ckv:]  # Slicing here does not copy or move data
+
+
+and ``ckv`` and ``kpe`` can then be fed into the MLA attention kernel :class:`flashinfer.mla.BatchMLAPagedAttentionWrapper`.
 
 FlashInfer APIs
 ~~~~~~~~~~~~~~~
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
@@ -30,9 +30,6 @@
 from .cascade import merge_state as merge_state
 from .cascade import merge_state_in_place as merge_state_in_place
 from .cascade import merge_states as merge_states
-from .decode import (
-    BatchDecodeMlaWithPagedKVCacheWrapper as BatchDecodeMlaWithPagedKVCacheWrapper,
-)
 from .decode import (
     BatchDecodeWithPagedKVCacheWrapper as BatchDecodeWithPagedKVCacheWrapper,
 )
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -1243,6 +1243,11 @@ def __init__(
 
 
 class BatchDecodeMlaWithPagedKVCacheWrapper:
+    r"""Warning: this class is deprecated and will be removed in a future release.
+    Please use :class:`flashinfer.mla.BatchMLAPagedAttentionWrapper` instead, which provides
+    a more efficient and general MLA implementation that supports decode and incremental prefill.
+    """
+
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
diff --git a/flashinfer/mla.py b/flashinfer/mla.py
@@ -42,7 +42,10 @@ class BatchMLAPagedAttentionWrapper:
     absorbed with :math:`W_{O}`.
     For MLA attention without Matrix Absorption (``head_dim_qk=192`` and ``head_dim_vo=128``, which is
     used in prefilling self-attention stage), please use
-    :class:`flashinfer.prefill.BatchPrefillWithRaggedAttentionWrapper`.
+    :class:`flashinfer.prefill.BatchPrefillWithRaggedKVCacheWrapper`.
+
+    More information about The Paged KV-Cache layout in MLA is explained in our tutorial
+    :ref:`MLA Page Layout <mla-page-layout>`.
 
     For more details about the MLA computation, Matrix Absorption and FlashInfer's MLA implementation,
     please refer to our `blog post <http://flashinfer.ai/2025/02/10/flashinfer-deepseek-mla.html>`_.
@@ -76,7 +79,7 @@ class BatchMLAPagedAttentionWrapper:
     >>> kpe = torch.zeros(
     ...     batch_size * 999, 1, head_dim_kpe, dtype=torch.bfloat16, device="cuda"
     ... )
-    >>> sm_scale = 1.0 / ((head_dim_ckv + head_dim_kpe) ** 0.5)
+    >>> sm_scale = 1.0 / ((128 + 64) ** 0.5)  # use head dimension before matrix absorption
     >>> mla_wrapper.plan(
     ...     q_indptr,
     ...     kv_indptr,
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -897,7 +897,7 @@ class BatchPrefillWithPagedKVCacheWrapper:
     r"""Wrapper class for prefill/append attention with paged kv-cache for batch of
     requests.
 
-    Check :ref:`our tutorial<page-layout>` for page table layout.
+    Check :ref:`our tutorial <page-layout>` for page table layout.
 
     Example
     -------
@@ -1711,7 +1711,7 @@ class BatchPrefillWithRaggedKVCacheWrapper:
     r"""Wrapper class for prefill/append attention with ragged (tensor) kv-cache for
     batch of requests.
 
-    Check :ref:`our tutorial<ragged-layout>` for ragged kv-cache layout.
+    Check :ref:`our tutorial <ragged-layout>` for ragged kv-cache layout.
 
     Example
     -------
diff --git a/tests/test_deepseek_mla.py b/tests/test_deepseek_mla.py
@@ -188,7 +188,7 @@ def test_batch_mla_page_attention(
         dtype=torch.half,
         device="cuda",
     )
-    sm_scale = 1.0 / ((head_dim_ckv + head_dim_kpe) ** 0.5)
+    sm_scale = 1.0 / ((128 + 64) ** 0.5)  # use head dimension before matrix absorption
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
     wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
         workspace_buffer, backend=backend

Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ def test_batch_mla_page_attention(`
`188`	`188`	`dtype=torch.half,`
`189`	`189`	`device="cuda",`
`190`	`190`	`)`
`191`		`- sm_scale = 1.0 / ((head_dim_ckv + head_dim_kpe) ** 0.5)`
	`191`	`+ sm_scale = 1.0 / ((128 + 64) ** 0.5) # use head dimension before matrix absorption`
`192`	`192`	`workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)`
`193`	`193`	`wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(`
`194`	`194`	`workspace_buffer, backend=backend`