Fix PagedPrefill python api and some typos (#441)

jianfei-wangg · web-flow · commit 3fff008dc9af · 2024-08-13T02:26:56.000-07:00
Fix two small bugs:
1. “NHD” and "HND" used confusing
2. PagedPrefill use self._custom_mask_buf to judge whether is
customized_mask, but uninitialized
Here is the code snippet to reproduce the 2nd bug:
```python
import torch
import flashinfer

# try to reproduce the bug under speculative decoding case
device = torch.device("cuda:0")
num_heads = 32
num_qo_heads = num_heads
num_kv_heads = 32
head_dim = 128
page_size = 4
max_num_pages = 4
batch_size = 1
seq_len = 4
query = torch.randn(seq_len, num_heads, head_dim, dtype=torch.bfloat16, device=device)
packed_kv_cache = torch.randn(max_num_pages, 2, page_size, num_kv_heads, head_dim, dtype=torch.bfloat16, device=device)
ragged_key_cache = packed_kv_cache[:, 0].reshape(-1, num_kv_heads, head_dim)
ragged_value_cache = packed_kv_cache[:, 1].reshape(-1, num_kv_heads, head_dim)

# [4, 15] shape
attn_mask = torch.tensor([
    [ True,  True,  True,  True,  True,  True,  True,  True, False, False, False,  True, False, False, False],
    [ True,  True,  True,  True,  True,  True,  True, False,  True, False, False, False,  True, False, False],
    [ True,  True,  True,  True,  True,  True,  True,  True, False, False, False, False, False,  True, False],
    [ True,  True,  True,  True,  True,  True,  True, False, False,  True, False, False, False, False,  True]
    ], device=device)

mask = attn_mask.reshape(-1)
# packed_mask = flashinfer.quantization.packbits(mask)
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
paged_prefill_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
    workspace_buffer, "NHD"
)
kv_page_indices = torch.arange(max_num_pages).int().to("cuda:0")
kv_page_indptr = torch.tensor(
    [0, 4], dtype=torch.int32, device="cuda:0"
)
# 1 &lt;= kv_last_page_len &lt;= page_size
kv_last_page_len = torch.tensor(
    [3], dtype=torch.int32, device="cuda:0"
)
qo_indptr = torch.tensor(
[0, 4], dtype=torch.int32, device="cuda:0")

# create auxiliary data structures for batch decode attention
paged_prefill_wrapper.begin_forward(
    qo_indptr,
    kv_page_indptr,
    kv_page_indices,
    kv_last_page_len,
    num_qo_heads,
    num_kv_heads,
    head_dim,
    page_size,
    mask,
    q_data_type=torch.bfloat16
)
# assert torch.equal(paged_prefill_wrapper._custom_mask, packed_mask)
# assert paged_prefill_wrapper._custom_mask_buf is not None
q = query
o = paged_prefill_wrapper.forward(q, packed_kv_cache, causal=False)
paged_prefill_wrapper.end_forward()

# ragged attn
workspace_buffer_ragged = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
ragged_prefill_wrapper = flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
    workspace_buffer_ragged, "NHD"
)
kv_indptr = torch.tensor(
    [0, 15], dtype=torch.int32, device="cuda:0"
)
ragged_prefill_wrapper.begin_forward(
    qo_indptr,
    kv_indptr,
    num_qo_heads,
    num_kv_heads,
    head_dim,
    mask,
    q_data_type='bfloat16'
    )
ragged_o = ragged_prefill_wrapper.forward(q, ragged_key_cache, ragged_value_cache)
ragged_prefill_wrapper.end_forward()
print("query shape: ", q.shape)
print("paged vs ragged allclose: ", torch.allclose(o, ragged_o, rtol=1e-3, atol=1e-3))
print("paged vs ragged equal: ", torch.equal(o, ragged_o))
assert torch.allclose(o, ragged_o, rtol=1e-3, atol=1e-3)
assert torch.equal(o, ragged_o)

```
diff --git a/python/csrc/batch_prefill.cu b/python/csrc/batch_prefill.cu
@@ -289,11 +289,11 @@ std::vector<torch::Tensor> BatchPrefillWithPagedKVCachePyTorchWrapper::ForwardCu
 
   if (paged_kv_defined) {
     // [max_num_pages, 2, num_kv_heads, page_size, head_dim] for HND
-    // [max_num_pages, 2, page_size, num_kv_heads, head_dim] for HND
+    // [max_num_pages, 2, page_size, num_kv_heads, head_dim] for NHD
     CHECK_DIM(5, paged_kv_cache.value());
   } else {
     // [max_num_pages, num_kv_heads, page_size, head_dim] for HND
-    // [max_num_pages, page_size, num_kv_heads, head_dim] for HND
+    // [max_num_pages, page_size, num_kv_heads, head_dim] for NHD
     CHECK_DIM(4, paged_k_cache.value());
     CHECK_DIM(4, paged_v_cache.value());
   }
diff --git a/python/flashinfer/cascade.py b/python/flashinfer/cascade.py
@@ -374,7 +374,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         allow_fp16_qk_reduction : bool
@@ -631,7 +631,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         causal : bool
diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py
@@ -577,7 +577,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         pos_encoding_mode : str
@@ -696,7 +696,7 @@ def forward_return_lse(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         pos_encoding_mode : str
diff --git a/python/flashinfer/page.py b/python/flashinfer/page.py
@@ -65,7 +65,7 @@ def append_paged_kv_cache(
           ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
           :attr:`kv_layout` is ``NHD``, and
           ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-          :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+          :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
           ``paged_kv_cache[:, 1]`` is the value-cache.
 
     kv_indices : torch.Tensor
diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py
@@ -778,8 +778,8 @@ def begin_forward(
             self._paged_kv_indices_buf = paged_kv_indices
             self._paged_kv_last_page_len_buf = paged_kv_last_page_len
             if packed_custom_mask is not None:
-                self._custom_mask = packed_custom_mask
-                self._qk_indptr = qk_indptr
+                self._custom_mask_buf = packed_custom_mask
+                self._qk_indptr_buf = qk_indptr
         empty_q_data = torch.empty(
             0,
             dtype=(
@@ -843,7 +843,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         causal : bool
@@ -969,7 +969,7 @@ def forward_return_lse(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         causal : bool