bugfix: Fix cudagraph mode of BatchPrefillWithRaggedKVCacheWrapper (#412)

yzh119 · web-flow · commit 9907bc163eec · 2024-07-30T16:32:09.000-07:00
The computation of `fixed_batch_size` is not correct.
diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py
@@ -1215,8 +1215,8 @@ def __init__(
                 raise ValueError(
                     "kv_indptr_buf should be a torch.Tensor in cuda graph mode"
                 )
-            self._fixed_batch_size = len(qo_indptr_buf)
-            if len(kv_indptr_buf) != self._fixed_batch_size:
+            self._fixed_batch_size = len(qo_indptr_buf) - 1
+            if len(kv_indptr_buf) != self._fixed_batch_size + 1:
                 raise ValueError(
                     "The length of kv_indptr_buf ({}) should be the same as qo_indptr_buf ({}).".format(
                         len(kv_indptr_buf), self._fixed_batch_size