Skip to content

Commit b10eb95

Browse files
committed
Revise as per comment
1 parent e97f72a commit b10eb95

File tree

2 files changed

+364
-266
lines changed

2 files changed

+364
-266
lines changed

vllm/attention/backends/flashinfer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,9 @@ def begin_forward(self, model_input):
271271
state = self
272272
use_cuda_graph = model_input.attn_metadata.use_cuda_graph
273273
is_decode = model_input.attn_metadata.num_prefills == 0
274+
# In case of multistep chunked-prefill, there might be prefill requests
275+
# scheduled while CUDA graph mode is enabled. We don't run graph in that
276+
# case.
274277
if use_cuda_graph and is_decode:
275278
batch_size = model_input.input_tokens.shape[0]
276279
state = self.runner.graph_runners[

0 commit comments

Comments
 (0)