Skip to content

Commit fe883bd

Browse files
single wg fixes
Signed-off-by: Lucas Wilkinson <[email protected]>
1 parent f547f06 commit fe883bd

File tree

2 files changed

+6
-4
lines changed

2 files changed

+6
-4
lines changed

cmake/external_projects/vllm_flash_attn.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ else()
3838
FetchContent_Declare(
3939
vllm-flash-attn
4040
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
41-
GIT_TAG e93779c59ba4905e56e5c39dc2c1904ada71fa21
41+
GIT_TAG e46f09441a8ee3231e54551cc8994ca768178e69
4242
GIT_PROGRESS TRUE
4343
# Don't share the vllm-flash-attn build between build types
4444
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

vllm/v1/attention/backends/flash_attn.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,9 @@ def __init__(self, runner: "GPUModelRunner"):
286286

287287
self.runner = runner
288288
self.aot_schedule = (get_flash_attn_version() == 3)
289-
self.num_heads = model_config.get_num_attention_heads(
289+
self.num_heads_q = model_config.get_num_attention_heads(
290+
runner.parallel_config)
291+
self.num_heads_kv = model_config.get_num_kv_heads(
290292
runner.parallel_config)
291293
self.headdim = model_config.get_head_size()
292294
self.page_size = self.runner.block_size
@@ -340,8 +342,8 @@ def schedule(cu_query_lens, max_query_len, seqlens, max_seq_len,
340342
max_seqlen_q=max_query_len,
341343
max_seqlen_k=max_seq_len,
342344
cache_seqlens=seqlens,
343-
num_heads_q=self.num_heads,
344-
num_heads_kv=self.num_heads,
345+
num_heads_q=self.num_heads_q,
346+
num_heads_kv=self.num_heads_kv,
345347
headdim=self.headdim,
346348
page_size=self.page_size,
347349
cu_seqlens_q=cu_query_lens,

0 commit comments

Comments
 (0)