Skip to content

Commit 2700c11

Browse files
sarckkMu Huai
authored and
Mu Huai
committed
Add warning for Attention backends that do not support irope yet (vllm-project#16212)
Signed-off-by: Mu Huai <[email protected]>
1 parent 87b5521 commit 2700c11

File tree

8 files changed

+52
-0
lines changed

8 files changed

+52
-0
lines changed

vllm/attention/backends/flashinfer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,12 @@
3838
from vllm.attention.layer import Attention
3939
from vllm.attention.ops.paged_attn import PagedAttention
4040
from vllm.config import VllmConfig, get_current_vllm_config
41+
from vllm.logger import init_logger
4142
from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
4243
make_tensor_with_pad)
4344

45+
logger = init_logger(__name__)
46+
4447
if TYPE_CHECKING:
4548
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
4649
ModelInputForGPUWithSamplingMetadata)
@@ -907,7 +910,12 @@ def __init__(
907910
blocksparse_params: Optional[Dict[str, Any]] = None,
908911
logits_soft_cap: Optional[float] = None,
909912
attn_type: str = AttentionType.DECODER,
913+
use_irope: bool = False,
910914
) -> None:
915+
if use_irope:
916+
logger.warning_once(
917+
"Using irope in FlashInfer is not supported yet, it will fall"
918+
" back to global attention for long context.")
911919
self.num_heads = num_heads
912920
self.head_size = head_size
913921
self.scale = float(scale)

vllm/attention/backends/hpu_attn.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,13 @@ def __init__(
108108
blocksparse_params: Optional[Dict[str, Any]] = None,
109109
max_seq_len: int = 4096,
110110
attn_type: str = AttentionType.DECODER,
111+
use_irope: bool = False,
111112
) -> None:
112113
super(AttentionImpl, self).__init__()
114+
if use_irope:
115+
logger.warning_once(
116+
"Using irope in HPU is not supported yet, it will fall back "
117+
"to global attention for long context.")
113118
self.kv_cache_dtype = kv_cache_dtype
114119
self.num_heads = num_heads
115120
self.head_size = head_size

vllm/attention/backends/ipex_attn.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
from vllm.attention.backends.utils import CommonAttentionState
1515
from vllm.attention.ops.paged_attn import (PagedAttention,
1616
PagedAttentionMetadata)
17+
from vllm.logger import init_logger
18+
19+
logger = init_logger(__name__)
1720

1821
_PARTITION_SIZE = 512
1922

@@ -119,7 +122,12 @@ def __init__(
119122
blocksparse_params: Optional[Dict[str, Any]] = None,
120123
logits_soft_cap: Optional[float] = None,
121124
attn_type: str = AttentionType.DECODER,
125+
use_irope: bool = False,
122126
) -> None:
127+
if use_irope:
128+
logger.warning_once(
129+
"Using irope in Ipex is not supported yet, it will fall"
130+
" back to global attention for long context.")
123131
if blocksparse_params is not None:
124132
raise ValueError(
125133
"IPEX backend does not support block-sparse attention.")

vllm/attention/backends/pallas.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
AttentionMetadata, AttentionType,
1212
is_quantized_kv_cache)
1313
from vllm.attention.backends.utils import CommonAttentionState
14+
from vllm.logger import init_logger
15+
16+
logger = init_logger(__name__)
1417

1518

1619
class PallasAttentionBackend(AttentionBackend):
@@ -105,7 +108,12 @@ def __init__(
105108
blocksparse_params: Optional[Dict[str, Any]] = None,
106109
logits_soft_cap: Optional[float] = None,
107110
attn_type: str = AttentionType.DECODER,
111+
use_irope: bool = False,
108112
) -> None:
113+
if use_irope:
114+
logger.warning_once(
115+
"Using irope in Pallas is not supported yet, it will fall back "
116+
"to global attention for long context.")
109117
self.num_heads = num_heads
110118
self.head_size = head_size
111119
self.scale = float(scale)

vllm/attention/backends/rocm_flash_attn.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,12 @@ def __init__(
462462
blocksparse_params: Optional[Dict[str, Any]] = None,
463463
logits_soft_cap: Optional[float] = None,
464464
attn_type: str = AttentionType.DECODER,
465+
use_irope: bool = False,
465466
) -> None:
467+
if use_irope:
468+
logger.warning_once(
469+
"Using irope in ROCm Flash Attention is not supported yet, it "
470+
"will fail back to global attention for long context.")
466471
if blocksparse_params is not None:
467472
raise ValueError(
468473
"ROCmFlashAttention does not support blocksparse attention.")

vllm/attention/backends/torch_sdpa.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,13 +404,18 @@ def __init__(
404404
blocksparse_params: Optional[Dict[str, Any]] = None,
405405
logits_soft_cap: Optional[float] = None,
406406
attn_type: str = AttentionType.DECODER,
407+
use_irope: bool = False,
407408
) -> None:
408409
if blocksparse_params is not None:
409410
raise ValueError(
410411
"Torch SPDA does not support block-sparse attention.")
411412
if logits_soft_cap is not None:
412413
logger.warning_once("Torch SPDA does not support logits soft cap. "
413414
"Outputs may be slightly off.")
415+
if use_irope:
416+
logger.warning_once(
417+
"Using irope in Torch SPDA is not supported yet, it will fall"
418+
" back to global attention for long context.")
414419
self.num_heads = num_heads
415420
self.head_size = head_size
416421
self.scale = float(scale)

vllm/attention/backends/xformers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,13 +389,18 @@ def __init__(
389389
blocksparse_params: Optional[Dict[str, Any]] = None,
390390
logits_soft_cap: Optional[float] = None,
391391
attn_type: str = AttentionType.DECODER,
392+
use_irope: bool = False,
392393
) -> None:
393394
if blocksparse_params is not None:
394395
raise ValueError(
395396
"XFormers does not support block-sparse attention.")
396397
if logits_soft_cap is not None:
397398
logger.warning_once("XFormers does not support logits soft cap. "
398399
"Outputs may be slightly off.")
400+
if use_irope:
401+
logger.warning_once(
402+
"Using irope in XFormers is not supported yet, it will fall"
403+
" back to global attention for long context.")
399404
self.num_heads = num_heads
400405
self.head_size = head_size
401406
self.scale = float(scale)

vllm/v1/attention/backends/pallas.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
1111
AttentionLayer, AttentionType)
1212
from vllm.attention.backends.utils import CommonAttentionState
13+
from vllm.logger import init_logger
14+
15+
logger = init_logger(__name__)
1316

1417

1518
class PallasAttentionBackend(AttentionBackend):
@@ -80,7 +83,12 @@ def __init__(
8083
blocksparse_params: Optional[dict[str, Any]] = None,
8184
logits_soft_cap: Optional[float] = None,
8285
attn_type: str = AttentionType.DECODER,
86+
use_irope: bool = False,
8387
) -> None:
88+
if use_irope:
89+
logger.warning_once(
90+
"Using irope in Pallas is not supported yet, it will fall back "
91+
"to global attention for long context.")
8492
if blocksparse_params is not None:
8593
raise ValueError("Paged attention Pallas kernel does "
8694
"not support block-sparse attention.")

0 commit comments

Comments
 (0)