File tree Expand file tree Collapse file tree 8 files changed +52
-0
lines changed Expand file tree Collapse file tree 8 files changed +52
-0
lines changed Original file line number Diff line number Diff line change 38
38
from vllm .attention .layer import Attention
39
39
from vllm .attention .ops .paged_attn import PagedAttention
40
40
from vllm .config import VllmConfig , get_current_vllm_config
41
+ from vllm .logger import init_logger
41
42
from vllm .utils import (async_tensor_h2d , get_kv_cache_torch_dtype ,
42
43
make_tensor_with_pad )
43
44
45
+ logger = init_logger (__name__ )
46
+
44
47
if TYPE_CHECKING :
45
48
from vllm .worker .model_runner import (ModelInputForGPUBuilder ,
46
49
ModelInputForGPUWithSamplingMetadata )
@@ -907,7 +910,12 @@ def __init__(
907
910
blocksparse_params : Optional [Dict [str , Any ]] = None ,
908
911
logits_soft_cap : Optional [float ] = None ,
909
912
attn_type : str = AttentionType .DECODER ,
913
+ use_irope : bool = False ,
910
914
) -> None :
915
+ if use_irope :
916
+ logger .warning_once (
917
+ "Using irope in FlashInfer is not supported yet, it will fall"
918
+ " back to global attention for long context." )
911
919
self .num_heads = num_heads
912
920
self .head_size = head_size
913
921
self .scale = float (scale )
Original file line number Diff line number Diff line change @@ -108,8 +108,13 @@ def __init__(
108
108
blocksparse_params : Optional [Dict [str , Any ]] = None ,
109
109
max_seq_len : int = 4096 ,
110
110
attn_type : str = AttentionType .DECODER ,
111
+ use_irope : bool = False ,
111
112
) -> None :
112
113
super (AttentionImpl , self ).__init__ ()
114
+ if use_irope :
115
+ logger .warning_once (
116
+ "Using irope in HPU is not supported yet, it will fall back "
117
+ "to global attention for long context." )
113
118
self .kv_cache_dtype = kv_cache_dtype
114
119
self .num_heads = num_heads
115
120
self .head_size = head_size
Original file line number Diff line number Diff line change 14
14
from vllm .attention .backends .utils import CommonAttentionState
15
15
from vllm .attention .ops .paged_attn import (PagedAttention ,
16
16
PagedAttentionMetadata )
17
+ from vllm .logger import init_logger
18
+
19
+ logger = init_logger (__name__ )
17
20
18
21
_PARTITION_SIZE = 512
19
22
@@ -119,7 +122,12 @@ def __init__(
119
122
blocksparse_params : Optional [Dict [str , Any ]] = None ,
120
123
logits_soft_cap : Optional [float ] = None ,
121
124
attn_type : str = AttentionType .DECODER ,
125
+ use_irope : bool = False ,
122
126
) -> None :
127
+ if use_irope :
128
+ logger .warning_once (
129
+ "Using irope in Ipex is not supported yet, it will fall"
130
+ " back to global attention for long context." )
123
131
if blocksparse_params is not None :
124
132
raise ValueError (
125
133
"IPEX backend does not support block-sparse attention." )
Original file line number Diff line number Diff line change 11
11
AttentionMetadata , AttentionType ,
12
12
is_quantized_kv_cache )
13
13
from vllm .attention .backends .utils import CommonAttentionState
14
+ from vllm .logger import init_logger
15
+
16
+ logger = init_logger (__name__ )
14
17
15
18
16
19
class PallasAttentionBackend (AttentionBackend ):
@@ -105,7 +108,12 @@ def __init__(
105
108
blocksparse_params : Optional [Dict [str , Any ]] = None ,
106
109
logits_soft_cap : Optional [float ] = None ,
107
110
attn_type : str = AttentionType .DECODER ,
111
+ use_irope : bool = False ,
108
112
) -> None :
113
+ if use_irope :
114
+ logger .warning_once (
115
+ "Using irope in Pallas is not supported yet, it will fall back "
116
+ "to global attention for long context." )
109
117
self .num_heads = num_heads
110
118
self .head_size = head_size
111
119
self .scale = float (scale )
Original file line number Diff line number Diff line change @@ -462,7 +462,12 @@ def __init__(
462
462
blocksparse_params : Optional [Dict [str , Any ]] = None ,
463
463
logits_soft_cap : Optional [float ] = None ,
464
464
attn_type : str = AttentionType .DECODER ,
465
+ use_irope : bool = False ,
465
466
) -> None :
467
+ if use_irope :
468
+ logger .warning_once (
469
+ "Using irope in ROCm Flash Attention is not supported yet, it "
470
+ "will fail back to global attention for long context." )
466
471
if blocksparse_params is not None :
467
472
raise ValueError (
468
473
"ROCmFlashAttention does not support blocksparse attention." )
Original file line number Diff line number Diff line change @@ -404,13 +404,18 @@ def __init__(
404
404
blocksparse_params : Optional [Dict [str , Any ]] = None ,
405
405
logits_soft_cap : Optional [float ] = None ,
406
406
attn_type : str = AttentionType .DECODER ,
407
+ use_irope : bool = False ,
407
408
) -> None :
408
409
if blocksparse_params is not None :
409
410
raise ValueError (
410
411
"Torch SPDA does not support block-sparse attention." )
411
412
if logits_soft_cap is not None :
412
413
logger .warning_once ("Torch SPDA does not support logits soft cap. "
413
414
"Outputs may be slightly off." )
415
+ if use_irope :
416
+ logger .warning_once (
417
+ "Using irope in Torch SPDA is not supported yet, it will fall"
418
+ " back to global attention for long context." )
414
419
self .num_heads = num_heads
415
420
self .head_size = head_size
416
421
self .scale = float (scale )
Original file line number Diff line number Diff line change @@ -389,13 +389,18 @@ def __init__(
389
389
blocksparse_params : Optional [Dict [str , Any ]] = None ,
390
390
logits_soft_cap : Optional [float ] = None ,
391
391
attn_type : str = AttentionType .DECODER ,
392
+ use_irope : bool = False ,
392
393
) -> None :
393
394
if blocksparse_params is not None :
394
395
raise ValueError (
395
396
"XFormers does not support block-sparse attention." )
396
397
if logits_soft_cap is not None :
397
398
logger .warning_once ("XFormers does not support logits soft cap. "
398
399
"Outputs may be slightly off." )
400
+ if use_irope :
401
+ logger .warning_once (
402
+ "Using irope in XFormers is not supported yet, it will fall"
403
+ " back to global attention for long context." )
399
404
self .num_heads = num_heads
400
405
self .head_size = head_size
401
406
self .scale = float (scale )
Original file line number Diff line number Diff line change 10
10
from vllm .attention .backends .abstract import (AttentionBackend , AttentionImpl ,
11
11
AttentionLayer , AttentionType )
12
12
from vllm .attention .backends .utils import CommonAttentionState
13
+ from vllm .logger import init_logger
14
+
15
+ logger = init_logger (__name__ )
13
16
14
17
15
18
class PallasAttentionBackend (AttentionBackend ):
@@ -80,7 +83,12 @@ def __init__(
80
83
blocksparse_params : Optional [dict [str , Any ]] = None ,
81
84
logits_soft_cap : Optional [float ] = None ,
82
85
attn_type : str = AttentionType .DECODER ,
86
+ use_irope : bool = False ,
83
87
) -> None :
88
+ if use_irope :
89
+ logger .warning_once (
90
+ "Using irope in Pallas is not supported yet, it will fall back "
91
+ "to global attention for long context." )
84
92
if blocksparse_params is not None :
85
93
raise ValueError ("Paged attention Pallas kernel does "
86
94
"not support block-sparse attention." )
You can’t perform that action at this time.
0 commit comments