File tree 1 file changed +7
-5
lines changed
vllm/v1/attention/backends/mla 1 file changed +7
-5
lines changed Original file line number Diff line number Diff line change 222
222
Fp8LinearGenericOp , current_platform_fp8_dtype , is_fp8 )
223
223
from vllm .model_executor .layers .quantization .utils .quant_utils import (
224
224
scaled_quantize )
225
- from vllm .model_executor .layers .rotary_embedding import (
226
- DeepseekScalingRotaryEmbedding , RotaryEmbedding )
225
+ from vllm .model_executor .layers .rotary_embedding import RotaryEmbedding
227
226
from vllm .utils import cdiv , round_down
228
227
229
228
try :
@@ -626,9 +625,12 @@ def __init__(
626
625
self .qk_head_dim = qk_head_dim
627
626
self .v_head_dim = v_head_dim
628
627
629
- self .rotary_emb = rotary_emb
630
- self .use_yarn_rope = isinstance (rotary_emb ,
631
- DeepseekScalingRotaryEmbedding )
628
+ # Hack for V1 for now to avoid torch library overhead (since we are
629
+ # already inside an attention custom op), pull out the forward
630
+ # method from the rotary embedding and call it directly
631
+ # TODO(lucas): we should probably find a cleaner way to do this
632
+ self .rotary_emb = rotary_emb ._forward_method
633
+
632
634
self .q_proj = q_proj
633
635
self .kv_b_proj = kv_b_proj
634
636
self .o_proj = o_proj
You can’t perform that action at this time.
0 commit comments