[attn] fix device of tensors

MengqingCao · MengqingCao · commit cb5b70450115 · 2025-02-10T06:58:38.000Z
Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/examples/offline_distributed_inference_npu.py b/examples/offline_distributed_inference_npu.py
@@ -29,11 +29,10 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 # Create an LLM.
-# TODO (cmq): ray is not supported currently, need some fixes
 llm = LLM(
     model="facebook/opt-125m",
     tensor_parallel_size=2,
-    distributed_executor_backend="mp",
+    distributed_executor_backend="ray",
     trust_remote_code=True,
 )
 
diff --git a/vllm_ascend/attention.py b/vllm_ascend/attention.py
@@ -458,8 +458,7 @@ def __init__(
         self.sliding_window = sliding_window
         if alibi_slopes is not None:
             alibi_slopes = torch.tensor(alibi_slopes,
-                                        dtype=torch.float32,
-                                        device="npu")
+                                        dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
         self.attn_type = attn_type
 
@@ -520,13 +519,13 @@ def forward(
                     attn_metadata.sparse_mode = 2
                 attention_mask = gen_input_mask(
                     attn_metadata.max_prefill_seq_len, self.sliding_window,
-                    num_tokens)
+                    num_tokens, query.device)
                 attn_metadata.attn_mask = attention_mask
 
             if (self.alibi_slopes is not None
                     and attn_metadata.pse_shift is None):
                 attn_metadata.pse_shift = _make_alibi_bias(
-                    self.alibi_slopes,
+                    self.alibi_slopes.to(query.device),
                     self.num_kv_heads,
                     dtype=query.dtype,
                     seq_len=attn_metadata.max_prefill_seq_len,
@@ -571,7 +570,7 @@ def forward(
                 query = query.view(query.shape[0], -1,
                                    self.num_heads * self.head_size)
                 output = torch.zeros(query.shape,
-                                     device="npu",
+                                     device=query.device,
                                      dtype=query.dtype)
                 # TODO (Mengqing Cao): torch_npu.npu_incre_flash_attention
                 # support only when `S == 1`, OPTIMIZE ME when prefix caching
@@ -621,7 +620,7 @@ def forward(
         return output
 
 
-def gen_input_mask(seq_len, sliding_window, len):
+def gen_input_mask(seq_len, sliding_window, len, device):
     """
     Generating lower triangular matrix
     """
@@ -630,15 +629,15 @@ def gen_input_mask(seq_len, sliding_window, len):
         global SHARE_MASK_TRIL_PREFIX_CACHE
         if SHARE_MASK_TRIL_PREFIX_CACHE is None:
             SHARE_MASK_TRIL_PREFIX_CACHE = torch.triu(
-                torch.ones(1, 1, 2048, 2048, dtype=bool, device="npu"),
+                torch.ones(1, 1, 2048, 2048, dtype=bool, device=device),
                 diagonal=1,
             )
         attention_mask = SHARE_MASK_TRIL_PREFIX_CACHE
     else:
         global SHARE_MASK_TRIL
         if SHARE_MASK_TRIL is None or SHARE_MASK_TRIL.shape[0] < seq_len:
             SHARE_MASK_TRIL = ~torch.tril(
-                torch.ones(seq_len, seq_len, dtype=bool, device="npu"))
+                torch.ones(seq_len, seq_len, dtype=bool, device=device))
 
         attention_mask = SHARE_MASK_TRIL
         if sliding_window is not None: