add alibi_slopes to paged attention (#5483)

mayuyuace · web-flow · commit 2d19893489f5 · 2025-03-31T15:46:59.000+08:00
* add alibi_slopes to paged attention

* format

* remove -inf

* remove -inf

* format
diff --git a/csrc/gpu/aten/operators/transformers/attention.cpp b/csrc/gpu/aten/operators/transformers/attention.cpp
@@ -1471,10 +1471,21 @@ void xetla_paged_attention_impl_v1(
   uint32_t num_kv_heads = key_cache.size(1);
   uint32_t max_num_blocks_per_seq = block_tables.size(1);
 
-  // TODO(zw): alibi_slopes is optional, not used currently.
-  const float* alibi_slopes_ptr = alibi_slopes
-      ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-      : nullptr;
+  if (alibi_slopes.has_value()) {
+    TORCH_CHECK(alibi_slopes->is_xpu(), "alibi_slopes_ must on XPU");
+    TORCH_CHECK(
+        alibi_slopes->is_contiguous(), "alibi_slopes_ must be contiguous");
+    TORCH_CHECK(
+        alibi_slopes->scalar_type() == at::kFloat,
+        "XeTLA VarlenAttention: The datatype of alibi_slopes should be float");
+    int ndim = alibi_slopes->ndimension();
+    TORCH_CHECK(
+        ndim == 1, "XeTLA VarlenAttention: only support 1 dim alibi tensor!");
+    int last_dim = alibi_slopes->size(-1);
+    TORCH_CHECK(
+        last_dim == num_heads,
+        "XeTLA VarlenAttention: The shape of alibi tensor should equal to [num_head]");
+  }
 
   auto dpcpp_queue = dpcppGetCurrentQueue();
 #if defined(USE_XETLA)
@@ -1490,6 +1501,8 @@ void xetla_paged_attention_impl_v1(
        reinterpret_cast<void*>(query.data_ptr()),
        reinterpret_cast<void*>(key_cache.data_ptr()),
        reinterpret_cast<void*>(value_cache.data_ptr()),
+       alibi_slopes.has_value() ? alibi_slopes.value().data_ptr()
+                                : (void*)nullptr,
        reinterpret_cast<void*>(block_tables.data_ptr()),
        reinterpret_cast<void*>(context_lens.data_ptr()),
        num_queries_per_tokens,
@@ -1560,10 +1573,21 @@ void xetla_paged_attention_impl_v2(
   uint32_t num_kv_heads = key_cache.size(1);
   uint32_t max_num_blocks_per_seq = block_tables.size(1);
 
-  // TODO(zw): alibi_slopes is optional, not used currently.
-  const float* alibi_slopes_ptr = alibi_slopes
-      ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-      : nullptr;
+  if (alibi_slopes.has_value()) {
+    TORCH_CHECK(alibi_slopes->is_xpu(), "alibi_slopes_ must on XPU");
+    TORCH_CHECK(
+        alibi_slopes->is_contiguous(), "alibi_slopes_ must be contiguous");
+    TORCH_CHECK(
+        alibi_slopes->scalar_type() == at::kFloat,
+        "XeTLA VarlenAttention: The datatype of alibi_slopes should be float");
+    int ndim = alibi_slopes->ndimension();
+    TORCH_CHECK(
+        ndim == 1, "XeTLA VarlenAttention: only support 1 dim alibi tensor!");
+    int last_dim = alibi_slopes->size(-1);
+    TORCH_CHECK(
+        last_dim == num_heads,
+        "XeTLA VarlenAttention: The shape of alibi tensor should equal to [num_head]");
+  }
 
   auto dpcpp_queue = dpcppGetCurrentQueue();
 #if defined(USE_XETLA)
@@ -1579,6 +1603,8 @@ void xetla_paged_attention_impl_v2(
        reinterpret_cast<void*>(query.data_ptr()),
        reinterpret_cast<void*>(key_cache.data_ptr()),
        reinterpret_cast<void*>(value_cache.data_ptr()),
+       alibi_slopes.has_value() ? alibi_slopes.value().data_ptr()
+                                : (void*)nullptr,
        reinterpret_cast<void*>(block_tables.data_ptr()),
        reinterpret_cast<void*>(context_lens.data_ptr()),
        num_queries_per_tokens,
diff --git a/csrc/gpu/aten/operators/xetla/kernels/SDP/fmha_utils.h b/csrc/gpu/aten/operators/xetla/kernels/SDP/fmha_utils.h
@@ -270,8 +270,6 @@ struct tile_mask_t {
 #pragma unroll
         for (int k = 0; k < block_size_y; k++) {
           src_sub.row(k) += (blk_seq_x * alibi_slopes);
-          xetla_mask<block_size_x> mask = blk_seq_x > blk_start_y + k;
-          src_sub.row(k).xetla_merge(kNegInfinity, mask);
         }
       }
     }
@@ -296,8 +294,6 @@ struct tile_mask_t {
 #pragma unroll
         for (int k = 0; k < tail_size_y; k++) {
           src_sub.row(k) += (blk_seq_x * alibi_slopes);
-          xetla_mask<block_size_x> mask = blk_seq_x > blk_start_y + k;
-          src_sub.row(k).xetla_merge(kNegInfinity, mask);
         }
       }
     }
diff --git a/csrc/gpu/aten/operators/xetla/kernels/SDP/paged_attention_kernel.hpp b/csrc/gpu/aten/operators/xetla/kernels/SDP/paged_attention_kernel.hpp
@@ -292,6 +292,7 @@ class paged_attention_kernel {
     scalar_t* query; // [num_seqs, num_heads, head_size]
     scalar_t* key_cache; // [num_blocks, num_kv_heads, head_size, block_size]
     scalar_t* value_cache; // [num_blocks, num_kv_heads, head_size, block_size]
+    float* alibi_slopes; // [num_heads] - alibi_slopes
 
     // Index
     index_t* block_tables; // [num_seqs, max_blocks_per_seq]
@@ -318,6 +319,7 @@ class paged_attention_kernel {
         scalar_t* query,
         scalar_t* key_cache,
         scalar_t* value_cache,
+        float* alibi_slopes,
         index_t* block_tables,
         index_t* context_lens,
         uint32_t num_queries_per_tokens,
@@ -334,6 +336,7 @@ class paged_attention_kernel {
           query(query),
           key_cache(key_cache),
           value_cache(value_cache),
+          alibi_slopes(alibi_slopes),
           block_tables(block_tables),
           context_lens(context_lens),
           num_queries_per_tokens(num_queries_per_tokens),
@@ -404,6 +407,8 @@ class paged_attention_kernel {
     int end_block_id;
     int loop_count;
 
+    float alibi_slopes;
+
     xetla_nbarrier_t<wg_size, wg_size, arch_tag> nbarrier;
 
     inline context_t() = default;
@@ -415,6 +420,10 @@ class paged_attention_kernel {
       partition_id = item.get_group(2);
       max_num_partitions = item.get_group_range(2);
 
+      if (args.alibi_slopes != nullptr) {
+        alibi_slopes = args.alibi_slopes[head_id];
+      }
+
       context_len = args.context_lens[seq_id];
       block_table = args.block_tables + seq_id * args.max_blocks_per_seq;
       num_blocks_per_sg = 0;
@@ -611,6 +620,15 @@ class paged_attention_kernel {
             xetla_tanh<typename score_tile_t::dtype, block_size>(score_sub);
         score_sub *= args.softcap;
       }
+
+      if (args.alibi_slopes != nullptr) {
+        int32_t mat_real_x = bid * block_size;
+        int32_t mat_real_y = ctx.seq_id;
+        xetla_vector<float, block_size> pos_id =
+            xetla_vector_gen<float, block_size>(mat_real_x, 1);
+        score_sub += (pos_id * ctx.alibi_slopes);
+      }
+
       uint32_t remained_len = ctx.context_len - bid * block_size;
       if (remained_len < block_size) {
         xetla_mask<block_size> mask =
@@ -646,6 +664,11 @@ class paged_attention_kernel {
     accum_t group_sum = wg_reduce_sum(mat_score);
     mat_score.reg /= group_sum;
 
+    if (use_partition && group_max == neg_infinity) {
+      mat_score.reg = 0.f;
+      group_sum = 0.f;
+    }
+
     if (use_partition && ctx.sg_id == 0) {
       // store the max and exp_sum
       using tile_desc_t = subgroup::tile_desc_t<1, 1, 1, 1>;
diff --git a/csrc/gpu/aten/operators/xetla/kernels/SDP/paged_attention_v1.cpp b/csrc/gpu/aten/operators/xetla/kernels/SDP/paged_attention_v1.cpp
@@ -98,6 +98,7 @@ cgfs_t launch_kernels(paged_attention_fwd_kernel_args_t fwd_args) {
               reinterpret_cast<T*>(fwd_args.query),
               reinterpret_cast<T*>(fwd_args.key_cache),
               reinterpret_cast<T*>(fwd_args.value_cache),
+              reinterpret_cast<float*>(fwd_args.alibi_slopes),
               reinterpret_cast<U*>(fwd_args.block_tables),
               reinterpret_cast<U*>(fwd_args.context_lens),
               fwd_args.num_queries_per_tokens,
diff --git a/csrc/gpu/aten/operators/xetla/kernels/SDP/paged_attention_v2.cpp b/csrc/gpu/aten/operators/xetla/kernels/SDP/paged_attention_v2.cpp
@@ -62,6 +62,7 @@ std::vector<std::function<void(sycl::handler&)>> launch_split_kv_kernels(
               reinterpret_cast<T*>(fwd_args.query),
               reinterpret_cast<T*>(fwd_args.key_cache),
               reinterpret_cast<T*>(fwd_args.value_cache),
+              reinterpret_cast<float*>(fwd_args.alibi_slopes),
               reinterpret_cast<U*>(fwd_args.block_tables),
               reinterpret_cast<U*>(fwd_args.context_lens),
               fwd_args.num_queries_per_tokens,
diff --git a/csrc/gpu/aten/operators/xetla/mha.h b/csrc/gpu/aten/operators/xetla/mha.h
@@ -67,6 +67,7 @@ struct paged_attention_fwd_kernel_args_t {
   void* query;
   void* key_cache;
   void* value_cache;
+  void* alibi_slopes;
   void* block_tables;
   void* context_lens;
   uint32_t num_queries_per_tokens;
diff --git a/tests/gpu/examples/test_paged_attention.py b/tests/gpu/examples/test_paged_attention.py
diff --git a/tests/gpu/examples/test_varlen_fwd.py b/tests/gpu/examples/test_varlen_fwd.py

Original file line number	Diff line number	Diff line change
`@@ -270,8 +270,6 @@ struct tile_mask_t {`
`270`	`270`	`#pragma unroll`
`271`	`271`	`for (int k = 0; k < block_size_y; k++) {`
`272`	`272`	`src_sub.row(k) += (blk_seq_x * alibi_slopes);`
`273`		`- xetla_mask<block_size_x> mask = blk_seq_x > blk_start_y + k;`
`274`		`- src_sub.row(k).xetla_merge(kNegInfinity, mask);`
`275`	`273`	`}`
`276`	`274`	`}`
`277`	`275`	`}`
`@@ -296,8 +294,6 @@ struct tile_mask_t {`
`296`	`294`	`#pragma unroll`
`297`	`295`	`for (int k = 0; k < tail_size_y; k++) {`
`298`	`296`	`src_sub.row(k) += (blk_seq_x * alibi_slopes);`
`299`		`- xetla_mask<block_size_x> mask = blk_seq_x > blk_start_y + k;`
`300`		`- src_sub.row(k).xetla_merge(kNegInfinity, mask);`
`301`	`297`	`}`
`302`	`298`	`}`
`303`	`299`	`}`