microsoft
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
+6-6 b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
+6-6
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
+8-7 b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
+8-7
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
+9-4 b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
+9-4
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
+6-4 b/‎onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
+6-4
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
+8-8 b/‎onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
+8-8
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
+9-5 b/‎onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
+9-5
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
+2-2 b/‎onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
+2-2
diff --git a/‎onnxruntime/core/optimizer/attention_fusion_helper.h
+3-1 b/‎onnxruntime/core/optimizer/attention_fusion_helper.h
+3-1
diff --git a/‎onnxruntime/core/providers/xnnpack/detail/utils.cc
+3-2 b/‎onnxruntime/core/providers/xnnpack/detail/utils.cc
+3-2
@@ -98,7 +98,7 @@ inline __device__ void compute_attn_1rowblock(const Params& params, const int bi
       for (int m = 0; m < size<1>(tOgO); ++m) {
         const int row = get<0>(tOcO(0, m, 0));
         if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
-          gLSE(row) = INFINITY;
+          gLSE(row) = std::numeric_limits<ElementAccum>::infinity();
         }
       }
       return;
@@ -499,7 +499,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params& params, cons
     for (int m = 0; m < size<1>(tOgOaccum); ++m) {
       const int row = get<0>(tOcO(0, m, 0));
       if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
-        gLSEaccum(row) = Split ? -INFINITY : INFINITY;
+        gLSEaccum(row) = Split ? -std::numeric_limits<ElementAccum>::infinity() : std::numeric_limits<ElementAccum>::infinity();
       }
     }
     return;
@@ -1061,7 +1061,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   for (int l = 0; l < kNLsePerThread; ++l) {
     const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
     const int col = tidx % kBlockM;
-    ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -INFINITY;
+    ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -std::numeric_limits<ElementAccum>::infinity();
     if (row < kMaxSplits) {
       sLSE[row][col] = lse;
     }
@@ -1082,7 +1082,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   for (int l = 0; l < kNLsePerThread; ++l) {
     const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
     const int col = tidx / kRowsPerLoadTranspose;
-    lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -INFINITY;
+    lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -std::numeric_limits<ElementAccum>::infinity();
     // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse_accum(l)); }
   }
 
@@ -1094,7 +1094,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   }
   MaxOp<float> max_op;
   lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
-  lse_max = lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
+  lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
   float lse_sum = expf(lse_accum(0) - lse_max);
 #pragma unroll
   for (int l = 1; l < kNLsePerThread; ++l) {
@@ -1104,7 +1104,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
   // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
   // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
-  ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
+  ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? std::numeric_limits<ElementAccum>::infinity() : logf(lse_sum) + lse_max;
   // if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f, lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); }
   if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) {
     gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum;
 
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <limits>
 #include <cute/tensor.hpp>
 
 namespace onnxruntime {
@@ -28,7 +29,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
 // Without the "make_coord" we get wrong results
 #pragma unroll
         for (int mi = 0; mi < size<0>(tensor); ++mi) {
-          tensor(mi, make_coord(j, nj)) = -INFINITY;
+          tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
         }
       }
     }
@@ -59,7 +60,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
         for (int j = 0; j < size<1, 0>(tensor); ++j) {
           const int col_idx = col_idx_base + j;
           if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
-            tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+            tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
           }
         }
       }
@@ -96,7 +97,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
 #pragma unroll
     for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
       if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
-        tensor(mi, ni) = -INFINITY;
+        tensor(mi, ni) = -std::numeric_limits<float>::infinity();
       }
     }
     // if (cute::thread0()) {
@@ -151,7 +152,7 @@ struct Mask {
               }
               if constexpr (!Is_even_MN) {
                 if (col_idx >= max_seqlen_k) {
-                  tensor(mi, make_coord(j, nj)) = -INFINITY;
+                  tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                 }
               }
             }
@@ -181,18 +182,18 @@ struct Mask {
                 }
                 if constexpr (Causal_mask) {
                   if (col_idx >= col_idx_limit_right) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (Is_local) {
                   if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
                   // Causal and Local already handles MN masking
                   if (col_idx >= max_seqlen_k) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
               }
 
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <cmath>
+#include <limits>
 
 #include <cute/tensor.hpp>
 
@@ -71,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
     // If we don't have float around M_LOG2E the multiplication is done in fp64.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
+                                 ? 0.f
+                                 : max(mi) * (Scale_max ? scale : float(M_LOG2E));
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
       // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
@@ -99,7 +102,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
     max(mi) = Allreduce<4>::run(max(mi), max_op);
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
     sum(mi) = 0;
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
@@ -143,7 +146,7 @@ struct Softmax {
       for (int mi = 0; mi < size(row_max); ++mi) {
         float scores_max_cur = !Check_inf
                                    ? row_max(mi)
-                                   : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                                   : (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
         float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
         row_sum(mi) *= scores_scale;
 #pragma unroll
@@ -169,7 +172,9 @@ struct Softmax {
     for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
       float sum = smooth_softmax ? row_sum(mi) + expf(-row_max(mi) * softmax_scale) : row_sum(mi);
       float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-      lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+      lse(mi) = (sum == 0.f || sum != sum)
+                    ? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
+                    : row_max(mi) * softmax_scale + __logf(sum);
       float scale = inv_sum;
 #pragma unroll
       for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
 
@@ -825,7 +825,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
         const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
         const int col = tidx % kBlockM;
         // We skip the first row = 0, as we already populated it in shared memory.
-        ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -INFINITY;
+        ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -std::numeric_limits<ElementAccum>::infinity();
         if (row > 0 && row < kMaxSplits) {
           sLSE(row, col) = lse;
 
@@ -857,7 +857,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       for (int l = 0; l < kNLsePerThread; ++l) {
         const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
         const int col = tidx / kRowsPerLoadTranspose;
-        lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -INFINITY;
+        lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -std::numeric_limits<ElementAccum>::infinity();
 
 #if defined(DEBUG_LEAN_ATTENTION)
         if (threadIdx.x == 0 && blockIdx.z == tracing_block) {
@@ -874,7 +874,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       }
       MaxOp<float> max_op;
       lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
-      lse_max = lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
+      lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
       float lse_sum = expf(lse_accum(0) - lse_max);
 #pragma unroll
       for (int l = 1; l < kNLsePerThread; ++l) {
@@ -884,7 +884,9 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
       // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
       // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
-      ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
+      ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum)
+                                    ? std::numeric_limits<ElementAccum>::infinity()
+                                    : logf(lse_sum) + lse_max;
 // if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; }
 // Store the scales exp(lse - lse_logsum) in shared memory.
 #pragma unroll
 
@@ -3,7 +3,7 @@
  ******************************************************************************/
 
 #pragma once
-
+#include <limits>
 #include <cute/tensor.hpp>
 
 namespace onnxruntime {
@@ -28,7 +28,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
 // Without the "make_coord" we get wrong results
 #pragma unroll
         for (int mi = 0; mi < size<0>(tensor); ++mi) {
-          tensor(mi, make_coord(j, nj)) = -INFINITY;
+          tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
         }
       }
     }
@@ -59,7 +59,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
         for (int j = 0; j < size<1, 0>(tensor); ++j) {
           const int col_idx = col_idx_base + j;
           if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
-            tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+            tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
           }
         }
       }
@@ -96,7 +96,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
 #pragma unroll
     for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
       if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
-        tensor(mi, ni) = -INFINITY;
+        tensor(mi, ni) = -std::numeric_limits<float>::infinity();
       }
     }
     // if (cute::thread0()) {
@@ -152,7 +152,7 @@ struct Mask {
               }
               if constexpr (!Is_even_MN) {
                 if (col_idx >= max_seqlen_k) {
-                  tensor(mi, make_coord(j, nj)) = -INFINITY;
+                  tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                 }
               }
             }
@@ -182,18 +182,18 @@ struct Mask {
                 }
                 if constexpr (Causal_mask) {
                   if (col_idx >= col_idx_limit_right) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (Is_local) {
                   if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
                   // Causal and Local already handles MN masking
                   if (col_idx >= max_seqlen_k) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
               }
 
@@ -3,7 +3,7 @@
  ******************************************************************************/
 
 #pragma once
-
+#include <limits>
 #include <cmath>
 
 #include <cute/tensor.hpp>
@@ -72,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
     // If we don't have float around M_LOG2E the multiplication is done in fp64.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
+                                 ? 0.f
+                                 : max(mi) * (Scale_max ? scale : float(M_LOG2E));
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
 // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
@@ -107,7 +109,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
     max(mi) = Allreduce<4>::run(max(mi), max_op);
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
     sum(mi) = 0;
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
@@ -151,7 +153,7 @@ struct Softmax {
       for (int mi = 0; mi < size(row_max); ++mi) {
         float scores_max_cur = !Check_inf
                                    ? row_max(mi)
-                                   : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                                   : (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
         float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
         row_sum(mi) *= scores_scale;
 #pragma unroll
@@ -181,7 +183,9 @@ struct Softmax {
       //     printf("sum: %f, inv_sum: %f\n", sum, inv_sum);
       //     printf("mi %d row_max %f softmax_scale %f\n", mi, row_max(mi), softmax_scale);
       // }
-      lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+      lse(mi) = (sum == 0.f || sum != sum)
+                    ? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
+                    : row_max(mi) * softmax_scale + __logf(sum);
       float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout;
 #pragma unroll
       for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
 
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 /*
 Kernel implementation for blocking repeated n-grams.
 */
-
+#include <limits>
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "contrib_ops/cuda/bert/ngram_repeat_block_impl.h"
 
@@ -48,7 +48,7 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens,
   }
   if (is_banned == true) {
     auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];
-    lprobs[lprob_start + token_to_be_banned] = -INFINITY;
+    lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits<float>::infinity();
   }
 }
 
 
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include <limits>
 #include "onnx/defs/shape_inference.h"
 #include "onnx/defs/tensor_proto_util.h"
 #include "core/framework/tensorprotoutils.h"
@@ -767,7 +768,8 @@ bool MatchInputMaskSubgraph(const Graph& graph, const Node& layer_norm, const No
   }
 
   // check where has X=-Infinity
-  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), -INFINITY, true)) {
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]),
+                                                       -std::numeric_limits<float>::infinity(), true)) {
     DEBUG_LOG("where const not matched.");
     return false;
   }
 
@@ -5,6 +5,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <limits>
 
 #include "core/common/common.h"
 #include "core/common/safeint.h"
@@ -239,8 +240,8 @@ std::unique_ptr<IndexedSubGraph::MetaDef> FuseActivation(const NodeUnit& node_un
   def.attributes = node_unit.GetNode().GetAttributes();
 
   // use infinity as the default as that's what xnnpack uses if min/max are not set
-  float min = -INFINITY;
-  float max = INFINITY;
+  float min = -std::numeric_limits<float>::infinity();
+  float max = std::numeric_limits<float>::infinity();
 
   const auto& activation_type = activation.OpType();
   if (activation_type == "Clip") {
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`
`5`	`5`	`#pragma once`
`6`	`6`
	`7`	`+#include <limits>`
`7`	`8`	`#include <cute/tensor.hpp>`
`8`	`9`
`9`	`10`	`namespace onnxruntime {`
`@@ -28,7 +29,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const`
`28`	`29`	`// Without the "make_coord" we get wrong results`
`29`	`30`	`#pragma unroll`
`30`	`31`	`for (int mi = 0; mi < size<0>(tensor); ++mi) {`
`31`		`- tensor(mi, make_coord(j, nj)) = -INFINITY;`
	`32`	`+ tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`32`	`33`	`}`
`33`	`34`	`}`
`34`	`35`	`}`
`@@ -59,7 +60,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,`
`59`	`60`	`for (int j = 0; j < size<1, 0>(tensor); ++j) {`
`60`	`61`	`const int col_idx = col_idx_base + j;`
`61`	`62`	`if (col_idx >= col_idx_limit_right \|\| (HasWSLeft && col_idx < col_idx_limit_left)) {`
`62`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`63`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`63`	`64`	`}`
`64`	`65`	`}`
`65`	`66`	`}`
`@@ -96,7 +97,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(`
`96`	`97`	`#pragma unroll`
`97`	`98`	`for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {`
`98`	`99`	`if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {`
`99`		`- tensor(mi, ni) = -INFINITY;`
	`100`	`+ tensor(mi, ni) = -std::numeric_limits<float>::infinity();`
`100`	`101`	`}`
`101`	`102`	`}`
`102`	`103`	`// if (cute::thread0()) {`
`@@ -151,7 +152,7 @@ struct Mask {`
`151`	`152`	`}`
`152`	`153`	`if constexpr (!Is_even_MN) {`
`153`	`154`	`if (col_idx >= max_seqlen_k) {`
`154`		`- tensor(mi, make_coord(j, nj)) = -INFINITY;`
	`155`	`+ tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`155`	`156`	`}`
`156`	`157`	`}`
`157`	`158`	`}`
`@@ -181,18 +182,18 @@ struct Mask {`
`181`	`182`	`}`
`182`	`183`	`if constexpr (Causal_mask) {`
`183`	`184`	`if (col_idx >= col_idx_limit_right) {`
`184`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`185`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`185`	`186`	`}`
`186`	`187`	`}`
`187`	`188`	`if constexpr (Is_local) {`
`188`	`189`	`if (col_idx >= col_idx_limit_right \|\| col_idx < col_idx_limit_left) {`
`189`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`190`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`190`	`191`	`}`
`191`	`192`	`}`
`192`	`193`	`if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {`
`193`	`194`	`// Causal and Local already handles MN masking`
`194`	`195`	`if (col_idx >= max_seqlen_k) {`
`195`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`196`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`196`	`197`	`}`
`197`	`198`	`}`
`198`	`199`	`}`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`******************************************************************************/`
`4`	`4`
`5`	`5`	`#pragma once`
`6`		`-`
	`6`	`+#include <limits>`
`7`	`7`	`#include <cute/tensor.hpp>`
`8`	`8`
`9`	`9`	`namespace onnxruntime {`
`@@ -28,7 +28,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const`
`28`	`28`	`// Without the "make_coord" we get wrong results`
`29`	`29`	`#pragma unroll`
`30`	`30`	`for (int mi = 0; mi < size<0>(tensor); ++mi) {`
`31`		`- tensor(mi, make_coord(j, nj)) = -INFINITY;`
	`31`	`+ tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`32`	`32`	`}`
`33`	`33`	`}`
`34`	`34`	`}`
`@@ -59,7 +59,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,`
`59`	`59`	`for (int j = 0; j < size<1, 0>(tensor); ++j) {`
`60`	`60`	`const int col_idx = col_idx_base + j;`
`61`	`61`	`if (col_idx >= col_idx_limit_right \|\| (HasWSLeft && col_idx < col_idx_limit_left)) {`
`62`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`62`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`63`	`63`	`}`
`64`	`64`	`}`
`65`	`65`	`}`
`@@ -96,7 +96,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(`
`96`	`96`	`#pragma unroll`
`97`	`97`	`for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {`
`98`	`98`	`if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {`
`99`		`- tensor(mi, ni) = -INFINITY;`
	`99`	`+ tensor(mi, ni) = -std::numeric_limits<float>::infinity();`
`100`	`100`	`}`
`101`	`101`	`}`
`102`	`102`	`// if (cute::thread0()) {`
`@@ -152,7 +152,7 @@ struct Mask {`
`152`	`152`	`}`
`153`	`153`	`if constexpr (!Is_even_MN) {`
`154`	`154`	`if (col_idx >= max_seqlen_k) {`
`155`		`- tensor(mi, make_coord(j, nj)) = -INFINITY;`
	`155`	`+ tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`156`	`156`	`}`
`157`	`157`	`}`
`158`	`158`	`}`
`@@ -182,18 +182,18 @@ struct Mask {`
`182`	`182`	`}`
`183`	`183`	`if constexpr (Causal_mask) {`
`184`	`184`	`if (col_idx >= col_idx_limit_right) {`
`185`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`185`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`186`	`186`	`}`
`187`	`187`	`}`
`188`	`188`	`if constexpr (Is_local) {`
`189`	`189`	`if (col_idx >= col_idx_limit_right \|\| col_idx < col_idx_limit_left) {`
`190`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`190`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`191`	`191`	`}`
`192`	`192`	`}`
`193`	`193`	`if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {`
`194`	`194`	`// Causal and Local already handles MN masking`
`195`	`195`	`if (col_idx >= max_seqlen_k) {`
`196`		`- tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;`
	`196`	`+ tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();`
`197`	`197`	`}`
`198`	`198`	`}`
`199`	`199`	`}`
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ Licensed under the MIT License.`
`6`	`6`	`/*`
`7`	`7`	`Kernel implementation for blocking repeated n-grams.`
`8`	`8`	`*/`
`9`		`-`
	`9`	`+#include <limits>`
`10`	`10`	`#include "core/providers/cuda/cu_inc/common.cuh"`
`11`	`11`	`#include "contrib_ops/cuda/bert/ngram_repeat_block_impl.h"`
`12`	`12`
`@@ -48,7 +48,7 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens,`
`48`	`48`	`}`
`49`	`49`	`if (is_banned == true) {`
`50`	`50`	`auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];`
`51`		`- lprobs[lprob_start + token_to_be_banned] = -INFINITY;`
	`51`	`+ lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits<float>::infinity();`
`52`	`52`	`}`
`53`	`53`	`}`
`54`	`54`