perf: dual pivot top-p/top-k renorm (#974)

xslingcn · yzh119 · web-flow · commit e19cb7bafbd7 · 2025-03-27T02:04:55.000-07:00
Uses one more pivot and a fixed Block Threads of 1024 in the sampling.renorm kernels. Benchmark and comparisons can be found [here](https://docs.google.com/spreadsheets/d/15Qxh7fRxO2HiYaSxpyZs7sWQKoJ-8gscQFTG3rW_hZI/edit?usp=sharing). --------- Co-authored-by: Zihao Ye <expye@outlook.com>
diff --git a/include/flashinfer/sampling.cuh b/include/flashinfer/sampling.cuh
@@ -943,9 +943,15 @@ struct RenormTempStorage {
     float max_val;
     float min_val;
     union {
-      float value;
-      int count;
-      ValueCount<float> pair;
+      struct {
+        float values[2];
+      };
+      struct {
+        int counts[2];
+      };
+      struct {
+        ValueCount<float> pairs[2];
+      };
     } block_aggregate;
   };
 };
@@ -964,7 +970,6 @@ __global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob, float*
       reinterpret_cast<RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>&>(smem_renorm);
   temp_storage.max_val = 0;
   vec_t<float, VEC_SIZE> probs_vec;
-  float probs_greater_than_pivot[VEC_SIZE];  // pivot initialized to 0
 
   float max_val = GetMaxValue<VEC_SIZE, BLOCK_THREADS, REDUCE_ALGORITHM,
                               RenormTempStorage<BLOCK_THREADS, REDUCE_ALGORITHM>>(probs, row_idx, d,
@@ -981,8 +986,10 @@ __global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob, float*
   // stopping condition
   // - f(low) >= p, f(min_gt_low) == f(max_le_high) == f(high) < p
   do {
-    float threadlocal_sum = 0;
-    double mid = (low + high) / 2;
+    double pivot_0 = (high + 2 * low) / 3;
+    double pivot_1 = (2 * high + low) / 3;
+
+    float aggregate_gt_pivot_0 = 0, aggregate_gt_pivot_1 = 0;
     min_gt_low = high;
     max_le_high = low;
 #pragma unroll 2
@@ -991,19 +998,29 @@ __global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob, float*
       if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
         probs_vec.cast_load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
       }
+
+      float probs_gt_pivot_0[VEC_SIZE], probs_gt_pivot_1[VEC_SIZE];
 #pragma unroll
       for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-        probs_greater_than_pivot[j] = (probs_vec[j] > mid) ? probs_vec[j] : 0;
+        probs_gt_pivot_0[j] = (probs_vec[j] > pivot_0) ? probs_vec[j] : 0;
+        probs_gt_pivot_1[j] = (probs_vec[j] > pivot_1) ? probs_vec[j] : 0;
+
         if (probs_vec[j] > low && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
           min_gt_low = min(min_gt_low, probs_vec[j]);
         }
         if (probs_vec[j] <= high && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
           max_le_high = max(max_le_high, probs_vec[j]);
         }
       }
-      threadlocal_sum +=
+
+      aggregate_gt_pivot_0 +=
+          BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+              .Sum<VEC_SIZE>(probs_gt_pivot_0);
+      __syncthreads();
+
+      aggregate_gt_pivot_1 +=
           BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-              .Sum<VEC_SIZE>(probs_greater_than_pivot);
+              .Sum<VEC_SIZE>(probs_gt_pivot_1);
       __syncthreads();
     }
     min_gt_low = BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
@@ -1013,19 +1030,26 @@ __global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob, float*
         BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
             .Reduce(max_le_high, cub::Max());
     if (tx == 0) {
-      temp_storage.block_aggregate.value = threadlocal_sum;
+      temp_storage.block_aggregate.values[0] = aggregate_gt_pivot_0;
+      temp_storage.block_aggregate.values[1] = aggregate_gt_pivot_1;
       temp_storage.min_val = min_gt_low;
       temp_storage.max_val = max_le_high;
     }
     __syncthreads();
-    threadlocal_sum = temp_storage.block_aggregate.value;
+    aggregate_gt_pivot_0 = temp_storage.block_aggregate.values[0];
+    aggregate_gt_pivot_1 = temp_storage.block_aggregate.values[1];
     min_gt_low = temp_storage.min_val;
     max_le_high = temp_storage.max_val;
-    if (threadlocal_sum >= p) {
-      low = mid;
-      sum_low = threadlocal_sum;
+
+    if (aggregate_gt_pivot_1 >= p) {
+      low = pivot_1;
+      sum_low = aggregate_gt_pivot_1;
+    } else if (aggregate_gt_pivot_0 >= p) {
+      low = pivot_0;
+      high = min(pivot_1, max_le_high);
+      sum_low = aggregate_gt_pivot_0;
     } else {
-      high = min(mid, max_le_high);
+      high = min(pivot_0, max_le_high);
     }
   } while (min_gt_low != max_le_high);
 
@@ -1079,9 +1103,10 @@ __global__ void TopKMaskLogitsKernel(DType* logits, DType* masked_logits, IdType
     // stopping condition: min_gt_low == max_le_high
     // - f(low) >= k, f(min_gt_low) == f(max_le_high) == f(high) < k
     do {
-      int threadlocal_count_sum = 0;
-      int probs_greater_than_pivot_count[VEC_SIZE];  // pivot initialized to 0
-      double mid = (low + high) / 2;
+      double pivot_0 = (high + 2 * low) / 3;
+      double pivot_1 = (2 * high + low) / 3;
+
+      int aggregate_gt_pivot_0 = 0, aggregate_gt_pivot_1 = 0;
       min_gt_low = high;
       max_le_high = low;
 #pragma unroll 2
@@ -1090,20 +1115,30 @@ __global__ void TopKMaskLogitsKernel(DType* logits, DType* masked_logits, IdType
         if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
           logits_vec.cast_load(logits + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
         }
+        int probs_gt_pivot_0_count[VEC_SIZE], probs_gt_pivot_1_count[VEC_SIZE];
 #pragma unroll
         for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-          probs_greater_than_pivot_count[j] =
-              logits_vec[j] > mid && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
+          probs_gt_pivot_0_count[j] =
+              logits_vec[j] > pivot_0 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
+          probs_gt_pivot_1_count[j] =
+              logits_vec[j] > pivot_1 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
+
           if (logits_vec[j] > low && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             min_gt_low = min(min_gt_low, logits_vec[j]);
           }
           if (logits_vec[j] <= high && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             max_le_high = max(max_le_high, logits_vec[j]);
           }
         }
-        threadlocal_count_sum +=
+
+        aggregate_gt_pivot_0 +=
+            BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce_int)
+                .Sum<VEC_SIZE>(probs_gt_pivot_0_count);
+        __syncthreads();
+
+        aggregate_gt_pivot_1 +=
             BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce_int)
-                .Sum<VEC_SIZE>(probs_greater_than_pivot_count);
+                .Sum<VEC_SIZE>(probs_gt_pivot_1_count);
         __syncthreads();
       }
       min_gt_low =
@@ -1114,18 +1149,24 @@ __global__ void TopKMaskLogitsKernel(DType* logits, DType* masked_logits, IdType
           BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
               .Reduce(max_le_high, cub::Max());
       if (tx == 0) {
-        temp_storage.block_aggregate.count = threadlocal_count_sum;
+        temp_storage.block_aggregate.counts[0] = aggregate_gt_pivot_0;
+        temp_storage.block_aggregate.counts[1] = aggregate_gt_pivot_1;
         temp_storage.min_val = min_gt_low;
         temp_storage.max_val = max_le_high;
       }
       __syncthreads();
-      threadlocal_count_sum = temp_storage.block_aggregate.count;
+      aggregate_gt_pivot_0 = temp_storage.block_aggregate.counts[0];
+      aggregate_gt_pivot_1 = temp_storage.block_aggregate.counts[1];
       min_gt_low = temp_storage.min_val;
       max_le_high = temp_storage.max_val;
-      if (threadlocal_count_sum >= k) {
-        low = mid;
+
+      if (aggregate_gt_pivot_1 >= k) {
+        low = pivot_1;
+      } else if (aggregate_gt_pivot_0 >= k) {
+        low = pivot_0;
+        high = min(pivot_1, max_le_high);
       } else {
-        high = min(mid, max_le_high);
+        high = min(pivot_0, max_le_high);
       }
     } while (min_gt_low != max_le_high);
     pivot = low;
@@ -1164,7 +1205,6 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
     auto& temp_storage =
         reinterpret_cast<RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>&>(smem_renorm);
     temp_storage.max_val = 0;
-    float probs_greater_than_pivot[VEC_SIZE];  // pivot initialized to 0
 
     float max_val = GetMaxValue<VEC_SIZE, BLOCK_THREADS, REDUCE_ALGORITHM,
                                 RenormTempStorage<BLOCK_THREADS, REDUCE_ALGORITHM>>(
@@ -1181,9 +1221,10 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
     // stopping condition: min_gt_low == max_le_high
     // - f(low) >= k, f(min_gt_low) == f(max_le_high) == f(high) < k
     do {
-      ValueCount<float> threadlocal_sum{0, 0};
-      ValueCount<float> probs_greater_than_pivot_pair[VEC_SIZE];  // pivot initialized to 0
-      double mid = (low + high) / 2;
+      double pivot_0 = (high + 2 * low) / 3;
+      double pivot_1 = (2 * high + low) / 3;
+
+      ValueCount<float> aggregate_gt_pivot_0{0, 0}, aggregate_gt_pivot_1{0, 0};
       min_gt_low = high;
       max_le_high = low;
 #pragma unroll 2
@@ -1192,21 +1233,32 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
         if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
           probs_vec.cast_load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
         }
+        ValueCount<float> probs_gt_pivot_0_pair[VEC_SIZE], probs_gt_pivot_1_pair[VEC_SIZE];
 #pragma unroll
         for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-          probs_greater_than_pivot_pair[j] = {
-              (probs_vec[j] > mid) ? probs_vec[j] : 0,
-              (probs_vec[j] > mid && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+          probs_gt_pivot_0_pair[j] = {
+              (probs_vec[j] > pivot_0) ? probs_vec[j] : 0,
+              (probs_vec[j] > pivot_0 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+          probs_gt_pivot_1_pair[j] = {
+              (probs_vec[j] > pivot_1) ? probs_vec[j] : 0,
+              (probs_vec[j] > pivot_1 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+
           if (probs_vec[j] > low && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             min_gt_low = min(min_gt_low, probs_vec[j]);
           }
           if (probs_vec[j] <= high && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             max_le_high = max(max_le_high, probs_vec[j]);
           }
         }
-        threadlocal_sum += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                               temp_storage.block_prim.reduce_value_count)
-                               .Sum<VEC_SIZE>(probs_greater_than_pivot_pair);
+
+        aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                    temp_storage.block_prim.reduce_value_count)
+                                    .Sum<VEC_SIZE>(probs_gt_pivot_0_pair);
+        __syncthreads();
+
+        aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                    temp_storage.block_prim.reduce_value_count)
+                                    .Sum<VEC_SIZE>(probs_gt_pivot_1_pair);
         __syncthreads();
       }
       min_gt_low =
@@ -1217,19 +1269,26 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
           BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
               .Reduce(max_le_high, cub::Max());
       if (tx == 0) {
-        temp_storage.block_aggregate.pair = threadlocal_sum;
+        temp_storage.block_aggregate.pairs[0] = aggregate_gt_pivot_0;
+        temp_storage.block_aggregate.pairs[1] = aggregate_gt_pivot_1;
         temp_storage.min_val = min_gt_low;
         temp_storage.max_val = max_le_high;
       }
       __syncthreads();
-      threadlocal_sum = temp_storage.block_aggregate.pair;
+      aggregate_gt_pivot_0 = temp_storage.block_aggregate.pairs[0];
+      aggregate_gt_pivot_1 = temp_storage.block_aggregate.pairs[1];
       min_gt_low = temp_storage.min_val;
       max_le_high = temp_storage.max_val;
-      if (threadlocal_sum.count >= k) {
-        low = mid;
-        sum_low = float(threadlocal_sum.value);
+
+      if (aggregate_gt_pivot_1.count >= k) {
+        low = pivot_1;
+        sum_low = float(aggregate_gt_pivot_1.value);
+      } else if (aggregate_gt_pivot_0.count >= k) {
+        low = pivot_0;
+        high = min(pivot_1, max_le_high);
+        sum_low = float(aggregate_gt_pivot_0.value);
       } else {
-        high = min(mid, max_le_high);
+        high = min(pivot_0, max_le_high);
       }
     } while (min_gt_low != max_le_high);