bugfix: fix the rope correctness issue introduced in #609 (#619)

yzh119 · web-flow · commit b27a2cc27a03 · 2024-11-20T01:43:45.000-08:00
As observed by @james-p-xu, #609 produce wrong results for some input shapes, this PR fixes the correctness issue, and add optimizations of dispatching to different parallelism modes for different input shapes. For large shape inputs, using the original implementation (re-use sin/cos for different heads) will be better. For small shape inputs, using head parallelism will be better. Some results: ``` Before #609 (no head-parallelism, re-use sin/cos value) ----------------- batch_size: 1, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 27us, throughput: 0.762GB/s batch_size: 1, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 22us, throughput: 0.919GB/s batch_size: 1, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 27us, throughput: 95.699GB/s batch_size: 1, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 28us, throughput: 95.244GB/s batch_size: 1, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 31us, throughput: 670.254GB/s batch_size: 1, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 31us, throughput: 667.253GB/s --- batch_size: 19, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 27us, throughput: 14.490GB/s batch_size: 19, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 27us, throughput: 14.466GB/s batch_size: 19, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 37us, throughput: 1344.086GB/s batch_size: 19, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 37us, throughput: 1344.902GB/s batch_size: 19, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 148us, throughput: 2699.475GB/s batch_size: 19, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 147us, throughput: 2701.897GB/s --- batch_size: 99, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 27us, throughput: 74.322GB/s batch_size: 99, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 27us, throughput: 74.568GB/s batch_size: 99, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 110us, throughput: 2352.352GB/s batch_size: 99, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 110us, throughput: 2365.580GB/s batch_size: 99, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 718us, throughput: 2893.608GB/s batch_size: 99, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 717us, throughput: 2894.859GB/s --- batch_size: 128, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 27us, throughput: 95.373GB/s batch_size: 128, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 27us, throughput: 95.810GB/s batch_size: 128, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 130us, throughput: 2583.872GB/s batch_size: 128, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 129us, throughput: 2595.944GB/s batch_size: 128, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 923us, throughput: 2907.408GB/s batch_size: 128, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 924us, throughput: 2905.533GB/s Head parallelism only (no dispatch) --------------------- batch_size: 1, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 6us, throughput: 3.321GB/s batch_size: 1, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 6us, throughput: 3.391GB/s batch_size: 1, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 7us, throughput: 358.862GB/s batch_size: 1, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 7us, throughput: 362.361GB/s batch_size: 1, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 15us, throughput: 1413.175GB/s batch_size: 1, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 15us, throughput: 1437.332GB/s --- batch_size: 19, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 6us, throughput: 60.526GB/s batch_size: 19, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 6us, throughput: 60.127GB/s batch_size: 19, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 26us, throughput: 1897.923GB/s batch_size: 19, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 24us, throughput: 2050.075GB/s batch_size: 19, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 164us, throughput: 2431.650GB/s batch_size: 19, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 147us, throughput: 2709.333GB/s --- batch_size: 99, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 7us, throughput: 284.641GB/s batch_size: 99, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 7us, throughput: 302.815GB/s batch_size: 99, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 109us, throughput: 2391.712GB/s batch_size: 99, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 97us, throughput: 2671.150GB/s batch_size: 99, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 860us, throughput: 2413.211GB/s batch_size: 99, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 828us, throughput: 2508.817GB/s --- batch_size: 128, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 7us, throughput: 349.795GB/s batch_size: 128, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 7us, throughput: 376.624GB/s batch_size: 128, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 139us, throughput: 2413.690GB/s batch_size: 128, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 124us, throughput: 2705.994GB/s batch_size: 128, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 1110us, throughput: 2417.480GB/s batch_size: 128, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 1063us, throughput: 2525.976GB/s This PR (shape dispatch) --------------------- batch_size: 1, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 28us, throughput: 0.728GB/s batch_size: 1, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 6us, throughput: 3.451GB/s batch_size: 1, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 7us, throughput: 359.759GB/s batch_size: 1, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 7us, throughput: 361.286GB/s batch_size: 1, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 15us, throughput: 1426.267GB/s batch_size: 1, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 15us, throughput: 1433.691GB/s --- batch_size: 19, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 6us, throughput: 60.390GB/s batch_size: 19, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 6us, throughput: 59.937GB/s batch_size: 19, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 26us, throughput: 1892.575GB/s batch_size: 19, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 24us, throughput: 2049.735GB/s batch_size: 19, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 148us, throughput: 2698.780GB/s batch_size: 19, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 147us, throughput: 2701.558GB/s --- batch_size: 99, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 7us, throughput: 285.335GB/s batch_size: 99, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 7us, throughput: 303.373GB/s batch_size: 99, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 110us, throughput: 2351.126GB/s batch_size: 99, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 110us, throughput: 2362.898GB/s batch_size: 99, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 717us, throughput: 2893.713GB/s batch_size: 99, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 717us, throughput: 2894.902GB/s --- batch_size: 128, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 7us, throughput: 350.720GB/s batch_size: 128, append_len: 1, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 7us, throughput: 376.690GB/s batch_size: 128, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 130us, throughput: 2584.221GB/s batch_size: 128, append_len: 128, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 129us, throughput: 2596.612GB/s batch_size: 128, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: False, latency: 924us, throughput: 2906.480GB/s batch_size: 128, append_len: 1024, num_qo_heads: 32, num_kv_heads: 8, head_dim: 128, use_cos_sin_cache: True, latency: 924us, throughput: 2905.134GB/s ``` cc @nandor @james-p-xu
diff --git a/include/flashinfer/pos_enc.cuh b/include/flashinfer/pos_enc.cuh
@@ -168,6 +168,56 @@ __device__ __forceinline__ vec_t<float, vec_size> vec_apply_llama_rope_cos_sin_i
   return vec;
 }
 
+template <bool interleave, uint32_t head_dim, uint32_t vec_size, uint32_t bdx, typename DType,
+          typename IdType>
+__global__ void BatchQKApplyRotaryPosIdsCosSinCacheHeadParallelismKernel(
+    DType* q, DType* k, DType* q_rope, DType* k_rope, float* __restrict__ cos_cache,
+    float* __restrict__ sin_cache, IdType* __restrict__ pos_ids, uint32_t nnz,
+    uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t rotary_dim, size_t q_stride_n,
+    size_t q_stride_h, size_t k_stride_n, size_t k_stride_h, size_t q_rope_stride_n,
+    size_t q_rope_stride_h, size_t k_rope_stride_n, size_t k_rope_stride_h) {
+  uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
+  uint32_t by = blockIdx.y;
+  const uint32_t bdy = blockDim.y;
+
+  vec_t<float, vec_size> cos, sin;
+  if (bx * bdy + ty < nnz) {
+    const uint32_t idx = bx * bdy + ty;
+    const IdType pos = pos_ids[idx];
+
+    if (tx * vec_size < rotary_dim) {
+      cos.load(cos_cache + pos * rotary_dim + tx * vec_size);
+      sin.load(sin_cache + pos * rotary_dim + tx * vec_size);
+    }
+
+    if (by < num_qo_heads) {
+      uint32_t qo_head_idx = by;
+      DType* q_ptr = q + get_elem_offset_impl(idx, qo_head_idx, 0, q_stride_n, q_stride_h);
+      DType* q_rope_ptr =
+          q_rope + get_elem_offset_impl(idx, qo_head_idx, 0, q_rope_stride_n, q_rope_stride_h);
+      vec_t<float, vec_size> q_vec;
+      if constexpr (interleave) {
+        q_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      } else {
+        q_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      }
+      q_vec.cast_store(q_rope_ptr + tx * vec_size);
+    } else {
+      uint32_t kv_head_idx = by - num_qo_heads;
+      DType* k_ptr = k + get_elem_offset_impl(idx, kv_head_idx, 0, k_stride_n, k_stride_h);
+      DType* k_rope_ptr =
+          k_rope + get_elem_offset_impl(idx, kv_head_idx, 0, k_rope_stride_n, k_rope_stride_h);
+      vec_t<float, vec_size> k_vec;
+      if constexpr (interleave) {
+        k_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      } else {
+        k_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      }
+      k_vec.cast_store(k_rope_ptr + tx * vec_size);
+    }
+  }
+}
+
 template <bool interleave, uint32_t head_dim, uint32_t vec_size, uint32_t bdx, typename DType,
           typename IdType>
 __global__ void BatchQKApplyRotaryPosIdsCosSinCacheKernel(
@@ -221,69 +271,144 @@ __global__ void BatchQKApplyRotaryPosIdsCosSinCacheKernel(
 
 template <bool interleave, uint32_t head_dim, uint32_t vec_size, uint32_t bdx, typename DType,
           typename IdType>
-__global__ void BatchQKApplyRotaryPosIdsKernel(
+__global__ void BatchQKApplyRotaryPosIdsHeadParallelismKernel(
     DType* q, DType* k, DType* q_rope, DType* k_rope, IdType* __restrict__ pos_ids, uint32_t nnz,
     uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t rotary_dim, size_t q_stride_n,
     size_t q_stride_h, size_t k_stride_n, size_t k_stride_h, size_t q_rope_stride_n,
     size_t q_rope_stride_h, size_t k_rope_stride_n, size_t k_rope_stride_h, float smooth_a,
     float smooth_b, float rope_rcp_scale, float rope_rcp_theta) {
   // NOTE: q and q_rope may be the same ptr, so do k and k_rope
   uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
+  uint32_t by = blockIdx.y;
+  const uint32_t bdy = blockDim.y;
+  vec_t<float, vec_size> freq;
+  if (tx * vec_size < rotary_dim) {
+#pragma unroll
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      if constexpr (interleave) {
+        freq[i] = __powf(rope_rcp_theta, float(2 * ((tx * vec_size + i) / 2)) / float(rotary_dim));
+      } else {
+        freq[i] = __powf(rope_rcp_theta,
+                         float(2 * ((tx * vec_size + i) % (rotary_dim / 2))) / float(rotary_dim));
+      }
 
-  const uint32_t idx = bx *  blockDim.y + ty;
-  const uint32_t pos_idx = idx / (num_qo_heads + num_kv_heads);
-  if (pos_idx >= nnz) {
-    return;
+      float smooth = freq[i] * smooth_a + smooth_b;
+      smooth = max(0.0f, min(1.0f, smooth));  // clamp to [0, 1]
+      freq[i] = (1 - smooth) * (freq[i] * rope_rcp_scale) + smooth * freq[i];
+    }
   }
 
-  const IdType pos = pos_ids[pos_idx];
-
   vec_t<float, vec_size> cos, sin;
+
+  if (bx * bdy + ty < nnz) {
+    const uint32_t idx = bx * bdy + ty;
+    const IdType pos = pos_ids[idx];
+
+    if (tx * vec_size < rotary_dim) {
+#pragma unroll
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        float embed = float(pos) * freq[i];
+        __sincosf(embed, &sin[i], &cos[i]);
+      }
+    }
+
+    if (by < num_qo_heads) {
+      uint32_t qo_head_idx = by;
+      DType* q_ptr = q + get_elem_offset_impl(idx, qo_head_idx, 0, q_stride_n, q_stride_h);
+      DType* q_rope_ptr =
+          q_rope + get_elem_offset_impl(idx, qo_head_idx, 0, q_rope_stride_n, q_rope_stride_h);
+      vec_t<float, vec_size> q_vec;
+      if constexpr (interleave) {
+        q_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      } else {
+        q_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      }
+      q_vec.cast_store(q_rope_ptr + tx * vec_size);
+    } else {
+      uint32_t kv_head_idx = by - num_qo_heads;
+      DType* k_ptr = k + get_elem_offset_impl(idx, kv_head_idx, 0, k_stride_n, k_stride_h);
+      DType* k_rope_ptr =
+          k_rope + get_elem_offset_impl(idx, kv_head_idx, 0, k_rope_stride_n, k_rope_stride_h);
+      vec_t<float, vec_size> k_vec;
+      if constexpr (interleave) {
+        k_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      } else {
+        k_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      }
+      k_vec.cast_store(k_rope_ptr + tx * vec_size);
+    }
+  }
+}
+
+template <bool interleave, uint32_t head_dim, uint32_t vec_size, uint32_t bdx, typename DType,
+          typename IdType>
+__global__ void BatchQKApplyRotaryPosIdsKernel(
+    DType* q, DType* k, DType* q_rope, DType* k_rope, IdType* __restrict__ pos_ids, uint32_t nnz,
+    uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t rotary_dim, size_t q_stride_n,
+    size_t q_stride_h, size_t k_stride_n, size_t k_stride_h, size_t q_rope_stride_n,
+    size_t q_rope_stride_h, size_t k_rope_stride_n, size_t k_rope_stride_h, float smooth_a,
+    float smooth_b, float rope_rcp_scale, float rope_rcp_theta) {
+  // NOTE: q and q_rope may be the same ptr, so do k and k_rope
+  uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
+  const uint32_t bdy = blockDim.y;
+  vec_t<float, vec_size> freq;
   if (tx * vec_size < rotary_dim) {
-  #pragma unroll
+#pragma unroll
     for (uint32_t i = 0; i < vec_size; ++i) {
-      float freq;
       if constexpr (interleave) {
-        freq = __powf(rope_rcp_theta, float(2 * ((tx * vec_size + i) / 2)) / float(rotary_dim));
+        freq[i] = __powf(rope_rcp_theta, float(2 * ((tx * vec_size + i) / 2)) / float(rotary_dim));
       } else {
-        freq = __powf(rope_rcp_theta,
-                        float(2 * ((tx * vec_size + i) % (rotary_dim / 2))) / float(rotary_dim));
+        freq[i] = __powf(rope_rcp_theta,
+                         float(2 * ((tx * vec_size + i) % (rotary_dim / 2))) / float(rotary_dim));
       }
 
-      float smooth = freq * smooth_a + smooth_b;
+      float smooth = freq[i] * smooth_a + smooth_b;
       smooth = max(0.0f, min(1.0f, smooth));  // clamp to [0, 1]
-      freq = (1 - smooth) * (freq * rope_rcp_scale) + smooth * freq;
-
-      const float embed = float(pos) * freq;
-      __sincosf(embed, &sin[i], &cos[i]);
+      freq[i] = (1 - smooth) * (freq[i] * rope_rcp_scale) + smooth * freq[i];
     }
   }
 
-  const uint32_t head_idx = idx % (num_qo_heads + num_kv_heads);
-  if (head_idx < num_qo_heads) {
-    const uint32_t qo_head_idx = head_idx;
-    DType* q_ptr = q + get_elem_offset_impl(pos_idx, qo_head_idx, 0, q_stride_n, q_stride_h);
-    DType* q_rope_ptr =
-        q_rope + get_elem_offset_impl(pos_idx, qo_head_idx, 0, q_rope_stride_n, q_rope_stride_h);
-    vec_t<float, vec_size> q_vec;
-    if constexpr (interleave) {
-      q_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
-    } else {
-      q_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+  vec_t<float, vec_size> cos, sin;
+
+  if (bx * bdy + ty < nnz) {
+    const uint32_t idx = bx * bdy + ty;
+    const IdType pos = pos_ids[idx];
+
+    if (tx * vec_size < rotary_dim) {
+#pragma unroll
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        float embed = float(pos) * freq[i];
+        __sincosf(embed, &sin[i], &cos[i]);
+      }
     }
-    q_vec.cast_store(q_rope_ptr + tx * vec_size);
-  } else {
-    const uint32_t kv_head_idx = head_idx - num_qo_heads;
-    DType* k_ptr = k + get_elem_offset_impl(pos_idx, kv_head_idx, 0, k_stride_n, k_stride_h);
-    DType* k_rope_ptr =
-        k_rope + get_elem_offset_impl(pos_idx, kv_head_idx, 0, k_rope_stride_n, k_rope_stride_h);
-    vec_t<float, vec_size> k_vec;
-    if constexpr (interleave) {
-      k_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
-    } else {
-      k_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+
+#pragma unroll 1
+    for (uint32_t qo_head_idx = 0; qo_head_idx < num_qo_heads; ++qo_head_idx) {
+      DType* q_ptr = q + get_elem_offset_impl(idx, qo_head_idx, 0, q_stride_n, q_stride_h);
+      DType* q_rope_ptr =
+          q_rope + get_elem_offset_impl(idx, qo_head_idx, 0, q_rope_stride_n, q_rope_stride_h);
+      vec_t<float, vec_size> q_vec;
+      if constexpr (interleave) {
+        q_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      } else {
+        q_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      }
+      q_vec.cast_store(q_rope_ptr + tx * vec_size);
+    }
+
+#pragma unroll 1
+    for (uint32_t kv_head_idx = 0; kv_head_idx < num_kv_heads; ++kv_head_idx) {
+      DType* k_ptr = k + get_elem_offset_impl(idx, kv_head_idx, 0, k_stride_n, k_stride_h);
+      DType* k_rope_ptr =
+          k_rope + get_elem_offset_impl(idx, kv_head_idx, 0, k_rope_stride_n, k_rope_stride_h);
+      vec_t<float, vec_size> k_vec;
+      if constexpr (interleave) {
+        k_vec = vec_apply_llama_rope_cos_sin_interleave<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      } else {
+        k_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      }
+      k_vec.cast_store(k_rope_ptr + tx * vec_size);
     }
-    k_vec.cast_store(k_rope_ptr + tx * vec_size);
   }
 }
 
@@ -383,16 +508,18 @@ cudaError_t BatchQKApplyRotaryPosIdsCosSinCache(
     uint32_t rotary_dim, uint32_t head_dim, size_t q_stride_n, size_t q_stride_h, size_t k_stride_n,
     size_t k_stride_h, size_t q_rope_stride_n, size_t q_rope_stride_h, size_t k_rope_stride_n,
     size_t k_rope_stride_h, bool interleave, cudaStream_t stream = nullptr) {
+  int dev_id = 0;
+  int num_sms = 0;
+  FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
+  FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+
   DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
     DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, {
       constexpr uint32_t vec_size = std::max(16 / sizeof(DType), HEAD_DIM / 32);
       constexpr uint32_t bdx = HEAD_DIM / vec_size;
       uint32_t num_threads = std::max(128U, bdx);
       uint32_t bdy = num_threads / bdx;
-      dim3 nblks((nnz + bdy - 1) / bdy);
-      dim3 nthrs(bdx, bdy);
-      auto kernel = BatchQKApplyRotaryPosIdsCosSinCacheKernel<INTERLEAVE, HEAD_DIM, vec_size, bdx,
-                                                              DType, IdType>;
+      uint32_t nblks_x = (nnz + bdy - 1) / bdy;
       void* args[] = {(void*)&q,
                       (void*)&k,
                       (void*)&q_rope,
@@ -412,7 +539,26 @@ cudaError_t BatchQKApplyRotaryPosIdsCosSinCache(
                       (void*)&q_rope_stride_h,
                       (void*)&k_rope_stride_n,
                       (void*)&k_rope_stride_h};
-      FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, 0, stream));
+      auto kernel_0 = BatchQKApplyRotaryPosIdsCosSinCacheKernel<INTERLEAVE, HEAD_DIM, vec_size, bdx,
+                                                                DType, IdType>;
+
+      int num_blocks_per_sm_0 = 0;
+      FLASHINFER_CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &num_blocks_per_sm_0, kernel_0, num_threads, /*smem_size=*/0));
+      uint32_t num_ctas_0 = num_blocks_per_sm_0 * num_sms;
+
+      if ((nnz + bdy - 1) / bdy >= num_ctas_0) {
+        dim3 nblks(nblks_x);
+        dim3 nthrs(bdx, bdy);
+        FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel_0, nblks, nthrs, args, 0, stream));
+      } else {
+        dim3 nblks(nblks_x, num_qo_heads + num_kv_heads);
+        dim3 nthrs(bdx, bdy);
+        auto kernel_1 =
+            BatchQKApplyRotaryPosIdsCosSinCacheHeadParallelismKernel<INTERLEAVE, HEAD_DIM, vec_size,
+                                                                     bdx, DType, IdType>;
+        FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel_1, nblks, nthrs, args, 0, stream));
+      }
     });
   });
 
@@ -430,17 +576,19 @@ cudaError_t BatchQKApplyRotaryPosIds(
   float rope_rcp_theta = 1.0f / rope_theta;
   float smooth_a = 0.f;
   float smooth_b = 0.f;
+  int dev_id = 0;
+  int num_sms = 0;
+  FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
+  FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
   DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
     DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, {
       constexpr uint32_t vec_size = std::max(16 / sizeof(DType), HEAD_DIM / 32);
       constexpr uint32_t bdx = HEAD_DIM / vec_size;
       uint32_t num_threads = std::max(128U, bdx);
       uint32_t bdy = num_threads / bdx;
-      dim3 nblks((nnz + bdy - 1) / bdy);
-      dim3 nthrs(bdx, bdy);
-      auto kernel =
-          BatchQKApplyRotaryPosIdsKernel<INTERLEAVE, HEAD_DIM, vec_size, bdx, DType, IdType>;
+      uint32_t nblks_x = (nnz + bdy - 1) / bdy;
+
       void* args[] = {(void*)&q,
                       (void*)&k,
                       (void*)&q_rope,
@@ -462,7 +610,26 @@ cudaError_t BatchQKApplyRotaryPosIds(
                       (void*)&smooth_b,
                       (void*)&rope_rcp_scale,
                       (void*)&rope_rcp_theta};
-      FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, 0, stream));
+      auto kernel_0 =
+          BatchQKApplyRotaryPosIdsKernel<INTERLEAVE, HEAD_DIM, vec_size, bdx, DType, IdType>;
+
+      int num_blocks_per_sm_0 = 0;
+      FLASHINFER_CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &num_blocks_per_sm_0, kernel_0, num_threads, /*smem_size=*/0));
+      uint32_t num_ctas_0 = num_blocks_per_sm_0 * num_sms;
+      if (nblks_x >= num_ctas_0) {
+        dim3 nblks(nblks_x);
+        dim3 nthrs(bdx, bdy);
+
+        FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel_0, nblks, nthrs, args, 0, stream));
+      } else {
+        dim3 nblks(nblks_x, num_qo_heads + num_kv_heads);
+        dim3 nthrs(bdx, bdy);
+        auto kernel_1 = BatchQKApplyRotaryPosIdsHeadParallelismKernel<INTERLEAVE, HEAD_DIM,
+                                                                      vec_size, bdx, DType, IdType>;
+
+        FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel_1, nblks, nthrs, args, 0, stream));
+      }
     });
   });
 
@@ -606,7 +773,7 @@ cudaError_t BatchQKApplyLlama31RotaryPosIds(
       constexpr uint32_t bdx = HEAD_DIM / vec_size;
       uint32_t num_threads = std::max(128U, bdx);
       uint32_t bdy = num_threads / bdx;
-      dim3 nblks((nnz + bdy - 1) / bdy * (num_qo_heads + num_kv_heads));
+      dim3 nblks((nnz + bdy - 1) / bdy);
       dim3 nthrs(bdx, bdy);
       auto kernel =
           BatchQKApplyRotaryPosIdsKernel<INTERLEAVE, HEAD_DIM, vec_size, bdx, DType, IdType>;