feat: non-inplace rope operators (#405)

yzh119 · web-flow · commit 74ffba1d1b94 · 2024-07-28T21:39:24.000-07:00
As requested in #403, this PR implements non-inplace rope operators.
diff --git a/docs/api/python/rope.rst b/docs/api/python/rope.rst
@@ -12,3 +12,5 @@ Kernels for applying rotary embeddings.
     
     apply_rope_inplace
     apply_llama31_rope_inplace
+    apply_rope
+    apply_llama31_rope
diff --git a/include/flashinfer/attention/prefill.cuh b/include/flashinfer/attention/prefill.cuh
@@ -1440,7 +1440,7 @@ __launch_bounds__(num_warps_x* num_warps_z* warp_size) void BatchPrefillWithRagg
   write_o_reg_gmem<num_warps_x, num_warps_z, num_frags_x, num_frags_y>(
       o_frag, &qo_smem, o_ptr_base, qo_packed_idx_base, qo_len,
       /*o_stride_n=*/
-          partition_kv ? num_qo_heads * head_dim * num_kv_chunks : num_qo_heads * head_dim,
+      partition_kv ? num_qo_heads * head_dim * num_kv_chunks : num_qo_heads * head_dim,
       /*o_stride_h=*/head_dim, group_size);
 
   // write lse
@@ -1732,7 +1732,7 @@ __launch_bounds__(num_warps_x* num_warps_z* warp_size) void BatchPrefillWithPage
   write_o_reg_gmem<num_warps_x, num_warps_z, num_frags_x, num_frags_y>(
       o_frag, &qo_smem, o_ptr_base, qo_packed_idx_base, qo_len,
       /*o_stride_n=*/
-          partition_kv ? num_qo_heads * head_dim * num_kv_chunks : num_qo_heads * head_dim,
+      partition_kv ? num_qo_heads * head_dim * num_kv_chunks : num_qo_heads * head_dim,
       /*o_stride_h=*/head_dim, group_size);
 
   // write lse
diff --git a/include/flashinfer/pos_enc.cuh b/include/flashinfer/pos_enc.cuh
@@ -191,6 +191,86 @@ __global__ void BatchQKApplyRotaryInPlaceKernel(
   }
 }
 
+template <bool interleave, uint32_t head_dim, uint32_t vec_size, uint32_t bdx, typename DType,
+          typename IdType>
+__global__ void BatchQKApplyRotaryKernel(DType* __restrict__ q, DType* __restrict__ k,
+                                         DType* __restrict__ q_rope, DType* __restrict__ k_rope,
+                                         IdType* __restrict__ indptr, IdType* __restrict__ offsets,
+                                         uint32_t batch_size, uint32_t num_qo_heads,
+                                         uint32_t num_kv_heads, size_t q_stride_n,
+                                         size_t q_stride_h, size_t k_stride_n, size_t k_stride_h,
+                                         float smooth_a, float smooth_b, float rope_rcp_scale,
+                                         float rope_rcp_theta) {
+  uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
+  const uint32_t bdy = blockDim.y;
+  vec_t<float, vec_size> freq;
+#pragma unroll
+  for (uint32_t i = 0; i < vec_size; ++i) {
+    if constexpr (interleave) {
+      freq[i] = __powf(rope_rcp_theta, float(2 * ((tx * vec_size + i) / 2)) / float(head_dim));
+    } else {
+      freq[i] = __powf(rope_rcp_theta,
+                       float(2 * ((tx * vec_size + i) % (head_dim / 2))) / float(head_dim));
+    }
+
+    float smooth = freq[i] * smooth_a + smooth_b;
+    smooth = max(0.0f, min(1.0f, smooth));  // clamp to [0, 1]
+    freq[i] = (1 - smooth) * (freq[i] * rope_rcp_scale) + smooth * freq[i];
+  }
+
+  if (bx < batch_size * num_qo_heads) {
+    // apply rotary to q
+    const uint32_t batch_idx = bx / num_qo_heads;
+    const uint32_t qo_head_idx = bx % num_qo_heads;
+    const uint32_t seq_len = indptr[batch_idx + 1] - indptr[batch_idx];
+    const uint32_t offset = offsets[batch_idx];
+#pragma unroll 2
+    for (uint32_t i = 0; i < (seq_len + bdy - 1) / bdy; ++i) {
+      vec_t<float, vec_size> q_vec;
+      if (i * bdy + ty < seq_len) {
+        DType* q_ptr = q + get_elem_offset_impl(indptr[batch_idx] + i * bdy + ty, qo_head_idx, 0,
+                                                q_stride_n, q_stride_h);
+        DType* q_rope_ptr =
+            q_rope + get_elem_offset_impl(indptr[batch_idx] + i * bdy + ty, qo_head_idx, 0,
+                                          /*q_stride_n=*/num_qo_heads * head_dim,
+                                          /*q_stride_h=*/head_dim);
+        if constexpr (interleave) {
+          q_vec =
+              vec_apply_llama_rope_interleave<vec_size, bdx>(q_ptr, freq, offset + i * bdy + ty);
+        } else {
+          q_vec = vec_apply_llama_rope<vec_size, bdx>(q_ptr, freq, offset + i * bdy + ty);
+        }
+        q_vec.cast_store(q_rope_ptr + tx * vec_size);
+      }
+    }
+  } else {
+    // apply rotary to k
+    uint32_t batch_idx = (bx - batch_size * num_qo_heads) / num_kv_heads;
+    uint32_t kv_head_idx = (bx - batch_size * num_qo_heads) % num_kv_heads;
+    const uint32_t seq_len = indptr[batch_idx + 1] - indptr[batch_idx];
+    const uint32_t offset = offsets[batch_idx];
+#pragma unroll 2
+    for (uint32_t i = 0; i < (seq_len + bdy - 1) / bdy; ++i) {
+      vec_t<float, vec_size> k_vec;
+      if (i * bdy + ty < seq_len) {
+        DType* k_ptr = k + get_elem_offset_impl(indptr[batch_idx] + i * bdy + ty, kv_head_idx, 0,
+                                                k_stride_n, k_stride_h);
+        DType* k_rope_ptr =
+            k_rope + get_elem_offset_impl(indptr[batch_idx] + i * bdy + ty, kv_head_idx, 0,
+                                          /*kv_stride_n=*/num_kv_heads * head_dim,
+                                          /*kv_stride_h=*/head_dim);
+        if constexpr (interleave) {
+          k_vec =
+              vec_apply_llama_rope_interleave<vec_size, bdx>(k_ptr, freq, offset + i * bdy + ty);
+        } else {
+          k_vec = vec_apply_llama_rope<vec_size, bdx>(k_ptr, freq, offset + i * bdy + ty);
+        }
+        k_vec.cast_store(k_rope_ptr + +tx * vec_size);
+      }
+    }
+  }
+}
+
 #define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
   if (interleave) {                                      \
     const bool INTERLEAVE = true;                        \
@@ -289,6 +369,100 @@ cudaError_t BatchQKApplyLlama31RotaryInPlace(
   return cudaSuccess;
 }
 
+template <typename DType, typename IdType>
+cudaError_t BatchQKApplyRotary(DType* __restrict__ q, DType* __restrict__ k,
+                               DType* __restrict__ q_rope, DType* __restrict__ k_rope,
+                               IdType* __restrict__ indptr, IdType* __restrict__ offsets,
+                               uint32_t batch_size, uint32_t num_qo_heads, uint32_t num_kv_heads,
+                               uint32_t head_dim, size_t q_stride_n, size_t q_stride_h,
+                               size_t k_stride_n, size_t k_stride_h, bool interleave,
+                               float rope_scale, float rope_theta, cudaStream_t stream = nullptr) {
+  float rope_rcp_scale = 1.0f / rope_scale;
+  float rope_rcp_theta = 1.0f / rope_theta;
+  float smooth_a = 0.f;
+  float smooth_b = 0.f;
+
+  DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+    DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, {
+      constexpr uint32_t vec_size = std::max(16 / sizeof(DType), HEAD_DIM / 32);
+      constexpr uint32_t bdx = HEAD_DIM / vec_size;
+      uint32_t num_threads = std::max(128U, bdx);
+      uint32_t bdy = num_threads / bdx;
+      dim3 nblks(batch_size * (num_qo_heads + num_kv_heads));
+      dim3 nthrs(bdx, bdy);
+      auto kernel = BatchQKApplyRotaryKernel<INTERLEAVE, HEAD_DIM, vec_size, bdx, DType, IdType>;
+      void* args[] = {(void*)&q,
+                      (void*)&k,
+                      (void*)&q_rope,
+                      (void*)&k_rope,
+                      (void*)&indptr,
+                      (void*)&offsets,
+                      (void*)&batch_size,
+                      (void*)&num_qo_heads,
+                      (void*)&num_kv_heads,
+                      (void*)&q_stride_n,
+                      (void*)&q_stride_h,
+                      (void*)&k_stride_n,
+                      (void*)&k_stride_h,
+                      (void*)&smooth_a,
+                      (void*)&smooth_b,
+                      (void*)&rope_rcp_scale,
+                      (void*)&rope_rcp_theta};
+      FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, 0, stream));
+    });
+  });
+
+  return cudaSuccess;
+}
+
+template <typename DType, typename IdType>
+cudaError_t BatchQKApplyLlama31Rotary(DType* __restrict__ q, DType* __restrict__ k,
+                                      DType* __restrict__ q_rope, DType* __restrict__ k_rope,
+                                      IdType* __restrict__ indptr, IdType* __restrict__ offsets,
+                                      uint32_t batch_size, uint32_t num_qo_heads,
+                                      uint32_t num_kv_heads, uint32_t head_dim, size_t q_stride_n,
+                                      size_t q_stride_h, size_t k_stride_n, size_t k_stride_h,
+                                      bool interleave, float rope_scale, float rope_theta,
+                                      float low_freq_factor, float high_freq_factor,
+                                      float old_context_length, cudaStream_t stream = nullptr) {
+  float rope_rcp_scale = 1.0f / rope_scale;
+  float rope_rcp_theta = 1.0f / rope_theta;
+  float smooth_a = old_context_length / (2 * M_PI * high_freq_factor - 2 * M_PI * low_freq_factor);
+  float smooth_b = -1.0f / (high_freq_factor / low_freq_factor - 1.0f);
+
+  DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+    DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, {
+      constexpr uint32_t vec_size = std::max(16 / sizeof(DType), HEAD_DIM / 32);
+      constexpr uint32_t bdx = HEAD_DIM / vec_size;
+      uint32_t num_threads = std::max(128U, bdx);
+      uint32_t bdy = num_threads / bdx;
+      dim3 nblks(batch_size * (num_qo_heads + num_kv_heads));
+      dim3 nthrs(bdx, bdy);
+      auto kernel = BatchQKApplyRotaryKernel<INTERLEAVE, HEAD_DIM, vec_size, bdx, DType, IdType>;
+      void* args[] = {(void*)&q,
+                      (void*)&k,
+                      (void*)&q_rope,
+                      (void*)&k_rope,
+                      (void*)&indptr,
+                      (void*)&offsets,
+                      (void*)&batch_size,
+                      (void*)&num_qo_heads,
+                      (void*)&num_kv_heads,
+                      (void*)&q_stride_n,
+                      (void*)&q_stride_h,
+                      (void*)&k_stride_n,
+                      (void*)&k_stride_h,
+                      (void*)&smooth_a,
+                      (void*)&smooth_b,
+                      (void*)&rope_rcp_scale,
+                      (void*)&rope_rcp_theta};
+      FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, 0, stream));
+    });
+  });
+
+  return cudaSuccess;
+}
+
 }  // namespace flashinfer
 
 #endif  // FLASHINFER_POS_ENC_CUH_
diff --git a/python/csrc/flashinfer_ops.cu b/python/csrc/flashinfer_ops.cu
@@ -45,6 +45,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("apply_rope_inplace", &apply_rope_inplace, "Apply RoPE in-place");
   m.def("apply_llama31_rope_inplace", &apply_llama31_rope_inplace,
         "Apply Llama 3.1 style RoPE in-place");
+  m.def("apply_rope", &apply_rope, "Apply RoPE");
+  m.def("apply_llama31_rope", &apply_llama31_rope, "Apply Llama 3.1 style RoPE");
   m.def("packbits", &packbits, "GPU packbits operator");
   m.def("segment_packbits", &segment_packbits, "GPU segment packbits operator");
   py::class_<BatchDecodeWithPagedKVCachePyTorchWrapper>(m,
diff --git a/python/csrc/flashinfer_ops.h b/python/csrc/flashinfer_ops.h
@@ -83,6 +83,16 @@ void apply_llama31_rope_inplace(torch::Tensor q, torch::Tensor k, torch::Tensor
                                 float rope_theta, float low_freq_factor, float high_freq_factor,
                                 float old_context_length);
 
+std::vector<torch::Tensor> apply_rope(torch::Tensor q, torch::Tensor k, torch::Tensor indptr,
+                                      torch::Tensor offsets, bool interleave, float rope_scale,
+                                      float rope_theta);
+
+std::vector<torch::Tensor> apply_llama31_rope(torch::Tensor q, torch::Tensor k,
+                                              torch::Tensor indptr, torch::Tensor offsets,
+                                              bool interleave, float rope_scale, float rope_theta,
+                                              float low_freq_factor, float high_freq_factor,
+                                              float old_context_length);
+
 torch::Tensor packbits(torch::Tensor x, const std::string& bitorder);
 
 torch::Tensor segment_packbits(torch::Tensor x, torch::Tensor input_indptr,
diff --git a/python/csrc/rope.cu b/python/csrc/rope.cu
@@ -102,4 +102,102 @@ void apply_llama31_rope_inplace(torch::Tensor q, torch::Tensor k, torch::Tensor
                                            std::string(cudaGetErrorString(status)));
     return true;
   });
-}
+}
+
+std::vector<torch::Tensor> apply_rope(torch::Tensor q, torch::Tensor k, torch::Tensor indptr,
+                                      torch::Tensor offsets, bool interleave, float rope_scale,
+                                      float rope_theta) {
+  CHECK_CUDA(q);  // not necessarily contiguous
+  CHECK_CUDA(k);  // not necessarily contiguous
+  CHECK_INPUT(indptr);
+  CHECK_INPUT(offsets);
+
+  auto device = q.device();
+  CHECK_EQ(k.device(), device);
+  CHECK_DIM(3, q);        // q: (nnz, H_Q, D)
+  CHECK_DIM(3, k);        // k: (nnz, H_K, D)
+  CHECK_DIM(1, indptr);   // indptr: (B + 1)
+  CHECK_DIM(1, offsets);  // offsets: (B)
+  CHECK_EQ(q.size(0), k.size(0));
+  CHECK_EQ(q.size(2), k.size(2));
+  unsigned int num_qo_heads = q.size(1);
+  unsigned int num_kv_heads = k.size(1);
+  unsigned int head_dim = q.size(2);
+  unsigned int batch_size = offsets.size(0);
+  CHECK_EQ(indptr.size(0), batch_size + 1);
+  size_t q_stride_n = q.stride(0);
+  size_t q_stride_h = q.stride(1);
+  size_t k_stride_n = k.stride(0);
+  size_t k_stride_h = k.stride(1);
+  indptr = indptr.to(torch::kInt32);
+  offsets = offsets.to(torch::kInt32);
+  // NOTE(Zihao): empty_like do not copy strides so it's okay to use it here.
+  auto q_rope = torch::empty_like(q);
+  auto k_rope = torch::empty_like(k);
+
+  cudaStream_t torch_current_stream = c10::cuda::getCurrentCUDAStream(device.index());
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(q.scalar_type(), c_type, [&] {
+    cudaError_t status = BatchQKApplyRotary(
+        static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k.data_ptr()),
+        static_cast<c_type*>(q_rope.data_ptr()), static_cast<c_type*>(k_rope.data_ptr()),
+        static_cast<int32_t*>(indptr.data_ptr()), static_cast<int32_t*>(offsets.data_ptr()),
+        batch_size, num_qo_heads, num_kv_heads, head_dim, q_stride_n, q_stride_h, k_stride_n,
+        k_stride_h, interleave, rope_scale, rope_theta, torch_current_stream);
+    TORCH_CHECK(status == cudaSuccess, "BatchQKApplyRotary failed with error code " +
+                                           std::string(cudaGetErrorString(status)));
+    return true;
+  });
+
+  return {q_rope, k_rope};
+}
+
+std::vector<torch::Tensor> apply_llama31_rope(torch::Tensor q, torch::Tensor k,
+                                              torch::Tensor indptr, torch::Tensor offsets,
+                                              bool interleave, float rope_scale, float rope_theta,
+                                              float low_freq_factor, float high_freq_factor,
+                                              float old_context_length) {
+  CHECK_CUDA(q);  // not necessarily contiguous
+  CHECK_CUDA(k);  // not necessarily contiguous
+  CHECK_INPUT(indptr);
+  CHECK_INPUT(offsets);
+
+  auto device = q.device();
+  CHECK_EQ(k.device(), device);
+  CHECK_DIM(3, q);        // q: (nnz, H_Q, D)
+  CHECK_DIM(3, k);        // k: (nnz, H_K, D)
+  CHECK_DIM(1, indptr);   // indptr: (B + 1)
+  CHECK_DIM(1, offsets);  // offsets: (B)
+  CHECK_EQ(q.size(0), k.size(0));
+  CHECK_EQ(q.size(2), k.size(2));
+  unsigned int num_qo_heads = q.size(1);
+  unsigned int num_kv_heads = k.size(1);
+  unsigned int head_dim = q.size(2);
+  unsigned int batch_size = offsets.size(0);
+  CHECK_EQ(indptr.size(0), batch_size + 1);
+  size_t q_stride_n = q.stride(0);
+  size_t q_stride_h = q.stride(1);
+  size_t k_stride_n = k.stride(0);
+  size_t k_stride_h = k.stride(1);
+  indptr = indptr.to(torch::kInt32);
+  offsets = offsets.to(torch::kInt32);
+
+  // NOTE(Zihao): empty_like do not copy strides so it's okay to use it here.
+  auto q_rope = torch::empty_like(q);
+  auto k_rope = torch::empty_like(k);
+
+  cudaStream_t torch_current_stream = c10::cuda::getCurrentCUDAStream(device.index());
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(q.scalar_type(), c_type, [&] {
+    cudaError_t status = BatchQKApplyLlama31Rotary(
+        static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k.data_ptr()),
+        static_cast<c_type*>(q_rope.data_ptr()), static_cast<c_type*>(k_rope.data_ptr()),
+        static_cast<int32_t*>(indptr.data_ptr()), static_cast<int32_t*>(offsets.data_ptr()),
+        batch_size, num_qo_heads, num_kv_heads, head_dim, q_stride_n, q_stride_h, k_stride_n,
+        k_stride_h, interleave, rope_scale, rope_theta, low_freq_factor, high_freq_factor,
+        old_context_length, torch_current_stream);
+    TORCH_CHECK(status == cudaSuccess, "BatchQKApplyLlama31Rotary failed with error code " +
+                                           std::string(cudaGetErrorString(status)));
+    return true;
+  });
+
+  return {q_rope, k_rope};
+}
diff --git a/python/csrc/single_prefill.cu b/python/csrc/single_prefill.cu
@@ -71,7 +71,7 @@ std::vector<torch::Tensor> single_prefill_with_kv_cache(
   TORCH_CHECK(logits_soft_cap >= 0.f, "logits_soft_cap must be non-negative");
   const LogitsPostHook logits_post_hook =
       logits_soft_cap > 0.f ? LogitsPostHook::kSoftCap : LogitsPostHook::kNone;
-  
+
   bool success = DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(q.scalar_type(), c_type, [&] {
     return DISPATCH_head_dim(head_dim, HEAD_DIM, [&] {
       return DISPATCH_mask_mode(mask_mode, MASK_MODE, [&] {
diff --git a/python/flashinfer/__init__.py b/python/flashinfer/__init__.py
@@ -44,7 +44,12 @@
     chain_speculative_sampling,
 )
 from .norm import rmsnorm
-from .rope import apply_rope_inplace, apply_llama31_rope_inplace
+from .rope import (
+    apply_rope_inplace,
+    apply_llama31_rope_inplace,
+    apply_rope,
+    apply_llama31_rope,
+)
 from .group_gemm import SegmentGEMMWrapper
 from .quantization import packbits, segment_packbits
 
diff --git a/python/flashinfer/rope.py b/python/flashinfer/rope.py
diff --git a/python/tests/test_rope.py b/python/tests/test_rope.py