flashinfer-ai
diff --git a/‎csrc/activation.cu
+10-7 b/‎csrc/activation.cu
+10-7
diff --git a/‎csrc/batch_decode.cu
+13-9 b/‎csrc/batch_decode.cu
+13-9
diff --git a/‎csrc/batch_decode_jit_pybind.cu
+9-7 b/‎csrc/batch_decode_jit_pybind.cu
+9-7
diff --git a/‎csrc/batch_mla_plan.cu
+4-3 b/‎csrc/batch_mla_plan.cu
+4-3
diff --git a/‎csrc/batch_mla_pybind.cu
+2-3 b/‎csrc/batch_mla_pybind.cu
+2-3
diff --git a/‎csrc/batch_mla_run.cu
+3-2 b/‎csrc/batch_mla_run.cu
+3-2
diff --git a/‎csrc/batch_mla_sm90_plan.cu
+4-2 b/‎csrc/batch_mla_sm90_plan.cu
+4-2
diff --git a/‎csrc/batch_mla_sm90_pybind.cu
+2-2 b/‎csrc/batch_mla_sm90_pybind.cu
+2-2
diff --git a/‎csrc/batch_mla_sm90_run.cu
+3-2 b/‎csrc/batch_mla_sm90_run.cu
+3-2
diff --git a/‎csrc/batch_prefill.cu
+16-12 b/‎csrc/batch_prefill.cu
+16-12
@@ -32,11 +32,12 @@ __device__ __forceinline__ float gelu_tanh(const float& val) {
   return val * cdf;
 }
 
-void silu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl, int64_t cuda_stream) {
+void silu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl) {
   int d = input.size(-1) / 2;
   int64_t num_tokens = input.numel() / input.size(-1);
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(out.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
 
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
     uint32_t vec_size = 16 / sizeof(c_type);
@@ -63,11 +64,13 @@ void silu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl, int64_t c
   });
 }
 
-void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl, int64_t cuda_stream) {
+void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl) {
   int d = input.size(-1) / 2;
   int64_t num_tokens = input.numel() / input.size(-1);
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(out.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
     uint32_t vec_size = 16 / sizeof(c_type);
     cudaLaunchConfig_t config;
@@ -93,12 +96,12 @@ void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl, int6
   });
 }
 
-void gelu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl, int64_t cuda_stream) {
+void gelu_and_mul(at::Tensor& out, at::Tensor& input, bool enable_pdl) {
   int d = input.size(-1) / 2;
   int64_t num_tokens = input.numel() / input.size(-1);
-  dim3 grid(num_tokens);
+  const c10::cuda::OptionalCUDAGuard device_guard(out.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
     uint32_t vec_size = 16 / sizeof(c_type);
     cudaLaunchConfig_t config;
 
@@ -38,7 +38,7 @@ at::Tensor BatchDecodeWithPagedKVCachePlan(
     at::Tensor page_locked_int_workspace_buffer, at::Tensor indptr, int64_t batch_size,
     int64_t num_qo_heads, int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph,
     int64_t window_left, double logits_soft_cap, int64_t head_dim_qk, int64_t head_dim_vo,
-    at::Tensor empty_q_data, at::Tensor empty_kv_data, int64_t cuda_stream) {
+    at::Tensor empty_q_data, at::Tensor empty_kv_data) {
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * float_workspace_buffer.element_size();
   size_t int_workspace_size_in_bytes =
@@ -53,7 +53,8 @@ at::Tensor BatchDecodeWithPagedKVCachePlan(
               "CUDA cores template only supports equal head dim for QK and VO, please use tensor "
               "cores template for different head dim");
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(float_workspace_buffer.device());
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE,
       USE_SLIDING_WINDOW, USE_LOGITS_SOFT_CAP, AttentionVariant, Params, [&] {
@@ -77,12 +78,14 @@ at::Tensor BatchDecodeWithPagedKVCachePlan(
   return vec_to_tensor(plan_info.ToVector());
 }
 
-void BatchDecodeWithPagedKVCacheRun(
-    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
-    at::Tensor q, at::Tensor paged_k_cache, at::Tensor paged_v_cache, at::Tensor paged_kv_indptr,
-    at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len, at::Tensor o,
-    std::optional<at::Tensor> maybe_lse, int64_t kv_layout_code,
-    int64_t window_left ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream) {
+void BatchDecodeWithPagedKVCacheRun(at::Tensor float_workspace_buffer,
+                                    at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
+                                    at::Tensor q, at::Tensor paged_k_cache,
+                                    at::Tensor paged_v_cache, at::Tensor paged_kv_indptr,
+                                    at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len,
+                                    at::Tensor o, std::optional<at::Tensor> maybe_lse,
+                                    int64_t kv_layout_code,
+                                    int64_t window_left ADDITIONAL_FUNC_PARAMS) {
   DecodePlanInfo plan_info;
   plan_info.FromVector(tensor_to_vec(plan_info_vec));
   QKVLayout kv_layout = static_cast<QKVLayout>(kv_layout_code);
@@ -129,7 +132,8 @@ void BatchDecodeWithPagedKVCacheRun(
   TORCH_CHECK(k_strides == v_strides, "k/v strides must be identical");
   kv_cache_strides = k_strides.data();
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(device);
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
 
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE,
 
@@ -21,14 +21,16 @@ at::Tensor BatchDecodeWithPagedKVCachePlan(
     at::Tensor page_locked_int_workspace_buffer, at::Tensor indptr, int64_t batch_size,
     int64_t num_qo_heads, int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph,
     int64_t window_left, double logits_soft_cap, int64_t head_dim_qk, int64_t head_dim_vo,
-    at::Tensor empty_q_data, at::Tensor empty_kv_data, int64_t cuda_stream);
+    at::Tensor empty_q_data, at::Tensor empty_kv_data);
 
-void BatchDecodeWithPagedKVCacheRun(
-    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
-    at::Tensor q, at::Tensor paged_k_cache, at::Tensor paged_v_cache, at::Tensor paged_kv_indptr,
-    at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len, at::Tensor o,
-    std::optional<at::Tensor> maybe_lse, int64_t kv_layout_code,
-    int64_t window_left ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream);
+void BatchDecodeWithPagedKVCacheRun(at::Tensor float_workspace_buffer,
+                                    at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
+                                    at::Tensor q, at::Tensor paged_k_cache,
+                                    at::Tensor paged_v_cache, at::Tensor paged_kv_indptr,
+                                    at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len,
+                                    at::Tensor o, std::optional<at::Tensor> maybe_lse,
+                                    int64_t kv_layout_code,
+                                    int64_t window_left ADDITIONAL_FUNC_PARAMS);
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   // Batched decode with paged KV-Cache plan
 
@@ -26,8 +26,7 @@ at::Tensor BatchMLAPagedAttentionPlan(at::Tensor float_workspace_buffer,
                                       at::Tensor int_workspace_buffer,
                                       at::Tensor page_locked_int_workspace_buffer,
                                       at::Tensor qo_indptr, at::Tensor kv_indptr, at::Tensor kv_len,
-                                      int64_t num_heads, int64_t head_dim_o, bool causal,
-                                      int64_t cuda_stream) {
+                                      int64_t num_heads, int64_t head_dim_o, bool causal) {
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * float_workspace_buffer.element_size();
   size_t int_workspace_size_in_bytes =
@@ -37,7 +36,9 @@ at::Tensor BatchMLAPagedAttentionPlan(at::Tensor float_workspace_buffer,
 
   int batch_size = kv_len.size(0);
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(float_workspace_buffer.device());
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
+
   cudaError_t status =
       MLAPlan(float_workspace_buffer.data_ptr(), float_workspace_size_in_bytes,
               int_workspace_buffer.data_ptr(), page_locked_int_workspace_buffer.data_ptr(),
 
@@ -20,15 +20,14 @@ at::Tensor BatchMLAPagedAttentionPlan(at::Tensor float_workspace_buffer,
                                       at::Tensor int_workspace_buffer,
                                       at::Tensor page_locked_int_workspace_buffer,
                                       at::Tensor qo_indptr, at::Tensor kv_indptr, at::Tensor kv_len,
-                                      int64_t num_heads, int64_t head_dim_o, bool causal,
-                                      int64_t cuda_stream);
+                                      int64_t num_heads, int64_t head_dim_o, bool causal);
 
 void BatchMLAPagedAttentionRun(at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
                                at::Tensor plan_info_vec, at::Tensor q_nope, at::Tensor q_pe,
                                at::Tensor ckv_cache, at::Tensor kpe_cache, at::Tensor kv_indices,
                                at::Tensor o, std::optional<at::Tensor> maybe_lse,
                                int64_t mask_mode_code, int64_t num_heads, int64_t page_size,
-                               double sm_scale, int64_t cuda_stream);
+                               double sm_scale);
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("plan", &BatchMLAPagedAttentionPlan);
 
@@ -29,7 +29,7 @@ void BatchMLAPagedAttentionRun(at::Tensor float_workspace_buffer, at::Tensor int
                                at::Tensor ckv_cache, at::Tensor kpe_cache, at::Tensor kv_indices,
                                at::Tensor o, std::optional<at::Tensor> maybe_lse,
                                int64_t mask_mode_code, int64_t num_heads, int64_t page_size,
-                               double sm_scale, int64_t cuda_stream) {
+                               double sm_scale) {
   // q_nope: [n, num_heads, head_dim_ckv]
   // q_pe: [n, num_heads, head_dim_kpe]
   // ckv_cache: [num_pages, page_size, head_dim_ckv]
@@ -58,7 +58,8 @@ void BatchMLAPagedAttentionRun(at::Tensor float_workspace_buffer, at::Tensor int
   unsigned int o_stride_n = o.stride(0);
   unsigned int o_stride_h = o.stride(1);
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(device);
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
 
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, MASK_MODE, HEAD_DIM_CKV, HEAD_DIM_KPE, Params, [&] {
 
@@ -27,7 +27,7 @@ at::Tensor BatchMLAPagedAttentionSM90Plan(at::Tensor float_workspace_buffer,
                                           at::Tensor page_locked_int_workspace_buffer,
                                           at::Tensor qo_indptr, at::Tensor kv_indptr,
                                           at::Tensor kv_len, int64_t num_heads, int64_t head_dim_o,
-                                          bool causal, int64_t cuda_stream) {
+                                          bool causal) {
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * float_workspace_buffer.element_size();
   size_t int_workspace_size_in_bytes =
@@ -37,7 +37,9 @@ at::Tensor BatchMLAPagedAttentionSM90Plan(at::Tensor float_workspace_buffer,
 
   int batch_size = kv_len.size(0);
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(float_workspace_buffer.device());
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
+
   cudaError_t status =
       MLAPlan(float_workspace_buffer.data_ptr(), float_workspace_size_in_bytes,
               int_workspace_buffer.data_ptr(), page_locked_int_workspace_buffer.data_ptr(),
 
@@ -21,15 +21,15 @@ at::Tensor BatchMLAPagedAttentionSM90Plan(at::Tensor float_workspace_buffer,
                                           at::Tensor page_locked_int_workspace_buffer,
                                           at::Tensor qo_indptr, at::Tensor kv_indptr,
                                           at::Tensor kv_len, int64_t num_heads, int64_t head_dim_o,
-                                          bool causal, int64_t cuda_stream);
+                                          bool causal);
 
 void BatchMLAPagedAttentionSM90Run(at::Tensor float_workspace_buffer,
                                    at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
                                    at::Tensor q_nope, at::Tensor q_pe, at::Tensor ckv_cache,
                                    at::Tensor kpe_cache, at::Tensor kv_indices, at::Tensor o,
                                    std::optional<at::Tensor> maybe_lse, int64_t mask_mode_code,
                                    int64_t num_heads, int64_t page_size,
-                                   double sm_scale ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream);
+                                   double sm_scale ADDITIONAL_FUNC_PARAMS);
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("plan", &BatchMLAPagedAttentionSM90Plan);
 
@@ -30,7 +30,7 @@ void BatchMLAPagedAttentionSM90Run(at::Tensor float_workspace_buffer,
                                    at::Tensor kpe_cache, at::Tensor kv_indices, at::Tensor o,
                                    std::optional<at::Tensor> maybe_lse, int64_t mask_mode_code,
                                    int64_t num_heads, int64_t page_size,
-                                   double sm_scale ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream) {
+                                   double sm_scale ADDITIONAL_FUNC_PARAMS) {
   // q_nope: [n, num_heads, head_dim_ckv]
   // q_pe: [n, num_heads, head_dim_kpe]
   // ckv_cache: [num_pages, page_size, head_dim_ckv]
@@ -59,7 +59,8 @@ void BatchMLAPagedAttentionSM90Run(at::Tensor float_workspace_buffer,
   unsigned int o_stride_n = o.stride(0);
   unsigned int o_stride_h = o.stride(1);
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(device);
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
 
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, MASK_MODE, HEAD_DIM_CKV, HEAD_DIM_KPE, Params, [&] {
 
@@ -45,15 +45,16 @@ at::Tensor BatchPrefillWithKVCachePlan(
     at::Tensor page_locked_int_workspace_buffer, at::Tensor qo_indptr, at::Tensor kv_indptr,
     at::Tensor kv_len_arr, int64_t total_num_rows, int64_t batch_size, int64_t num_qo_heads,
     int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph, int64_t head_dim_qk,
-    int64_t head_dim_vo, bool causal, int64_t cuda_stream) {
+    int64_t head_dim_vo, bool causal) {
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * float_workspace_buffer.element_size();
   size_t int_workspace_size_in_bytes =
       int_workspace_buffer.size(0) * int_workspace_buffer.element_size();
 
   PrefillPlanInfo plan_info;
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(float_workspace_buffer.device());
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
   cudaError_t status = PrefillPlan<IdType>(
       float_workspace_buffer.data_ptr(), float_workspace_size_in_bytes,
       int_workspace_buffer.data_ptr(), page_locked_int_workspace_buffer.data_ptr(),
@@ -72,8 +73,7 @@ void BatchPrefillWithRaggedKVCacheRun(at::Tensor float_workspace_buffer,
                                       at::Tensor q, at::Tensor k, at::Tensor v,
                                       at::Tensor qo_indptr, at::Tensor kv_indptr, at::Tensor o,
                                       std::optional<at::Tensor> maybe_lse, int64_t mask_mode_code,
-                                      int64_t layout, int64_t window_left ADDITIONAL_FUNC_PARAMS,
-                                      int64_t cuda_stream) {
+                                      int64_t layout, int64_t window_left ADDITIONAL_FUNC_PARAMS) {
   PrefillPlanInfo plan_info;
   plan_info.FromVector(tensor_to_vec(plan_info_vec));
   QKVLayout kv_layout = static_cast<QKVLayout>(layout);
@@ -109,7 +109,8 @@ void BatchPrefillWithRaggedKVCacheRun(at::Tensor float_workspace_buffer,
   auto q_scalar_type = q.scalar_type();
   auto kv_scalar_type = k.scalar_type();
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(float_workspace_buffer.device());
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
 
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, MASK_MODE, HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE,
@@ -193,12 +194,14 @@ void BatchPrefillWithRaggedKVCacheRun(at::Tensor float_workspace_buffer,
       });
 }
 
-void BatchPrefillWithPagedKVCacheRun(
-    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
-    at::Tensor q, at::Tensor paged_k_cache, at::Tensor paged_v_cache, at::Tensor qo_indptr,
-    at::Tensor paged_kv_indptr, at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len,
-    at::Tensor o, std::optional<at::Tensor> maybe_lse, int64_t mask_mode_code, int64_t layout,
-    int64_t window_left ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream) {
+void BatchPrefillWithPagedKVCacheRun(at::Tensor float_workspace_buffer,
+                                     at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
+                                     at::Tensor q, at::Tensor paged_k_cache,
+                                     at::Tensor paged_v_cache, at::Tensor qo_indptr,
+                                     at::Tensor paged_kv_indptr, at::Tensor paged_kv_indices,
+                                     at::Tensor paged_kv_last_page_len, at::Tensor o,
+                                     std::optional<at::Tensor> maybe_lse, int64_t mask_mode_code,
+                                     int64_t layout, int64_t window_left ADDITIONAL_FUNC_PARAMS) {
   PrefillPlanInfo plan_info;
   plan_info.FromVector(tensor_to_vec(plan_info_vec));
   QKVLayout kv_layout = static_cast<QKVLayout>(layout);
@@ -239,7 +242,8 @@ void BatchPrefillWithPagedKVCacheRun(
   TORCH_CHECK(k_strides == v_strides, "k/v strides must be identical");
   kv_cache_strides = k_strides.data();
 
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  const c10::cuda::OptionalCUDAGuard device_guard(float_workspace_buffer.device());
+  const cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
 
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, MASK_MODE, HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE,