feat: support non-contiguous input/output in normalization functions (#921)

yzh119 · web-flow · commit 0daed1a1d447 · 2025-03-07T20:54:06.000-08:00
We should support normalization functions where input and output are not
contiguous.
diff --git a/csrc/norm.cu b/csrc/norm.cu
@@ -22,8 +22,8 @@ using namespace flashinfer;
 
 void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps,
              int64_t cuda_stream) {
-  CHECK_INPUT(input);
-  CHECK_INPUT(weight);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(weight);
   auto device = input.device();
   CHECK_EQ(weight.device(), device);
   CHECK_DIM(2, input);   // input: (batch_size, hidden_size)
@@ -36,9 +36,10 @@ void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double e
 
   cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
-    cudaError_t status = norm::RMSNorm(
-        static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(weight.data_ptr()),
-        static_cast<c_type*>(output.data_ptr()), batch_size, hidden_size, eps, stream);
+    cudaError_t status = norm::RMSNorm(static_cast<c_type*>(input.data_ptr()),
+                                       static_cast<c_type*>(weight.data_ptr()),
+                                       static_cast<c_type*>(output.data_ptr()), batch_size,
+                                       hidden_size, input.stride(0), output.stride(0), eps, stream);
     TORCH_CHECK(status == cudaSuccess,
                 "RMSNorm failed with error code " + std::string(cudaGetErrorString(status)));
     return true;
@@ -47,9 +48,9 @@ void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double e
 
 void fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps,
                        int64_t cuda_stream) {
-  CHECK_INPUT(input);
-  CHECK_INPUT(residual);
-  CHECK_INPUT(weight);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(residual);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(weight);
   auto device = input.device();
   CHECK_EQ(residual.device(), device);
   CHECK_EQ(weight.device(), device);
@@ -66,7 +67,8 @@ void fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weig
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
     cudaError_t status = norm::FusedAddRMSNorm(
         static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(residual.data_ptr()),
-        static_cast<c_type*>(weight.data_ptr()), batch_size, hidden_size, eps, stream);
+        static_cast<c_type*>(weight.data_ptr()), batch_size, hidden_size, input.stride(0),
+        residual.stride(0), eps, stream);
     TORCH_CHECK(status == cudaSuccess, "FusedAddRMSNorm failed with error code " +
                                            std::string(cudaGetErrorString(status)));
     return true;
@@ -75,8 +77,8 @@ void fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weig
 
 void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps,
                    int64_t cuda_stream) {
-  CHECK_INPUT(input);
-  CHECK_INPUT(weight);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(weight);
   auto device = input.device();
   CHECK_EQ(weight.device(), device);
   CHECK_DIM(2, input);   // input: (batch_size, hidden_size)
@@ -91,7 +93,8 @@ void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, do
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
     cudaError_t status = norm::GemmaRMSNorm(
         static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(weight.data_ptr()),
-        static_cast<c_type*>(output.data_ptr()), batch_size, hidden_size, eps, stream);
+        static_cast<c_type*>(output.data_ptr()), batch_size, hidden_size, input.stride(0),
+        output.stride(0), eps, stream);
     TORCH_CHECK(status == cudaSuccess,
                 "GemmaRMSNorm failed with error code " + std::string(cudaGetErrorString(status)));
     return true;
@@ -100,9 +103,9 @@ void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, do
 
 void gemma_fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight,
                              double eps, int64_t cuda_stream) {
-  CHECK_INPUT(input);
-  CHECK_INPUT(residual);
-  CHECK_INPUT(weight);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(residual);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(weight);
   auto device = input.device();
   CHECK_EQ(residual.device(), device);
   CHECK_EQ(weight.device(), device);
@@ -119,7 +122,8 @@ void gemma_fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
     cudaError_t status = norm::GemmaFusedAddRMSNorm(
         static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(residual.data_ptr()),
-        static_cast<c_type*>(weight.data_ptr()), batch_size, hidden_size, eps, stream);
+        static_cast<c_type*>(weight.data_ptr()), batch_size, hidden_size, input.stride(0),
+        residual.stride(0), eps, stream);
     TORCH_CHECK(status == cudaSuccess, "GemmaFusedAddRMSNorm failed with error code " +
                                            std::string(cudaGetErrorString(status)));
     return true;
diff --git a/include/flashinfer/norm.cuh b/include/flashinfer/norm.cuh
@@ -29,7 +29,8 @@ namespace norm {
 
 template <uint32_t VEC_SIZE, typename T>
 __global__ void RMSNormKernel(T* __restrict__ input, T* __restrict__ weight, T* __restrict__ output,
-                              const uint32_t d, float weight_bias, float eps) {
+                              const uint32_t d, const uint32_t stride_input,
+                              const uint32_t stride_output, float weight_bias, float eps) {
   const uint32_t bx = blockIdx.x;
   const uint32_t tx = threadIdx.x, ty = threadIdx.y;
   constexpr uint32_t warp_size = 32;
@@ -46,7 +47,7 @@ __global__ void RMSNormKernel(T* __restrict__ input, T* __restrict__ weight, T*
     vec_t<T, VEC_SIZE> input_vec;
     input_vec.fill(0.f);
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      input_vec.load(input + bx * stride_input + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
     }
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; j++) {
@@ -82,22 +83,24 @@ __global__ void RMSNormKernel(T* __restrict__ input, T* __restrict__ weight, T*
     input_vec.fill(0.f);
     weight_vec.fill(0.f);
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      input_vec.load(input + bx * stride_input + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
       weight_vec.load(weight + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
     }
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; j++) {
       output_vec[j] = float(input_vec[j]) * rms_rcp * (weight_bias + float(weight_vec[j]));
     }
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      output_vec.store(output + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      output_vec.store(output + bx * stride_output + i * num_threads * VEC_SIZE +
+                       thread_id * VEC_SIZE);
     }
   }
 }
 
 template <typename T>
 cudaError_t RMSNorm(T* input, T* weight, T* output, uint32_t batch_size, uint32_t d,
-                    float eps = 1e-5, cudaStream_t stream = 0) {
+                    uint32_t stride_input, uint32_t stride_output, float eps = 1e-5,
+                    cudaStream_t stream = 0) {
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
   const uint32_t block_size = std::min<uint32_t>(1024, d / vec_size);
@@ -106,7 +109,7 @@ cudaError_t RMSNorm(T* input, T* weight, T* output, uint32_t batch_size, uint32_
   dim3 nthrs(32, num_warps);
   const uint32_t smem_size = num_warps * sizeof(float);
   float weight_bias = 0.f;
-  void* args[] = {&input, &weight, &output, &d, &weight_bias, &eps};
+  void* args[] = {&input, &weight, &output, &d, &stride_input, &stride_output, &weight_bias, &eps};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
     auto kernel = RMSNormKernel<VEC_SIZE, T>;
@@ -117,8 +120,9 @@ cudaError_t RMSNorm(T* input, T* weight, T* output, uint32_t batch_size, uint32_
 
 template <uint32_t VEC_SIZE, typename T>
 __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ residual,
-                                      T* __restrict__ weight, const uint32_t d, float weight_bias,
-                                      float eps) {
+                                      T* __restrict__ weight, const uint32_t d,
+                                      const uint32_t stride_input, const uint32_t stride_residual,
+                                      float weight_bias, float eps) {
   const uint32_t bx = blockIdx.x;
   const uint32_t tx = threadIdx.x, ty = threadIdx.y;
   constexpr uint32_t warp_size = 32;
@@ -139,8 +143,9 @@ __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ res
     vec_t<float, VEC_SIZE> x_vec;
     x_vec.fill(0.f);
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
-      residual_vec.load(residual + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      input_vec.load(input + bx * stride_input + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      residual_vec.load(residual + bx * stride_residual + i * num_threads * VEC_SIZE +
+                        thread_id * VEC_SIZE);
     }
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; j++) {
@@ -151,7 +156,8 @@ __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ res
       x_vec[j] = x;
     }
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      residual_vec.store(residual + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      residual_vec.store(residual + bx * stride_residual + i * num_threads * VEC_SIZE +
+                         thread_id * VEC_SIZE);
       x_vec.store(smem_x + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
     }
   }
@@ -193,14 +199,16 @@ __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ res
       input_vec[j] = x_vec[j] * rms_rcp * (weight_bias + float(weight_vec[j]));
     }
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      input_vec.store(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      input_vec.store(input + bx * stride_input + i * num_threads * VEC_SIZE +
+                      thread_id * VEC_SIZE);
     }
   }
 }
 
 template <typename T>
 cudaError_t FusedAddRMSNorm(T* input, T* residual, T* weight, uint32_t batch_size, uint32_t d,
-                            float eps = 1e-5, cudaStream_t stream = 0) {
+                            uint32_t stride_input, uint32_t stride_residual, float eps = 1e-5,
+                            cudaStream_t stream = 0) {
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
   const uint32_t block_size = std::min<uint32_t>(1024, d / vec_size);
@@ -209,11 +217,13 @@ cudaError_t FusedAddRMSNorm(T* input, T* residual, T* weight, uint32_t batch_siz
   dim3 nthrs(32, num_warps);
   const uint32_t smem_size = (ceil_div(num_warps, 4) * 4 + d) * sizeof(float);
   float weight_bias = 0.f;
-  void* args[] = {&input, &residual, &weight, &d, &weight_bias, &eps};
+  void* args[] = {&input,        &residual,        &weight,      &d,
+                  &stride_input, &stride_residual, &weight_bias, &eps};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
     auto kernel = FusedAddRMSNormKernel<VEC_SIZE, T>;
-    FLASHINFER_CUDA_CALL(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    FLASHINFER_CUDA_CALL(
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
   });
 
@@ -222,7 +232,8 @@ cudaError_t FusedAddRMSNorm(T* input, T* residual, T* weight, uint32_t batch_siz
 
 template <typename T>
 cudaError_t GemmaRMSNorm(T* input, T* weight, T* output, uint32_t batch_size, uint32_t d,
-                         float eps = 1e-5, cudaStream_t stream = 0) {
+                         uint32_t stride_input, uint32_t stride_output, float eps = 1e-5,
+                         cudaStream_t stream = 0) {
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
   const uint32_t block_size = std::min<uint32_t>(1024, d / vec_size);
@@ -231,7 +242,7 @@ cudaError_t GemmaRMSNorm(T* input, T* weight, T* output, uint32_t batch_size, ui
   dim3 nthrs(32, num_warps);
   const uint32_t smem_size = num_warps * sizeof(float);
   float weight_bias = 1.f;
-  void* args[] = {&input, &weight, &output, &d, &weight_bias, &eps};
+  void* args[] = {&input, &weight, &output, &d, &stride_input, &stride_output, &weight_bias, &eps};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
     auto kernel = RMSNormKernel<VEC_SIZE, T>;
@@ -242,7 +253,8 @@ cudaError_t GemmaRMSNorm(T* input, T* weight, T* output, uint32_t batch_size, ui
 
 template <typename T>
 cudaError_t GemmaFusedAddRMSNorm(T* input, T* residual, T* weight, uint32_t batch_size, uint32_t d,
-                                 float eps = 1e-5, cudaStream_t stream = 0) {
+                                 uint32_t stride_input, uint32_t stride_residual, float eps = 1e-5,
+                                 cudaStream_t stream = 0) {
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
   const uint32_t block_size = std::min<uint32_t>(1024, d / vec_size);
@@ -252,11 +264,13 @@ cudaError_t GemmaFusedAddRMSNorm(T* input, T* residual, T* weight, uint32_t batc
   // NOTE(Zihao): use ceil_div(num_warps, 4) * 4 for address alignment to 16 bytes
   const uint32_t smem_size = (ceil_div(num_warps, 4) * 4 + d) * sizeof(float);
   float weight_bias = 1.f;
-  void* args[] = {&input, &residual, &weight, &d, &weight_bias, &eps};
+  void* args[] = {&input,        &residual,        &weight,      &d,
+                  &stride_input, &stride_residual, &weight_bias, &eps};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
     auto kernel = FusedAddRMSNormKernel<VEC_SIZE, T>;
-    FLASHINFER_CUDA_CALL(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    FLASHINFER_CUDA_CALL(
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
   });
 
diff --git a/tests/test_norm.py b/tests/test_norm.py
@@ -68,8 +68,14 @@ def fused_add_rms_norm(x, residual, weight, eps):
 @pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
 @pytest.mark.parametrize("dtype", [torch.float16])
 @pytest.mark.parametrize("specify_out", [True, False])
-def test_norm(batch_size, hidden_size, dtype, specify_out):
-    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+@pytest.mark.parametrize("contiguous", [True, False])
+def test_norm(batch_size, hidden_size, dtype, specify_out, contiguous):
+    if contiguous:
+        x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+    else:
+        x = torch.randn(batch_size, hidden_size * 2, device="cuda").to(dtype)
+        x = x[:, :hidden_size]
+
     w = torch.randn(hidden_size).to(0).to(dtype)
 
     y_ref = llama_rms_norm(x, w)
@@ -85,10 +91,16 @@ def test_norm(batch_size, hidden_size, dtype, specify_out):
 @pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
 @pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
 @pytest.mark.parametrize("dtype", [torch.float16])
-def test_fused_add_rmsnorm(batch_size, hidden_size, dtype):
+@pytest.mark.parametrize("contiguous", [True, False])
+def test_fused_add_rmsnorm(batch_size, hidden_size, dtype, contiguous):
     eps = 1e-6
 
-    x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    if contiguous:
+        x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    else:
+        x = torch.randn(batch_size, hidden_size * 2, device="cuda").to(dtype)
+        x = x[:, :hidden_size]
+
     residual = torch.randn_like(x)
     weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
 
@@ -108,8 +120,14 @@ def test_fused_add_rmsnorm(batch_size, hidden_size, dtype):
 @pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
 @pytest.mark.parametrize("dtype", [torch.float16])
 @pytest.mark.parametrize("specify_out", [True, False])
-def test_gemma_norm(batch_size, hidden_size, dtype, specify_out):
-    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+@pytest.mark.parametrize("contiguous", [True, False])
+def test_gemma_norm(batch_size, hidden_size, dtype, specify_out, contiguous):
+    if contiguous:
+        x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+    else:
+        x = torch.randn(batch_size, hidden_size * 2, device="cuda").to(dtype)
+        x = x[:, :hidden_size]
+
     w = torch.randn(hidden_size).to(0).to(dtype)
 
     y_ref = gemma_rms_norm(x, w)
@@ -125,10 +143,16 @@ def test_gemma_norm(batch_size, hidden_size, dtype, specify_out):
 @pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
 @pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
 @pytest.mark.parametrize("dtype", [torch.float16])
-def test_gemma_fused_add_rmsnorm(batch_size, hidden_size, dtype):
+@pytest.mark.parametrize("contiguous", [True, False])
+def test_gemma_fused_add_rmsnorm(batch_size, hidden_size, dtype, contiguous):
     eps = 1e-6
 
-    x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    if contiguous:
+        x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    else:
+        x = torch.randn(batch_size, hidden_size * 2, device="cuda").to(dtype)
+        x = x[:, :hidden_size]
+
     residual = torch.randn_like(x)
     weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
 
@@ -142,3 +166,7 @@ def test_gemma_fused_add_rmsnorm(batch_size, hidden_size, dtype):
 
     torch.testing.assert_close(x_fused, x_native, rtol=1e-3, atol=1e-3)
     torch.testing.assert_close(residual_fused, residual_native, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    test_norm(1, 1024, torch.float16, False, True)