[WebGPU-EP Native] Add ReduceMean (#23860)

satyajandhyala · guschmue · commit 00ae32d5ac69 · 2025-03-05T16:36:20.000-08:00
### Description
&lt;!-- Describe your changes. --&gt;



### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -0,0 +1,168 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/reduction/reduction_ops.h"
+#include <sstream>
+#include "core/framework/data_transfer_manager.h"
+#include "core/providers/webgpu/data_transfer.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end)              \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                   \
+      ReduceOp,                                                                        \
+      kOnnxDomain,                                                                     \
+      begin, end,                                                                      \
+      kWebGpuExecutionProvider,                                                        \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \
+      ReduceOp);
+
+#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version)                                                                  \
+  ONNX_OPERATOR_KERNEL_EX(                                                                                                    \
+      ReduceOp,                                                                                                               \
+      kOnnxDomain,                                                                                                            \
+      version,                                                                                                                \
+      kWebGpuExecutionProvider,                                                                                               \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
+      ReduceOp);
+
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
+REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);
+
+Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty();
+  std::string loop_header = code_[0];
+  std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1];
+  std::string loop_footer = code_[2];
+  const auto input_rank = input.Rank();
+  for (int i = 0, l = 0; i < input_rank; ++i) {
+    if (reduce_on_all_axes || std::find(axes_.begin(), axes_.end(), i) != axes_.end()) {
+      if (keepdims_) {
+        l++;
+      }
+      std::stringstream ss;
+      std::string index = "i" + std::to_string(i);
+      ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n";
+      ss << input.IndicesSet("input_indices", i, index) << ";\n";
+      ss << loop_body << "\n";
+      ss << "}\n";
+      loop_body = ss.str();
+    } else {
+      std::stringstream ss;
+      ss << loop_header << "\n";
+      std::string index = "i" + std::to_string(i);
+      ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n";
+      ss << input.IndicesSet("input_indices", i, index) << ";\n";
+      loop_header = ss.str();
+      l++;
+    }
+  }
+  std::stringstream input_indices_init_value;
+  for (int i = 0; i < input_rank - 1; ++i) {
+    input_indices_init_value << "0, ";
+  }
+  input_indices_init_value << "0";
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let output_indices: output_indices_t = " << output.OffsetToIndices("global_idx") << ";\n"
+                            << "var input_indices: input_indices_t = input_indices_t(" << input_indices_init_value.str() << ");\n"
+                            << loop_header << loop_body << loop_footer;
+  shader.MainFunctionBody() << output.SetByOffset("global_idx", "output_value");
+  return Status::OK();
+}
+
+template <bool allow_multi_axes>
+Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  InlinedVector<uint32_t> input_axes;
+  auto rank = input_tensor->Shape().NumDimensions();
+  auto transform_axis = [rank](int64_t axis) {
+    if (axis < 0) {
+      axis += rank;
+    }
+    if (axis < 0 || static_cast<size_t>(axis) >= rank) {
+      ORT_THROW("Axes values must be in the range [-rank, rank-1]. Got: ", axis);
+    }
+    return static_cast<uint32_t>(axis);
+  };
+  // Check if axes input is provided and copy the axes values to input_axes
+  if (context.InputCount() > 1) {
+    ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided.");
+    const Tensor* axes_tensor = context.Input<Tensor>(1);
+    auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
+    const auto* data = axes_tensor->Data<int64_t>();
+    input_axes.reserve(size);
+    std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
+  } else {
+    input_axes.reserve(axes_.size());
+    std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis);
+  }
+  if (input_axes.empty()) {
+    if (noop_with_empty_axes_ || rank == 0) {
+      // If axes is empty and noop_with_empty_axes_ is true, it is a no-op according to the spec
+      // If input tensor is a scalar, return the input tensor as is.
+      // This is not correct for ReduceLogSum and ReduceSumSquare
+      // TODO handle these cases separately.
+      auto output = context.Output(0, input_tensor->Shape());
+      if (output->DataRaw() != input_tensor->DataRaw()) {
+        ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input_tensor, *output));
+      }
+      return Status::OK();
+    } else {
+      // If axes is empty and noop_with_empty_axes_ is false, it is a reduction over all axes
+      input_axes.resize(rank);
+      std::iota(input_axes.begin(), input_axes.end(), 0);
+    }
+  }
+  const auto code = GetOpSpecificCode(input_tensor, input_axes.size());
+  // Compute output shape
+  std::vector<int64_t> output_shape;
+  for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) {
+    if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
+      if (keepdims_) {
+        output_shape.push_back(1);
+      }
+    } else {
+      output_shape.push_back(input_tensor->Shape()[i]);
+    }
+  }
+  TensorShape output_tensor_shape(output_shape);
+  int64_t output_size = output_tensor_shape.Size();
+  ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code);
+  program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+      .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                            {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
+                            {input_axes},
+                            {static_cast<uint32_t>(input_axes.size())}});
+
+  return context.RunProgram(program);
+}
+
+ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const {
+  const TensorShape& input_shape = input_tensor->Shape();
+  size_t input_rank = input_shape.NumDimensions();
+  std::stringstream ss;
+  ss << "var size: u32 = 1;\n"
+     << "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n"
+     << "  let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n"
+     << "  size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n"
+     << "}\n"
+     << "let output_value = output_value_t(sum / f32(size));";
+  ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()});
+  return code;
+}
+
+Status ReduceMean::ComputeInternal(ComputeContext& ctx) const {
+  return ReduceKernel<true>::ComputeInternal(ctx);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/optional.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/cpu/reduction/reduction_kernel_base.h"
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/shader_helper.h"
+namespace onnxruntime {
+namespace webgpu {
+// reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation.
+// The first element is the loop header, the second element is the loop body, and the third element is the loop footer.
+// The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop.
+// The loop footer is the code that is executed after the loop ends.
+typedef std::array<std::string, 3> ReduceOpSpecificCode;
+class ReduceKernelProgram final : public Program<ReduceKernelProgram> {
+ public:
+  ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {}
+  Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32},
+                                          {"axes", ProgramUniformVariableDataType::Uint32},
+                                          {"axes_size", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  const bool keepdims_;
+  const bool no_op_with_empty_axes_;
+  InlinedVector<uint32_t> axes_;
+  ReduceOpSpecificCode code_;
+};
+
+template <bool allow_multi_axes = true>
+class ReduceKernel : public WebGpuKernel, public ReduceKernelBase<allow_multi_axes> {
+ protected:
+  using ReduceKernelBase<allow_multi_axes>::axes_;
+  using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;
+  using ReduceKernelBase<allow_multi_axes>::keepdims_;
+  using ReduceKernelBase<allow_multi_axes>::select_last_index_;
+
+  ReduceKernel(const OpKernelInfo& info, std::string name, optional<int64_t> keepdims_override = {})
+      : WebGpuKernel(info),
+        ReduceKernelBase<allow_multi_axes>(info, keepdims_override),
+        name_(name) {
+  }
+  Status ComputeInternal(ComputeContext& ctx) const;
+  virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0;
+
+ private:
+  std::string name_;
+};
+
+class ReduceMean final : public ReduceKernel<true> {
+ public:
+  ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean") {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override;
+  Status ComputeInternal(ComputeContext& ctx) const override;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -516,10 +516,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMean)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMean)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,