Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WebGPU-EP Native] Add ReduceMean #23860

Merged
merged 4 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "core/providers/webgpu/reduction/reduction_ops.h"
#include <sstream>
#include "core/framework/data_transfer_manager.h"
#include "core/providers/webgpu/data_transfer.h"
#include "core/providers/webgpu/shader_helper.h"
#include "core/providers/webgpu/webgpu_supported_types.h"

namespace onnxruntime {
namespace webgpu {

#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end) \
ONNX_OPERATOR_VERSIONED_KERNEL_EX( \
ReduceOp, \
kOnnxDomain, \
begin, end, \
kWebGpuExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \
ReduceOp);

#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version) \
ONNX_OPERATOR_KERNEL_EX( \
ReduceOp, \
kOnnxDomain, \
version, \
kWebGpuExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
ReduceOp);

REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);

Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty();
std::string loop_header = code_[0];
std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1];
std::string loop_footer = code_[2];
const auto input_rank = input.Rank();
for (int i = 0, l = 0; i < input_rank; ++i) {
if (reduce_on_all_axes || std::find(axes_.begin(), axes_.end(), i) != axes_.end()) {
if (keepdims_) {
l++;
}
std::stringstream ss;
std::string index = "i" + std::to_string(i);
ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n";
ss << input.IndicesSet("input_indices", i, index) << ";\n";
ss << loop_body << "\n";
ss << "}\n";
loop_body = ss.str();
} else {
std::stringstream ss;
ss << loop_header << "\n";
std::string index = "i" + std::to_string(i);

Check warning on line 60 in onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc:60: Add #include <string> for string [build/include_what_you_use] [4]
ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n";
ss << input.IndicesSet("input_indices", i, index) << ";\n";
loop_header = ss.str();
l++;
}
}
std::stringstream input_indices_init_value;
for (int i = 0; i < input_rank - 1; ++i) {
input_indices_init_value << "0, ";
}
input_indices_init_value << "0";
shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
<< "let output_indices: output_indices_t = " << output.OffsetToIndices("global_idx") << ";\n"
<< "var input_indices: input_indices_t = input_indices_t(" << input_indices_init_value.str() << ");\n"
<< loop_header << loop_body << loop_footer;
shader.MainFunctionBody() << output.SetByOffset("global_idx", "output_value");
return Status::OK();
}

template <bool allow_multi_axes>
Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context) const {
const auto* input_tensor = context.Input(0);
InlinedVector<uint32_t> input_axes;
auto rank = input_tensor->Shape().NumDimensions();
auto transform_axis = [rank](int64_t axis) {
if (axis < 0) {
axis += rank;
}
if (axis < 0 || static_cast<size_t>(axis) >= rank) {
ORT_THROW("Axes values must be in the range [-rank, rank-1]. Got: ", axis);
}
return static_cast<uint32_t>(axis);
};
// Check if axes input is provided and copy the axes values to input_axes
if (context.InputCount() > 1) {
ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided.");
const Tensor* axes_tensor = context.Input<Tensor>(1);
auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->Data<int64_t>();
input_axes.reserve(size);
std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
} else {
input_axes.reserve(axes_.size());
std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis);

Check warning on line 104 in onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <algorithm> for transform [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc:104: Add #include <algorithm> for transform [build/include_what_you_use] [4]
}
if (input_axes.empty()) {
if (noop_with_empty_axes_ || rank == 0) {
// If axes is empty and noop_with_empty_axes_ is true, it is a no-op according to the spec
// If input tensor is a scalar, return the input tensor as is.
// This is not correct for ReduceLogSum and ReduceSumSquare
// TODO handle these cases separately.

Check warning on line 111 in onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Missing username in TODO; it should look like "// TODO(my_username): Stuff." [readability/todo] [2] Raw Output: onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc:111: Missing username in TODO; it should look like "// TODO(my_username): Stuff." [readability/todo] [2]
auto output = context.Output(0, input_tensor->Shape());
if (output->DataRaw() != input_tensor->DataRaw()) {
ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input_tensor, *output));
}
return Status::OK();
} else {
// If axes is empty and noop_with_empty_axes_ is false, it is a reduction over all axes
input_axes.resize(rank);
std::iota(input_axes.begin(), input_axes.end(), 0);
}
}
const auto code = GetOpSpecificCode(input_tensor, input_axes.size());
// Compute output shape
std::vector<int64_t> output_shape;

Check warning on line 125 in onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc:125: Add #include <vector> for vector<> [build/include_what_you_use] [4]
for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) {
if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
if (keepdims_) {
output_shape.push_back(1);
}
} else {
output_shape.push_back(input_tensor->Shape()[i]);
}
}
TensorShape output_tensor_shape(output_shape);
int64_t output_size = output_tensor_shape.Size();
ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code);
program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
.AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
.AddUniformVariables({{static_cast<uint32_t>(output_size)},
{static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
{input_axes},
{static_cast<uint32_t>(input_axes.size())}});

return context.RunProgram(program);
}

ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const {
const TensorShape& input_shape = input_tensor->Shape();
size_t input_rank = input_shape.NumDimensions();
std::stringstream ss;
ss << "var size: u32 = 1;\n"
<< "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n"
<< " let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n"
<< " size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n"
<< "}\n"
<< "let output_value = output_value_t(sum / f32(size));";
ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()});
return code;
}

Status ReduceMean::ComputeInternal(ComputeContext& ctx) const {
return ReduceKernel<true>::ComputeInternal(ctx);
}

} // namespace webgpu
} // namespace onnxruntime
62 changes: 62 additions & 0 deletions onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once
#include "core/common/optional.h"
#include "core/providers/webgpu/webgpu_supported_types.h"
#include "core/providers/webgpu/webgpu_kernel.h"
#include "core/providers/cpu/reduction/reduction_kernel_base.h"
#include "core/providers/webgpu/program.h"
#include "core/providers/webgpu/shader_helper.h"
namespace onnxruntime {
namespace webgpu {
// reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation.
// The first element is the loop header, the second element is the loop body, and the third element is the loop footer.
// The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop.
// The loop footer is the code that is executed after the loop ends.
typedef std::array<std::string, 3> ReduceOpSpecificCode;
class ReduceKernelProgram final : public Program<ReduceKernelProgram> {
public:
ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {}
Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
{"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32},
{"axes", ProgramUniformVariableDataType::Uint32},
{"axes_size", ProgramUniformVariableDataType::Uint32});

private:
const bool keepdims_;
const bool no_op_with_empty_axes_;
InlinedVector<uint32_t> axes_;
ReduceOpSpecificCode code_;
};

template <bool allow_multi_axes = true>
class ReduceKernel : public WebGpuKernel, public ReduceKernelBase<allow_multi_axes> {
protected:
using ReduceKernelBase<allow_multi_axes>::axes_;
using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;
using ReduceKernelBase<allow_multi_axes>::keepdims_;
using ReduceKernelBase<allow_multi_axes>::select_last_index_;

ReduceKernel(const OpKernelInfo& info, std::string name, optional<int64_t> keepdims_override = {})
: WebGpuKernel(info),
ReduceKernelBase<allow_multi_axes>(info, keepdims_override),
name_(name) {
}
Status ComputeInternal(ComputeContext& ctx) const;
virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0;

private:
std::string name_;

Check warning on line 51 in onnxruntime/core/providers/webgpu/reduction/reduction_ops.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/reduction/reduction_ops.h:51: Add #include <string> for string [build/include_what_you_use] [4]
};

class ReduceMean final : public ReduceKernel<true> {
public:
ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean") {}
ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override;
Status ComputeInternal(ComputeContext& ctx) const override;
};

} // namespace webgpu
} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -516,10 +516,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,

// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMean)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMean)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMean)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMean)>,

BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,
Expand Down
Loading