Skip to content

Commit 09a3a5a

Browse files
authored
Revert "Extract parallel_for_each_reduce_over_dim_output_index from argmin parallelization PoC (#9139)" (#9273)
This reverts commit 8334bb6.
1 parent cfd0bc4 commit 09a3a5a

File tree

4 files changed

+13
-24
lines changed

4 files changed

+13
-24
lines changed

kernels/portable/cpu/op_argmin.cpp

+12-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include <executorch/kernels/portable/cpu/util/reduce_util.h>
1414
#include <executorch/runtime/kernel/kernel_includes.h>
15+
#include <executorch/runtime/kernel/thread_parallel_interface.h>
1516
#include <executorch/runtime/platform/assert.h>
1617

1718
namespace torch {
@@ -47,8 +48,17 @@ Tensor& argmin_out(
4748
ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
4849
long* out_data = out.mutable_data_ptr<long>();
4950

50-
const bool success = parallel_for_each_reduce_over_dim_output_index(
51-
in, dim, out, [&](const auto begin, const auto end) {
51+
// REVIEW: this is the parallelization strategy ATen uses
52+
// specifically when the reduction is along the last dimension and
53+
// that dimension is contiguous. Is there any particular reason we
54+
// shouldn't just always use this strategy since we aren't
55+
// otherwise capable of parallelizing reductions?
56+
const int64_t reduction_size = get_reduced_dim_product(in, dim);
57+
const auto grain_size = std::max(
58+
static_cast<int64_t>(1),
59+
executorch::extension::internal::GRAIN_SIZE / reduction_size);
60+
const bool success = executorch::extension::parallel_for(
61+
0, out.numel(), grain_size, [&](const auto begin, const auto end) {
5262
for (const auto out_ix : c10::irange(begin, end)) {
5363
std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
5464
[](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {

kernels/portable/cpu/util/reduce_util.h

-19
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
#include <executorch/runtime/core/exec_aten/exec_aten.h>
1212
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
13-
#include <executorch/runtime/kernel/thread_parallel_interface.h>
1413
#include <cstring>
1514
#include <tuple>
1615

@@ -812,23 +811,5 @@ bool check_prod_out_args(
812811

813812
#endif
814813

815-
/**
816-
* parallel_for wrapper for reductions that call reduce_over_dim or
817-
* map_reduce_over_dim for each output element. Automatically
818-
* calculates appropriate grain size.
819-
*/
820-
template <typename Func>
821-
[[nodiscard]] bool parallel_for_each_reduce_over_dim_output_index(
822-
const Tensor& in,
823-
optional<int64_t> dim,
824-
const Tensor& out,
825-
const Func& func) {
826-
const int64_t reduction_size = get_reduced_dim_product(in, dim);
827-
const auto grain_size = std::max(
828-
static_cast<int64_t>(1),
829-
executorch::extension::internal::GRAIN_SIZE / reduction_size);
830-
return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
831-
}
832-
833814
} // namespace executor
834815
} // namespace torch

kernels/portable/cpu/util/targets.bzl

-3
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,6 @@ def define_common_targets():
314314
"//executorch/runtime/kernel:kernel_includes{}".format(suffix),
315315
"//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
316316
],
317-
exported_deps = [
318-
"//executorch/runtime/kernel:thread_parallel_interface",
319-
],
320317
exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
321318
visibility = [
322319
"//executorch/extension/llm/custom_ops/...",

shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl

+1
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ ATEN_OPS = (
284284
name = "op_argmin",
285285
deps = [
286286
"//executorch/kernels/portable/cpu/util:reduce_util",
287+
"//executorch/runtime/kernel:thread_parallel_interface",
287288
],
288289
),
289290
op_target(

0 commit comments

Comments
 (0)