From 2c8542a25204f74d41199087465a5bc5da06618a Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Wed, 29 Apr 2020 22:06:16 -0700
Subject: [PATCH] [SYCL] Add faster reduction implementations using atomic
 or/and intel::reduce()

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/handler.hpp         |  88 ++++++-
 sycl/include/CL/sycl/intel/reduction.hpp | 317 ++++++++++++++++++++++-
 2 files changed, 386 insertions(+), 19 deletions(-)
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 49767539bc683..7feaecd6f7102 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -117,13 +117,37 @@ template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
           access::placeholder IsPlaceholder>
 class reduction_impl;
 
+using cl::sycl::detail::enable_if_t;
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu, typename Reduction::rw_accessor_type &Out);
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu, typename Reduction::rw_accessor_type &Out);
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu);
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu);
+
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFunc(handler &CGH, KernelType KernelFunc,
-                const nd_range<Dims> &Range, Reduction &Redu);
+enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+              size_t KernelRun, Reduction &Redu);
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                   size_t KernelRun, Reduction &Redu);
+enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+              size_t KernelRun, Reduction &Redu);
 } // namespace detail
 } // namespace intel
 
@@ -761,6 +785,48 @@ class __SYCL_EXPORT handler {
 #endif
   }
 
+  /// Implements parallel_for() accepting nd_range and 1 reduction variable
+  /// having 'read_write' access mode.
+  /// This version uses fast sycl::atomic operations to update user's reduction
+  /// variable at the end of each work-group work.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
+                      Reduction::has_fast_atomics>
+  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+    intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
+                                          Redu.MAcc);
+  }
+
+  /// Implements parallel_for() accepting nd_range and 1 reduction variable
+  /// having 'discard_write' access mode.
+  /// This version uses fast sycl::atomic operations to update user's reduction
+  /// variable at the end of each work-group work.
+  ///
+  /// The reduction variable must be initialized before the kernel is started
+  /// because atomic operations only update the value, but never initialize it.
+  /// Thus, an additional 'read_write' accessor is created/initialized with
+  /// identity value and then passed to the kernel. After running the kernel it
+  /// is copied to user's 'discard_write' accessor.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
+                      Reduction::has_fast_atomics>
+  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+    auto QueueCopy = MQueue;
+    auto RWAcc = Redu.getReadWriteScalarAcc(*this);
+    intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
+                                          RWAcc);
+    this->finalize();
+
+    // Copy from RWAcc to some temp memory.
+    handler CopyHandler(QueueCopy, MIsHost);
+    CopyHandler.saveCodeLoc(MCodeLoc);
+    CopyHandler.associateWithHandler(RWAcc);
+    CopyHandler.copy(RWAcc, Redu.MAcc);
+    MLastEvent = CopyHandler.finalize();
+  }
+
   /// Defines and invokes a SYCL kernel function for the specified nd_range.
   /// Performs reduction operation specified in \param Redu.
   ///
@@ -771,11 +837,6 @@ class __SYCL_EXPORT handler {
   /// globally visible, there is no need for the developer to provide
   /// a kernel name for it.
   ///
-  /// TODO: currently it calls only those versions of kernels that can handle
-  /// custom types and operations. Some of types and operations may use faster
-  /// implementations that use intel::reduce() and/or sycl::atomic.fetch_<op>()
-  /// functions and thus provide much better performance. Those variants exist,
-  /// are fully functional. They just wait for their time for code-review.
   /// TODO: Need to handle more than 1 reduction in parallel_for().
   /// TODO: Support HOST. The kernels called by this parallel_for() may use
   /// some functionality that is not yet supported on HOST such as:
@@ -783,8 +844,8 @@ class __SYCL_EXPORT handler {
   /// optimized implementations waiting for their turn of code-review.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  void parallel_for(nd_range<Dims> Range, Reduction &Redu,
-                    KernelType KernelFunc) {
+  detail::enable_if_t<!Reduction::has_fast_atomics>
+  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
     size_t NWorkGroups = Range.get_group_range().size();
 
     // This parallel_for() is lowered to the following sequence:
@@ -803,7 +864,7 @@ class __SYCL_EXPORT handler {
     // 1. Call the kernel that includes user's lambda function.
     intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
     auto QueueCopy = MQueue;
-    MLastEvent = this->finalize();
+    this->finalize();
 
     // 2. Run the additional aux kernel as many times as needed to reduce
     // all partial sums into one scalar.
@@ -823,8 +884,7 @@ class __SYCL_EXPORT handler {
       // The last group may be not fully loaded. Still register it as a group.
       if ((NWorkItems % WGSize) != 0)
         ++NWorkGroups;
-      auto Range =
-          nd_range<1>(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
+      nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
 
       handler AuxHandler(QueueCopy, MIsHost);
       AuxHandler.saveCodeLoc(MCodeLoc);
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
index a22f7f38249d7..cd4200884e47f 100644
--- a/sycl/include/CL/sycl/intel/reduction.hpp
+++ b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <CL/sycl/accessor.hpp>
+#include <CL/sycl/intel/group_algorithm.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -24,6 +25,25 @@ using cl::sycl::detail::is_geninteger64bit;
 using cl::sycl::detail::is_geninteger8bit;
 using cl::sycl::detail::remove_AS;
 
+template <typename T, class BinaryOperation>
+using IsReduOptForFastAtomicFetch = detail::bool_constant<
+    (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+    (std::is_same<BinaryOperation, intel::plus<T>>::value ||
+     std::is_same<BinaryOperation, intel::minimum<T>>::value ||
+     std::is_same<BinaryOperation, intel::maximum<T>>::value ||
+     std::is_same<BinaryOperation, intel::bit_or<T>>::value ||
+     std::is_same<BinaryOperation, intel::bit_xor<T>>::value ||
+     std::is_same<BinaryOperation, intel::bit_and<T>>::value)>;
+
+template <typename T, class BinaryOperation>
+using IsReduOptForFastReduce = detail::bool_constant<
+    (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
+     std::is_same<T, half>::value || std::is_same<T, float>::value ||
+     std::is_same<T, double>::value) &&
+    (std::is_same<BinaryOperation, intel::plus<T>>::value ||
+     std::is_same<BinaryOperation, intel::minimum<T>>::value ||
+     std::is_same<BinaryOperation, intel::maximum<T>>::value)>;
+
 // Identity = 0
 template <typename T, class BinaryOperation>
 using IsZeroIdentityOp = bool_constant<
@@ -278,9 +298,16 @@ class reduction_impl {
   using binary_operation = BinaryOperation;
   using accessor_type =
       accessor<T, Dims, AccMode, access::target::global_buffer, IsPlaceholder>;
+  using rw_accessor_type =
+      accessor<T, Dims, access::mode::read_write, access::target::global_buffer,
+               IsPlaceholder>;
   static constexpr access::mode accessor_mode = AccMode;
   static constexpr int accessor_dim = Dims;
   static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims;
+  static constexpr bool has_fast_atomics =
+      IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
+  static constexpr bool has_fast_reduce =
+      IsReduOptForFastReduce<T, BinaryOperation>::value;
 
   // Only scalar (i.e. 0-dim and 1-dim with 1 element) reductions supported now.
   // TODO: suport (Dims > 1) and placeholder accessors/reductions.
@@ -367,6 +394,20 @@ class reduction_impl {
     CGH.addReduction(MOutBufPtr);
     return accessor_type(*MOutBufPtr, CGH);
   }
+
+  /// Creates 1-element global buffer initialized with identity value and
+  /// returns an accessor to that buffer.
+  accessor<T, Dims, access::mode::read_write, access::target::global_buffer>
+  getReadWriteScalarAcc(handler &CGH) const {
+    auto RWReduVal = std::make_shared<T>(MIdentity);
+    CGH.addReduction(RWReduVal);
+    auto RWReduBuf =
+        std::make_shared<buffer<T, 1>>(RWReduVal.get(), range<1>(1));
+    CGH.addReduction(RWReduBuf);
+    return accessor<T, Dims, access::mode::read_write,
+                    access::target::global_buffer>(*RWReduBuf, CGH);
+  }
+
   /// User's accessor to where the reduction must be written.
   accessor_type MAcc;
 
@@ -403,6 +444,213 @@ struct get_reduction_aux_2nd_kernel_name_t {
       typename sycl::detail::get_kernel_name_t<Name, Type>::name>;
 };
 
+/// Implements a command group function that enqueues a kernel that calls
+/// user's lambda function KernelFunc and also does one iteration of reduction
+/// of elements computed in user's lambda function.
+/// This version uses intel::reduce() algorithm to reduce elements in each
+/// of work-groups, then it calls fast sycl atomic operations to update
+/// user's reduction variable.
+///
+/// Briefly: calls user's lambda, intel::reduce() + atomic, INT + ADD/MIN/MAX.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu, typename Reduction::rw_accessor_type &Out) {
+
+  size_t NWorkItems = Range.get_global_range().size();
+  size_t WGSize = Range.get_local_range().size();
+  size_t NWorkGroups = Range.get_group_range().size();
+
+  if (NWorkGroups * WGSize == NWorkItems) {
+    // Efficient case: all work-groups are uniform.
+    CGH.parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's function. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer;
+      KernelFunc(NDIt, Reducer);
+
+      typename Reduction::binary_operation BOp;
+      Reducer.MValue = intel::reduce(NDIt.get_group(), Reducer.MValue, BOp);
+      if (NDIt.get_local_linear_id() == 0)
+        Reducer.atomic_combine(Out.get_pointer().get());
+    });
+  } else {
+    // Inefficient case: non-uniform work groups.
+    using AuxName =
+        typename get_reduction_main_2nd_kernel_name_t<KernelName,
+                                                      KernelType>::name;
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's functions. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer;
+      KernelFunc(NDIt, Reducer);
+
+      typename Reduction::binary_operation BOp;
+      size_t GID = NDIt.get_global_linear_id();
+      auto Val = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity();
+      Reducer.MValue = intel::reduce(NDIt.get_group(), Val, BOp);
+      if (NDIt.get_local_linear_id() == 0)
+        Reducer.atomic_combine(Out.get_pointer().get());
+    });
+  }
+}
+
+/// Implements a command group function that enqueues a kernel that calls
+/// user's lambda function KernelFunc and also does one iteration of reduction
+/// of elements computed in user's lambda function.
+/// This version uses tree-reduction algorithm to reduce elements in each
+/// of work-groups, then it calls fast sycl atomic operations to update
+/// user's reduction variable.
+///
+/// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu, typename Reduction::rw_accessor_type &Out) {
+
+  size_t NWorkItems = Range.get_global_range().size();
+  size_t WGSize = Range.get_local_range().size();
+  size_t NWorkGroups = Range.get_group_range().size();
+
+  // Use local memory to reduce elements in work-groups into zero-th element.
+  // If WGSize is not power of two, then WGSize+1 elements are allocated.
+  // The additional last element is used to catch reduce elements that could
+  // otherwise be lost in the tree-reduction algorithm used in the kernel.
+  bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0;
+  bool IsEfficientCase = !HasNonUniformWG && (WGSize & (WGSize - 1)) == 0;
+  size_t NLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
+  auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH);
+
+  if (IsEfficientCase) {
+    // Efficient case: uniform work-groups and WGSize is power of two.
+    CGH.parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's functions. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer;
+      KernelFunc(NDIt, Reducer);
+
+      size_t LID = NDIt.get_local_linear_id();
+      // Copy the element to local memory to prepare it for tree-reduction.
+      LocalReds[LID] = Reducer.MValue;
+      NDIt.barrier();
+
+      // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
+      typename Reduction::binary_operation BOp;
+      size_t WGSize = NDIt.get_local_range().size();
+      for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
+        if (LID < CurStep)
+          LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        NDIt.barrier();
+      }
+
+      if (LID == 0) {
+        Reducer.MValue = LocalReds[0];
+        Reducer.atomic_combine(Out.get_pointer().get());
+      }
+    });
+  } else {
+    // Inefficient case: non-uniform work-groups or WGSize is not power of two.
+    // These two inefficient cases are handled by one kernel, which
+    // can be split later into two separate kernels, if there are users who
+    // really need more efficient code for them.
+    using AuxName =
+        typename get_reduction_main_2nd_kernel_name_t<KernelName,
+                                                      KernelType>::name;
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's functions. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer;
+      KernelFunc(NDIt, Reducer);
+
+      size_t WGSize = NDIt.get_local_range().size();
+      size_t LID = NDIt.get_local_linear_id();
+      size_t GID = NDIt.get_global_linear_id();
+      // Copy the element to local memory to prepare it for tree-reduction.
+      auto ReduIdentity = Reducer.getIdentity();
+      LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
+      LocalReds[WGSize] = ReduIdentity;
+      NDIt.barrier();
+
+      // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
+      // LocalReds[WGSize] accumulates last/odd elements when the step
+      // of tree-reduction loop is not even.
+      typename Reduction::binary_operation BOp;
+      size_t PrevStep = WGSize;
+      for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+        if (LID < CurStep)
+          LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        else if (LID == CurStep && (PrevStep & 0x1))
+          LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+        NDIt.barrier();
+        PrevStep = CurStep;
+      }
+
+      if (LID == 0) {
+        Reducer.MValue = BOp(LocalReds[0], LocalReds[WGSize]);
+        Reducer.atomic_combine(Out.get_pointer().get());
+      }
+    });
+  }
+}
+
+/// Implements a command group function that enqueues a kernel that
+/// calls user's lambda function and does one iteration of reduction
+/// of elements in each of work-groups.
+/// This version uses intel::reduce() algorithm to reduce elements in each
+/// of work-groups. At the end of each work-groups the partial sum is written
+/// to a global buffer.
+///
+/// Briefly: user's lambda, intel:reduce(), FP + ADD/MIN/MAX.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu) {
+
+  size_t NWorkItems = Range.get_global_range().size();
+  size_t WGSize = Range.get_local_range().size();
+  size_t NWorkGroups = Range.get_group_range().size();
+
+  // This additional check is needed for 'read_write' accessor case only.
+  // It does not slow-down the kernel writing to 'discard_write' accessor as
+  // the condition seems to be resolved at compile time for 'discard_write'.
+  bool IsUpdateOfUserAcc =
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
+
+  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, CGH);
+  if (NWorkGroups * WGSize == NWorkItems) {
+    // Efficient case: uniform work-groups.
+    CGH.parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's functions. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer;
+      KernelFunc(NDIt, Reducer);
+
+      // Compute the partial sum/reduction for the work-group.
+      typename Reduction::binary_operation BOp;
+      size_t WGID = NDIt.get_group_linear_id();
+      auto V = intel::reduce(NDIt.get_group(), Reducer.MValue, BOp);
+      if (NDIt.get_local_linear_id() == 0)
+        Out.get_pointer().get()[WGID] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+    });
+  } else {
+    // Inefficient case: non-uniform work-group require additional checks.
+    using AuxName =
+        typename get_reduction_main_2nd_kernel_name_t<KernelName,
+                                                      KernelType>::name;
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's functions. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer;
+      KernelFunc(NDIt, Reducer);
+
+      // Compute the partial sum/reduction for the work-group.
+      typename Reduction::binary_operation BOp;
+      size_t GID = NDIt.get_global_linear_id();
+      size_t WGID = NDIt.get_group_linear_id();
+      auto V = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity();
+      V = intel::reduce(NDIt.get_group(), V, BOp);
+      if (NDIt.get_local_linear_id() == 0)
+        Out.get_pointer().get()[WGID] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+    });
+  }
+}
+
 /// Implements a command group function that enqueues a kernel that calls
 /// user's lambda function \param KernelFunc and does one iteration of
 /// reduction of elements in each of work-groups.
@@ -412,8 +660,9 @@ struct get_reduction_aux_2nd_kernel_name_t {
 ///
 /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFunc(handler &CGH, KernelType KernelFunc,
-                const nd_range<Dims> &Range, Reduction &Redu) {
+enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
+           Reduction &Redu) {
 
   size_t NWorkItems = Range.get_global_range().size();
   size_t WGSize = Range.get_local_range().size();
@@ -422,7 +671,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
   // The last work-group may be not fully loaded with work, or the work group
   // size may be not power of two. Those two cases considered inefficient
   // as they require additional code and checks in the kernel.
-  bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0;
+  bool HasNonUniformWG = NWorkGroups * WGSize != NWorkItems;
   bool IsEfficientCase = !HasNonUniformWG && ((WGSize & (WGSize - 1)) == 0);
 
   bool IsUpdateOfUserAcc =
@@ -511,6 +760,63 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
   }
 }
 
+/// Implements a command group function that enqueues a kernel that does one
+/// iteration of reduction of elements in each of work-groups.
+/// This version uses intel::reduce() algorithm to reduce elements in each
+/// of work-groups. At the end of each work-groups the partial sum is written
+/// to a global buffer.
+///
+/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+              size_t KernelRun, Reduction &Redu) {
+  size_t WGSize = Range.get_local_range().size();
+  size_t NWorkGroups = Range.get_group_range().size();
+
+  bool IsUpdateOfUserAcc =
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
+
+  // Get read accessor to the buffer that was used as output
+  // in the previous kernel. After that create new output buffer and
+  // get accessor to it to store the new partial sum(s).
+  auto In = Redu.getReadAccToPreviousPartialReds(CGH);
+  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, CGH);
+
+  if (NWorkGroups * WGSize == NWorkItems) {
+    // Efficient case: uniform work groups.
+    using AuxName =
+        typename get_reduction_aux_1st_kernel_name_t<KernelName,
+                                                     KernelType>::name;
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      typename Reduction::binary_operation BOp;
+      size_t WGID = NDIt.get_group_linear_id();
+      size_t GID = NDIt.get_global_linear_id();
+      auto V = intel::reduce(NDIt.get_group(), In[GID], BOp);
+      if (NDIt.get_local_linear_id() == 0)
+        Out.get_pointer().get()[WGID] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+    });
+  } else {
+    // Inefficient case: non-uniform work-groups require additional checks.
+    using AuxName =
+        typename get_reduction_aux_2nd_kernel_name_t<KernelName,
+                                                     KernelType>::name;
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      typename Reduction::binary_operation BOp;
+      size_t WGID = NDIt.get_group_linear_id();
+      size_t GID = NDIt.get_global_linear_id();
+      typename Reduction::reducer_type Reducer;
+      auto V =
+          (GID < NWorkItems) ? In[GID] : Reduction::reducer_type::getIdentity();
+      V = intel::reduce(NDIt.get_group(), V, BOp);
+      if (NDIt.get_local_linear_id() == 0)
+        Out.get_pointer().get()[WGID] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+    });
+  }
+}
+
 /// Implements a command group function that enqueues a kernel that does one
 /// iteration of reduction of elements in each of work-groups.
 /// This version uses tree-reduction algorithm to reduce elements in each
@@ -519,8 +825,9 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
 ///
 /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                   size_t KernelRun, Reduction &Redu) {
+enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
+reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+              size_t KernelRun, Reduction &Redu) {
   size_t WGSize = Range.get_local_range().size();
   size_t NWorkGroups = Range.get_group_range().size();