From 2c8542a25204f74d41199087465a5bc5da06618a Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Wed, 29 Apr 2020 22:06:16 -0700 Subject: [PATCH] [SYCL] Add faster reduction implementations using atomic or/and intel::reduce() Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/handler.hpp | 88 ++++++- sycl/include/CL/sycl/intel/reduction.hpp | 317 ++++++++++++++++++++++- 2 files changed, 386 insertions(+), 19 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 49767539bc683..7feaecd6f7102 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -117,13 +117,37 @@ template class reduction_impl; +using cl::sycl::detail::enable_if_t; + +template +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu, typename Reduction::rw_accessor_type &Out); + +template +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu, typename Reduction::rw_accessor_type &Out); + +template +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu); + +template +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu); + template -void reduCGFunc(handler &CGH, KernelType KernelFunc, - const nd_range &Range, Reduction &Redu); +enable_if_t +reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu); template -void reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, - size_t KernelRun, Reduction &Redu); +enable_if_t +reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu); } // namespace detail } // namespace intel @@ -761,6 +785,48 @@ class __SYCL_EXPORT handler { #endif } + /// Implements parallel_for() accepting nd_range and 1 reduction variable + /// having 'read_write' access mode. + /// This version uses fast sycl::atomic operations to update user's reduction + /// variable at the end of each work-group work. + template + detail::enable_if_t + parallel_for(nd_range Range, Reduction &Redu, KernelType KernelFunc) { + intel::detail::reduCGFunc(*this, KernelFunc, Range, Redu, + Redu.MAcc); + } + + /// Implements parallel_for() accepting nd_range and 1 reduction variable + /// having 'discard_write' access mode. + /// This version uses fast sycl::atomic operations to update user's reduction + /// variable at the end of each work-group work. + /// + /// The reduction variable must be initialized before the kernel is started + /// because atomic operations only update the value, but never initialize it. + /// Thus, an additional 'read_write' accessor is created/initialized with + /// identity value and then passed to the kernel. After running the kernel it + /// is copied to user's 'discard_write' accessor. + template + detail::enable_if_t + parallel_for(nd_range Range, Reduction &Redu, KernelType KernelFunc) { + auto QueueCopy = MQueue; + auto RWAcc = Redu.getReadWriteScalarAcc(*this); + intel::detail::reduCGFunc(*this, KernelFunc, Range, Redu, + RWAcc); + this->finalize(); + + // Copy from RWAcc to some temp memory. + handler CopyHandler(QueueCopy, MIsHost); + CopyHandler.saveCodeLoc(MCodeLoc); + CopyHandler.associateWithHandler(RWAcc); + CopyHandler.copy(RWAcc, Redu.MAcc); + MLastEvent = CopyHandler.finalize(); + } + /// Defines and invokes a SYCL kernel function for the specified nd_range. /// Performs reduction operation specified in \param Redu. /// @@ -771,11 +837,6 @@ class __SYCL_EXPORT handler { /// globally visible, there is no need for the developer to provide /// a kernel name for it. /// - /// TODO: currently it calls only those versions of kernels that can handle - /// custom types and operations. Some of types and operations may use faster - /// implementations that use intel::reduce() and/or sycl::atomic.fetch_() - /// functions and thus provide much better performance. Those variants exist, - /// are fully functional. They just wait for their time for code-review. /// TODO: Need to handle more than 1 reduction in parallel_for(). /// TODO: Support HOST. The kernels called by this parallel_for() may use /// some functionality that is not yet supported on HOST such as: @@ -783,8 +844,8 @@ class __SYCL_EXPORT handler { /// optimized implementations waiting for their turn of code-review. template - void parallel_for(nd_range Range, Reduction &Redu, - KernelType KernelFunc) { + detail::enable_if_t + parallel_for(nd_range Range, Reduction &Redu, KernelType KernelFunc) { size_t NWorkGroups = Range.get_group_range().size(); // This parallel_for() is lowered to the following sequence: @@ -803,7 +864,7 @@ class __SYCL_EXPORT handler { // 1. Call the kernel that includes user's lambda function. intel::detail::reduCGFunc(*this, KernelFunc, Range, Redu); auto QueueCopy = MQueue; - MLastEvent = this->finalize(); + this->finalize(); // 2. Run the additional aux kernel as many times as needed to reduce // all partial sums into one scalar. @@ -823,8 +884,7 @@ class __SYCL_EXPORT handler { // The last group may be not fully loaded. Still register it as a group. if ((NWorkItems % WGSize) != 0) ++NWorkGroups; - auto Range = - nd_range<1>(range<1>(WGSize * NWorkGroups), range<1>(WGSize)); + nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize)); handler AuxHandler(QueueCopy, MIsHost); AuxHandler.saveCodeLoc(MCodeLoc); diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp index a22f7f38249d7..cd4200884e47f 100644 --- a/sycl/include/CL/sycl/intel/reduction.hpp +++ b/sycl/include/CL/sycl/intel/reduction.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { @@ -24,6 +25,25 @@ using cl::sycl::detail::is_geninteger64bit; using cl::sycl::detail::is_geninteger8bit; using cl::sycl::detail::remove_AS; +template +using IsReduOptForFastAtomicFetch = detail::bool_constant< + (is_geninteger32bit::value || is_geninteger64bit::value) && + (std::is_same>::value || + std::is_same>::value || + std::is_same>::value || + std::is_same>::value || + std::is_same>::value || + std::is_same>::value)>; + +template +using IsReduOptForFastReduce = detail::bool_constant< + (is_geninteger32bit::value || is_geninteger64bit::value || + std::is_same::value || std::is_same::value || + std::is_same::value) && + (std::is_same>::value || + std::is_same>::value || + std::is_same>::value)>; + // Identity = 0 template using IsZeroIdentityOp = bool_constant< @@ -278,9 +298,16 @@ class reduction_impl { using binary_operation = BinaryOperation; using accessor_type = accessor; + using rw_accessor_type = + accessor; static constexpr access::mode accessor_mode = AccMode; static constexpr int accessor_dim = Dims; static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims; + static constexpr bool has_fast_atomics = + IsReduOptForFastAtomicFetch::value; + static constexpr bool has_fast_reduce = + IsReduOptForFastReduce::value; // Only scalar (i.e. 0-dim and 1-dim with 1 element) reductions supported now. // TODO: suport (Dims > 1) and placeholder accessors/reductions. @@ -367,6 +394,20 @@ class reduction_impl { CGH.addReduction(MOutBufPtr); return accessor_type(*MOutBufPtr, CGH); } + + /// Creates 1-element global buffer initialized with identity value and + /// returns an accessor to that buffer. + accessor + getReadWriteScalarAcc(handler &CGH) const { + auto RWReduVal = std::make_shared(MIdentity); + CGH.addReduction(RWReduVal); + auto RWReduBuf = + std::make_shared>(RWReduVal.get(), range<1>(1)); + CGH.addReduction(RWReduBuf); + return accessor(*RWReduBuf, CGH); + } + /// User's accessor to where the reduction must be written. accessor_type MAcc; @@ -403,6 +444,213 @@ struct get_reduction_aux_2nd_kernel_name_t { typename sycl::detail::get_kernel_name_t::name>; }; +/// Implements a command group function that enqueues a kernel that calls +/// user's lambda function KernelFunc and also does one iteration of reduction +/// of elements computed in user's lambda function. +/// This version uses intel::reduce() algorithm to reduce elements in each +/// of work-groups, then it calls fast sycl atomic operations to update +/// user's reduction variable. +/// +/// Briefly: calls user's lambda, intel::reduce() + atomic, INT + ADD/MIN/MAX. +template +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu, typename Reduction::rw_accessor_type &Out) { + + size_t NWorkItems = Range.get_global_range().size(); + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + if (NWorkGroups * WGSize == NWorkItems) { + // Efficient case: all work-groups are uniform. + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's function. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + typename Reduction::binary_operation BOp; + Reducer.MValue = intel::reduce(NDIt.get_group(), Reducer.MValue, BOp); + if (NDIt.get_local_linear_id() == 0) + Reducer.atomic_combine(Out.get_pointer().get()); + }); + } else { + // Inefficient case: non-uniform work groups. + using AuxName = + typename get_reduction_main_2nd_kernel_name_t::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + typename Reduction::binary_operation BOp; + size_t GID = NDIt.get_global_linear_id(); + auto Val = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity(); + Reducer.MValue = intel::reduce(NDIt.get_group(), Val, BOp); + if (NDIt.get_local_linear_id() == 0) + Reducer.atomic_combine(Out.get_pointer().get()); + }); + } +} + +/// Implements a command group function that enqueues a kernel that calls +/// user's lambda function KernelFunc and also does one iteration of reduction +/// of elements computed in user's lambda function. +/// This version uses tree-reduction algorithm to reduce elements in each +/// of work-groups, then it calls fast sycl atomic operations to update +/// user's reduction variable. +/// +/// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR. +template +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu, typename Reduction::rw_accessor_type &Out) { + + size_t NWorkItems = Range.get_global_range().size(); + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + // Use local memory to reduce elements in work-groups into zero-th element. + // If WGSize is not power of two, then WGSize+1 elements are allocated. + // The additional last element is used to catch reduce elements that could + // otherwise be lost in the tree-reduction algorithm used in the kernel. + bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0; + bool IsEfficientCase = !HasNonUniformWG && (WGSize & (WGSize - 1)) == 0; + size_t NLocalElements = WGSize + (IsEfficientCase ? 0 : 1); + auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH); + + if (IsEfficientCase) { + // Efficient case: uniform work-groups and WGSize is power of two. + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + size_t LID = NDIt.get_local_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction. + LocalReds[LID] = Reducer.MValue; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]. + typename Reduction::binary_operation BOp; + size_t WGSize = NDIt.get_local_range().size(); + for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + NDIt.barrier(); + } + + if (LID == 0) { + Reducer.MValue = LocalReds[0]; + Reducer.atomic_combine(Out.get_pointer().get()); + } + }); + } else { + // Inefficient case: non-uniform work-groups or WGSize is not power of two. + // These two inefficient cases are handled by one kernel, which + // can be split later into two separate kernels, if there are users who + // really need more efficient code for them. + using AuxName = + typename get_reduction_main_2nd_kernel_name_t::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction. + auto ReduIdentity = Reducer.getIdentity(); + LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity; + LocalReds[WGSize] = ReduIdentity; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]. + // LocalReds[WGSize] accumulates last/odd elements when the step + // of tree-reduction loop is not even. + typename Reduction::binary_operation BOp; + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + else if (LID == CurStep && (PrevStep & 0x1)) + LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + NDIt.barrier(); + PrevStep = CurStep; + } + + if (LID == 0) { + Reducer.MValue = BOp(LocalReds[0], LocalReds[WGSize]); + Reducer.atomic_combine(Out.get_pointer().get()); + } + }); + } +} + +/// Implements a command group function that enqueues a kernel that +/// calls user's lambda function and does one iteration of reduction +/// of elements in each of work-groups. +/// This version uses intel::reduce() algorithm to reduce elements in each +/// of work-groups. At the end of each work-groups the partial sum is written +/// to a global buffer. +/// +/// Briefly: user's lambda, intel:reduce(), FP + ADD/MIN/MAX. +template +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu) { + + size_t NWorkItems = Range.get_global_range().size(); + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + // This additional check is needed for 'read_write' accessor case only. + // It does not slow-down the kernel writing to 'discard_write' accessor as + // the condition seems to be resolved at compile time for 'discard_write'. + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1; + + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, CGH); + if (NWorkGroups * WGSize == NWorkItems) { + // Efficient case: uniform work-groups. + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + // Compute the partial sum/reduction for the work-group. + typename Reduction::binary_operation BOp; + size_t WGID = NDIt.get_group_linear_id(); + auto V = intel::reduce(NDIt.get_group(), Reducer.MValue, BOp); + if (NDIt.get_local_linear_id() == 0) + Out.get_pointer().get()[WGID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + }); + } else { + // Inefficient case: non-uniform work-group require additional checks. + using AuxName = + typename get_reduction_main_2nd_kernel_name_t::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + // Compute the partial sum/reduction for the work-group. + typename Reduction::binary_operation BOp; + size_t GID = NDIt.get_global_linear_id(); + size_t WGID = NDIt.get_group_linear_id(); + auto V = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity(); + V = intel::reduce(NDIt.get_group(), V, BOp); + if (NDIt.get_local_linear_id() == 0) + Out.get_pointer().get()[WGID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + }); + } +} + /// Implements a command group function that enqueues a kernel that calls /// user's lambda function \param KernelFunc and does one iteration of /// reduction of elements in each of work-groups. @@ -412,8 +660,9 @@ struct get_reduction_aux_2nd_kernel_name_t { /// /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops. template -void reduCGFunc(handler &CGH, KernelType KernelFunc, - const nd_range &Range, Reduction &Redu) { +enable_if_t +reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, + Reduction &Redu) { size_t NWorkItems = Range.get_global_range().size(); size_t WGSize = Range.get_local_range().size(); @@ -422,7 +671,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, // The last work-group may be not fully loaded with work, or the work group // size may be not power of two. Those two cases considered inefficient // as they require additional code and checks in the kernel. - bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0; + bool HasNonUniformWG = NWorkGroups * WGSize != NWorkItems; bool IsEfficientCase = !HasNonUniformWG && ((WGSize & (WGSize - 1)) == 0); bool IsUpdateOfUserAcc = @@ -511,6 +760,63 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, } } +/// Implements a command group function that enqueues a kernel that does one +/// iteration of reduction of elements in each of work-groups. +/// This version uses intel::reduce() algorithm to reduce elements in each +/// of work-groups. At the end of each work-groups the partial sum is written +/// to a global buffer. +/// +/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX +template +enable_if_t +reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu) { + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1; + + // Get read accessor to the buffer that was used as output + // in the previous kernel. After that create new output buffer and + // get accessor to it to store the new partial sum(s). + auto In = Redu.getReadAccToPreviousPartialReds(CGH); + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, CGH); + + if (NWorkGroups * WGSize == NWorkItems) { + // Efficient case: uniform work groups. + using AuxName = + typename get_reduction_aux_1st_kernel_name_t::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + typename Reduction::binary_operation BOp; + size_t WGID = NDIt.get_group_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + auto V = intel::reduce(NDIt.get_group(), In[GID], BOp); + if (NDIt.get_local_linear_id() == 0) + Out.get_pointer().get()[WGID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + }); + } else { + // Inefficient case: non-uniform work-groups require additional checks. + using AuxName = + typename get_reduction_aux_2nd_kernel_name_t::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + typename Reduction::binary_operation BOp; + size_t WGID = NDIt.get_group_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + typename Reduction::reducer_type Reducer; + auto V = + (GID < NWorkItems) ? In[GID] : Reduction::reducer_type::getIdentity(); + V = intel::reduce(NDIt.get_group(), V, BOp); + if (NDIt.get_local_linear_id() == 0) + Out.get_pointer().get()[WGID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + }); + } +} + /// Implements a command group function that enqueues a kernel that does one /// iteration of reduction of elements in each of work-groups. /// This version uses tree-reduction algorithm to reduce elements in each @@ -519,8 +825,9 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, /// /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops. template -void reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, - size_t KernelRun, Reduction &Redu) { +enable_if_t +reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu) { size_t WGSize = Range.get_local_range().size(); size_t NWorkGroups = Range.get_group_range().size();