[SYCL] Implement SYCL2020 reductions with initialize_to_identity prop (#3410)

v-klochkov · web-flow · commit 3473c1a4501b · 2021-03-30T21:26:06.000-07:00
* [SYCL] Implement SYCL2020 reductions with set initialize_to_identity property The corresponding LIT tests PR: intel/llvm-test-suite#194 SYCL2020 reductions use dynamic information (instead of static info used previously) to ask to do discard-write to user's reduction variable or USM memory. That caused big changes removing/re-structuring the code that previously relied on accessor-mode (read-write vs discard-write). With this patch the SYCL-2020 reductions become on par with ONEAPI::reduction Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
diff --git a/sycl/include/CL/sycl/detail/property_helper.hpp b/sycl/include/CL/sycl/detail/property_helper.hpp
@@ -31,8 +31,8 @@ enum DataLessPropKind {
   NoInit = 4,
   BufferUsePinnedHostMemory = 5,
   UsePrimaryContext = 6,
-  DataLessPropKindSize = 7,
-  InitializeToIdentity = 8
+  InitializeToIdentity = 7,
+  DataLessPropKindSize = 8
 };
 
 // List of all properties with data IDs
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -223,17 +223,16 @@ checkValueRange(const T &V) {
 namespace ONEAPI {
 namespace detail {
 template <typename T, class BinaryOperation, int Dims, bool IsUSM,
-          access::mode AccMode, access::placeholder IsPlaceholder>
+          access::placeholder IsPlaceholder>
 class reduction_impl;
 
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          typename OutputT>
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
 enable_if_t<Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
-           Reduction &Redu, OutputT Out);
+           Reduction &Redu);
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 enable_if_t<!Reduction::has_fast_atomics>
@@ -258,6 +257,26 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
                      std::tuple<Reductions...> &ReduTuple,
                      std::index_sequence<Is...>);
 
+template <typename KernelName, class Reduction>
+std::enable_if_t<!Reduction::is_usm>
+reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
+
+template <typename KernelName, class Reduction>
+std::enable_if_t<Reduction::is_usm>
+reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
+
+template <typename... Reduction, size_t... Is>
+shared_ptr_class<event>
+reduSaveFinalResultToUserMem(shared_ptr_class<detail::queue_impl> Queue,
+                             bool IsHost, std::tuple<Reduction...> &ReduTuple,
+                             std::index_sequence<Is...>);
+
+template <typename Reduction, typename... RestT>
+std::enable_if_t<!Reduction::is_usm>
+reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
+                                   shared_ptr_class<detail::queue_impl> Queue,
+                                   bool IsHost, Reduction &Redu, RestT... Rest);
+
 __SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
                                       size_t LocalMemBytesPerWorkItem);
 
@@ -1159,73 +1178,43 @@ class __SYCL_EXPORT handler {
 #endif
   }
 
-  /// Implements parallel_for() accepting nd_range and 1 reduction variable
-  /// having 'read_write' access mode.
-  /// This version uses fast sycl::atomic operations to update user's reduction
+  /// Implements parallel_for() accepting nd_range \p Range and one reduction
+  /// object. This version uses fast sycl::atomic operations to update reduction
   /// variable at the end of each work-group work.
+  //
+  // If the reduction variable must be initialized with the identity value
+  // before the kernel run, then an additional working accessor is created,
+  // initialized with the identity value and used in the kernel. That working
+  // accessor is then copied to user's accessor or USM pointer after
+  // the kernel run.
+  // For USM pointers without initialize_to_identity properties the same scheme
+  // with working accessor is used as re-using user's USM pointer in the kernel
+  // would require creation of another variant of user's kernel, which does not
+  // seem efficient.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
-                      Reduction::has_fast_atomics && !Reduction::is_usm>
-  parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
-    ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
-                                           Redu.getUserAccessor());
-  }
-
-  /// Implements parallel_for() accepting nd_range and 1 reduction variable
-  /// having 'read_write' access mode.
-  /// This version uses fast sycl::atomic operations to update user's reduction
-  /// variable at the end of each work-group work.
-  template <typename KernelName = detail::auto_name, typename KernelType,
-            int Dims, typename Reduction>
-  detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
-                      Reduction::has_fast_atomics && Reduction::is_usm>
-  parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
-    ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
-                                           Redu.getUSMPointer());
-  }
-
-  /// Implements parallel_for() accepting nd_range and 1 reduction variable
-  /// having 'discard_write' access mode.
-  /// This version uses fast sycl::atomic operations to update user's reduction
-  /// variable at the end of each work-group work.
-  ///
-  /// The reduction variable must be initialized before the kernel is started
-  /// because atomic operations only update the value, but never initialize it.
-  /// Thus, an additional 'read_write' accessor is created/initialized with
-  /// identity value and then passed to the kernel. After running the kernel it
-  /// is copied to user's 'discard_write' accessor.
-  template <typename KernelName = detail::auto_name, typename KernelType,
-            int Dims, typename Reduction>
-  detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
-                      Reduction::has_fast_atomics>
+  detail::enable_if_t<Reduction::has_fast_atomics>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
-    auto RWAcc = Redu.getReadWriteScalarAcc(*this);
-    ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
-                                           RWAcc);
-    this->finalize();
+    ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
 
-    // Copy from RWAcc to user's reduction accessor.
-    handler CopyHandler(QueueCopy, MIsHost);
-    CopyHandler.saveCodeLoc(MCodeLoc);
-#ifndef __SYCL_DEVICE_ONLY__
-    CopyHandler.associateWithHandler(&RWAcc, access::target::global_buffer);
-    Redu.associateWithHandler(CopyHandler);
-#endif
-    CopyHandler.copy(RWAcc, Redu.getUserAccessor());
-    MLastEvent = CopyHandler.finalize();
+    if (Reduction::is_usm || Redu.initializeToIdentity()) {
+      this->finalize();
+      handler CopyHandler(QueueCopy, MIsHost);
+      CopyHandler.saveCodeLoc(MCodeLoc);
+      ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
+                                                               Redu);
+      MLastEvent = CopyHandler.finalize();
+    }
   }
 
   /// Defines and invokes a SYCL kernel function for the specified nd_range.
-  /// Performs reduction operation specified in \param Redu.
+  /// Performs reduction operation specified in \p Redu.
   ///
   /// The SYCL kernel function is defined as a lambda function or a named
   /// function object type and given an id or item for indexing in the indexing
-  /// space defined by range.
+  /// space defined by \p Range.
   /// If it is a named function object and the function object type is
   /// globally visible, there is no need for the developer to provide
   /// a kernel name for it.
@@ -1300,13 +1289,50 @@ class __SYCL_EXPORT handler {
           AuxHandler, NWorkItems, MaxWGSize, Redu);
       MLastEvent = AuxHandler.finalize();
     } // end while (NWorkItems > 1)
+
+    if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
+      handler CopyHandler(QueueCopy, MIsHost);
+      CopyHandler.saveCodeLoc(MCodeLoc);
+      ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
+                                                               Redu);
+      MLastEvent = CopyHandler.finalize();
+    }
   }
 
   // This version of parallel_for may handle one or more reductions packed in
   // \p Rest argument. Note thought that the last element in \p Rest pack is
   // the kernel function.
   // TODO: this variant is currently enabled for 2+ reductions only as the
   // versions handling 1 reduction variable are more efficient right now.
+  //
+  // Algorithm:
+  // 1) discard_write accessor (DWAcc), InitializeToIdentity = true:
+  //    a) Create uninitialized buffer and read_write accessor (RWAcc).
+  //    b) discard-write partial sums to RWAcc.
+  //    c) Repeat the steps (a) and (b) to get one final sum.
+  //    d) Copy RWAcc to DWAcc.
+  // 2) read_write accessor (RWAcc), InitializeToIdentity = false:
+  //    a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
+  //       re-use user's RWAcc (if #work-groups is 1).
+  //    b) discard-write to RWAcc (#WG > 1), or update-write (#WG == 1).
+  //    c) Repeat the steps (a) and (b) to get one final sum.
+  // 3) read_write accessor (RWAcc), InitializeToIdentity = true:
+  //    a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
+  //       re-use user's RWAcc (if #work-groups is 1).
+  //    b) discard-write to RWAcc.
+  //    c) Repeat the steps (a) and (b) to get one final sum.
+  // 4) USM pointer, InitializeToIdentity = false:
+  //    a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
+  //       re-use user's USM pointer (if #work-groups is 1).
+  //    b) discard-write to RWAcc (#WG > 1) or
+  //       update-write to USM pointer (#WG == 1).
+  //    c) Repeat the steps (a) and (b) to get one final sum.
+  // 5) USM pointer, InitializeToIdentity = true:
+  //    a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
+  //       re-use user's USM pointer (if #work-groups is 1).
+  //    b) discard-write to RWAcc (#WG > 1) or
+  //       discard-write to USM pointer (#WG == 1).
+  //    c) Repeat the steps (a) and (b) to get one final sum.
   template <typename KernelName = detail::auto_name, int Dims,
             typename... RestT>
   std::enable_if_t<(sizeof...(RestT) >= 3 &&
@@ -1348,6 +1374,11 @@ class __SYCL_EXPORT handler {
               AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
       MLastEvent = AuxHandler.finalize();
     } // end while (NWorkItems > 1)
+
+    auto CopyEvent = ONEAPI::detail::reduSaveFinalResultToUserMem(
+        QueueCopy, MIsHost, ReduTuple, ReduIndices);
+    if (CopyEvent)
+      MLastEvent = *CopyEvent;
   }
 
   /// Hierarchical kernel invocation method of a kernel defined as a lambda
@@ -2085,9 +2116,17 @@ class __SYCL_EXPORT handler {
   // Make reduction_impl friend to store buffers and arrays created for it
   // in handler from reduction_impl methods.
   template <typename T, class BinaryOperation, int Dims, bool IsUSM,
-            access::mode AccMode, access::placeholder IsPlaceholder>
+            access::placeholder IsPlaceholder>
   friend class ONEAPI::detail::reduction_impl;
 
+  // This method needs to call the method finalize().
+  template <typename Reduction, typename... RestT>
+  std::enable_if_t<!Reduction::is_usm> friend ONEAPI::detail::
+      reduSaveFinalResultToUserMemHelper(
+          std::vector<event> &Events,
+          shared_ptr_class<detail::queue_impl> Queue, bool IsHost, Reduction &,
+          RestT...);
+
   friend void detail::associateWithHandler(handler &,
                                            detail::AccessorBaseHost *,
                                            access::target);
diff --git a/sycl/include/CL/sycl/reduction.hpp b/sycl/include/CL/sycl/reduction.hpp
@@ -41,16 +41,12 @@ inline constexpr AccumulatorT known_identity_v =
 template <typename T, typename AllocatorT, typename BinaryOperation>
 std::enable_if_t<has_known_identity<BinaryOperation, T>::value,
                  ONEAPI::detail::reduction_impl<T, BinaryOperation, 1, false,
-                                                access::mode::read_write,
                                                 access::placeholder::true_t>>
 reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, BinaryOperation,
           const property_list &PropList = {}) {
-  // TODO: need to handle 'PropList'.
-  if (PropList.has_property<property::reduction::initialize_to_identity>())
-    throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
-                        "property is not supported yet.",
-                        PI_INVALID_VALUE);
-  return {Var, CGH};
+  bool InitializeToIdentity =
+      PropList.has_property<property::reduction::initialize_to_identity>();
+  return {Var, CGH, InitializeToIdentity};
 }
 
 /// Constructs a reduction object using the given buffer \p Var, handler \p CGH,
@@ -60,7 +56,6 @@ reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, BinaryOperation,
 template <typename T, typename AllocatorT, typename BinaryOperation>
 std::enable_if_t<!has_known_identity<BinaryOperation, T>::value,
                  ONEAPI::detail::reduction_impl<T, BinaryOperation, 1, false,
-                                                access::mode::read_write,
                                                 access::placeholder::true_t>>
 reduction(buffer<T, 1, AllocatorT>, handler &, BinaryOperation,
           const property_list &PropList = {}) {
@@ -76,14 +71,11 @@ reduction(buffer<T, 1, AllocatorT>, handler &, BinaryOperation,
 /// \p Combiner, and optional reduction properties.
 template <typename T, typename BinaryOperation>
 std::enable_if_t<has_known_identity<BinaryOperation, T>::value,
-                 ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true,
-                                                access::mode::read_write>>
+                 ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true>>
 reduction(T *Var, BinaryOperation, const property_list &PropList = {}) {
-  if (PropList.has_property<property::reduction::initialize_to_identity>())
-    throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
-                        "property is not supported yet.",
-                        PI_INVALID_VALUE);
-  return {Var};
+  bool InitializeToIdentity =
+      PropList.has_property<property::reduction::initialize_to_identity>();
+  return {Var, InitializeToIdentity};
 }
 
 /// Constructs a reduction object using the reduction variable referenced by
@@ -93,8 +85,7 @@ reduction(T *Var, BinaryOperation, const property_list &PropList = {}) {
 /// reduction identity is not known statically and it is not provided by user.
 template <typename T, typename BinaryOperation>
 std::enable_if_t<!has_known_identity<BinaryOperation, T>::value,
-                 ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true,
-                                                access::mode::read_write>>
+                 ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true>>
 reduction(T *, BinaryOperation, const property_list &PropList = {}) {
   // TODO: implement reduction that works even when identity is not known.
   (void)PropList;
@@ -108,32 +99,24 @@ reduction(T *, BinaryOperation, const property_list &PropList = {}) {
 /// and optional reduction properties.
 template <typename T, typename AllocatorT, typename BinaryOperation>
 ONEAPI::detail::reduction_impl<T, BinaryOperation, 1, false,
-                               access::mode::read_write,
                                access::placeholder::true_t>
 reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, const T &Identity,
           BinaryOperation Combiner, const property_list &PropList = {}) {
-  // TODO: need to handle 'PropList'.
-  if (PropList.has_property<property::reduction::initialize_to_identity>())
-    throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
-                        "property is not supported yet.",
-                        PI_INVALID_VALUE);
-  return {Var, CGH, Identity, Combiner};
+  bool InitializeToIdentity =
+      PropList.has_property<property::reduction::initialize_to_identity>();
+  return {Var, CGH, Identity, Combiner, InitializeToIdentity};
 }
 
 /// Constructs a reduction object using the reduction variable referenced by
 /// the given USM pointer \p Var, reduction identity value \p Identity,
 /// binary operation \p Combiner, and optional reduction properties.
 template <typename T, typename BinaryOperation>
-ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true,
-                               access::mode::read_write>
+ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true>
 reduction(T *Var, const T &Identity, BinaryOperation Combiner,
           const property_list &PropList = {}) {
-  // TODO: need to handle 'PropList'.
-  if (PropList.has_property<property::reduction::initialize_to_identity>())
-    throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
-                        "property is not supported yet.",
-                        PI_INVALID_VALUE);
-  return {Var, Identity, Combiner};
+  bool InitializeToIdentity =
+      PropList.has_property<property::reduction::initialize_to_identity>();
+  return {Var, Identity, Combiner, InitializeToIdentity};
 }
 
 } // namespace sycl
diff --git a/sycl/test/abi/layout_buffer.cpp b/sycl/test/abi/layout_buffer.cpp
@@ -28,7 +28,7 @@ void foo(sycl::buffer<int, 2>) {}
 // CHECK-NEXT:  24 |                 class sycl::detail::SYCLMemObjAllocator * _M_head_impl
 // CHECK-NEXT:  32 |     class sycl::property_list MProps
 // CHECK-NEXT:  32 |       class sycl::detail::PropertyListBase (base)
-// CHECK-NEXT:  32 |         class std::bitset<7> MDataLessProps
+// CHECK-NEXT:  32 |         class std::bitset<8> MDataLessProps
 // CHECK-NEXT:  32 |           struct std::_Base_bitset<1> (base)
 // CHECK-NEXT:  32 |             std::_Base_bitset<1>::_WordT _M_w
 // CHECK-NEXT:  40 |         class std::vector<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > MPropsWithData
diff --git a/sycl/test/abi/layout_image.cpp b/sycl/test/abi/layout_image.cpp
@@ -29,7 +29,7 @@ sycl::image<2> Img{sycl::image_channel_order::rgba, sycl::image_channel_type::fp
 // CHECK-NEXT: 24 |                 class sycl::detail::SYCLMemObjAllocator * _M_head_impl
 // CHECK-NEXT: 32 |     class sycl::property_list MProps
 // CHECK-NEXT: 32 |       class sycl::detail::PropertyListBase (base)
-// CHECK-NEXT: 32 |         class std::bitset<7> MDataLessProps
+// CHECK-NEXT: 32 |         class std::bitset<8> MDataLessProps
 // CHECK-NEXT: 32 |           struct std::_Base_bitset<1> (base)
 // CHECK-NEXT: 32 |             std::_Base_bitset<1>::_WordT _M_w
 // CHECK-NEXT: 40 |         class std::vector<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > MPropsWithData