@@ -107,32 +107,6 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
107
107
108
108
__SYCL_EXPORT device getDeviceFromHandler (handler &);
109
109
110
- // / These are the forward declaration for the classes that help to create
111
- // / names for additional kernels. It is used only when there are
112
- // / more then 1 kernels in one parallel_for() implementing SYCL reduction.
113
- template <typename Type> class __sycl_reduction_main_2nd_kernel ;
114
- template <typename Type> class __sycl_reduction_aux_1st_kernel ;
115
- template <typename Type> class __sycl_reduction_aux_2nd_kernel ;
116
-
117
- // / Helper structs to get additional kernel name types based on given
118
- // / \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then
119
- // / \c Type becomes the \c Name.
120
- template <typename Name, typename Type>
121
- struct get_reduction_main_2nd_kernel_name_t {
122
- using name = __sycl_reduction_main_2nd_kernel<
123
- typename get_kernel_name_t <Name, Type>::name>;
124
- };
125
- template <typename Name, typename Type>
126
- struct get_reduction_aux_1st_kernel_name_t {
127
- using name = __sycl_reduction_aux_1st_kernel<
128
- typename get_kernel_name_t <Name, Type>::name>;
129
- };
130
- template <typename Name, typename Type>
131
- struct get_reduction_aux_2nd_kernel_name_t {
132
- using name = __sycl_reduction_aux_2nd_kernel<
133
- typename get_kernel_name_t <Name, Type>::name>;
134
- };
135
-
136
110
device getDeviceFromHandler (handler &);
137
111
138
112
} // namespace detail
@@ -142,6 +116,14 @@ namespace detail {
142
116
template <typename T, class BinaryOperation , int Dims, access::mode AccMode,
143
117
access::placeholder IsPlaceholder>
144
118
class reduction_impl ;
119
+
120
+ template <typename KernelName, typename KernelType, int Dims, class Reduction >
121
+ void reduCGFunc (handler &CGH, KernelType KernelFunc,
122
+ const nd_range<Dims> &Range, Reduction &Redu);
123
+
124
+ template <typename KernelName, typename KernelType, int Dims, class Reduction >
125
+ void reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
126
+ size_t KernelRun, Reduction &Redu);
145
127
} // namespace detail
146
128
} // namespace intel
147
129
@@ -231,13 +213,6 @@ class __SYCL_EXPORT handler {
231
213
// / usage in finalize() method.
232
214
void saveCodeLoc (detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
233
215
234
- // / Stores the given \param Event to the \param Queue.
235
- // / Even though MQueue is a field of handler, the method addEvent() of
236
- // / queue_impl class cannot be called inside this handler.hpp file
237
- // / as queue_impl is incomplete class for handler.
238
- static void addEventToQueue (shared_ptr_class<detail::queue_impl> Queue,
239
- cl::sycl::event Event);
240
-
241
216
// / Constructs CG object of specific type, passes it to Scheduler and
242
217
// / returns sycl::event object representing the command group.
243
218
// / It's expected that the method is the latest method executed before
@@ -288,30 +263,6 @@ class __SYCL_EXPORT handler {
288
263
/* index*/ 0 );
289
264
}
290
265
291
- template <typename DataT, int Dims, access::mode AccessMode,
292
- access::target AccessTarget>
293
- void dissociateWithHandler (accessor<DataT, Dims, AccessMode, AccessTarget,
294
- access::placeholder::false_t >
295
- Acc) {
296
- detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
297
- detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl (*AccBase);
298
- detail::Requirement *Req = AccImpl.get ();
299
-
300
- // Remove accessor from the list of requirements, accessors storage,
301
- // and from the list of associated accessors.
302
- auto ReqIt = std::find (MRequirements.begin (), MRequirements.end (), Req);
303
- auto AccIt = std::find (MAccStorage.begin (), MAccStorage.end (), AccImpl);
304
- auto It =
305
- std::find_if (MAssociatedAccesors.begin (), MAssociatedAccesors.end (),
306
- [Req](const detail::ArgDesc &D) { return D.MPtr == Req; });
307
- assert ((ReqIt != MRequirements.end () && AccIt != MAccStorage.end () &&
308
- It != MAssociatedAccesors.end ()) &&
309
- " Cannot dissociate accessor." );
310
- MRequirements.erase (ReqIt);
311
- MAccStorage.erase (AccIt);
312
- MAssociatedAccesors.erase (It);
313
- }
314
-
315
266
// Recursively calls itself until arguments pack is fully processed.
316
267
// The version for regular(standard layout) argument.
317
268
template <typename T, typename ... Ts>
@@ -810,219 +761,6 @@ class __SYCL_EXPORT handler {
810
761
#endif
811
762
}
812
763
813
- // / Implements a command group function that enqueues a kernel that calls
814
- // / user's lambda function \param KernelFunc and does one iteration of
815
- // / reduction of elements in each of work-groups.
816
- // / This version uses tree-reduction algorithm to reduce elements in each
817
- // / of work-groups. At the end of each work-group the partial sum is written
818
- // / to a global buffer.
819
- // /
820
- // / Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
821
- template <typename KernelName, typename KernelType, int Dims, class Reduction >
822
- void reduCGFunc (KernelType KernelFunc, const nd_range<Dims> &Range,
823
- Reduction &Redu) {
824
-
825
- size_t NWorkItems = Range.get_global_range ().size ();
826
- size_t WGSize = Range.get_local_range ().size ();
827
- size_t NWorkGroups = Range.get_group_range ().size ();
828
-
829
- bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0 ;
830
- bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1 )) == 0 );
831
-
832
- bool IsUpdateOfUserAcc =
833
- Reduction::accessor_mode == access ::mode::read_write &&
834
- NWorkGroups == 1 ;
835
-
836
- // Use local memory to reduce elements in work-groups into 0-th element.
837
- // If WGSize is not power of two, then WGSize+1 elements are allocated.
838
- // The additional last element is used to catch elements that could
839
- // otherwise be lost in the tree-reduction algorithm.
840
- size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1 );
841
- auto LocalReds = Redu.getReadWriteLocalAcc (NumLocalElements, *this );
842
-
843
- auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, 0 , *this );
844
- auto ReduIdentity = Redu.getIdentity ();
845
- if (IsEfficientCase) {
846
- // Efficient case: work-groups are fully loaded and work-group size
847
- // is power of two.
848
- parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
849
- // Call user's functions. Reducer.MValue gets initialized there.
850
- typename Reduction::reducer_type Reducer (ReduIdentity);
851
- KernelFunc (NDIt, Reducer);
852
-
853
- // Copy the element to local memory to prepare it for tree-reduction.
854
- size_t LID = NDIt.get_local_linear_id ();
855
- LocalReds[LID] = Reducer.MValue ;
856
- NDIt.barrier ();
857
-
858
- // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
859
- typename Reduction::binary_operation BOp;
860
- size_t WGSize = NDIt.get_local_range ().size ();
861
- for (size_t CurStep = WGSize >> 1 ; CurStep > 0 ; CurStep >>= 1 ) {
862
- if (LID < CurStep)
863
- LocalReds[LID] = BOp (LocalReds[LID], LocalReds[LID + CurStep]);
864
- NDIt.barrier ();
865
- }
866
-
867
- // Compute the partial sum/reduction for the work-group.
868
- if (LID == 0 )
869
- Out.get_pointer ().get ()[NDIt.get_group_linear_id ()] =
870
- IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), LocalReds[0 ])
871
- : LocalReds[0 ];
872
- });
873
- } else {
874
- // Inefficient case: work-groups are not fully loaded
875
- // or WGSize is not power of two.
876
- // These two inefficient cases are handled by one kernel, which
877
- // can be split later into two separate kernels, if there are users who
878
- // really need more efficient code for them.
879
- using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t <
880
- KernelName, KernelType>::name;
881
- parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
882
- // Call user's functions. Reducer.MValue gets initialized there.
883
- typename Reduction::reducer_type Reducer (ReduIdentity);
884
- KernelFunc (NDIt, Reducer);
885
-
886
- size_t WGSize = NDIt.get_local_range ().size ();
887
- size_t LID = NDIt.get_local_linear_id ();
888
- size_t GID = NDIt.get_global_linear_id ();
889
- // Copy the element to local memory to prepare it for tree-reduction.
890
- LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
891
- LocalReds[WGSize] = ReduIdentity;
892
- NDIt.barrier ();
893
-
894
- // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
895
- // LocalReds[WGSize] accumulates last/odd elements when the step
896
- // of tree-reduction loop is not even.
897
- typename Reduction::binary_operation BOp;
898
- size_t PrevStep = WGSize;
899
- for (size_t CurStep = PrevStep >> 1 ; CurStep > 0 ; CurStep >>= 1 ) {
900
- if (LID < CurStep)
901
- LocalReds[LID] = BOp (LocalReds[LID], LocalReds[LID + CurStep]);
902
- else if (LID == CurStep && (PrevStep & 0x1 ))
903
- LocalReds[WGSize] = BOp (LocalReds[WGSize], LocalReds[PrevStep - 1 ]);
904
- NDIt.barrier ();
905
- PrevStep = CurStep;
906
- }
907
-
908
- // Compute the partial sum/reduction for the work-group.
909
- if (LID == 0 ) {
910
- auto GrID = NDIt.get_group_linear_id ();
911
- auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
912
- Out.get_pointer ().get ()[GrID] =
913
- IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), V) : V;
914
- }
915
- });
916
- }
917
- }
918
-
919
- // / Implements a command group function that enqueues a kernel that does one
920
- // / iteration of reduction of elements in each of work-groups.
921
- // / This version uses tree-reduction algorithm to reduce elements in each
922
- // / of work-groups. At the end of each work-group the partial sum is written
923
- // / to a global buffer.
924
- // /
925
- // / Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
926
- template <typename KernelName, typename KernelType, int Dims, class Reduction >
927
- void reduAuxCGFunc (const nd_range<Dims> &Range, size_t NWorkItems,
928
- size_t KernelRun, Reduction &Redu) {
929
- size_t WGSize = Range.get_local_range ().size ();
930
- size_t NWorkGroups = Range.get_group_range ().size ();
931
-
932
- // The last work-group may be not fully loaded with work, or the work group
933
- // size may be not power of those. Those two cases considered inefficient
934
- // as they require additional code and checks in the kernel.
935
- bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
936
- bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1 )) == 0 ;
937
-
938
- bool IsUpdateOfUserAcc =
939
- Reduction::accessor_mode == access ::mode::read_write &&
940
- NWorkGroups == 1 ;
941
-
942
- // Use local memory to reduce elements in work-groups into 0-th element.
943
- // If WGSize is not power of two, then WGSize+1 elements are allocated.
944
- // The additional last element is used to catch elements that could
945
- // otherwise be lost in the tree-reduction algorithm.
946
- size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1 );
947
- auto LocalReds = Redu.getReadWriteLocalAcc (NumLocalElements, *this );
948
-
949
- // Get read accessor to the buffer that was used as output
950
- // in the previous kernel. After that create new output buffer if needed
951
- // and get accessor to it (or use reduction's accessor if the kernel
952
- // is the last one).
953
- auto In = Redu.getReadAccToPreviousPartialReds (*this );
954
- auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, KernelRun, *this );
955
-
956
- if (IsEfficientCase) {
957
- // Efficient case: work-groups are fully loaded and work-group size
958
- // is power of two.
959
- using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t <
960
- KernelName, KernelType>::name;
961
- parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
962
- // Copy the element to local memory to prepare it for tree-reduction.
963
- size_t LID = NDIt.get_local_linear_id ();
964
- size_t GID = NDIt.get_global_linear_id ();
965
- LocalReds[LID] = In[GID];
966
- NDIt.barrier ();
967
-
968
- // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
969
- typename Reduction::binary_operation BOp;
970
- size_t WGSize = NDIt.get_local_range ().size ();
971
- for (size_t CurStep = WGSize >> 1 ; CurStep > 0 ; CurStep >>= 1 ) {
972
- if (LID < CurStep)
973
- LocalReds[LID] = BOp (LocalReds[LID], LocalReds[LID + CurStep]);
974
- NDIt.barrier ();
975
- }
976
-
977
- // Compute the partial sum/reduction for the work-group.
978
- if (LID == 0 )
979
- Out.get_pointer ().get ()[NDIt.get_group_linear_id ()] =
980
- IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), LocalReds[0 ])
981
- : LocalReds[0 ];
982
- });
983
- } else {
984
- // Inefficient case: work-groups are not fully loaded
985
- // or WGSize is not power of two.
986
- // These two inefficient cases are handled by one kernel, which
987
- // can be split later into two separate kernels, if there are users
988
- // who really need more efficient code for them.
989
- using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t <
990
- KernelName, KernelType>::name;
991
- auto ReduIdentity = Redu.getIdentity ();
992
- parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
993
- size_t WGSize = NDIt.get_local_range ().size ();
994
- size_t LID = NDIt.get_local_linear_id ();
995
- size_t GID = NDIt.get_global_linear_id ();
996
- // Copy the element to local memory to prepare it for tree-reduction
997
- LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity;
998
- LocalReds[WGSize] = ReduIdentity;
999
- NDIt.barrier ();
1000
-
1001
- // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
1002
- // LocalReds[WGSize] accumulates last/odd elements when the step
1003
- // of tree-reduction loop is not even.
1004
- typename Reduction::binary_operation BOp;
1005
- size_t PrevStep = WGSize;
1006
- for (size_t CurStep = PrevStep >> 1 ; CurStep > 0 ; CurStep >>= 1 ) {
1007
- if (LID < CurStep)
1008
- LocalReds[LID] = BOp (LocalReds[LID], LocalReds[LID + CurStep]);
1009
- else if (LID == CurStep && (PrevStep & 0x1 ))
1010
- LocalReds[WGSize] = BOp (LocalReds[WGSize], LocalReds[PrevStep - 1 ]);
1011
- NDIt.barrier ();
1012
- PrevStep = CurStep;
1013
- }
1014
-
1015
- // Compute the partial sum/reduction for the work-group.
1016
- if (LID == 0 ) {
1017
- auto GrID = NDIt.get_group_linear_id ();
1018
- auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
1019
- Out.get_pointer ().get ()[GrID] =
1020
- IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), V) : V;
1021
- }
1022
- });
1023
- }
1024
- }
1025
-
1026
764
// / Defines and invokes a SYCL kernel function for the specified nd_range.
1027
765
// / Performs reduction operation specified in \param Redu.
1028
766
// /
@@ -1063,30 +801,23 @@ class __SYCL_EXPORT handler {
1063
801
// necessary to reduce all partial sums into one final sum.
1064
802
1065
803
// 1. Call the kernel that includes user's lambda function.
1066
- // If this kernel is going to be now last one, i.e. it does not write
1067
- // to user's accessor, then detach user's accessor from this kernel
1068
- // to make the dependencies between accessors and kernels more clean and
1069
- // correct.
1070
- if (NWorkGroups > 1 )
1071
- dissociateWithHandler (Redu.MAcc );
1072
-
1073
- reduCGFunc<KernelName>(KernelFunc, Range, Redu);
804
+ intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
1074
805
auto QueueCopy = MQueue;
1075
806
MLastEvent = this ->finalize ();
1076
807
1077
808
// 2. Run the additional aux kernel as many times as needed to reduce
1078
809
// all partial sums into one scalar.
810
+
811
+ // TODO: user's nd_range and the work-group size specified there must
812
+ // be honored only for the main kernel that calls user's lambda functions.
813
+ // There is no need in using the same work-group size in these additional
814
+ // kernels. Thus, the better strategy here is to make the work-group size
815
+ // as big as possible to converge/reduce the partial sums into the last
816
+ // sum faster.
1079
817
size_t WGSize = Range.get_local_range ().size ();
1080
818
size_t NWorkItems = NWorkGroups;
1081
819
size_t KernelRun = 1 ;
1082
820
while (NWorkItems > 1 ) {
1083
- // Before creating another kernel, add the event from the previous kernel
1084
- // to queue.
1085
- addEventToQueue (QueueCopy, MLastEvent);
1086
-
1087
- // TODO: here the work-group size is not limited by user's needs,
1088
- // the better strategy here is to make the work-group-size as big
1089
- // as possible.
1090
821
WGSize = std::min (WGSize, NWorkItems);
1091
822
NWorkGroups = NWorkItems / WGSize;
1092
823
// The last group may be not fully loaded. Still register it as a group.
@@ -1102,8 +833,8 @@ class __SYCL_EXPORT handler {
1102
833
// Associate it with handler manually.
1103
834
if (NWorkGroups == 1 )
1104
835
AuxHandler.associateWithHandler (Redu.MAcc );
1105
- AuxHandler. reduAuxCGFunc <KernelName, KernelType>(Range, NWorkItems,
1106
- KernelRun, Redu);
836
+ intel::detail:: reduAuxCGFunc<KernelName, KernelType>(
837
+ AuxHandler, Range, NWorkItems, KernelRun, Redu);
1107
838
MLastEvent = AuxHandler.finalize ();
1108
839
1109
840
NWorkItems = NWorkGroups;
0 commit comments