Skip to content

Commit abe533c

Browse files
authored
[SYCL] Move the reduction command group funcs out of handler.hpp (#1602)
Moved the reduction command group functions from handler.hpp to reduction.hpp; Removed handler::dissociateWithHandler(); Removed handler::addEventToQueue() and made queue_impl::addEvent() private again; Minor changes in comments; Replaced 'auto' with 'size_t' in few places. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 5788b98 commit abe533c

File tree

5 files changed

+265
-298
lines changed

5 files changed

+265
-298
lines changed

sycl/include/CL/sycl/handler.hpp

+18-287
Original file line numberDiff line numberDiff line change
@@ -107,32 +107,6 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
107107

108108
__SYCL_EXPORT device getDeviceFromHandler(handler &);
109109

110-
/// These are the forward declaration for the classes that help to create
111-
/// names for additional kernels. It is used only when there are
112-
/// more then 1 kernels in one parallel_for() implementing SYCL reduction.
113-
template <typename Type> class __sycl_reduction_main_2nd_kernel;
114-
template <typename Type> class __sycl_reduction_aux_1st_kernel;
115-
template <typename Type> class __sycl_reduction_aux_2nd_kernel;
116-
117-
/// Helper structs to get additional kernel name types based on given
118-
/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then
119-
/// \c Type becomes the \c Name.
120-
template <typename Name, typename Type>
121-
struct get_reduction_main_2nd_kernel_name_t {
122-
using name = __sycl_reduction_main_2nd_kernel<
123-
typename get_kernel_name_t<Name, Type>::name>;
124-
};
125-
template <typename Name, typename Type>
126-
struct get_reduction_aux_1st_kernel_name_t {
127-
using name = __sycl_reduction_aux_1st_kernel<
128-
typename get_kernel_name_t<Name, Type>::name>;
129-
};
130-
template <typename Name, typename Type>
131-
struct get_reduction_aux_2nd_kernel_name_t {
132-
using name = __sycl_reduction_aux_2nd_kernel<
133-
typename get_kernel_name_t<Name, Type>::name>;
134-
};
135-
136110
device getDeviceFromHandler(handler &);
137111

138112
} // namespace detail
@@ -142,6 +116,14 @@ namespace detail {
142116
template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
143117
access::placeholder IsPlaceholder>
144118
class reduction_impl;
119+
120+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
121+
void reduCGFunc(handler &CGH, KernelType KernelFunc,
122+
const nd_range<Dims> &Range, Reduction &Redu);
123+
124+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
125+
void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
126+
size_t KernelRun, Reduction &Redu);
145127
} // namespace detail
146128
} // namespace intel
147129

@@ -231,13 +213,6 @@ class __SYCL_EXPORT handler {
231213
/// usage in finalize() method.
232214
void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
233215

234-
/// Stores the given \param Event to the \param Queue.
235-
/// Even though MQueue is a field of handler, the method addEvent() of
236-
/// queue_impl class cannot be called inside this handler.hpp file
237-
/// as queue_impl is incomplete class for handler.
238-
static void addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
239-
cl::sycl::event Event);
240-
241216
/// Constructs CG object of specific type, passes it to Scheduler and
242217
/// returns sycl::event object representing the command group.
243218
/// It's expected that the method is the latest method executed before
@@ -288,30 +263,6 @@ class __SYCL_EXPORT handler {
288263
/*index*/ 0);
289264
}
290265

291-
template <typename DataT, int Dims, access::mode AccessMode,
292-
access::target AccessTarget>
293-
void dissociateWithHandler(accessor<DataT, Dims, AccessMode, AccessTarget,
294-
access::placeholder::false_t>
295-
Acc) {
296-
detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
297-
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
298-
detail::Requirement *Req = AccImpl.get();
299-
300-
// Remove accessor from the list of requirements, accessors storage,
301-
// and from the list of associated accessors.
302-
auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req);
303-
auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl);
304-
auto It =
305-
std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
306-
[Req](const detail::ArgDesc &D) { return D.MPtr == Req; });
307-
assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() &&
308-
It != MAssociatedAccesors.end()) &&
309-
"Cannot dissociate accessor.");
310-
MRequirements.erase(ReqIt);
311-
MAccStorage.erase(AccIt);
312-
MAssociatedAccesors.erase(It);
313-
}
314-
315266
// Recursively calls itself until arguments pack is fully processed.
316267
// The version for regular(standard layout) argument.
317268
template <typename T, typename... Ts>
@@ -810,219 +761,6 @@ class __SYCL_EXPORT handler {
810761
#endif
811762
}
812763

813-
/// Implements a command group function that enqueues a kernel that calls
814-
/// user's lambda function \param KernelFunc and does one iteration of
815-
/// reduction of elements in each of work-groups.
816-
/// This version uses tree-reduction algorithm to reduce elements in each
817-
/// of work-groups. At the end of each work-group the partial sum is written
818-
/// to a global buffer.
819-
///
820-
/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
821-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
822-
void reduCGFunc(KernelType KernelFunc, const nd_range<Dims> &Range,
823-
Reduction &Redu) {
824-
825-
size_t NWorkItems = Range.get_global_range().size();
826-
size_t WGSize = Range.get_local_range().size();
827-
size_t NWorkGroups = Range.get_group_range().size();
828-
829-
bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
830-
bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
831-
832-
bool IsUpdateOfUserAcc =
833-
Reduction::accessor_mode == access::mode::read_write &&
834-
NWorkGroups == 1;
835-
836-
// Use local memory to reduce elements in work-groups into 0-th element.
837-
// If WGSize is not power of two, then WGSize+1 elements are allocated.
838-
// The additional last element is used to catch elements that could
839-
// otherwise be lost in the tree-reduction algorithm.
840-
size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
841-
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
842-
843-
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this);
844-
auto ReduIdentity = Redu.getIdentity();
845-
if (IsEfficientCase) {
846-
// Efficient case: work-groups are fully loaded and work-group size
847-
// is power of two.
848-
parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
849-
// Call user's functions. Reducer.MValue gets initialized there.
850-
typename Reduction::reducer_type Reducer(ReduIdentity);
851-
KernelFunc(NDIt, Reducer);
852-
853-
// Copy the element to local memory to prepare it for tree-reduction.
854-
size_t LID = NDIt.get_local_linear_id();
855-
LocalReds[LID] = Reducer.MValue;
856-
NDIt.barrier();
857-
858-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
859-
typename Reduction::binary_operation BOp;
860-
size_t WGSize = NDIt.get_local_range().size();
861-
for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
862-
if (LID < CurStep)
863-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
864-
NDIt.barrier();
865-
}
866-
867-
// Compute the partial sum/reduction for the work-group.
868-
if (LID == 0)
869-
Out.get_pointer().get()[NDIt.get_group_linear_id()] =
870-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
871-
: LocalReds[0];
872-
});
873-
} else {
874-
// Inefficient case: work-groups are not fully loaded
875-
// or WGSize is not power of two.
876-
// These two inefficient cases are handled by one kernel, which
877-
// can be split later into two separate kernels, if there are users who
878-
// really need more efficient code for them.
879-
using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t<
880-
KernelName, KernelType>::name;
881-
parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
882-
// Call user's functions. Reducer.MValue gets initialized there.
883-
typename Reduction::reducer_type Reducer(ReduIdentity);
884-
KernelFunc(NDIt, Reducer);
885-
886-
size_t WGSize = NDIt.get_local_range().size();
887-
size_t LID = NDIt.get_local_linear_id();
888-
size_t GID = NDIt.get_global_linear_id();
889-
// Copy the element to local memory to prepare it for tree-reduction.
890-
LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
891-
LocalReds[WGSize] = ReduIdentity;
892-
NDIt.barrier();
893-
894-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
895-
// LocalReds[WGSize] accumulates last/odd elements when the step
896-
// of tree-reduction loop is not even.
897-
typename Reduction::binary_operation BOp;
898-
size_t PrevStep = WGSize;
899-
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
900-
if (LID < CurStep)
901-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
902-
else if (LID == CurStep && (PrevStep & 0x1))
903-
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
904-
NDIt.barrier();
905-
PrevStep = CurStep;
906-
}
907-
908-
// Compute the partial sum/reduction for the work-group.
909-
if (LID == 0) {
910-
auto GrID = NDIt.get_group_linear_id();
911-
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
912-
Out.get_pointer().get()[GrID] =
913-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
914-
}
915-
});
916-
}
917-
}
918-
919-
/// Implements a command group function that enqueues a kernel that does one
920-
/// iteration of reduction of elements in each of work-groups.
921-
/// This version uses tree-reduction algorithm to reduce elements in each
922-
/// of work-groups. At the end of each work-group the partial sum is written
923-
/// to a global buffer.
924-
///
925-
/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
926-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
927-
void reduAuxCGFunc(const nd_range<Dims> &Range, size_t NWorkItems,
928-
size_t KernelRun, Reduction &Redu) {
929-
size_t WGSize = Range.get_local_range().size();
930-
size_t NWorkGroups = Range.get_group_range().size();
931-
932-
// The last work-group may be not fully loaded with work, or the work group
933-
// size may be not power of those. Those two cases considered inefficient
934-
// as they require additional code and checks in the kernel.
935-
bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
936-
bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
937-
938-
bool IsUpdateOfUserAcc =
939-
Reduction::accessor_mode == access::mode::read_write &&
940-
NWorkGroups == 1;
941-
942-
// Use local memory to reduce elements in work-groups into 0-th element.
943-
// If WGSize is not power of two, then WGSize+1 elements are allocated.
944-
// The additional last element is used to catch elements that could
945-
// otherwise be lost in the tree-reduction algorithm.
946-
size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
947-
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
948-
949-
// Get read accessor to the buffer that was used as output
950-
// in the previous kernel. After that create new output buffer if needed
951-
// and get accessor to it (or use reduction's accessor if the kernel
952-
// is the last one).
953-
auto In = Redu.getReadAccToPreviousPartialReds(*this);
954-
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this);
955-
956-
if (IsEfficientCase) {
957-
// Efficient case: work-groups are fully loaded and work-group size
958-
// is power of two.
959-
using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t<
960-
KernelName, KernelType>::name;
961-
parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
962-
// Copy the element to local memory to prepare it for tree-reduction.
963-
size_t LID = NDIt.get_local_linear_id();
964-
size_t GID = NDIt.get_global_linear_id();
965-
LocalReds[LID] = In[GID];
966-
NDIt.barrier();
967-
968-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
969-
typename Reduction::binary_operation BOp;
970-
size_t WGSize = NDIt.get_local_range().size();
971-
for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
972-
if (LID < CurStep)
973-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
974-
NDIt.barrier();
975-
}
976-
977-
// Compute the partial sum/reduction for the work-group.
978-
if (LID == 0)
979-
Out.get_pointer().get()[NDIt.get_group_linear_id()] =
980-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
981-
: LocalReds[0];
982-
});
983-
} else {
984-
// Inefficient case: work-groups are not fully loaded
985-
// or WGSize is not power of two.
986-
// These two inefficient cases are handled by one kernel, which
987-
// can be split later into two separate kernels, if there are users
988-
// who really need more efficient code for them.
989-
using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t<
990-
KernelName, KernelType>::name;
991-
auto ReduIdentity = Redu.getIdentity();
992-
parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
993-
size_t WGSize = NDIt.get_local_range().size();
994-
size_t LID = NDIt.get_local_linear_id();
995-
size_t GID = NDIt.get_global_linear_id();
996-
// Copy the element to local memory to prepare it for tree-reduction
997-
LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity;
998-
LocalReds[WGSize] = ReduIdentity;
999-
NDIt.barrier();
1000-
1001-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
1002-
// LocalReds[WGSize] accumulates last/odd elements when the step
1003-
// of tree-reduction loop is not even.
1004-
typename Reduction::binary_operation BOp;
1005-
size_t PrevStep = WGSize;
1006-
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1007-
if (LID < CurStep)
1008-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1009-
else if (LID == CurStep && (PrevStep & 0x1))
1010-
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1011-
NDIt.barrier();
1012-
PrevStep = CurStep;
1013-
}
1014-
1015-
// Compute the partial sum/reduction for the work-group.
1016-
if (LID == 0) {
1017-
auto GrID = NDIt.get_group_linear_id();
1018-
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
1019-
Out.get_pointer().get()[GrID] =
1020-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
1021-
}
1022-
});
1023-
}
1024-
}
1025-
1026764
/// Defines and invokes a SYCL kernel function for the specified nd_range.
1027765
/// Performs reduction operation specified in \param Redu.
1028766
///
@@ -1063,30 +801,23 @@ class __SYCL_EXPORT handler {
1063801
// necessary to reduce all partial sums into one final sum.
1064802

1065803
// 1. Call the kernel that includes user's lambda function.
1066-
// If this kernel is going to be now last one, i.e. it does not write
1067-
// to user's accessor, then detach user's accessor from this kernel
1068-
// to make the dependencies between accessors and kernels more clean and
1069-
// correct.
1070-
if (NWorkGroups > 1)
1071-
dissociateWithHandler(Redu.MAcc);
1072-
1073-
reduCGFunc<KernelName>(KernelFunc, Range, Redu);
804+
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1074805
auto QueueCopy = MQueue;
1075806
MLastEvent = this->finalize();
1076807

1077808
// 2. Run the additional aux kernel as many times as needed to reduce
1078809
// all partial sums into one scalar.
810+
811+
// TODO: user's nd_range and the work-group size specified there must
812+
// be honored only for the main kernel that calls user's lambda functions.
813+
// There is no need in using the same work-group size in these additional
814+
// kernels. Thus, the better strategy here is to make the work-group size
815+
// as big as possible to converge/reduce the partial sums into the last
816+
// sum faster.
1079817
size_t WGSize = Range.get_local_range().size();
1080818
size_t NWorkItems = NWorkGroups;
1081819
size_t KernelRun = 1;
1082820
while (NWorkItems > 1) {
1083-
// Before creating another kernel, add the event from the previous kernel
1084-
// to queue.
1085-
addEventToQueue(QueueCopy, MLastEvent);
1086-
1087-
// TODO: here the work-group size is not limited by user's needs,
1088-
// the better strategy here is to make the work-group-size as big
1089-
// as possible.
1090821
WGSize = std::min(WGSize, NWorkItems);
1091822
NWorkGroups = NWorkItems / WGSize;
1092823
// The last group may be not fully loaded. Still register it as a group.
@@ -1102,8 +833,8 @@ class __SYCL_EXPORT handler {
1102833
// Associate it with handler manually.
1103834
if (NWorkGroups == 1)
1104835
AuxHandler.associateWithHandler(Redu.MAcc);
1105-
AuxHandler.reduAuxCGFunc<KernelName, KernelType>(Range, NWorkItems,
1106-
KernelRun, Redu);
836+
intel::detail::reduAuxCGFunc<KernelName, KernelType>(
837+
AuxHandler, Range, NWorkItems, KernelRun, Redu);
1107838
MLastEvent = AuxHandler.finalize();
1108839

1109840
NWorkItems = NWorkGroups;

0 commit comments

Comments
 (0)