Skip to content

Commit 3473c1a

Browse files
authored
[SYCL] Implement SYCL2020 reductions with initialize_to_identity prop (#3410)
* [SYCL] Implement SYCL2020 reductions with set initialize_to_identity property The corresponding LIT tests PR: intel/llvm-test-suite#194 SYCL2020 reductions use dynamic information (instead of static info used previously) to ask to do discard-write to user's reduction variable or USM memory. That caused big changes removing/re-structuring the code that previously relied on accessor-mode (read-write vs discard-write). With this patch the SYCL-2020 reductions become on par with ONEAPI::reduction Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent a102ed3 commit 3473c1a

File tree

6 files changed

+492
-348
lines changed

6 files changed

+492
-348
lines changed

sycl/include/CL/sycl/ONEAPI/reduction.hpp

+375-253
Large diffs are not rendered by default.

sycl/include/CL/sycl/detail/property_helper.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ enum DataLessPropKind {
3131
NoInit = 4,
3232
BufferUsePinnedHostMemory = 5,
3333
UsePrimaryContext = 6,
34-
DataLessPropKindSize = 7,
35-
InitializeToIdentity = 8
34+
InitializeToIdentity = 7,
35+
DataLessPropKindSize = 8
3636
};
3737

3838
// List of all properties with data IDs

sycl/include/CL/sycl/handler.hpp

+98-59
Original file line numberDiff line numberDiff line change
@@ -223,17 +223,16 @@ checkValueRange(const T &V) {
223223
namespace ONEAPI {
224224
namespace detail {
225225
template <typename T, class BinaryOperation, int Dims, bool IsUSM,
226-
access::mode AccMode, access::placeholder IsPlaceholder>
226+
access::placeholder IsPlaceholder>
227227
class reduction_impl;
228228

229229
using cl::sycl::detail::enable_if_t;
230230
using cl::sycl::detail::queue_impl;
231231

232-
template <typename KernelName, typename KernelType, int Dims, class Reduction,
233-
typename OutputT>
232+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
234233
enable_if_t<Reduction::has_fast_atomics>
235234
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
236-
Reduction &Redu, OutputT Out);
235+
Reduction &Redu);
237236

238237
template <typename KernelName, typename KernelType, int Dims, class Reduction>
239238
enable_if_t<!Reduction::has_fast_atomics>
@@ -258,6 +257,26 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
258257
std::tuple<Reductions...> &ReduTuple,
259258
std::index_sequence<Is...>);
260259

260+
template <typename KernelName, class Reduction>
261+
std::enable_if_t<!Reduction::is_usm>
262+
reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
263+
264+
template <typename KernelName, class Reduction>
265+
std::enable_if_t<Reduction::is_usm>
266+
reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
267+
268+
template <typename... Reduction, size_t... Is>
269+
shared_ptr_class<event>
270+
reduSaveFinalResultToUserMem(shared_ptr_class<detail::queue_impl> Queue,
271+
bool IsHost, std::tuple<Reduction...> &ReduTuple,
272+
std::index_sequence<Is...>);
273+
274+
template <typename Reduction, typename... RestT>
275+
std::enable_if_t<!Reduction::is_usm>
276+
reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
277+
shared_ptr_class<detail::queue_impl> Queue,
278+
bool IsHost, Reduction &Redu, RestT... Rest);
279+
261280
__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
262281
size_t LocalMemBytesPerWorkItem);
263282

@@ -1159,73 +1178,43 @@ class __SYCL_EXPORT handler {
11591178
#endif
11601179
}
11611180

1162-
/// Implements parallel_for() accepting nd_range and 1 reduction variable
1163-
/// having 'read_write' access mode.
1164-
/// This version uses fast sycl::atomic operations to update user's reduction
1181+
/// Implements parallel_for() accepting nd_range \p Range and one reduction
1182+
/// object. This version uses fast sycl::atomic operations to update reduction
11651183
/// variable at the end of each work-group work.
1184+
//
1185+
// If the reduction variable must be initialized with the identity value
1186+
// before the kernel run, then an additional working accessor is created,
1187+
// initialized with the identity value and used in the kernel. That working
1188+
// accessor is then copied to user's accessor or USM pointer after
1189+
// the kernel run.
1190+
// For USM pointers without initialize_to_identity properties the same scheme
1191+
// with working accessor is used as re-using user's USM pointer in the kernel
1192+
// would require creation of another variant of user's kernel, which does not
1193+
// seem efficient.
11661194
template <typename KernelName = detail::auto_name, typename KernelType,
11671195
int Dims, typename Reduction>
1168-
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
1169-
Reduction::has_fast_atomics && !Reduction::is_usm>
1170-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1171-
_KERNELFUNCPARAM(KernelFunc)) {
1172-
ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
1173-
Redu.getUserAccessor());
1174-
}
1175-
1176-
/// Implements parallel_for() accepting nd_range and 1 reduction variable
1177-
/// having 'read_write' access mode.
1178-
/// This version uses fast sycl::atomic operations to update user's reduction
1179-
/// variable at the end of each work-group work.
1180-
template <typename KernelName = detail::auto_name, typename KernelType,
1181-
int Dims, typename Reduction>
1182-
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
1183-
Reduction::has_fast_atomics && Reduction::is_usm>
1184-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1185-
_KERNELFUNCPARAM(KernelFunc)) {
1186-
ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
1187-
Redu.getUSMPointer());
1188-
}
1189-
1190-
/// Implements parallel_for() accepting nd_range and 1 reduction variable
1191-
/// having 'discard_write' access mode.
1192-
/// This version uses fast sycl::atomic operations to update user's reduction
1193-
/// variable at the end of each work-group work.
1194-
///
1195-
/// The reduction variable must be initialized before the kernel is started
1196-
/// because atomic operations only update the value, but never initialize it.
1197-
/// Thus, an additional 'read_write' accessor is created/initialized with
1198-
/// identity value and then passed to the kernel. After running the kernel it
1199-
/// is copied to user's 'discard_write' accessor.
1200-
template <typename KernelName = detail::auto_name, typename KernelType,
1201-
int Dims, typename Reduction>
1202-
detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
1203-
Reduction::has_fast_atomics>
1196+
detail::enable_if_t<Reduction::has_fast_atomics>
12041197
parallel_for(nd_range<Dims> Range, Reduction Redu,
12051198
_KERNELFUNCPARAM(KernelFunc)) {
12061199
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
1207-
auto RWAcc = Redu.getReadWriteScalarAcc(*this);
1208-
ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
1209-
RWAcc);
1210-
this->finalize();
1200+
ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
12111201

1212-
// Copy from RWAcc to user's reduction accessor.
1213-
handler CopyHandler(QueueCopy, MIsHost);
1214-
CopyHandler.saveCodeLoc(MCodeLoc);
1215-
#ifndef __SYCL_DEVICE_ONLY__
1216-
CopyHandler.associateWithHandler(&RWAcc, access::target::global_buffer);
1217-
Redu.associateWithHandler(CopyHandler);
1218-
#endif
1219-
CopyHandler.copy(RWAcc, Redu.getUserAccessor());
1220-
MLastEvent = CopyHandler.finalize();
1202+
if (Reduction::is_usm || Redu.initializeToIdentity()) {
1203+
this->finalize();
1204+
handler CopyHandler(QueueCopy, MIsHost);
1205+
CopyHandler.saveCodeLoc(MCodeLoc);
1206+
ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1207+
Redu);
1208+
MLastEvent = CopyHandler.finalize();
1209+
}
12211210
}
12221211

12231212
/// Defines and invokes a SYCL kernel function for the specified nd_range.
1224-
/// Performs reduction operation specified in \param Redu.
1213+
/// Performs reduction operation specified in \p Redu.
12251214
///
12261215
/// The SYCL kernel function is defined as a lambda function or a named
12271216
/// function object type and given an id or item for indexing in the indexing
1228-
/// space defined by range.
1217+
/// space defined by \p Range.
12291218
/// If it is a named function object and the function object type is
12301219
/// globally visible, there is no need for the developer to provide
12311220
/// a kernel name for it.
@@ -1300,13 +1289,50 @@ class __SYCL_EXPORT handler {
13001289
AuxHandler, NWorkItems, MaxWGSize, Redu);
13011290
MLastEvent = AuxHandler.finalize();
13021291
} // end while (NWorkItems > 1)
1292+
1293+
if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
1294+
handler CopyHandler(QueueCopy, MIsHost);
1295+
CopyHandler.saveCodeLoc(MCodeLoc);
1296+
ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1297+
Redu);
1298+
MLastEvent = CopyHandler.finalize();
1299+
}
13031300
}
13041301

13051302
// This version of parallel_for may handle one or more reductions packed in
13061303
// \p Rest argument. Note thought that the last element in \p Rest pack is
13071304
// the kernel function.
13081305
// TODO: this variant is currently enabled for 2+ reductions only as the
13091306
// versions handling 1 reduction variable are more efficient right now.
1307+
//
1308+
// Algorithm:
1309+
// 1) discard_write accessor (DWAcc), InitializeToIdentity = true:
1310+
// a) Create uninitialized buffer and read_write accessor (RWAcc).
1311+
// b) discard-write partial sums to RWAcc.
1312+
// c) Repeat the steps (a) and (b) to get one final sum.
1313+
// d) Copy RWAcc to DWAcc.
1314+
// 2) read_write accessor (RWAcc), InitializeToIdentity = false:
1315+
// a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1316+
// re-use user's RWAcc (if #work-groups is 1).
1317+
// b) discard-write to RWAcc (#WG > 1), or update-write (#WG == 1).
1318+
// c) Repeat the steps (a) and (b) to get one final sum.
1319+
// 3) read_write accessor (RWAcc), InitializeToIdentity = true:
1320+
// a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1321+
// re-use user's RWAcc (if #work-groups is 1).
1322+
// b) discard-write to RWAcc.
1323+
// c) Repeat the steps (a) and (b) to get one final sum.
1324+
// 4) USM pointer, InitializeToIdentity = false:
1325+
// a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1326+
// re-use user's USM pointer (if #work-groups is 1).
1327+
// b) discard-write to RWAcc (#WG > 1) or
1328+
// update-write to USM pointer (#WG == 1).
1329+
// c) Repeat the steps (a) and (b) to get one final sum.
1330+
// 5) USM pointer, InitializeToIdentity = true:
1331+
// a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1332+
// re-use user's USM pointer (if #work-groups is 1).
1333+
// b) discard-write to RWAcc (#WG > 1) or
1334+
// discard-write to USM pointer (#WG == 1).
1335+
// c) Repeat the steps (a) and (b) to get one final sum.
13101336
template <typename KernelName = detail::auto_name, int Dims,
13111337
typename... RestT>
13121338
std::enable_if_t<(sizeof...(RestT) >= 3 &&
@@ -1348,6 +1374,11 @@ class __SYCL_EXPORT handler {
13481374
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
13491375
MLastEvent = AuxHandler.finalize();
13501376
} // end while (NWorkItems > 1)
1377+
1378+
auto CopyEvent = ONEAPI::detail::reduSaveFinalResultToUserMem(
1379+
QueueCopy, MIsHost, ReduTuple, ReduIndices);
1380+
if (CopyEvent)
1381+
MLastEvent = *CopyEvent;
13511382
}
13521383

13531384
/// Hierarchical kernel invocation method of a kernel defined as a lambda
@@ -2085,9 +2116,17 @@ class __SYCL_EXPORT handler {
20852116
// Make reduction_impl friend to store buffers and arrays created for it
20862117
// in handler from reduction_impl methods.
20872118
template <typename T, class BinaryOperation, int Dims, bool IsUSM,
2088-
access::mode AccMode, access::placeholder IsPlaceholder>
2119+
access::placeholder IsPlaceholder>
20892120
friend class ONEAPI::detail::reduction_impl;
20902121

2122+
// This method needs to call the method finalize().
2123+
template <typename Reduction, typename... RestT>
2124+
std::enable_if_t<!Reduction::is_usm> friend ONEAPI::detail::
2125+
reduSaveFinalResultToUserMemHelper(
2126+
std::vector<event> &Events,
2127+
shared_ptr_class<detail::queue_impl> Queue, bool IsHost, Reduction &,
2128+
RestT...);
2129+
20912130
friend void detail::associateWithHandler(handler &,
20922131
detail::AccessorBaseHost *,
20932132
access::target);

sycl/include/CL/sycl/reduction.hpp

+15-32
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,12 @@ inline constexpr AccumulatorT known_identity_v =
4141
template <typename T, typename AllocatorT, typename BinaryOperation>
4242
std::enable_if_t<has_known_identity<BinaryOperation, T>::value,
4343
ONEAPI::detail::reduction_impl<T, BinaryOperation, 1, false,
44-
access::mode::read_write,
4544
access::placeholder::true_t>>
4645
reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, BinaryOperation,
4746
const property_list &PropList = {}) {
48-
// TODO: need to handle 'PropList'.
49-
if (PropList.has_property<property::reduction::initialize_to_identity>())
50-
throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
51-
"property is not supported yet.",
52-
PI_INVALID_VALUE);
53-
return {Var, CGH};
47+
bool InitializeToIdentity =
48+
PropList.has_property<property::reduction::initialize_to_identity>();
49+
return {Var, CGH, InitializeToIdentity};
5450
}
5551

5652
/// Constructs a reduction object using the given buffer \p Var, handler \p CGH,
@@ -60,7 +56,6 @@ reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, BinaryOperation,
6056
template <typename T, typename AllocatorT, typename BinaryOperation>
6157
std::enable_if_t<!has_known_identity<BinaryOperation, T>::value,
6258
ONEAPI::detail::reduction_impl<T, BinaryOperation, 1, false,
63-
access::mode::read_write,
6459
access::placeholder::true_t>>
6560
reduction(buffer<T, 1, AllocatorT>, handler &, BinaryOperation,
6661
const property_list &PropList = {}) {
@@ -76,14 +71,11 @@ reduction(buffer<T, 1, AllocatorT>, handler &, BinaryOperation,
7671
/// \p Combiner, and optional reduction properties.
7772
template <typename T, typename BinaryOperation>
7873
std::enable_if_t<has_known_identity<BinaryOperation, T>::value,
79-
ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true,
80-
access::mode::read_write>>
74+
ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true>>
8175
reduction(T *Var, BinaryOperation, const property_list &PropList = {}) {
82-
if (PropList.has_property<property::reduction::initialize_to_identity>())
83-
throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
84-
"property is not supported yet.",
85-
PI_INVALID_VALUE);
86-
return {Var};
76+
bool InitializeToIdentity =
77+
PropList.has_property<property::reduction::initialize_to_identity>();
78+
return {Var, InitializeToIdentity};
8779
}
8880

8981
/// Constructs a reduction object using the reduction variable referenced by
@@ -93,8 +85,7 @@ reduction(T *Var, BinaryOperation, const property_list &PropList = {}) {
9385
/// reduction identity is not known statically and it is not provided by user.
9486
template <typename T, typename BinaryOperation>
9587
std::enable_if_t<!has_known_identity<BinaryOperation, T>::value,
96-
ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true,
97-
access::mode::read_write>>
88+
ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true>>
9889
reduction(T *, BinaryOperation, const property_list &PropList = {}) {
9990
// TODO: implement reduction that works even when identity is not known.
10091
(void)PropList;
@@ -108,32 +99,24 @@ reduction(T *, BinaryOperation, const property_list &PropList = {}) {
10899
/// and optional reduction properties.
109100
template <typename T, typename AllocatorT, typename BinaryOperation>
110101
ONEAPI::detail::reduction_impl<T, BinaryOperation, 1, false,
111-
access::mode::read_write,
112102
access::placeholder::true_t>
113103
reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, const T &Identity,
114104
BinaryOperation Combiner, const property_list &PropList = {}) {
115-
// TODO: need to handle 'PropList'.
116-
if (PropList.has_property<property::reduction::initialize_to_identity>())
117-
throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
118-
"property is not supported yet.",
119-
PI_INVALID_VALUE);
120-
return {Var, CGH, Identity, Combiner};
105+
bool InitializeToIdentity =
106+
PropList.has_property<property::reduction::initialize_to_identity>();
107+
return {Var, CGH, Identity, Combiner, InitializeToIdentity};
121108
}
122109

123110
/// Constructs a reduction object using the reduction variable referenced by
124111
/// the given USM pointer \p Var, reduction identity value \p Identity,
125112
/// binary operation \p Combiner, and optional reduction properties.
126113
template <typename T, typename BinaryOperation>
127-
ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true,
128-
access::mode::read_write>
114+
ONEAPI::detail::reduction_impl<T, BinaryOperation, 0, true>
129115
reduction(T *Var, const T &Identity, BinaryOperation Combiner,
130116
const property_list &PropList = {}) {
131-
// TODO: need to handle 'PropList'.
132-
if (PropList.has_property<property::reduction::initialize_to_identity>())
133-
throw runtime_error("SYCL-2020 reduction with initialize_to_identity "
134-
"property is not supported yet.",
135-
PI_INVALID_VALUE);
136-
return {Var, Identity, Combiner};
117+
bool InitializeToIdentity =
118+
PropList.has_property<property::reduction::initialize_to_identity>();
119+
return {Var, Identity, Combiner, InitializeToIdentity};
137120
}
138121

139122
} // namespace sycl

sycl/test/abi/layout_buffer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ void foo(sycl::buffer<int, 2>) {}
2828
// CHECK-NEXT: 24 | class sycl::detail::SYCLMemObjAllocator * _M_head_impl
2929
// CHECK-NEXT: 32 | class sycl::property_list MProps
3030
// CHECK-NEXT: 32 | class sycl::detail::PropertyListBase (base)
31-
// CHECK-NEXT: 32 | class std::bitset<7> MDataLessProps
31+
// CHECK-NEXT: 32 | class std::bitset<8> MDataLessProps
3232
// CHECK-NEXT: 32 | struct std::_Base_bitset<1> (base)
3333
// CHECK-NEXT: 32 | std::_Base_bitset<1>::_WordT _M_w
3434
// CHECK-NEXT: 40 | class std::vector<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > MPropsWithData

sycl/test/abi/layout_image.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ sycl::image<2> Img{sycl::image_channel_order::rgba, sycl::image_channel_type::fp
2929
// CHECK-NEXT: 24 | class sycl::detail::SYCLMemObjAllocator * _M_head_impl
3030
// CHECK-NEXT: 32 | class sycl::property_list MProps
3131
// CHECK-NEXT: 32 | class sycl::detail::PropertyListBase (base)
32-
// CHECK-NEXT: 32 | class std::bitset<7> MDataLessProps
32+
// CHECK-NEXT: 32 | class std::bitset<8> MDataLessProps
3333
// CHECK-NEXT: 32 | struct std::_Base_bitset<1> (base)
3434
// CHECK-NEXT: 32 | std::_Base_bitset<1>::_WordT _M_w
3535
// CHECK-NEXT: 40 | class std::vector<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > MPropsWithData

0 commit comments

Comments
 (0)