Skip to content

Commit 8af1eb3

Browse files
[SYCL] Use custom type to pass CGF around instead of std::function (#16668)
* `std::function` is affecting compile-time too much * our `queue::submit` is a *synchronous* operation, so we don't need to make any copy. Maybe `std::function_ref` would be a good choice here, but that's C++26. * Try to convert typed CGFO into type-erased version as soon as possible to limit number of template instantiations FE needs to perform
1 parent 6965142 commit 8af1eb3

File tree

10 files changed

+179
-86
lines changed

10 files changed

+179
-86
lines changed

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,16 @@ template <typename LCRangeT, typename LCPropertiesT> struct LaunchConfigAccess {
9999
template <typename CommandGroupFunc, typename PropertiesT>
100100
void submit_impl(queue &Q, PropertiesT Props, CommandGroupFunc &&CGF,
101101
const sycl::detail::code_location &CodeLoc) {
102-
Q.submit_without_event(Props, std::forward<CommandGroupFunc>(CGF), CodeLoc);
102+
Q.submit_without_event<__SYCL_USE_FALLBACK_ASSERT>(
103+
Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc);
103104
}
104105

105106
template <typename CommandGroupFunc, typename PropertiesT>
106107
event submit_with_event_impl(queue &Q, PropertiesT Props,
107108
CommandGroupFunc &&CGF,
108109
const sycl::detail::code_location &CodeLoc) {
109-
return Q.submit_with_event(Props, std::forward<CommandGroupFunc>(CGF),
110-
nullptr, CodeLoc);
110+
return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT>(
111+
Props, detail::type_erased_cgfo_ty{CGF}, nullptr, CodeLoc);
111112
}
112113
} // namespace detail
113114

sycl/include/sycl/handler.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,38 @@ class graph_impl;
162162
} // namespace ext::oneapi::experimental::detail
163163
namespace detail {
164164

165+
class type_erased_cgfo_ty {
166+
// From SYCL 2020, command group function object:
167+
// A type which is callable with operator() that takes a reference to a
168+
// command group handler, that defines a command group which can be submitted
169+
// by a queue. The function object can be a named type, lambda function or
170+
// std::function.
171+
template <typename T> struct invoker {
172+
static void call(void *object, handler &cgh) {
173+
(*static_cast<T *>(object))(cgh);
174+
}
175+
};
176+
void *object;
177+
using invoker_ty = void (*)(void *, handler &);
178+
const invoker_ty invoker_f;
179+
180+
public:
181+
template <class T>
182+
type_erased_cgfo_ty(T &f)
183+
// NOTE: Even if `T` is a pointer to a function, `&f` is a pointer to a
184+
// pointer to a function and as such can be casted to `void *` (pointer to
185+
// a function cannot be casted).
186+
: object(static_cast<void *>(&f)), invoker_f(&invoker<T>::call) {}
187+
~type_erased_cgfo_ty() = default;
188+
189+
type_erased_cgfo_ty(const type_erased_cgfo_ty &) = delete;
190+
type_erased_cgfo_ty(type_erased_cgfo_ty &&) = delete;
191+
type_erased_cgfo_ty &operator=(const type_erased_cgfo_ty &) = delete;
192+
type_erased_cgfo_ty &operator=(type_erased_cgfo_ty &&) = delete;
193+
194+
void operator()(sycl::handler &cgh) const { invoker_f(object, cgh); }
195+
};
196+
165197
class kernel_bundle_impl;
166198
class work_group_memory_impl;
167199
class handler_impl;

sycl/include/sycl/queue.hpp

Lines changed: 65 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,8 @@ auto get_native(const SyclObjectT &Obj)
7575
namespace detail {
7676
class queue_impl;
7777

78-
#if __SYCL_USE_FALLBACK_ASSERT
7978
inline event submitAssertCapture(queue &, event &, queue *,
8079
const detail::code_location &);
81-
#endif
8280

8381
// Function to postprocess submitted command
8482
// Arguments:
@@ -375,8 +373,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
375373
std::enable_if_t<std::is_invocable_r_v<void, T, handler &>, event> submit(
376374
T CGF,
377375
const detail::code_location &CodeLoc = detail::code_location::current()) {
378-
return submit_with_event(
379-
sycl::ext::oneapi::experimental::empty_properties_t{}, CGF,
376+
return submit_with_event<__SYCL_USE_FALLBACK_ASSERT>(
377+
sycl::ext::oneapi::experimental::empty_properties_t{},
378+
detail::type_erased_cgfo_ty{CGF},
380379
/*SecondaryQueuePtr=*/nullptr, CodeLoc);
381380
}
382381

@@ -395,9 +394,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
395394
std::enable_if_t<std::is_invocable_r_v<void, T, handler &>, event> submit(
396395
T CGF, queue &SecondaryQueue,
397396
const detail::code_location &CodeLoc = detail::code_location::current()) {
398-
return submit_with_event(
399-
sycl::ext::oneapi::experimental::empty_properties_t{}, CGF,
400-
&SecondaryQueue, CodeLoc);
397+
return submit_with_event<__SYCL_USE_FALLBACK_ASSERT>(
398+
sycl::ext::oneapi::experimental::empty_properties_t{},
399+
detail::type_erased_cgfo_ty{CGF}, &SecondaryQueue, CodeLoc);
401400
}
402401

403402
/// Prevents any commands submitted afterward to this queue from executing
@@ -2786,6 +2785,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
27862785

27872786
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
27882787
/// TODO: Unused. Remove these when ABI-break window is open.
2788+
/// Not using `type_erased_cgfo_ty` on purpose.
27892789
event submit_impl(std::function<void(handler &)> CGH,
27902790
const detail::code_location &CodeLoc);
27912791
event submit_impl(std::function<void(handler &)> CGH,
@@ -2815,16 +2815,28 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28152815
std::function<void(handler &)> CGH, queue secondQueue,
28162816
const detail::code_location &CodeLoc,
28172817
const detail::SubmitPostProcessF &PostProcess, bool IsTopCodeLoc);
2818+
2819+
// Old version when `std::function` was used in place of
2820+
// `std::function<void(handler &)>`.
2821+
event submit_with_event_impl(std::function<void(handler &)> CGH,
2822+
const detail::SubmissionInfo &SubmitInfo,
2823+
const detail::code_location &CodeLoc,
2824+
bool IsTopCodeLoc);
2825+
2826+
void submit_without_event_impl(std::function<void(handler &)> CGH,
2827+
const detail::SubmissionInfo &SubmitInfo,
2828+
const detail::code_location &CodeLoc,
2829+
bool IsTopCodeLoc);
28182830
#endif // __INTEL_PREVIEW_BREAKING_CHANGES
28192831

28202832
/// A template-free versions of submit.
2821-
event submit_with_event_impl(std::function<void(handler &)> CGH,
2833+
event submit_with_event_impl(const detail::type_erased_cgfo_ty &CGH,
28222834
const detail::SubmissionInfo &SubmitInfo,
28232835
const detail::code_location &CodeLoc,
28242836
bool IsTopCodeLoc);
28252837

28262838
/// A template-free version of submit_without_event.
2827-
void submit_without_event_impl(std::function<void(handler &)> CGH,
2839+
void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH,
28282840
const detail::SubmissionInfo &SubmitInfo,
28292841
const detail::code_location &CodeLoc,
28302842
bool IsTopCodeLoc);
@@ -2836,32 +2848,35 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28362848
/// \param CGF is a function object containing command group.
28372849
/// \param CodeLoc is the code location of the submit call (default argument)
28382850
/// \return a SYCL event object for the submitted command group.
2839-
template <typename T, typename PropertiesT>
2840-
std::enable_if_t<std::is_invocable_r_v<void, T, handler &>, event>
2841-
submit_with_event(
2842-
PropertiesT Props, T CGF, queue *SecondaryQueuePtr,
2851+
//
2852+
// UseFallBackAssert as template param vs `#if` in function body is necessary
2853+
// to prevent ODR-violation between TUs built with different fallback assert
2854+
// modes.
2855+
template <bool UseFallbackAssert, typename PropertiesT>
2856+
event submit_with_event(
2857+
PropertiesT Props, const detail::type_erased_cgfo_ty &CGF,
2858+
queue *SecondaryQueuePtr,
28432859
const detail::code_location &CodeLoc = detail::code_location::current()) {
28442860
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
28452861
detail::SubmissionInfo SI{};
28462862
ProcessSubmitProperties(Props, SI);
28472863
if (SecondaryQueuePtr)
28482864
SI.SecondaryQueue() = detail::getSyclObjImpl(*SecondaryQueuePtr);
2849-
#if __SYCL_USE_FALLBACK_ASSERT
2850-
SI.PostProcessorFunc() =
2851-
[this, &SecondaryQueuePtr,
2852-
&TlsCodeLocCapture](bool IsKernel, bool KernelUsesAssert, event &E) {
2853-
if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
2854-
KernelUsesAssert && !device_has(aspect::accelerator)) {
2855-
// __devicelib_assert_fail isn't supported by Device-side Runtime
2856-
// Linking against fallback impl of __devicelib_assert_fail is
2857-
// performed by program manager class
2858-
// Fallback assert isn't supported for FPGA
2859-
submitAssertCapture(*this, E, SecondaryQueuePtr,
2860-
TlsCodeLocCapture.query());
2861-
}
2862-
};
2863-
#endif // __SYCL_USE_FALLBACK_ASSERT
2864-
return submit_with_event_impl(std::move(CGF), SI, TlsCodeLocCapture.query(),
2865+
if constexpr (UseFallbackAssert)
2866+
SI.PostProcessorFunc() =
2867+
[this, &SecondaryQueuePtr,
2868+
&TlsCodeLocCapture](bool IsKernel, bool KernelUsesAssert, event &E) {
2869+
if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
2870+
KernelUsesAssert && !device_has(aspect::accelerator)) {
2871+
// __devicelib_assert_fail isn't supported by Device-side Runtime
2872+
// Linking against fallback impl of __devicelib_assert_fail is
2873+
// performed by program manager class
2874+
// Fallback assert isn't supported for FPGA
2875+
submitAssertCapture(*this, E, SecondaryQueuePtr,
2876+
TlsCodeLocCapture.query());
2877+
}
2878+
};
2879+
return submit_with_event_impl(CGF, SI, TlsCodeLocCapture.query(),
28652880
TlsCodeLocCapture.isToplevel());
28662881
}
28672882

@@ -2871,21 +2886,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28712886
/// \param Props is a property list with submission properties.
28722887
/// \param CGF is a function object containing command group.
28732888
/// \param CodeLoc is the code location of the submit call (default argument)
2874-
template <typename T, typename PropertiesT>
2875-
std::enable_if_t<std::is_invocable_r_v<void, T, handler &>, void>
2876-
submit_without_event(PropertiesT Props, T CGF,
2877-
const detail::code_location &CodeLoc) {
2878-
#if __SYCL_USE_FALLBACK_ASSERT
2879-
// If post-processing is needed, fall back to the regular submit.
2880-
// TODO: Revisit whether we can avoid this.
2881-
submit_with_event(Props, CGF, nullptr, CodeLoc);
2882-
#else
2883-
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
2884-
detail::SubmissionInfo SI{};
2885-
ProcessSubmitProperties(Props, SI);
2886-
submit_without_event_impl(CGF, SI, TlsCodeLocCapture.query(),
2887-
TlsCodeLocCapture.isToplevel());
2888-
#endif // __SYCL_USE_FALLBACK_ASSERT
2889+
//
2890+
// UseFallBackAssert as template param vs `#if` in function body is necessary
2891+
// to prevent ODR-violation between TUs built with different fallback assert
2892+
// modes.
2893+
template <bool UseFallbackAssert, typename PropertiesT>
2894+
void submit_without_event(PropertiesT Props,
2895+
const detail::type_erased_cgfo_ty &CGF,
2896+
const detail::code_location &CodeLoc) {
2897+
if constexpr (UseFallbackAssert) {
2898+
// If post-processing is needed, fall back to the regular submit.
2899+
// TODO: Revisit whether we can avoid this.
2900+
submit_with_event<UseFallbackAssert>(Props, CGF, nullptr, CodeLoc);
2901+
} else {
2902+
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
2903+
detail::SubmissionInfo SI{};
2904+
ProcessSubmitProperties(Props, SI);
2905+
submit_without_event_impl(CGF, SI, TlsCodeLocCapture.query(),
2906+
TlsCodeLocCapture.isToplevel());
2907+
}
28892908
}
28902909

28912910
/// parallel_for_impl with a kernel represented as a lambda + range that
@@ -3114,10 +3133,10 @@ event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
31143133
});
31153134
};
31163135

3117-
CopierEv = Self.submit_with_event(
3136+
CopierEv = Self.submit_with_event<true>(
31183137
sycl::ext::oneapi::experimental::empty_properties_t{}, CopierCGF,
31193138
SecondaryQueue, CodeLoc);
3120-
CheckerEv = Self.submit_with_event(
3139+
CheckerEv = Self.submit_with_event<true>(
31213140
sycl::ext::oneapi::experimental::empty_properties_t{}, CheckerCGF,
31223141
SecondaryQueue, CodeLoc);
31233142

sycl/source/detail/queue_impl.cpp

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ void queue_impl::addSharedEvent(const event &Event) {
349349
MEventsShared.push_back(Event);
350350
}
351351

352-
event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
352+
event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
353353
const std::shared_ptr<queue_impl> &Self,
354354
const std::shared_ptr<queue_impl> &PrimaryQueue,
355355
const std::shared_ptr<queue_impl> &SecondaryQueue,
@@ -402,10 +402,13 @@ event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
402402
// We don't want stream flushing to be blocking operation that is why submit
403403
// a host task to print stream buffer. It will fire up as soon as the kernel
404404
// finishes execution.
405-
event FlushEvent = submit_impl(
406-
[&](handler &ServiceCGH) { Stream->generateFlushCommand(ServiceCGH); },
407-
Self, PrimaryQueue, SecondaryQueue, /*CallerNeedsEvent*/ true, Loc,
408-
IsTopCodeLoc, {});
405+
auto L = [&](handler &ServiceCGH) {
406+
Stream->generateFlushCommand(ServiceCGH);
407+
};
408+
detail::type_erased_cgfo_ty CGF{L};
409+
event FlushEvent =
410+
submit_impl(CGF, Self, PrimaryQueue, SecondaryQueue,
411+
/*CallerNeedsEvent*/ true, Loc, IsTopCodeLoc, {});
409412
EventImpl->attachEventToCompleteWeak(detail::getSyclObjImpl(FlushEvent));
410413
registerStreamServiceEvent(detail::getSyclObjImpl(FlushEvent));
411414
}
@@ -419,21 +422,19 @@ event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
419422
bool CallerNeedsEvent,
420423
HandlerFuncT HandlerFunc) {
421424
SubmissionInfo SI{};
425+
auto L = [&](handler &CGH) {
426+
CGH.depends_on(DepEvents);
427+
HandlerFunc(CGH);
428+
};
429+
detail::type_erased_cgfo_ty CGF{L};
430+
422431
if (!CallerNeedsEvent && supportsDiscardingPiEvents()) {
423-
submit_without_event(
424-
[&](handler &CGH) {
425-
CGH.depends_on(DepEvents);
426-
HandlerFunc(CGH);
427-
},
428-
Self, SI, /*CodeLoc*/ {}, /*IsTopCodeLoc*/ true);
432+
submit_without_event(CGF, Self, SI,
433+
/*CodeLoc*/ {}, /*IsTopCodeLoc*/ true);
429434
return createDiscardedEvent();
430435
}
431-
return submit_with_event(
432-
[&](handler &CGH) {
433-
CGH.depends_on(DepEvents);
434-
HandlerFunc(CGH);
435-
},
436-
Self, SI, /*CodeLoc*/ {}, /*IsTopCodeLoc*/ true);
436+
return submit_with_event(CGF, Self, SI,
437+
/*CodeLoc*/ {}, /*IsTopCodeLoc*/ true);
437438
}
438439

439440
template <typename HandlerFuncT, typename MemOpFuncT, typename... MemOpArgTs>

sycl/source/detail/queue_impl.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ class queue_impl {
340340
/// \param StoreAdditionalInfo makes additional info be stored in event_impl
341341
/// \return a SYCL event object, which corresponds to the queue the command
342342
/// group is being enqueued on.
343-
event submit(const std::function<void(handler &)> &CGF,
343+
event submit(const detail::type_erased_cgfo_ty &CGF,
344344
const std::shared_ptr<queue_impl> &Self,
345345
const std::shared_ptr<queue_impl> &SecondQueue,
346346
const detail::code_location &Loc, bool IsTopCodeLoc,
@@ -362,7 +362,7 @@ class queue_impl {
362362
/// \param Loc is the code location of the submit call (default argument)
363363
/// \param StoreAdditionalInfo makes additional info be stored in event_impl
364364
/// \return a SYCL event object for the submitted command group.
365-
event submit_with_event(const std::function<void(handler &)> &CGF,
365+
event submit_with_event(const detail::type_erased_cgfo_ty &CGF,
366366
const std::shared_ptr<queue_impl> &Self,
367367
const SubmissionInfo &SubmitInfo,
368368
const detail::code_location &Loc, bool IsTopCodeLoc) {
@@ -387,7 +387,7 @@ class queue_impl {
387387
return discard_or_return(ResEvent);
388388
}
389389

390-
void submit_without_event(const std::function<void(handler &)> &CGF,
390+
void submit_without_event(const detail::type_erased_cgfo_ty &CGF,
391391
const std::shared_ptr<queue_impl> &Self,
392392
const SubmissionInfo &SubmitInfo,
393393
const detail::code_location &Loc,
@@ -855,7 +855,7 @@ class queue_impl {
855855
/// \param Loc is the code location of the submit call (default argument)
856856
/// \param SubmitInfo is additional optional information for the submission.
857857
/// \return a SYCL event representing submitted command group.
858-
event submit_impl(const std::function<void(handler &)> &CGF,
858+
event submit_impl(const detail::type_erased_cgfo_ty &CGF,
859859
const std::shared_ptr<queue_impl> &Self,
860860
const std::shared_ptr<queue_impl> &PrimaryQueue,
861861
const std::shared_ptr<queue_impl> &SecondaryQueue,

sycl/source/queue.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,6 @@ event queue::submit_impl_and_postprocess(
255255
return impl->submit(CGH, impl, SecondQueue.impl, CodeLoc, IsTopCodeLoc,
256256
&PostProcess);
257257
}
258-
#endif // __INTEL_PREVIEW_BREAKING_CHANGES
259258

260259
event queue::submit_with_event_impl(std::function<void(handler &)> CGH,
261260
const detail::SubmissionInfo &SubmitInfo,
@@ -270,6 +269,21 @@ void queue::submit_without_event_impl(std::function<void(handler &)> CGH,
270269
bool IsTopCodeLoc) {
271270
impl->submit_without_event(CGH, impl, SubmitInfo, CodeLoc, IsTopCodeLoc);
272271
}
272+
#endif // __INTEL_PREVIEW_BREAKING_CHANGES
273+
274+
event queue::submit_with_event_impl(const detail::type_erased_cgfo_ty &CGH,
275+
const detail::SubmissionInfo &SubmitInfo,
276+
const detail::code_location &CodeLoc,
277+
bool IsTopCodeLoc) {
278+
return impl->submit_with_event(CGH, impl, SubmitInfo, CodeLoc, IsTopCodeLoc);
279+
}
280+
281+
void queue::submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH,
282+
const detail::SubmissionInfo &SubmitInfo,
283+
const detail::code_location &CodeLoc,
284+
bool IsTopCodeLoc) {
285+
impl->submit_without_event(CGH, impl, SubmitInfo, CodeLoc, IsTopCodeLoc);
286+
}
273287

274288
void queue::wait_proxy(const detail::code_location &CodeLoc) {
275289
impl->wait(CodeLoc);

sycl/test-e2e/Basic/submit_fn_ptr.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include <sycl/detail/core.hpp>
5+
#include <sycl/usm.hpp>
6+
7+
int *p = nullptr;
8+
9+
void foo(sycl::handler &cgh) {
10+
auto *copy = p;
11+
cgh.single_task([=]() { *copy = 42; });
12+
}
13+
14+
int main() {
15+
sycl::queue q;
16+
p = sycl::malloc_shared<int>(1, q);
17+
*p = 0;
18+
q.submit(foo).wait();
19+
assert(*p == 42);
20+
sycl::free(p, q);
21+
return 0;
22+
}

0 commit comments

Comments
 (0)