@@ -217,19 +217,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
217
217
}
218
218
auto numGroups = groups.size ();
219
219
auto groupsPerThread = numGroups / numParallelThreads;
220
- auto remainder = numGroups % numParallelThreads;
221
- for (unsigned thread = 0 ; thread < numParallelThreads; thread++) {
222
- futures.emplace_back (
223
- tp.schedule_task ([groups, thread, groupsPerThread,
224
- &kernel = *kernel](size_t threadId) {
225
- for (unsigned i = 0 ; i < groupsPerThread; i++) {
226
- auto index = thread * groupsPerThread + i;
227
- groups[index ](threadId, kernel);
228
- }
229
- }));
220
+ if (groupsPerThread) {
221
+ for (unsigned thread = 0 ; thread < numParallelThreads; thread++) {
222
+ futures.emplace_back (
223
+ tp.schedule_task ([groups, thread, groupsPerThread,
224
+ &kernel = *kernel](size_t threadId) {
225
+ for (unsigned i = 0 ; i < groupsPerThread; i++) {
226
+ auto index = thread * groupsPerThread + i;
227
+ groups[index ](threadId, kernel);
228
+ }
229
+ }));
230
+ }
230
231
}
231
232
232
233
// schedule the remaining tasks
234
+ auto remainder = numGroups % numParallelThreads;
233
235
if (remainder ) {
234
236
futures.emplace_back (
235
237
tp.schedule_task ([groups, remainder ,
@@ -263,11 +265,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
263
265
return UR_RESULT_SUCCESS;
264
266
}
265
267
266
- ur_result_t withTimingEvent (ur_command_t command_type, ur_queue_handle_t hQueue,
267
- uint32_t numEventsInWaitList,
268
- const ur_event_handle_t *phEventWaitList,
269
- ur_event_handle_t *phEvent,
270
- const std::function<ur_result_t ()> &f) {
268
+ template <class T >
269
+ static inline ur_result_t
270
+ withTimingEvent (ur_command_t command_type, ur_queue_handle_t hQueue,
271
+ uint32_t numEventsInWaitList,
272
+ const ur_event_handle_t *phEventWaitList,
273
+ ur_event_handle_t *phEvent, T &&f) {
271
274
urEventWait (numEventsInWaitList, phEventWaitList);
272
275
ur_event_handle_t event = nullptr ;
273
276
if (phEvent) {
0 commit comments