Skip to content

Commit 3ee420c

Browse files
authored
[SYCL][NATIVECPU][UR] performance improvments in NativeCPU adapter (intel#17102)
Initial set of performance improvements (less allocation and thread launches) in Native CPU adapter
1 parent 7d58fad commit 3ee420c

File tree

2 files changed

+28
-26
lines changed

2 files changed

+28
-26
lines changed

unified-runtime/source/adapters/native_cpu/enqueue.cpp

+18-15
Original file line numberDiff line numberDiff line change
@@ -217,19 +217,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
217217
}
218218
auto numGroups = groups.size();
219219
auto groupsPerThread = numGroups / numParallelThreads;
220-
auto remainder = numGroups % numParallelThreads;
221-
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
222-
futures.emplace_back(
223-
tp.schedule_task([groups, thread, groupsPerThread,
224-
&kernel = *kernel](size_t threadId) {
225-
for (unsigned i = 0; i < groupsPerThread; i++) {
226-
auto index = thread * groupsPerThread + i;
227-
groups[index](threadId, kernel);
228-
}
229-
}));
220+
if (groupsPerThread) {
221+
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
222+
futures.emplace_back(
223+
tp.schedule_task([groups, thread, groupsPerThread,
224+
&kernel = *kernel](size_t threadId) {
225+
for (unsigned i = 0; i < groupsPerThread; i++) {
226+
auto index = thread * groupsPerThread + i;
227+
groups[index](threadId, kernel);
228+
}
229+
}));
230+
}
230231
}
231232

232233
// schedule the remaining tasks
234+
auto remainder = numGroups % numParallelThreads;
233235
if (remainder) {
234236
futures.emplace_back(
235237
tp.schedule_task([groups, remainder,
@@ -263,11 +265,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
263265
return UR_RESULT_SUCCESS;
264266
}
265267

266-
ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
267-
uint32_t numEventsInWaitList,
268-
const ur_event_handle_t *phEventWaitList,
269-
ur_event_handle_t *phEvent,
270-
const std::function<ur_result_t()> &f) {
268+
template <class T>
269+
static inline ur_result_t
270+
withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
271+
uint32_t numEventsInWaitList,
272+
const ur_event_handle_t *phEventWaitList,
273+
ur_event_handle_t *phEvent, T &&f) {
271274
urEventWait(numEventsInWaitList, phEventWaitList);
272275
ur_event_handle_t event = nullptr;
273276
if (phEvent) {

unified-runtime/source/adapters/native_cpu/threadpool.hpp

+10-11
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#include <condition_variable>
1313
#include <cstdlib>
1414
#include <forward_list>
15-
#include <functional>
1615
#include <future>
1716
#include <iterator>
1817
#include <mutex>
@@ -24,7 +23,7 @@
2423

2524
namespace native_cpu {
2625

27-
using worker_task_t = std::function<void(size_t)>;
26+
using worker_task_t = std::packaged_task<void(size_t)>;
2827

2928
namespace detail {
3029

@@ -63,11 +62,11 @@ class worker_thread {
6362
m_isRunning.store(true, std::memory_order_release);
6463
}
6564

66-
inline void schedule(const worker_task_t &task) {
65+
inline void schedule(worker_task_t &&task) {
6766
{
6867
std::lock_guard<std::mutex> lock(m_workMutex);
6968
// Add the task to the queue
70-
m_tasks.push(task);
69+
m_tasks.emplace(std::move(task));
7170
++m_numTasks;
7271
}
7372
m_startWorkCondition.notify_one();
@@ -135,9 +134,9 @@ class simple_thread_pool {
135134
m_isRunning.store(false, std::memory_order_release);
136135
}
137136

138-
inline void schedule(const worker_task_t &task) {
137+
inline void schedule(worker_task_t &&task) {
139138
// Schedule the task on the best available worker thread
140-
this->best_worker().schedule(task);
139+
this->best_worker().schedule(std::move(task));
141140
}
142141

143142
inline bool is_running() const noexcept {
@@ -201,11 +200,11 @@ template <typename ThreadPoolT> class threadpool_interface {
201200

202201
threadpool_interface() : threadpool() {}
203202

204-
auto schedule_task(worker_task_t &&task) {
205-
auto workerTask = std::make_shared<std::packaged_task<void(size_t)>>(
206-
[task](auto &&PH1) { return task(std::forward<decltype(PH1)>(PH1)); });
207-
threadpool.schedule([=](size_t threadId) { (*workerTask)(threadId); });
208-
return workerTask->get_future();
203+
template <class T> std::future<void> schedule_task(T &&task) {
204+
auto workerTask = std::packaged_task<void(size_t)>(std::forward<T>(task));
205+
auto ret = workerTask.get_future();
206+
threadpool.schedule(std::move(workerTask));
207+
return ret;
209208
}
210209
};
211210

0 commit comments

Comments
 (0)