Skip to content

Commit 38e9478

Browse files
authored
Merge pull request #1478 from PietroGhg/pietro/vecz_threadpool
[NATIVECPU] Initial threadpool implementation for Native CPU
2 parents b582fb8 + c594cdc commit 38e9478

File tree

9 files changed

+452
-58
lines changed

9 files changed

+452
-58
lines changed

source/adapters/native_cpu/device.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
9898
case UR_DEVICE_INFO_LINKER_AVAILABLE:
9999
return ReturnValue(bool{false});
100100
case UR_DEVICE_INFO_MAX_COMPUTE_UNITS:
101-
return ReturnValue(uint32_t{256});
101+
return ReturnValue(static_cast<uint32_t>(hDevice->tp.num_threads()));
102102
case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES:
103103
return ReturnValue(uint32_t{0});
104104
case UR_DEVICE_INFO_SUPPORTED_PARTITIONS:
@@ -138,7 +138,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
138138
case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS:
139139
return ReturnValue(uint32_t{3});
140140
case UR_DEVICE_INFO_PARTITION_TYPE:
141-
return ReturnValue(ur_device_partition_property_t{});
141+
if (pPropSizeRet) {
142+
*pPropSizeRet = 0;
143+
}
144+
return UR_RESULT_SUCCESS;
142145
case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
143146
return ReturnValue("");
144147
case UR_DEVICE_INFO_QUEUE_PROPERTIES:
@@ -158,15 +161,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
158161
case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT:
159162
case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE:
160163
case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF:
164+
// TODO: How can we query vector width in a platform
165+
// independent way?
161166
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
167+
return ReturnValue(uint32_t{32});
162168
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
169+
return ReturnValue(uint32_t{16});
163170
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
171+
return ReturnValue(uint32_t{8});
164172
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
173+
return ReturnValue(uint32_t{4});
165174
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
175+
return ReturnValue(uint32_t{8});
166176
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
177+
return ReturnValue(uint32_t{4});
167178
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
168-
return ReturnValue(uint32_t{1});
169-
179+
return ReturnValue(uint32_t{16});
170180
// Imported from level_zero
171181
case UR_DEVICE_INFO_USM_HOST_SUPPORT:
172182
case UR_DEVICE_INFO_USM_DEVICE_SUPPORT:
@@ -213,10 +223,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
213223
return ReturnValue(uint64_t{0});
214224
case UR_DEVICE_INFO_GLOBAL_MEM_SIZE:
215225
// TODO : CHECK
216-
return ReturnValue(uint64_t{0});
226+
return ReturnValue(uint64_t{32768});
217227
case UR_DEVICE_INFO_LOCAL_MEM_SIZE:
218228
// TODO : CHECK
219-
return ReturnValue(uint64_t{0});
229+
return ReturnValue(uint64_t{32768});
220230
case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE:
221231
// TODO : CHECK
222232
return ReturnValue(uint64_t{0});
@@ -256,7 +266,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
256266
case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE:
257267
return ReturnValue(bool{0});
258268
case UR_DEVICE_INFO_ATOMIC_64:
259-
return ReturnValue(bool{0});
269+
return ReturnValue(bool{1});
260270
case UR_DEVICE_INFO_BFLOAT16:
261271
return ReturnValue(bool{0});
262272
case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:

source/adapters/native_cpu/device.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010

1111
#pragma once
1212

13+
#include "threadpool.hpp"
1314
#include <ur/ur.hpp>
1415

1516
struct ur_device_handle_t_ {
17+
native_cpu::threadpool_t tp;
1618
ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {}
1719

1820
ur_platform_handle_t Platform;

source/adapters/native_cpu/enqueue.cpp

Lines changed: 145 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,22 @@
11
//===----------- enqueue.cpp - NATIVE CPU Adapter -------------------------===//
22
//
3-
// Copyright (C) 2023 Intel Corporation
4-
//
5-
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6-
// Exceptions. See LICENSE.TXT
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
75
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
86
//
97
//===----------------------------------------------------------------------===//
108
#include <array>
9+
#include <cstddef>
1110
#include <cstdint>
11+
#include <vector>
1212

1313
#include "ur_api.h"
1414

1515
#include "common.hpp"
1616
#include "kernel.hpp"
1717
#include "memory.hpp"
18+
#include "queue.hpp"
19+
#include "threadpool.hpp"
1820

1921
namespace native_cpu {
2022
struct NDRDescT {
@@ -37,9 +39,29 @@ struct NDRDescT {
3739
GlobalOffset[I] = 0;
3840
}
3941
}
42+
43+
void dump(std::ostream &os) const {
44+
os << "GlobalSize: " << GlobalSize[0] << " " << GlobalSize[1] << " "
45+
<< GlobalSize[2] << "\n";
46+
os << "LocalSize: " << LocalSize[0] << " " << LocalSize[1] << " "
47+
<< LocalSize[2] << "\n";
48+
os << "GlobalOffset: " << GlobalOffset[0] << " " << GlobalOffset[1] << " "
49+
<< GlobalOffset[2] << "\n";
50+
}
4051
};
4152
} // namespace native_cpu
4253

54+
#ifdef NATIVECPU_USE_OCK
55+
static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
56+
size_t itemsPerThread) {
57+
native_cpu::state resized_state(
58+
ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
59+
ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
60+
ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
61+
return resized_state;
62+
}
63+
#endif
64+
4365
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
4466
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
4567
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -63,23 +85,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
6385
// TODO: add proper event dep management
6486
native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
6587
pLocalWorkSize);
66-
hKernel->handleLocalArgs();
67-
88+
auto &tp = hQueue->device->tp;
89+
const size_t numParallelThreads = tp.num_threads();
90+
hKernel->updateMemPool(numParallelThreads);
91+
std::vector<std::future<void>> futures;
92+
std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups;
93+
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
94+
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
95+
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
6896
native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
6997
ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
7098
ndr.LocalSize[2], ndr.GlobalOffset[0],
7199
ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
72-
73-
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
74-
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
75-
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
100+
#ifndef NATIVECPU_USE_OCK
101+
hKernel->handleLocalArgs(1, 0);
76102
for (unsigned g2 = 0; g2 < numWG2; g2++) {
77103
for (unsigned g1 = 0; g1 < numWG1; g1++) {
78104
for (unsigned g0 = 0; g0 < numWG0; g0++) {
79-
#ifdef NATIVECPU_USE_OCK
80-
state.update(g0, g1, g2);
81-
hKernel->_subhandler(hKernel->_args.data(), &state);
82-
#else
83105
for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
84106
for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
85107
for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
@@ -88,10 +110,118 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
88110
}
89111
}
90112
}
91-
#endif
92113
}
93114
}
94115
}
116+
#else
117+
bool isLocalSizeOne =
118+
ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
119+
if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads) {
120+
// If the local size is one, we make the assumption that we are running a
121+
// parallel_for over a sycl::range.
122+
// Todo: we could add compiler checks and
123+
// kernel properties for this (e.g. check that no barriers are called, no
124+
// local memory args).
125+
126+
// Todo: this assumes that dim 0 is the best dimension over which we want to
127+
// parallelize
128+
129+
// Since we also vectorize the kernel, and vectorization happens within the
130+
// work group loop, it's better to have a large-ish local size. We can
131+
// divide the global range by the number of threads, set that as the local
132+
// size and peel everything else.
133+
134+
size_t new_num_work_groups_0 = numParallelThreads;
135+
size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;
136+
137+
for (unsigned g2 = 0; g2 < numWG2; g2++) {
138+
for (unsigned g1 = 0; g1 < numWG1; g1++) {
139+
for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
140+
futures.emplace_back(
141+
tp.schedule_task([&ndr = std::as_const(ndr), itemsPerThread,
142+
hKernel, g0, g1, g2](size_t) {
143+
native_cpu::state resized_state =
144+
getResizedState(ndr, itemsPerThread);
145+
resized_state.update(g0, g1, g2);
146+
hKernel->_subhandler(hKernel->_args.data(), &resized_state);
147+
}));
148+
}
149+
// Peel the remaining work items. Since the local size is 1, we iterate
150+
// over the work groups.
151+
for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
152+
g0++) {
153+
state.update(g0, g1, g2);
154+
hKernel->_subhandler(hKernel->_args.data(), &state);
155+
}
156+
}
157+
}
158+
159+
} else {
160+
// We are running a parallel_for over an nd_range
161+
162+
if (numWG1 * numWG2 >= numParallelThreads) {
163+
// Dimensions 1 and 2 have enough work, split them across the threadpool
164+
for (unsigned g2 = 0; g2 < numWG2; g2++) {
165+
for (unsigned g1 = 0; g1 < numWG1; g1++) {
166+
futures.emplace_back(
167+
tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
168+
numParallelThreads](size_t threadId) mutable {
169+
for (unsigned g0 = 0; g0 < numWG0; g0++) {
170+
kernel.handleLocalArgs(numParallelThreads, threadId);
171+
state.update(g0, g1, g2);
172+
kernel._subhandler(kernel._args.data(), &state);
173+
}
174+
}));
175+
}
176+
}
177+
} else {
178+
// Split dimension 0 across the threadpool
179+
// Here we try to create groups of workgroups in order to reduce
180+
// synchronization overhead
181+
for (unsigned g2 = 0; g2 < numWG2; g2++) {
182+
for (unsigned g1 = 0; g1 < numWG1; g1++) {
183+
for (unsigned g0 = 0; g0 < numWG0; g0++) {
184+
groups.push_back(
185+
[state, g0, g1, g2, numParallelThreads](
186+
size_t threadId, ur_kernel_handle_t_ kernel) mutable {
187+
kernel.handleLocalArgs(numParallelThreads, threadId);
188+
state.update(g0, g1, g2);
189+
kernel._subhandler(kernel._args.data(), &state);
190+
});
191+
}
192+
}
193+
}
194+
auto numGroups = groups.size();
195+
auto groupsPerThread = numGroups / numParallelThreads;
196+
auto remainder = numGroups % numParallelThreads;
197+
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
198+
futures.emplace_back(tp.schedule_task(
199+
[&groups, thread, groupsPerThread, hKernel](size_t threadId) {
200+
for (unsigned i = 0; i < groupsPerThread; i++) {
201+
auto index = thread * groupsPerThread + i;
202+
groups[index](threadId, *hKernel);
203+
}
204+
}));
205+
}
206+
207+
// schedule the remaining tasks
208+
if (remainder) {
209+
futures.emplace_back(
210+
tp.schedule_task([&groups, remainder,
211+
scheduled = numParallelThreads * groupsPerThread,
212+
hKernel](size_t threadId) {
213+
for (unsigned i = 0; i < remainder; i++) {
214+
auto index = scheduled + i;
215+
groups[index](threadId, *hKernel);
216+
}
217+
}));
218+
}
219+
}
220+
}
221+
222+
for (auto &f : futures)
223+
f.get();
224+
#endif // NATIVECPU_USE_OCK
95225
// TODO: we should avoid calling clear here by avoiding using push_back
96226
// in setKernelArgs.
97227
hKernel->_args.clear();

source/adapters/native_cpu/kernel.hpp

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
//===--------------- kernel.hpp - Native CPU Adapter ----------------------===//
22
//
3-
// Copyright (C) 2023 Intel Corporation
4-
//
5-
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6-
// Exceptions. See LICENSE.TXT
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
75
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
86
//
97
//===----------------------------------------------------------------------===//
@@ -42,50 +40,52 @@ struct ur_kernel_handle_t_ : RefCounted {
4240
ur_kernel_handle_t_(const char *name, nativecpu_task_t subhandler)
4341
: _name{name}, _subhandler{std::move(subhandler)} {}
4442

45-
const char *_name;
46-
nativecpu_task_t _subhandler;
47-
std::vector<native_cpu::NativeCPUArgDesc> _args;
48-
std::vector<local_arg_info_t> _localArgInfo;
49-
50-
// To be called before enqueing the kernel.
51-
void handleLocalArgs() {
52-
updateMemPool();
53-
size_t offset = 0;
54-
for (auto &entry : _localArgInfo) {
55-
_args[entry.argIndex].MPtr =
56-
reinterpret_cast<char *>(_localMemPool) + offset;
57-
// update offset in the memory pool
58-
// Todo: update this offset computation when we have work-group
59-
// level parallelism.
60-
offset += entry.argSize;
61-
}
43+
ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
44+
: _name(other._name), _subhandler(other._subhandler), _args(other._args),
45+
_localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
46+
_localMemPoolSize(other._localMemPoolSize) {
47+
incrementReferenceCount();
6248
}
6349

6450
~ur_kernel_handle_t_() {
65-
if (_localMemPool) {
51+
if (decrementReferenceCount() == 0) {
6652
free(_localMemPool);
6753
}
6854
}
6955

70-
private:
71-
void updateMemPool() {
56+
const char *_name;
57+
nativecpu_task_t _subhandler;
58+
std::vector<native_cpu::NativeCPUArgDesc> _args;
59+
std::vector<local_arg_info_t> _localArgInfo;
60+
61+
// To be called before enqueueing the kernel.
62+
void updateMemPool(size_t numParallelThreads) {
7263
// compute requested size.
73-
// Todo: currently we execute only one work-group at a time, so for each
74-
// local arg we can allocate just 1 * argSize local arg. When we implement
75-
// work-group level parallelism we should allocate N * argSize where N is
76-
// the number of work groups being executed in parallel (e.g. number of
77-
// threads in the thread pool).
7864
size_t reqSize = 0;
7965
for (auto &entry : _localArgInfo) {
80-
reqSize += entry.argSize;
66+
reqSize += entry.argSize * numParallelThreads;
8167
}
8268
if (reqSize == 0 || reqSize == _localMemPoolSize) {
8369
return;
8470
}
8571
// realloc handles nullptr case
86-
_localMemPool = realloc(_localMemPool, reqSize);
72+
_localMemPool = (char *)realloc(_localMemPool, reqSize);
8773
_localMemPoolSize = reqSize;
8874
}
89-
void *_localMemPool = nullptr;
75+
76+
// To be called before executing a work group
77+
void handleLocalArgs(size_t numParallelThread, size_t threadId) {
78+
// For each local argument we have size*numthreads
79+
size_t offset = 0;
80+
for (auto &entry : _localArgInfo) {
81+
_args[entry.argIndex].MPtr =
82+
_localMemPool + offset + (entry.argSize * threadId);
83+
// update offset in the memory pool
84+
offset += entry.argSize * numParallelThread;
85+
}
86+
}
87+
88+
private:
89+
char *_localMemPool = nullptr;
9090
size_t _localMemPoolSize = 0;
9191
};

0 commit comments

Comments
 (0)