Skip to content

Commit 14d3701

Browse files
committed
Remove MaxBlockDimY and Z
Remove duplicate/redundant member vars
1 parent 97088aa commit 14d3701

File tree

3 files changed

+20
-35
lines changed

3 files changed

+20
-35
lines changed

source/adapters/cuda/device.hpp

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ struct ur_device_handle_t_ {
2727
size_t MaxWorkItemSizes[MaxWorkItemDimensions];
2828
size_t MaxWorkGroupSize{0};
2929
size_t MaxAllocSize{0};
30-
int MaxBlockDimY{0};
31-
int MaxBlockDimZ{0};
3230
int MaxRegsPerBlock{0};
3331
int MaxCapacityLocalMem{0};
3432
int MaxChosenLocalMem{0};
@@ -95,16 +93,13 @@ struct ur_device_handle_t_ {
9593

9694
uint64_t getElapsedTime(CUevent) const;
9795

98-
void getMaxWorkItemSizes(size_t RetSize,
99-
size_t *RetMaxWorkItemSizes) const noexcept {
100-
memcpy(RetMaxWorkItemSizes, MaxWorkItemSizes, RetSize);
101-
};
102-
103-
size_t getMaxWorkGroupSize() const noexcept { return MaxWorkGroupSize; };
104-
105-
size_t getMaxBlockDimY() const noexcept { return MaxBlockDimY; };
96+
size_t getMaxWorkItemSizes(int index) const noexcept {
97+
return MaxWorkItemSizes[index];
98+
}
10699

107-
size_t getMaxBlockDimZ() const noexcept { return MaxBlockDimZ; };
100+
size_t getMaxWorkGroupSize() const noexcept {
101+
return MaxWorkGroupSize;
102+
};
108103

109104
size_t getMaxRegsPerBlock() const noexcept { return MaxRegsPerBlock; };
110105

source/adapters/cuda/enqueue.cpp

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
139139
// dimension.
140140
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
141141
const size_t *GlobalWorkSize, const uint32_t WorkDim,
142-
const size_t MaxThreadsPerBlock[3],
143142
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
144143
assert(ThreadsPerBlock != nullptr);
145144
assert(GlobalWorkSize != nullptr);
@@ -154,20 +153,21 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
154153
GlobalSizeNormalized[i] = GlobalWorkSize[i];
155154
}
156155

157-
MaxBlockDim[1] = Device->getMaxBlockDimY();
158-
MaxBlockDim[2] = Device->getMaxBlockDimZ();
156+
MaxBlockDim[1] = Device->getMaxWorkItemSizes(1);
157+
MaxBlockDim[2] = Device->getMaxWorkItemSizes(2);
159158

160-
UR_CHECK_ERROR(
161-
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
162-
NULL, LocalSize, MaxThreadsPerBlock[0]));
159+
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
160+
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, LocalSize,
161+
Device->getMaxWorkItemSizes(0)));
163162

164163
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
165164
ThreadsPerBlock[1] =
166165
std::min(GlobalSizeNormalized[1],
167166
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
168167
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
169-
ThreadsPerBlock[0] = std::min(
170-
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
168+
ThreadsPerBlock[0] =
169+
std::min(Device->getMaxWorkItemSizes(0),
170+
std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
171171

172172
static auto IsPowerOf2 = [](size_t Value) -> bool {
173173
return Value && !(Value & (Value - 1));
@@ -213,7 +213,6 @@ setKernelParams(const ur_context_handle_t Context,
213213
size_t (&BlocksPerGrid)[3]) {
214214
ur_result_t Result = UR_RESULT_SUCCESS;
215215
size_t MaxWorkGroupSize = 0u;
216-
size_t MaxThreadsPerBlock[3] = {};
217216
bool ProvidedLocalWorkGroupSize = LocalWorkSize != nullptr;
218217
uint32_t LocalSize = Kernel->getLocalSize();
219218

@@ -223,16 +222,14 @@ setKernelParams(const ur_context_handle_t Context,
223222
{
224223
size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
225224
MaxWorkGroupSize = Device->getMaxWorkGroupSize();
226-
Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
227-
MaxThreadsPerBlock);
228225

229226
if (ProvidedLocalWorkGroupSize) {
230227
auto IsValid = [&](int Dim) {
231228
if (ReqdThreadsPerBlock[Dim] != 0 &&
232229
LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
233230
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
234231

235-
if (LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
232+
if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim))
236233
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
237234
// Checks that local work sizes are a divisor of the global work sizes
238235
// which includes that the local work sizes are neither larger than
@@ -261,7 +258,7 @@ setKernelParams(const ur_context_handle_t Context,
261258
}
262259
} else {
263260
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
264-
MaxThreadsPerBlock, Kernel, LocalSize);
261+
Kernel, LocalSize);
265262
}
266263
}
267264

source/adapters/cuda/kernel.cpp

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,6 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
6868
case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
6969
size_t GlobalWorkSize[3] = {0, 0, 0};
7070

71-
int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0};
72-
UR_CHECK_ERROR(cuDeviceGetAttribute(
73-
&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, hDevice->get()));
74-
UR_CHECK_ERROR(cuDeviceGetAttribute(
75-
&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, hDevice->get()));
76-
UR_CHECK_ERROR(cuDeviceGetAttribute(
77-
&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, hDevice->get()));
78-
7971
int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0};
8072
UR_CHECK_ERROR(cuDeviceGetAttribute(
8173
&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, hDevice->get()));
@@ -84,9 +76,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
8476
UR_CHECK_ERROR(cuDeviceGetAttribute(
8577
&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, hDevice->get()));
8678

87-
GlobalWorkSize[0] = MaxBlockDimX * MaxGridDimX;
88-
GlobalWorkSize[1] = MaxBlockDimY * MaxGridDimY;
89-
GlobalWorkSize[2] = MaxBlockDimZ * MaxGridDimZ;
79+
GlobalWorkSize[0] = hDevice->getMaxWorkItemSizes(0) * MaxGridDimX;
80+
GlobalWorkSize[1] = hDevice->getMaxWorkItemSizes(1) * MaxGridDimY;
81+
GlobalWorkSize[2] = hDevice->getMaxWorkItemSizes(2) * MaxGridDimZ;
82+
9083
return ReturnValue(GlobalWorkSize, 3);
9184
}
9285
case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {

0 commit comments

Comments
 (0)