Skip to content

Commit b430b82

Browse files
committed
Move some shared code to ur/ur.hpp
Moves code to ur/ur.hpp that is used by both CUDA/HIP adapters. Perhaps there is some better place to put this. Also replaces the use of lambdas with free functions.
1 parent 96c44da commit b430b82

File tree

3 files changed

+43
-44
lines changed

3 files changed

+43
-44
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <cmath>
2020
#include <cuda.h>
21+
#include <ur/ur.hpp>
2122

2223
ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
2324
uint32_t NumEventsInWaitList,
@@ -144,8 +145,6 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
144145
assert(ThreadsPerBlock != nullptr);
145146
assert(GlobalWorkSize != nullptr);
146147
assert(Kernel != nullptr);
147-
int MinGrid, MaxBlockSize;
148-
size_t MaxBlockDim[3];
149148

150149
// The below assumes a three dimensional range but this is not guaranteed by
151150
// UR.
@@ -154,39 +153,31 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
154153
GlobalSizeNormalized[i] = GlobalWorkSize[i];
155154
}
156155

156+
size_t MaxBlockDim[3];
157+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
157158
MaxBlockDim[1] = Device->getMaxBlockDimY();
158159
MaxBlockDim[2] = Device->getMaxBlockDimZ();
159160

161+
int MinGrid, MaxBlockSize;
160162
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
161163
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
162164
MaxThreadsPerBlock[0]));
163165

164-
// Helper lambda to make sure each x, y, z dim divide the global dimension.
165-
// Can optionally specify that we want the wg size to be a power of 2 in a
166-
// given dimension, which is useful for the X dim for performance reasons.
167-
static auto roundToHighestFactorOfGlobalSize =
168-
[](size_t &ThreadsPerBlockInDim, const size_t GlobalWorkSizeInDim,
169-
bool MakePowerOfTwo) {
170-
auto IsPowerOf2 = [](size_t Value) -> bool {
171-
return Value && !(Value & (Value - 1));
172-
};
173-
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim ||
174-
(MakePowerOfTwo && !IsPowerOf2(ThreadsPerBlockInDim)))
175-
--ThreadsPerBlockInDim;
176-
};
177-
178166
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
179-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2],
180-
false);
167+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2]);
168+
181169
ThreadsPerBlock[1] =
182170
std::min(GlobalSizeNormalized[1],
183171
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
184-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1],
185-
false);
172+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1]);
173+
186174
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
187175
ThreadsPerBlock[0] = std::min(
188176
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
189-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0], true);
177+
// Make the X dim a factor of 2
178+
do {
179+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0]);
180+
} while (!isPowerOf2(ThreadsPerBlock[0]));
190181
}
191182

192183
// Helper to verify out-of-registers case (exceeded block max registers).

source/adapters/hip/enqueue.cpp

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "memory.hpp"
1616
#include "queue.hpp"
1717

18+
#include <ur/ur.hpp>
19+
1820
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
1921

2022
namespace {
@@ -59,49 +61,38 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
5961
assert(ThreadsPerBlock != nullptr);
6062
assert(GlobalWorkSize != nullptr);
6163
assert(Kernel != nullptr);
62-
int MinGrid, MaxBlockSize;
63-
size_t MaxBlockDim[3];
6464

65-
// The below assumes a three dimensional range but this is not guaranteed by
66-
// UR.
65+
// FIXME: The below assumes a three dimensional range but this is not
66+
// guaranteed by UR.
6767
size_t GlobalSizeNormalized[3] = {1, 1, 1};
6868
for (uint32_t i = 0; i < WorkDim; i++) {
6969
GlobalSizeNormalized[i] = GlobalWorkSize[i];
7070
}
7171

72+
size_t MaxBlockDim[3];
7273
MaxBlockDim[1] = Device->getMaxBlockDimY();
7374
MaxBlockDim[2] = Device->getMaxBlockDimZ();
7475

76+
int MinGrid, MaxBlockSize;
7577
UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
7678
&MinGrid, &MaxBlockSize, Kernel->get(), Kernel->getLocalSize(),
7779
MaxThreadsPerBlock[0]));
7880

79-
// Helper lambda to make sure each x, y, z dim divide the global dimension.
80-
// Can optionally specify that we want the wg size to be a power of 2 in a
81-
// given dimension, which is useful for the X dim for performance reasons.
82-
static auto roundToHighestFactorOfGlobalSize =
83-
[](size_t &ThreadsPerBlockInDim, const size_t GlobalWorkSizeInDim,
84-
bool MakePowerOfTwo) {
85-
auto IsPowerOf2 = [](size_t Value) -> bool {
86-
return Value && !(Value & (Value - 1));
87-
};
88-
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim ||
89-
(MakePowerOfTwo && !IsPowerOf2(ThreadsPerBlockInDim)))
90-
--ThreadsPerBlockInDim;
91-
};
92-
9381
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
94-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2],
95-
false);
82+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2]);
83+
9684
ThreadsPerBlock[1] =
9785
std::min(GlobalSizeNormalized[1],
9886
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
99-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1],
100-
false);
87+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1]);
88+
10189
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
10290
ThreadsPerBlock[0] = std::min(
10391
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
104-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0], true);
92+
// Make the X dim a factor of 2
93+
do {
94+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0]);
95+
} while (!IsPowerOf2(ThreadsPerBlock[0]));
10596
}
10697

10798
ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,

source/ur/ur.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,20 @@ template <typename T> class Result {
321321
private:
322322
std::variant<ur_result_t, T> value_or_err;
323323
};
324+
325+
// Helper to make sure each x, y, z dim divide the global dimension.
326+
//
327+
// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
328+
// In: GlobalWorkSizeInDim - The global size in some dimension
329+
static inline void
330+
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
331+
const size_t GlobalWorkSizeInDim) {
332+
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
333+
--ThreadsPerBlockInDim;
334+
}
335+
}
336+
337+
// Returns whether or not Value is a power of 2
338+
template <typename T> inline bool isPowerOf2(const T &Value) {
339+
return Value && !(Value & (Value - 1));
340+
}

0 commit comments

Comments
 (0)