137
137
#define STRINGIZE (...) STRINGIZE_IMPL(__VA_ARGS__)
138
138
139
139
#define WARP_SIZE 32
140
- #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
140
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
141
+ #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
141
142
142
143
#define CC_PASCAL 600
143
144
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
@@ -293,8 +294,20 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
293
294
return x;
294
295
}
295
296
297
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
298
+ static __device__ __forceinline__ half __hmax (const half a, const half b) {
299
+ return __half2float (a) > __half2float (b) ? a : b;
300
+ }
301
+ static __device__ __forceinline__ half2 __hmax2 (const half2 a, const half2 b) {
302
+ half2 ret;
303
+ reinterpret_cast <half&>(ret.x ) = __low2float (a) > __low2float (b) ? __low2half (a) : __low2half (b);
304
+ reinterpret_cast <half&>(ret.y ) = __high2float (a) > __high2float (b) ? __high2half (a) : __high2half (b);
305
+ return ret;
306
+ }
307
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
308
+
296
309
static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
297
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
310
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
298
311
#pragma unroll
299
312
for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
300
313
x = __hmax2 (x, __shfl_xor_sync (0xffffffff , x, mask, 32 ));
@@ -303,10 +316,10 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
303
316
#else
304
317
GGML_UNUSED (x);
305
318
NO_DEVICE_CODE;
306
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
319
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
307
320
}
308
321
309
- #if CUDART_VERSION < 12000
322
+ #if CUDART_VERSION < CUDART_HMASK
310
323
static __device__ __forceinline__ uint32_t __hgt2_mask (const half2 a, const half2 b) {
311
324
const uint32_t mask_low = 0x0000FFFF * (float ( __low2half (a)) > float ( __low2half (b)));
312
325
const uint32_t mask_high = 0xFFFF0000 * (float (__high2half (a)) > float (__high2half (b)));
0 commit comments