Skip to content

Commit ff90589

Browse files
authored
Merge pull request GPUOpen-LibrariesAndSDKs#189 from AvKhokhlov/reduction_primitivies
Added normalization, reduction and atomic premitieves
2 parents 3de8f3f + f407695 commit ff90589

File tree

4 files changed

+379
-85
lines changed

4 files changed

+379
-85
lines changed

CLW/CL/CLW.cl

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,21 @@ THE SOFTWARE.
2323
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
2424
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
2525

26+
// --------------------- CONSTANTS ------------------------
27+
// add neutral elements
28+
__constant int neutral_add_int = 0;
29+
__constant float neutral_add_float = 0;
30+
__constant float3 neutral_add_float3 = (float3)(0.0, 0.0, 0.0);
31+
// max neutral elements
32+
__constant int neutral_max_int = INT_MIN;
33+
__constant float neutral_max_float = FLT_MIN;
34+
__constant float3 neutral_max_float3 = (float3)(FLT_MIN, FLT_MIN, FLT_MIN);
35+
// min neutral elements
36+
__constant int neutral_min_int = INT_MAX;
37+
__constant float neutral_min_float = FLT_MAX;
38+
__constant float3 neutral_min_float3 = (float3)(FLT_MAX, FLT_MAX, FLT_MAX);
39+
40+
__constant float epsilon = .00001f;
2641

2742
// --------------------- HELPERS ------------------------
2843
//#define INT_MAX 0x7FFFFFFF
@@ -1492,3 +1507,101 @@ __kernel void segmented_distribute_part_sum_int_nocut(
14921507
}
14931508
}
14941509
}
1510+
1511+
// --------------------- ATOMIC OPERTIONS ------------------------
1512+
1513+
#define DEFINE_ATOMIC(operation)\
1514+
__attribute__((always_inline)) void atomic_##operation##_float(volatile __global float* addr, float value)\
1515+
{\
1516+
union\
1517+
{\
1518+
unsigned int u32;\
1519+
float f32;\
1520+
} next, expected, current;\
1521+
current.f32 = *addr;\
1522+
do\
1523+
{\
1524+
expected.f32 = current.f32;\
1525+
next.f32 = operation(expected.f32, value);\
1526+
current.u32 = atomic_cmpxchg((volatile __global unsigned int *)addr,\
1527+
expected.u32, next.u32);\
1528+
} while (current.u32 != expected.u32);\
1529+
}
1530+
1531+
#define DEFINE_ATOMIC_FLOAT3(operation)\
1532+
__attribute__((always_inline)) void atomic_##operation##_float3(volatile __global float3* addr, float3 value)\
1533+
{\
1534+
volatile __global float* p = (volatile __global float*)addr;\
1535+
atomic_##operation##_float(p, value.x);\
1536+
atomic_##operation##_float(p + 1, value.y);\
1537+
atomic_##operation##_float(p + 2, value.z);\
1538+
}
1539+
1540+
__attribute__((always_inline)) void atomic_max_int(volatile __global int* addr, int value)
1541+
{
1542+
atomic_max(addr, value);
1543+
}
1544+
1545+
__attribute__((always_inline)) void atomic_min_int(volatile __global int* addr, int value)
1546+
{
1547+
atomic_min(addr, value);
1548+
}
1549+
1550+
// --------------------- REDUCTION ------------------------
1551+
1552+
#define DEFINE_REDUCTION(bin_op, type)\
1553+
__kernel void reduction_##bin_op##_##type(const __global type* buffer,\
1554+
int count,\
1555+
__local type* shared_mem,\
1556+
__global type* out,\
1557+
int /* in elements */ out_offset)\
1558+
{\
1559+
int global_id = get_global_id(0);\
1560+
int group_id = get_group_id(0);\
1561+
int local_id = get_local_id(0);\
1562+
int group_size = get_local_size(0);\
1563+
if (global_id < count)\
1564+
shared_mem[local_id] = buffer[global_id];\
1565+
else\
1566+
shared_mem[local_id] = neutral_##bin_op##_##type;\
1567+
barrier(CLK_LOCAL_MEM_FENCE);\
1568+
for (int i = group_size / 2; i > 0; i >>= 1)\
1569+
{\
1570+
if (local_id < i)\
1571+
shared_mem[local_id] = bin_op(shared_mem[local_id], shared_mem[local_id + i]);\
1572+
barrier(CLK_LOCAL_MEM_FENCE);\
1573+
}\
1574+
if (local_id == 0)\
1575+
atomic_##bin_op##_##type(out + out_offset, shared_mem[0]);\
1576+
}
1577+
1578+
// --------------------- NORMALIZATION ------------------------
1579+
1580+
#define DEFINE_BUFFER_NORMALIZATION(type)\
1581+
__kernel void buffer_normalization_##type(const __global type* input,\
1582+
__global type* output,\
1583+
int count,\
1584+
const __global type* storage)\
1585+
{\
1586+
type norm_coef = storage[0] - storage[1];\
1587+
int global_id = get_global_id(0);\
1588+
if (global_id < count)\
1589+
output[global_id] = (input[global_id] - storage[1]) / norm_coef;\
1590+
}
1591+
1592+
// Do not change the order
1593+
DEFINE_ATOMIC(min)
1594+
DEFINE_ATOMIC(max)
1595+
DEFINE_ATOMIC_FLOAT3(min)
1596+
DEFINE_ATOMIC_FLOAT3(max)
1597+
1598+
DEFINE_REDUCTION(min, int)
1599+
DEFINE_REDUCTION(min, float)
1600+
DEFINE_REDUCTION(min, float3)
1601+
DEFINE_REDUCTION(max, int)
1602+
DEFINE_REDUCTION(max, float)
1603+
DEFINE_REDUCTION(max, float3)
1604+
1605+
DEFINE_BUFFER_NORMALIZATION(int)
1606+
DEFINE_BUFFER_NORMALIZATION(float)
1607+
DEFINE_BUFFER_NORMALIZATION(float3)

0 commit comments

Comments
 (0)