@@ -23,6 +23,21 @@ THE SOFTWARE.
23
23
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
24
24
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
25
25
26
+ // --------------------- CONSTANTS ------------------------
27
+ // add neutral elements
28
+ __constant int neutral_add_int = 0 ;
29
+ __constant float neutral_add_float = 0 ;
30
+ __constant float3 neutral_add_float3 = (float3 )(0.0 , 0.0 , 0.0 );
31
+ // max neutral elements
32
+ __constant int neutral_max_int = INT_MIN ;
33
+ __constant float neutral_max_float = FLT_MIN ;
34
+ __constant float3 neutral_max_float3 = (float3 )(FLT_MIN , FLT_MIN , FLT_MIN );
35
+ // min neutral elements
36
+ __constant int neutral_min_int = INT_MAX ;
37
+ __constant float neutral_min_float = FLT_MAX ;
38
+ __constant float3 neutral_min_float3 = (float3 )(FLT_MAX , FLT_MAX , FLT_MAX );
39
+
40
+ __constant float epsilon = .00001f ;
26
41
27
42
// --------------------- HELPERS ------------------------
28
43
//#define INT_MAX 0x7FFFFFFF
@@ -1492,3 +1507,101 @@ __kernel void segmented_distribute_part_sum_int_nocut(
1492
1507
}
1493
1508
}
1494
1509
}
1510
+
1511
+ // --------------------- ATOMIC OPERTIONS ------------------------
1512
+
1513
+ #define DEFINE_ATOMIC (operation )\
1514
+ __attribute__((always_inline)) void atomic_##operation##_float(volatile __global float* addr, float value)\
1515
+ {\
1516
+ union\
1517
+ {\
1518
+ unsigned int u32;\
1519
+ float f32;\
1520
+ } next, expected, current;\
1521
+ current.f32 = *addr;\
1522
+ do\
1523
+ {\
1524
+ expected.f32 = current.f32;\
1525
+ next.f32 = operation(expected.f32, value);\
1526
+ current.u32 = atomic_cmpxchg((volatile __global unsigned int *)addr,\
1527
+ expected.u32, next.u32);\
1528
+ } while (current.u32 != expected.u32);\
1529
+ }
1530
+
1531
+ #define DEFINE_ATOMIC_FLOAT3 (operation )\
1532
+ __attribute__((always_inline)) void atomic_##operation##_float3(volatile __global float3* addr, float3 value)\
1533
+ {\
1534
+ volatile __global float* p = (volatile __global float*)addr;\
1535
+ atomic_##operation##_float(p, value.x);\
1536
+ atomic_##operation##_float(p + 1, value.y);\
1537
+ atomic_##operation##_float(p + 2, value.z);\
1538
+ }
1539
+
1540
+ __attribute__((always_inline )) void atomic_max_int (volatile __global int * addr , int value )
1541
+ {
1542
+ atomic_max (addr , value );
1543
+ }
1544
+
1545
+ __attribute__((always_inline )) void atomic_min_int (volatile __global int * addr , int value )
1546
+ {
1547
+ atomic_min (addr , value );
1548
+ }
1549
+
1550
+ // --------------------- REDUCTION ------------------------
1551
+
1552
+ #define DEFINE_REDUCTION (bin_op , type )\
1553
+ __kernel void reduction_##bin_op##_##type(const __global type* buffer,\
1554
+ int count,\
1555
+ __local type* shared_mem,\
1556
+ __global type* out,\
1557
+ int /* in elements */ out_offset )\
1558
+ {\
1559
+ int global_id = get_global_id(0);\
1560
+ int group_id = get_group_id(0);\
1561
+ int local_id = get_local_id(0);\
1562
+ int group_size = get_local_size(0);\
1563
+ if (global_id < count)\
1564
+ shared_mem[local_id] = buffer[global_id];\
1565
+ else\
1566
+ shared_mem[local_id] = neutral_##bin_op##_##type;\
1567
+ barrier(CLK_LOCAL_MEM_FENCE);\
1568
+ for (int i = group_size / 2; i > 0; i >>= 1)\
1569
+ {\
1570
+ if (local_id < i)\
1571
+ shared_mem[local_id] = bin_op(shared_mem[local_id], shared_mem[local_id + i]);\
1572
+ barrier(CLK_LOCAL_MEM_FENCE);\
1573
+ }\
1574
+ if (local_id == 0)\
1575
+ atomic_##bin_op##_##type(out + out_offset, shared_mem[0]);\
1576
+ }
1577
+
1578
+ // --------------------- NORMALIZATION ------------------------
1579
+
1580
+ #define DEFINE_BUFFER_NORMALIZATION (type )\
1581
+ __kernel void buffer_normalization_##type(const __global type* input,\
1582
+ __global type* output,\
1583
+ int count,\
1584
+ const __global type* storage)\
1585
+ {\
1586
+ type norm_coef = storage[0] - storage[1];\
1587
+ int global_id = get_global_id(0);\
1588
+ if (global_id < count)\
1589
+ output[global_id] = (input[global_id] - storage[1]) / norm_coef;\
1590
+ }
1591
+
1592
+ // Do not change the order
1593
+ DEFINE_ATOMIC (min )
1594
+ DEFINE_ATOMIC (max )
1595
+ DEFINE_ATOMIC_FLOAT3 (min )
1596
+ DEFINE_ATOMIC_FLOAT3 (max )
1597
+
1598
+ DEFINE_REDUCTION (min , int )
1599
+ DEFINE_REDUCTION (min , float )
1600
+ DEFINE_REDUCTION (min , float3 )
1601
+ DEFINE_REDUCTION (max , int )
1602
+ DEFINE_REDUCTION (max , float )
1603
+ DEFINE_REDUCTION (max , float3 )
1604
+
1605
+ DEFINE_BUFFER_NORMALIZATION (int )
1606
+ DEFINE_BUFFER_NORMALIZATION (float )
1607
+ DEFINE_BUFFER_NORMALIZATION (float3 )
0 commit comments