11
11
12
12
#ifdef cl_khr_fp16
13
13
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
14
- struct out_16 {
15
- short x , y , z , w ;
16
- };
17
14
#endif
18
15
19
16
#ifdef cl_khr_3d_image_writes
20
17
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
21
18
#endif
22
19
23
- struct out_32 {
24
- int x , y , z , w ;
25
- };
26
-
27
20
// CLC helpers
28
21
int __clc__sampler_extract_normalized_coords_prop (int ) __asm(
29
22
"__clc__sampler_extract_normalized_coords_prop" );
@@ -57,73 +50,79 @@ int __clc__sampled_image3d_unpack_sampler(__ocl_sampled_image3d_ro_t) __asm(
57
50
"__clc__sampled_image_unpack_sampler" );
58
51
59
52
// NVVM helpers
60
- struct out_16
61
- __nvvm_suld_1d_v4i16_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_trap_s" );
62
- struct out_16
53
+ #ifdef cl_khr_fp16
54
+ short4
55
+ __nvvm_suld_1d_v4i16_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_trap" );
56
+ short4
63
57
__nvvm_suld_2d_v4i16_trap_s (long , int ,
64
58
int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_trap" );
65
- struct out_16
59
+ short4
66
60
__nvvm_suld_3d_v4i16_trap_s (long , int , int ,
67
61
int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_trap" );
68
- struct out_32
69
- __nvvm_suld_1d_v4i32_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_trap" );
70
- struct out_32
71
- __nvvm_suld_2d_v4i32_trap_s (long , int ,
72
- int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_trap" );
73
- struct out_32
74
- __nvvm_suld_3d_v4i32_trap_s (long , int , int ,
75
- int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_trap" );
76
62
77
- struct out_16
63
+ short4
78
64
__nvvm_suld_1d_v4i16_clamp_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_clamp" );
79
- struct out_16
65
+ short4
80
66
__nvvm_suld_2d_v4i16_clamp_s (long , int ,
81
67
int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_clamp" );
82
- struct out_16
68
+ short4
83
69
__nvvm_suld_3d_v4i16_clamp_s (long , int , int ,
84
70
int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_clamp" );
85
- struct out_32
86
- __nvvm_suld_1d_v4i32_clamp_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_clamp" );
87
- struct out_32
88
- __nvvm_suld_2d_v4i32_clamp_s (long , int ,
89
- int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_clamp" );
90
- struct out_32
91
- __nvvm_suld_3d_v4i32_clamp_s (long , int , int ,
92
- int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_clamp" );
93
71
94
- struct out_16
72
+ short4
95
73
__nvvm_suld_1d_v4i16_zero_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_zero" );
96
- struct out_16
74
+ short4
97
75
__nvvm_suld_2d_v4i16_zero_s (long , int ,
98
76
int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_zero" );
99
- struct out_16
77
+ short4
100
78
__nvvm_suld_3d_v4i16_zero_s (long , int , int ,
101
79
int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_zero" );
102
- struct out_32
103
- __nvvm_suld_1d_v4i32_zero_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_zero" );
104
- struct out_32
105
- __nvvm_suld_2d_v4i32_zero_s (long , int ,
106
- int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_zero" );
107
- struct out_32
108
- __nvvm_suld_3d_v4i32_zero_s (long , int , int ,
109
- int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_zero" );
110
80
111
- struct out_16
81
+ short4
112
82
__nvvm_suld_1d_v4i16_clamp (read_only image1d_t ,
113
83
int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_clamp" );
114
- struct out_16
84
+ short4
115
85
__nvvm_suld_2d_v4i16_clamp (read_only image2d_t , int ,
116
86
int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_clamp" );
117
- struct out_16
87
+ short4
118
88
__nvvm_suld_3d_v4i16_clamp (read_only image3d_t , int , int ,
119
89
int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_clamp" );
120
- struct out_32
90
+ #endif
91
+
92
+ int4
93
+ __nvvm_suld_1d_v4i32_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_trap" );
94
+ int4
95
+ __nvvm_suld_2d_v4i32_trap_s (long , int ,
96
+ int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_trap" );
97
+ int4
98
+ __nvvm_suld_3d_v4i32_trap_s (long , int , int ,
99
+ int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_trap" );
100
+
101
+ int4
102
+ __nvvm_suld_1d_v4i32_clamp_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_clamp" );
103
+ int4
104
+ __nvvm_suld_2d_v4i32_clamp_s (long , int ,
105
+ int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_clamp" );
106
+ int4
107
+ __nvvm_suld_3d_v4i32_clamp_s (long , int , int ,
108
+ int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_clamp" );
109
+
110
+ int4
111
+ __nvvm_suld_1d_v4i32_zero_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_zero" );
112
+ int4
113
+ __nvvm_suld_2d_v4i32_zero_s (long , int ,
114
+ int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_zero" );
115
+ int4
116
+ __nvvm_suld_3d_v4i32_zero_s (long , int , int ,
117
+ int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_zero" );
118
+
119
+ int4
121
120
__nvvm_suld_1d_v4i32_clamp (read_only image1d_t ,
122
121
int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_clamp" );
123
- struct out_32
122
+ int4
124
123
__nvvm_suld_2d_v4i32_clamp (read_only image2d_t , int ,
125
124
int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_clamp" );
126
- struct out_32
125
+ int4
127
126
__nvvm_suld_3d_v4i32_clamp (read_only image3d_t , int , int ,
128
127
int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_clamp" );
129
128
@@ -199,11 +198,8 @@ typedef float4 pixelf32;
199
198
typedef half fp16 ;
200
199
typedef float fp32 ;
201
200
202
- #define _DEFINE_OUT_TYPE (elem_t , elem_size ) \
203
- inline elem_t##4 out_##elem_t(struct out_##elem_size out) { \
204
- return (elem_t##4)(as_##elem_t(out.x), as_##elem_t(out.y), \
205
- as_##elem_t(out.z), as_##elem_t(out.w)); \
206
- }
201
+ pixelf16 as_pixelf16 (short4 v ) { return as_half4 (v ); }
202
+ pixelf32 as_pixelf32 (int4 v ) { return as_float4 (v ); }
207
203
208
204
#define _DEFINE_VEC4_CAST (from_t , to_t ) \
209
205
inline to_t##4 cast_##from_t##4_to_##to_t##4(from_t##4 from) { \
@@ -223,44 +219,30 @@ typedef float fp32;
223
219
return cast_##pixelf_base_t##_to_##to_t(from); \
224
220
}
225
221
226
- #define _DEFINE_OUT_PIXELF (pixelf_size , elem_t ) \
227
- inline pixelf##pixelf_size out_pixelf##pixelf_size( \
228
- struct out_##pixelf_size out) { \
229
- return (pixelf##pixelf_size)(as_##elem_t(out.x), as_##elem_t(out.y), \
230
- as_##elem_t(out.z), as_##elem_t(out.w)); \
231
- }
232
-
233
222
#define _DEFINE_READ_1D_PIXELF (pixelf_size , cuda_address_mode ) \
234
223
pixelf##pixelf_size read_1d_##pixelf_size##_##cuda_address_mode(long image, \
235
224
int x) { \
236
- struct out_ ##pixelf_size res = \
225
+ return as_pixelf ##pixelf_size( \
237
226
__nvvm_suld_1d_v4i##pixelf_size##_##cuda_address_mode##_s( \
238
- image, x * sizeof(struct out_##pixelf_size)); \
239
- return out_pixelf##pixelf_size(res); \
227
+ image, x * sizeof(pixelf##pixelf_size))); \
240
228
}
241
229
242
230
#define _DEFINE_READ_2D_PIXELF (pixelf_size , cuda_address_mode ) \
243
231
pixelf##pixelf_size read_2d_##pixelf_size##_##cuda_address_mode( \
244
232
long image, int x, int y) { \
245
- struct out_ ##pixelf_size res = \
233
+ return as_pixelf ##pixelf_size( \
246
234
__nvvm_suld_2d_v4i##pixelf_size##_##cuda_address_mode##_s( \
247
- image, x * sizeof(struct out_##pixelf_size), y); \
248
- return out_pixelf##pixelf_size(res); \
235
+ image, x * sizeof(pixelf##pixelf_size), y)); \
249
236
}
250
237
251
238
#define _DEFINE_READ_3D_PIXELF (pixelf_size , cuda_address_mode ) \
252
239
pixelf##pixelf_size read_3d_##pixelf_size##_##cuda_address_mode( \
253
240
long image, int x, int y, int z) { \
254
- struct out_ ##pixelf_size res = \
241
+ return as_pixelf ##pixelf_size( \
255
242
__nvvm_suld_3d_v4i##pixelf_size##_##cuda_address_mode##_s( \
256
- image, x * sizeof(struct out_##pixelf_size), y, z); \
257
- return out_pixelf##pixelf_size(res); \
243
+ image, x * sizeof(pixelf##pixelf_size), y, z)); \
258
244
}
259
245
260
- _DEFINE_OUT_TYPE (float , 32 )
261
- _DEFINE_OUT_TYPE (int , 32 )
262
- _DEFINE_OUT_TYPE (uint , 32 )
263
-
264
246
_DEFINE_VEC4_CAST (float , int )
265
247
_DEFINE_VEC4_CAST (int , float )
266
248
_DEFINE_VEC4_CAST (float , uint )
@@ -276,8 +258,6 @@ _DEFINE_CAST(pixelf32, float4)
276
258
_DEFINE_CAST (pixelf32 , pixelf32 )
277
259
_DEFINE_CAST (float4 , pixelf32 )
278
260
279
- _DEFINE_OUT_PIXELF (32 , float )
280
-
281
261
_DEFINE_PIXELF_CAST (32 , float4 , int4 )
282
262
_DEFINE_PIXELF_CAST (32 , float4 , uint4 )
283
263
@@ -298,8 +278,6 @@ _DEFINE_CAST(half4, half4)
298
278
_DEFINE_CAST (pixelf16 , half4 )
299
279
_DEFINE_CAST (pixelf16 , pixelf16 )
300
280
_DEFINE_CAST (half4 , pixelf16 )
301
- _DEFINE_OUT_TYPE (half , 16 )
302
- _DEFINE_OUT_PIXELF (16 , half )
303
281
_DEFINE_READ_1D_PIXELF (16 , trap )
304
282
_DEFINE_READ_2D_PIXELF (16 , trap )
305
283
_DEFINE_READ_3D_PIXELF (16 , trap )
@@ -311,11 +289,9 @@ _DEFINE_READ_2D_PIXELF(16, clamp)
311
289
_DEFINE_READ_3D_PIXELF (16 , clamp )
312
290
#endif
313
291
314
- #undef _DEFINE_OUT_TYPE
315
292
#undef _DEFINE_VEC4_CAST
316
293
#undef _DEFINE_VEC2_CAST
317
294
#undef _DEFINE_CAST
318
- #undef _DEFINE_OUT_PIXELF
319
295
#undef _DEFINE_READ_1D_PIXELF
320
296
#undef _DEFINE_READ_2D_PIXELF
321
297
#undef _DEFINE_READ_3D_PIXELF
@@ -327,15 +303,15 @@ _DEFINE_READ_3D_PIXELF(16, clamp)
327
303
_CLC_DEF \
328
304
elem_t##4 _Z17__spirv_ImageReadIDv4_##elem_t_mangled##14ocl_image1d_roiET_T0_T1_( \
329
305
read_only image1d_t image, int x) { \
330
- return out_ ##elem_t( \
306
+ return as_ ##elem_t##4( \
331
307
__nvvm_suld_1d_v4i##elem_size##_clamp(image, x * sizeof(elem_t##4))); \
332
308
}
333
309
334
310
#define _CLC_DEFINE_IMAGE2D_READ_BUILTIN (elem_t , elem_t_mangled , elem_size ) \
335
311
_CLC_DEF \
336
312
elem_t##4 _Z17__spirv_ImageReadIDv4_##elem_t_mangled##14ocl_image2d_roDv2_iET_T0_T1_( \
337
313
read_only image2d_t image, int2 coord) { \
338
- return out_ ##elem_t(__nvvm_suld_2d_v4i##elem_size##_clamp( \
314
+ return as_ ##elem_t##4 (__nvvm_suld_2d_v4i##elem_size##_clamp( \
339
315
image, coord.x * sizeof(elem_t##4), coord.y)); \
340
316
}
341
317
@@ -344,7 +320,7 @@ _DEFINE_READ_3D_PIXELF(16, clamp)
344
320
_CLC_DEF \
345
321
elem_t##4 _Z17__spirv_ImageReadIDv4_##elem_t_mangled##14ocl_image3d_ro##coord_mangled##ET_T0_T1_( \
346
322
read_only image3d_t image, int4 coord) { \
347
- return out_ ##elem_t(__nvvm_suld_3d_v4i##elem_size##_clamp( \
323
+ return as_ ##elem_t##4 (__nvvm_suld_3d_v4i##elem_size##_clamp( \
348
324
image, coord.x * sizeof(elem_t##4), coord.y, coord.z)); \
349
325
}
350
326
@@ -463,7 +439,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
463
439
float coord, long image, int sampler) { \
464
440
if (is_nearest_filter_mode(sampler)) { \
465
441
int i = (int)__spirv_ocl_floor(coord); \
466
- return out_ ##elem_t( \
442
+ return as_ ##elem_t##4( \
467
443
__nvvm_suld_1d_v4i##elem_size##_##cuda_address_mode##_s( \
468
444
image, i * sizeof(elem_t##4))); \
469
445
} else { \
@@ -487,7 +463,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
487
463
if (is_nearest_filter_mode(sampler)) { \
488
464
int i = (int)__spirv_ocl_floor(coord.x); \
489
465
int j = (int)__spirv_ocl_floor(coord.y); \
490
- return out_ ##elem_t( \
466
+ return as_ ##elem_t##4( \
491
467
__nvvm_suld_2d_v4i##elem_size##_##cuda_address_mode##_s( \
492
468
image, i * sizeof(elem_t##4), j)); \
493
469
} else { \
@@ -520,7 +496,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
520
496
int i = (int)__spirv_ocl_floor(coord.x); \
521
497
int j = (int)__spirv_ocl_floor(coord.y); \
522
498
int k = (int)__spirv_ocl_floor(coord.z); \
523
- return out_ ##elem_t( \
499
+ return as_ ##elem_t##4( \
524
500
__nvvm_suld_3d_v4i##elem_size##_##cuda_address_mode##_s( \
525
501
image, i * sizeof(elem_t##4), j, k)); \
526
502
} else { \
@@ -570,7 +546,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
570
546
if (i > width - 1) { \
571
547
i = i - width; \
572
548
} \
573
- return out_ ##elem_t(__nvvm_suld_1d_v4i##elem_size##_trap_s( \
549
+ return as_ ##elem_t##4 (__nvvm_suld_1d_v4i##elem_size##_trap_s( \
574
550
image, i * sizeof(elem_t##4))); \
575
551
} else { \
576
552
int i0, i1; \
@@ -609,7 +585,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
609
585
if (j > height - 1) { \
610
586
j = j - height; \
611
587
} \
612
- return out_ ##elem_t(__nvvm_suld_2d_v4i##elem_size##_trap_s( \
588
+ return as_ ##elem_t##4 (__nvvm_suld_2d_v4i##elem_size##_trap_s( \
613
589
image, i * sizeof(elem_t##4), j)); \
614
590
} else { \
615
591
int i0, i1, j0, j1; \
@@ -666,7 +642,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
666
642
if (k > depth - 1) { \
667
643
k = k - depth; \
668
644
} \
669
- return out_ ##elem_t(__nvvm_suld_3d_v4i##elem_size##_trap_s( \
645
+ return as_ ##elem_t##4 (__nvvm_suld_3d_v4i##elem_size##_trap_s( \
670
646
image, i * sizeof(elem_t##4), j, k)); \
671
647
} else { \
672
648
int i0, i1, j0, j1, k0, k1; \
@@ -735,7 +711,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
735
711
int i = (int)__spirv_ocl_floor(u); \
736
712
i = __spirv_ocl_s_min(i, width - 1); \
737
713
\
738
- return out_ ##elem_t(__nvvm_suld_1d_v4i##elem_size##_trap_s( \
714
+ return as_ ##elem_t##4 (__nvvm_suld_1d_v4i##elem_size##_trap_s( \
739
715
image, i * sizeof(elem_t##4))); \
740
716
} else { \
741
717
int i0, i1; \
@@ -771,7 +747,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
771
747
int j = (int)__spirv_ocl_floor(v); \
772
748
j = __spirv_ocl_s_min(j, height - 1); \
773
749
\
774
- return out_ ##elem_t(__nvvm_suld_2d_v4i##elem_size##_trap_s( \
750
+ return as_ ##elem_t##4 (__nvvm_suld_2d_v4i##elem_size##_trap_s( \
775
751
image, i * sizeof(elem_t##4), j)); \
776
752
} else { \
777
753
int i0, i1, j0, j1; \
@@ -821,7 +797,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
821
797
int k = (int)__spirv_ocl_floor(w); \
822
798
k = __spirv_ocl_s_min(k, depth - 1); \
823
799
\
824
- return out_ ##elem_t(__nvvm_suld_3d_v4i##elem_size##_trap_s( \
800
+ return as_ ##elem_t##4 (__nvvm_suld_3d_v4i##elem_size##_trap_s( \
825
801
image, i * sizeof(elem_t##4), j, k)); \
826
802
} else { \
827
803
int i0, i1, j0, j1, k0, k1; \
@@ -913,8 +889,7 @@ _DEFINE_SAMPLED_LOADS(half, 16)
913
889
/* Sampling algorithms are implemented assu__spirv_ocl_s_ming an \
914
890
* unnormalized floating point coordinate as input. Need to transform as \
915
891
* appropriate. */ \
916
- sampling_coord_t sampling_coord = \
917
- cast_ ##input_coord_t ##_to_##sampling_coord_t(input_coord); \
892
+ sampling_coord_t sampling_coord = as_ ##sampling_coord_t (input_coord); \
918
893
if (is_normalized_coords(sampler)) { \
919
894
sampling_coord = unnormalized_coord_##dims##d(sampling_coord, image); \
920
895
} \
0 commit comments