intel · bader · Jun 7, 2019 · May 31, 2019 · keryell · Jun 3, 2019
@@ -15,6 +15,20 @@ namespace cl {
 namespace sycl {
 namespace detail {
 
+namespace half_impl {
+
+class half;
+// Half type is defined as _Float16 on device and as manually implemented half
+// type on host. Integration header is generated by device compiler so it sees
+// half type as _Float16 and it will add _Float16 to integration header if it
+// is used in kernel name template parameters. To avoid errors in host
+// compilation we remove _Float16 from integration header using following macro.
+#ifndef __SYCL_DEVICE_ONLY__
+#define _Float16 cl::sycl::detail::half_impl::half
+#endif
+
+} // namespace half_impl
+
 // kernel parameter kinds
 enum class kernel_param_kind_t {
   kind_accessor,

@@ -16,10 +16,6 @@
 template <typename T, int N> class sycl_subgr;
 
 using namespace cl::sycl;
-// TODO remove this workaround when integration header will support correct
-// half generation
-struct wa_half;
-typedef half aligned_half __attribute__((aligned(16)));
 
 template <typename T, int N> void check(queue &Queue) {
   const int G = 1024, L = 64;
@@ -34,12 +30,10 @@ template <typename T, int N> void check(queue &Queue) {
         acc[i] += 0.1; // Check that floating point types are not casted to int
       }
     }
-    using TT = typename std::conditional<std::is_same<T, aligned_half>::value,
-                                         wa_half, T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<sycl_subgr<TT, N>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         if (SG.get_group_id().get(0) % N == 0) {
           size_t WGSGoffset =
@@ -103,12 +97,10 @@ template <typename T> void check(queue &Queue) {
       }
     }
 
-    using TT = typename std::conditional<std::is_same<T, aligned_half>::value,
-                                         wa_half, T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<sycl_subgr<TT, 0>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         if (NdItem.get_global_id(0) == 0)
           sgsizeacc[0] = SG.get_max_local_range()[0];
@@ -180,6 +172,7 @@ int main() {
     check<aligned_short, 4>(Queue);
     check<aligned_short, 8>(Queue);
     if (Queue.get_device().has_extension("cl_khr_fp16")) {
+      typedef half aligned_half __attribute__((aligned(16)));
       check<aligned_half>(Queue);
       check<aligned_half, 1>(Queue);
       check<aligned_half, 2>(Queue);

@@ -33,8 +33,6 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
     buffer<vec<T, N>> buf_down(G);
     buffer<vec<T, N>> buf_xor(G);
     buffer<size_t> sgsizebuf(1);
-    using TT = typename std::conditional<std::is_same<T, half>::value, wa_half,
-                                         T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
       auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
@@ -48,7 +46,7 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
       auto acc_xor = buf_xor.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
-      cgh.parallel_for<sycl_subgr<TT, N>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);
@@ -141,8 +139,6 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
     buffer<T> buf_down(G);
     buffer<T> buf_xor(G);
     buffer<size_t> sgsizebuf(1);
-    using TT = typename std::conditional<std::is_same<T, half>::value, wa_half,
-                                         T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
       auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
@@ -156,7 +152,7 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
       auto acc_xor = buf_xor.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
-      cgh.parallel_for<sycl_subgr<TT, 0>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);