From ea29d71ba0afd6e56dfe02f660572e710fcd6111 Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Mon, 26 Apr 2021 08:34:24 +0300
Subject: [PATCH 1/7] [SYCL] SG load/store for vec3 and vec16

---
 SYCL/SubGroup/helper.hpp     | 11 +++++++++++
 SYCL/SubGroup/load_store.cpp | 26 +++++++++++++++++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp
index 964fb742bc..175a0abad8 100644
--- a/SYCL/SubGroup/helper.hpp
+++ b/SYCL/SubGroup/helper.hpp
@@ -36,6 +36,17 @@ template <typename T2> struct utils<T2, 2> {
            std::to_string((T2)v.s1()) + " )";
   }
 };
+template <typename T2> struct utils<T2, 3> {
+  static T2 add_vec(const vec<T2, 3> &v) { return v.s0() + v.s1() + v.s2(); }
+  static bool cmp_vec(const vec<T2, 3> &v, const vec<T2, 3> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2();
+  }
+  static std::string stringify_vec(const vec<T2, 2> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) +
+           " )";
+  }
+};
 template <typename T2> struct utils<T2, 4> {
   static T2 add_vec(const vec<T2, 4> &v) {
     return v.s0() + v.s1() + v.s2() + v.s3();
diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp
index ffd9bb21c7..aaf6f4cc42 100644
--- a/SYCL/SubGroup/load_store.cpp
+++ b/SYCL/SubGroup/load_store.cpp
@@ -2,7 +2,8 @@
 // #2252 Disable until all variants of built-ins are available in OpenCL CPU
 // runtime for every supported ISA
 //
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel \
+//               -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -23,15 +24,10 @@ template <typename T, int N> class sycl_subgr;
 using namespace cl::sycl;
 
 template <typename T, int N> void check(queue &Queue) {
-  const int G = 1024, L = 128;
+  const int G = 1024, L = 1024;
 
-  // Pad arrays based on sub-group size to ensure no out-of-bounds accesses
-  // Workaround for info::device::sub_group_sizes support on some devices
-  size_t max_sg_size = 128;
-#if 0
   auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
   size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
-#endif
 
   try {
     nd_range<1> NdRange(G, L);
@@ -181,20 +177,26 @@ int main() {
     check<aligned_int>(Queue);
     check<aligned_int, 1>(Queue);
     check<aligned_int, 2>(Queue);
+    check<aligned_int, 3>(Queue);
     check<aligned_int, 4>(Queue);
     check<aligned_int, 8>(Queue);
+    check<aligned_int, 16>(Queue);
     typedef unsigned int aligned_uint __attribute__((aligned(16)));
     check<aligned_uint>(Queue);
     check<aligned_uint, 1>(Queue);
     check<aligned_uint, 2>(Queue);
+    check<aligned_uint, 3>(Queue);
     check<aligned_uint, 4>(Queue);
     check<aligned_uint, 8>(Queue);
+    check<aligned_uint, 16>(Queue);
     typedef float aligned_float __attribute__((aligned(16)));
     check<aligned_float>(Queue);
     check<aligned_float, 1>(Queue);
     check<aligned_float, 2>(Queue);
+    check<aligned_float, 3>(Queue);
     check<aligned_float, 4>(Queue);
     check<aligned_float, 8>(Queue);
+    check<aligned_float, 16>(Queue);
   }
   if (Queue.get_device().has_extension("cl_intel_subgroups_short") ||
       PlatformName.find("CUDA") != std::string::npos) {
@@ -202,16 +204,20 @@ int main() {
     check<aligned_short>(Queue);
     check<aligned_short, 1>(Queue);
     check<aligned_short, 2>(Queue);
+    check<aligned_short, 3>(Queue);
     check<aligned_short, 4>(Queue);
     check<aligned_short, 8>(Queue);
+    check<aligned_short, 16>(Queue);
     if (Queue.get_device().has_extension("cl_khr_fp16") ||
         PlatformName.find("CUDA") != std::string::npos) {
       typedef half aligned_half __attribute__((aligned(16)));
       check<aligned_half>(Queue);
       check<aligned_half, 1>(Queue);
       check<aligned_half, 2>(Queue);
+      check<aligned_half, 3>(Queue);
       check<aligned_half, 4>(Queue);
       check<aligned_half, 8>(Queue);
+      check<aligned_half, 16>(Queue);
     }
   }
   if (Queue.get_device().has_extension("cl_intel_subgroups_long") ||
@@ -220,20 +226,26 @@ int main() {
     check<aligned_long>(Queue);
     check<aligned_long, 1>(Queue);
     check<aligned_long, 2>(Queue);
+    check<aligned_long, 3>(Queue);
     check<aligned_long, 4>(Queue);
     check<aligned_long, 8>(Queue);
+    check<aligned_long, 16>(Queue);
     typedef unsigned long aligned_ulong __attribute__((aligned(16)));
     check<aligned_ulong>(Queue);
     check<aligned_ulong, 1>(Queue);
     check<aligned_ulong, 2>(Queue);
+    check<aligned_ulong, 3>(Queue);
     check<aligned_ulong, 4>(Queue);
     check<aligned_ulong, 8>(Queue);
+    check<aligned_ulong, 16>(Queue);
     typedef double aligned_double __attribute__((aligned(16)));
     check<aligned_double>(Queue);
     check<aligned_double, 1>(Queue);
     check<aligned_double, 2>(Queue);
+    check<aligned_double, 3>(Queue);
     check<aligned_double, 4>(Queue);
     check<aligned_double, 8>(Queue);
+    check<aligned_double, 16>(Queue);
   }
   std::cout << "Test passed." << std::endl;
   return 0;

From 45e126151ae3b7369382c293dfd89635122d2ec5 Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Mon, 26 Apr 2021 09:02:20 +0300
Subject: [PATCH 2/7] Disable device-code-split which is not supported by CUDA
 BE

---
 SYCL/SubGroup/load_store.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp
index aaf6f4cc42..f5a3de2e61 100644
--- a/SYCL/SubGroup/load_store.cpp
+++ b/SYCL/SubGroup/load_store.cpp
@@ -2,8 +2,7 @@
 // #2252 Disable until all variants of built-ins are available in OpenCL CPU
 // runtime for every supported ISA
 //
-// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel \
-//               -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out

From 86b83d54d519996951f05757b442e3513e2e5e6c Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Mon, 26 Apr 2021 10:37:28 +0300
Subject: [PATCH 3/7] Fix failure due to limitations on GPU

---
 SYCL/SubGroup/helper.hpp     |  8 +++---
 SYCL/SubGroup/load_store.cpp | 49 +++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp
index 175a0abad8..1ef035388b 100644
--- a/SYCL/SubGroup/helper.hpp
+++ b/SYCL/SubGroup/helper.hpp
@@ -109,7 +109,7 @@ template <typename T2> struct utils<T2, 16> {
 
 template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
   if (std::is_floating_point<T>::value) {
-    if (std::fabs(val - ref) > 0.01) {
+    if (std::fabs(val - ref) > 0.02) {
       std::cout << "Unexpected result for " << name << ": " << (double)val
                 << " expected value: " << (double)ref << std::endl;
       exit(1);
@@ -126,8 +126,8 @@ template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
 template <typename T>
 void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
                        const char *name) {
-  if (std::fabs(val.real() - ref.real()) > 0.01 ||
-      std::fabs(val.imag() - ref.imag()) > 0.01) {
+  if (std::fabs(val.real() - ref.real()) > 0.02 ||
+      std::fabs(val.imag() - ref.imag()) > 0.02) {
     std::cout << "Unexpected result for " << name << ": " << val
               << " expected value: " << ref << std::endl;
     exit(1);
@@ -145,7 +145,7 @@ template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {
 template <> void exit_if_not_equal(half val, half ref, const char *name) {
   int16_t cmp_val = reinterpret_cast<int16_t &>(val);
   int16_t cmp_ref = reinterpret_cast<int16_t &>(ref);
-  if (std::abs(cmp_val - cmp_ref) > 1) {
+  if (std::abs(cmp_val - cmp_ref) > 2) {
     std::cout << "Unexpected result for " << name << ": " << (float)val
               << " expected value: " << (float)ref << std::endl;
     exit(1);
diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp
index f5a3de2e61..679032bd9f 100644
--- a/SYCL/SubGroup/load_store.cpp
+++ b/SYCL/SubGroup/load_store.cpp
@@ -22,8 +22,13 @@ template <typename T, int N> class sycl_subgr;
 
 using namespace cl::sycl;
 
+/* Maximum local group size for the dimention on GPU is 256. At the same time
+ * vector load/store operations require max_sg_size * vec_len elements in local
+ * group to work. Limit sub-group size to 8 for 16-element vectors to let them
+ * work on GPU.
+ */
 template <typename T, int N> void check(queue &Queue) {
-  const int G = 1024, L = 1024;
+  const int G = 1024, L = 256;
 
   auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
   size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
@@ -44,25 +49,29 @@ template <typename T, int N> void check(queue &Queue) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
       accessor<T, 1, access::mode::read_write, access::target::local> LocalMem(
           {L + max_sg_size * N}, cgh);
-      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
-        ONEAPI::sub_group SG = NdItem.get_sub_group();
-        if (SG.get_group_id().get(0) % N == 0) {
-          size_t SGOffset =
-              SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
-          size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
-          multi_ptr<T, access::address_space::global_space> mp(
-              &acc[WGSGoffset]);
-          multi_ptr<T, access::address_space::local_space> MPL(
-              &LocalMem[SGOffset]);
-          // Add all values in read block
-          vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
-          SG.store<N, T>(MPL, v);
-          vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
-          SG.store<N, T>(mp, t);
-        }
-        if (NdItem.get_global_id(0) == 0)
-          sgsizeacc[0] = SG.get_max_local_range()[0];
-      });
+      cgh.parallel_for<sycl_subgr<T, N>>(
+          NdRange, [=
+      ](nd_item<1> NdItem)[[intel::reqd_sub_group_size(N == 16 ? 8 : 16)]] {
+            ONEAPI::sub_group SG = NdItem.get_sub_group();
+            auto SGid = SG.get_group_id().get(0);
+            /* Avoid overlapping data ranges inside and between local groups */
+            if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] < L) {
+              size_t SGOffset =
+                  SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
+              size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
+              multi_ptr<T, access::address_space::global_space> mp(
+                  &acc[WGSGoffset]);
+              multi_ptr<T, access::address_space::local_space> MPL(
+                  &LocalMem[SGOffset]);
+              // Add all values in read block
+              vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
+              SG.store<N, T>(MPL, v);
+              vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
+              SG.store<N, T>(mp, t);
+            }
+            if (NdItem.get_global_id(0) == 0)
+              sgsizeacc[0] = SG.get_local_range()[0];
+          });
     });
     auto acc = syclbuf.template get_access<access::mode::read_write>();
     auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();

From b4bba1e3b755c430e9cfb2979eaf939d3f2a1477 Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Mon, 26 Apr 2021 12:35:24 +0300
Subject: [PATCH 4/7] Align data ranges to avoid limitting max_sg_size

---
 SYCL/SubGroup/load_store.cpp | 51 ++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp
index 679032bd9f..a377de5d88 100644
--- a/SYCL/SubGroup/load_store.cpp
+++ b/SYCL/SubGroup/load_store.cpp
@@ -22,11 +22,6 @@ template <typename T, int N> class sycl_subgr;
 
 using namespace cl::sycl;
 
-/* Maximum local group size for the dimention on GPU is 256. At the same time
- * vector load/store operations require max_sg_size * vec_len elements in local
- * group to work. Limit sub-group size to 8 for 16-element vectors to let them
- * work on GPU.
- */
 template <typename T, int N> void check(queue &Queue) {
   const int G = 1024, L = 256;
 
@@ -49,29 +44,27 @@ template <typename T, int N> void check(queue &Queue) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
       accessor<T, 1, access::mode::read_write, access::target::local> LocalMem(
           {L + max_sg_size * N}, cgh);
-      cgh.parallel_for<sycl_subgr<T, N>>(
-          NdRange, [=
-      ](nd_item<1> NdItem)[[intel::reqd_sub_group_size(N == 16 ? 8 : 16)]] {
-            ONEAPI::sub_group SG = NdItem.get_sub_group();
-            auto SGid = SG.get_group_id().get(0);
-            /* Avoid overlapping data ranges inside and between local groups */
-            if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] < L) {
-              size_t SGOffset =
-                  SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
-              size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
-              multi_ptr<T, access::address_space::global_space> mp(
-                  &acc[WGSGoffset]);
-              multi_ptr<T, access::address_space::local_space> MPL(
-                  &LocalMem[SGOffset]);
-              // Add all values in read block
-              vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
-              SG.store<N, T>(MPL, v);
-              vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
-              SG.store<N, T>(mp, t);
-            }
-            if (NdItem.get_global_id(0) == 0)
-              sgsizeacc[0] = SG.get_local_range()[0];
-          });
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
+        ONEAPI::sub_group SG = NdItem.get_sub_group();
+        auto SGid = SG.get_group_id().get(0);
+        /* Avoid overlapping data ranges inside and between local groups */
+        if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] <= L) {
+          size_t SGOffset =
+              SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
+          size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
+          multi_ptr<T, access::address_space::global_space> mp(
+              &acc[WGSGoffset]);
+          multi_ptr<T, access::address_space::local_space> MPL(
+              &LocalMem[SGOffset]);
+          // Add all values in read block
+          vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
+          SG.store<N, T>(MPL, v);
+          vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
+          SG.store<N, T>(mp, t);
+        }
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_local_range()[0];
+      });
     });
     auto acc = syclbuf.template get_access<access::mode::read_write>();
     auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
@@ -95,7 +88,7 @@ template <typename T, int N> void check(queue &Queue) {
         ref *= N;
       }
       /* There is no defined out-of-range behavior for these functions. */
-      if ((SGid + N) * sg_size < L) {
+      if ((SGid + N) * sg_size <= L) {
         std::string s("Vector<");
         s += std::string(typeid(ref).name()) + std::string(",") +
              std::to_string(N) + std::string(">[") + std::to_string(j) +

From 3cbb081a5749be1b25e8b26c03cd0491eccfc4a5 Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Mon, 26 Apr 2021 14:16:49 +0300
Subject: [PATCH 5/7] Apply review comments and skip execution on CPU device
 only instead of skipping the whole test when CPU device is available on host
 machine

---
 SYCL/SubGroup/load_store.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp
index a377de5d88..00e4e77e2c 100644
--- a/SYCL/SubGroup/load_store.cpp
+++ b/SYCL/SubGroup/load_store.cpp
@@ -1,10 +1,8 @@
-// UNSUPPORTED: cpu
-// #2252 Disable until all variants of built-ins are available in OpenCL CPU
-// runtime for every supported ISA
-//
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+// RUNx %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 //
@@ -47,10 +45,10 @@ template <typename T, int N> void check(queue &Queue) {
       cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
         ONEAPI::sub_group SG = NdItem.get_sub_group();
         auto SGid = SG.get_group_id().get(0);
+        auto SGsize = SG.get_max_local_range().get(0);
         /* Avoid overlapping data ranges inside and between local groups */
-        if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] <= L) {
-          size_t SGOffset =
-              SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
+        if (SGid % N == 0 && (SGid + N) * SGsize <= L) {
+          size_t SGOffset = SGid * SGsize;
           size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
           multi_ptr<T, access::address_space::global_space> mp(
               &acc[WGSGoffset]);
@@ -63,7 +61,7 @@ template <typename T, int N> void check(queue &Queue) {
           SG.store<N, T>(mp, t);
         }
         if (NdItem.get_global_id(0) == 0)
-          sgsizeacc[0] = SG.get_local_range()[0];
+          sgsizeacc[0] = SGsize;
       });
     });
     auto acc = syclbuf.template get_access<access::mode::read_write>();

From 0c34ec87129a24837f4a37f5e462b5f054c09550 Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Wed, 5 May 2021 07:45:57 +0300
Subject: [PATCH 6/7] Use bitwise comparison for floating point type

---
 SYCL/SubGroup/helper.hpp     | 21 +++++++++++----------
 SYCL/SubGroup/load_store.cpp |  9 ++++-----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp
index 1ef035388b..1c85cf11a5 100644
--- a/SYCL/SubGroup/helper.hpp
+++ b/SYCL/SubGroup/helper.hpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include <CL/sycl.hpp>
+#include <bitset>
 #include <cmath>
 #include <complex>
 #include <iostream>
@@ -109,9 +110,12 @@ template <typename T2> struct utils<T2, 16> {
 
 template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
   if (std::is_floating_point<T>::value) {
-    if (std::fabs(val - ref) > 0.02) {
-      std::cout << "Unexpected result for " << name << ": " << (double)val
-                << " expected value: " << (double)ref << std::endl;
+    auto cmp_val = std::bitset<CHAR_BIT * sizeof(T)>(val);
+    auto cmp_ref = std::bitset<CHAR_BIT * sizeof(T)>(ref);
+    if (cmp_val != cmp_ref) {
+      std::cout << "Unexpected result for " << name << ": " << val << "("
+                << cmp_val << ") expected value: " << ref << "(" << cmp_ref
+                << ")" << std::endl;
       exit(1);
     }
   } else {
@@ -126,12 +130,9 @@ template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
 template <typename T>
 void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
                        const char *name) {
-  if (std::fabs(val.real() - ref.real()) > 0.02 ||
-      std::fabs(val.imag() - ref.imag()) > 0.02) {
-    std::cout << "Unexpected result for " << name << ": " << val
-              << " expected value: " << ref << std::endl;
-    exit(1);
-  }
+  std::string Name{name};
+  exit_if_not_equal(val.real(), ref.real(), (Name+".real()").c_str());
+  exit_if_not_equal(val.imag(), ref.imag(), (Name+".imag()").c_str());
 }
 
 template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {
@@ -145,7 +146,7 @@ template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {
 template <> void exit_if_not_equal(half val, half ref, const char *name) {
   int16_t cmp_val = reinterpret_cast<int16_t &>(val);
   int16_t cmp_ref = reinterpret_cast<int16_t &>(ref);
-  if (std::abs(cmp_val - cmp_ref) > 2) {
+  if (std::abs(cmp_val - cmp_ref) > 1) {
     std::cout << "Unexpected result for " << name << ": " << (float)val
               << " expected value: " << (float)ref << std::endl;
     exit(1);
diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp
index 00e4e77e2c..f3409ef00c 100644
--- a/SYCL/SubGroup/load_store.cpp
+++ b/SYCL/SubGroup/load_store.cpp
@@ -21,7 +21,7 @@ template <typename T, int N> class sycl_subgr;
 using namespace cl::sycl;
 
 template <typename T, int N> void check(queue &Queue) {
-  const int G = 1024, L = 256;
+  const int G = 512, L = 256;
 
   auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
   size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
@@ -34,7 +34,7 @@ template <typename T, int N> void check(queue &Queue) {
       auto acc = syclbuf.template get_access<access::mode::read_write>();
       for (int i = 0; i < G; i++) {
         acc[i] = i;
-        acc[i] += 0.1; // Check that floating point types are not casted to int
+        acc[i] += 0.25; // Check that floating point types are not casted to int
       }
     }
     Queue.submit([&](handler &cgh) {
@@ -55,7 +55,7 @@ template <typename T, int N> void check(queue &Queue) {
           multi_ptr<T, access::address_space::local_space> MPL(
               &LocalMem[SGOffset]);
           // Add all values in read block
-          vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
+          vec<T, N> v(SG.load<N, T>(mp));
           SG.store<N, T>(MPL, v);
           vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
           SG.store<N, T>(mp, t);
@@ -81,9 +81,8 @@ template <typename T, int N> void check(queue &Queue) {
         ref = acc[j - (SGid % N) * sg_size];
       } else {
         for (int i = 0; i < N; i++) {
-          ref += (T)(j + i * sg_size) + 0.1;
+          ref += (T)(j + i * sg_size) + 0.25;
         }
-        ref *= N;
       }
       /* There is no defined out-of-range behavior for these functions. */
       if ((SGid + N) * sg_size <= L) {

From afcd3fd987a30aed64e8f12d74c8a170fa43811a Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Wed, 5 May 2021 16:03:42 +0300
Subject: [PATCH 7/7] Fix clang-format

---
 SYCL/SubGroup/helper.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp
index 1c85cf11a5..712537db3f 100644
--- a/SYCL/SubGroup/helper.hpp
+++ b/SYCL/SubGroup/helper.hpp
@@ -131,8 +131,8 @@ template <typename T>
 void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
                        const char *name) {
   std::string Name{name};
-  exit_if_not_equal(val.real(), ref.real(), (Name+".real()").c_str());
-  exit_if_not_equal(val.imag(), ref.imag(), (Name+".imag()").c_str());
+  exit_if_not_equal(val.real(), ref.real(), (Name + ".real()").c_str());
+  exit_if_not_equal(val.imag(), ref.imag(), (Name + ".imag()").c_str());
 }
 
 template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {