From ea29d71ba0afd6e56dfe02f660572e710fcd6111 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 08:34:24 +0300 Subject: [PATCH 1/7] [SYCL] SG load/store for vec3 and vec16 --- SYCL/SubGroup/helper.hpp | 11 +++++++++++ SYCL/SubGroup/load_store.cpp | 26 +++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp index 964fb742bc..175a0abad8 100644 --- a/SYCL/SubGroup/helper.hpp +++ b/SYCL/SubGroup/helper.hpp @@ -36,6 +36,17 @@ template struct utils { std::to_string((T2)v.s1()) + " )"; } }; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0() + v.s1() + v.s2(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) + + " )"; + } +}; template struct utils { static T2 add_vec(const vec &v) { return v.s0() + v.s1() + v.s2() + v.s3(); diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index ffd9bb21c7..aaf6f4cc42 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -2,7 +2,8 @@ // #2252 Disable until all variants of built-ins are available in OpenCL CPU // runtime for every supported ISA // -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel \ +// -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -23,15 +24,10 @@ template class sycl_subgr; using namespace cl::sycl; template void check(queue &Queue) { - const int G = 1024, L = 128; + const int G = 1024, L = 1024; - // Pad arrays based on sub-group size to ensure no out-of-bounds accesses - // Workaround for info::device::sub_group_sizes support on some devices - size_t max_sg_size = 128; -#if 0 auto sg_sizes = Queue.get_device().get_info(); size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end()); -#endif try { nd_range<1> NdRange(G, L); @@ -181,20 +177,26 @@ int main() { check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); typedef unsigned int aligned_uint __attribute__((aligned(16))); check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); typedef float aligned_float __attribute__((aligned(16))); check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); } if (Queue.get_device().has_extension("cl_intel_subgroups_short") || PlatformName.find("CUDA") != std::string::npos) { @@ -202,16 +204,20 @@ int main() { check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); if (Queue.get_device().has_extension("cl_khr_fp16") || PlatformName.find("CUDA") != std::string::npos) { typedef half aligned_half __attribute__((aligned(16))); check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); } } if (Queue.get_device().has_extension("cl_intel_subgroups_long") || @@ -220,20 +226,26 @@ int main() { check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); typedef unsigned long aligned_ulong __attribute__((aligned(16))); check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); typedef double aligned_double __attribute__((aligned(16))); check(Queue); check(Queue); check(Queue); + check(Queue); check(Queue); check(Queue); + check(Queue); } std::cout << "Test passed." << std::endl; return 0; From 45e126151ae3b7369382c293dfd89635122d2ec5 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 09:02:20 +0300 Subject: [PATCH 2/7] Disable device-code-split which is not supported by CUDA BE --- SYCL/SubGroup/load_store.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index aaf6f4cc42..f5a3de2e61 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -2,8 +2,7 @@ // #2252 Disable until all variants of built-ins are available in OpenCL CPU // runtime for every supported ISA // -// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel \ -// -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out From 86b83d54d519996951f05757b442e3513e2e5e6c Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 10:37:28 +0300 Subject: [PATCH 3/7] Fix failure due to limitations on GPU --- SYCL/SubGroup/helper.hpp | 8 +++--- SYCL/SubGroup/load_store.cpp | 49 +++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp index 175a0abad8..1ef035388b 100644 --- a/SYCL/SubGroup/helper.hpp +++ b/SYCL/SubGroup/helper.hpp @@ -109,7 +109,7 @@ template struct utils { template void exit_if_not_equal(T val, T ref, const char *name) { if (std::is_floating_point::value) { - if (std::fabs(val - ref) > 0.01) { + if (std::fabs(val - ref) > 0.02) { std::cout << "Unexpected result for " << name << ": " << (double)val << " expected value: " << (double)ref << std::endl; exit(1); @@ -126,8 +126,8 @@ template void exit_if_not_equal(T val, T ref, const char *name) { template void exit_if_not_equal(std::complex val, std::complex ref, const char *name) { - if (std::fabs(val.real() - ref.real()) > 0.01 || - std::fabs(val.imag() - ref.imag()) > 0.01) { + if (std::fabs(val.real() - ref.real()) > 0.02 || + std::fabs(val.imag() - ref.imag()) > 0.02) { std::cout << "Unexpected result for " << name << ": " << val << " expected value: " << ref << std::endl; exit(1); @@ -145,7 +145,7 @@ template void exit_if_not_equal(T *val, T *ref, const char *name) { template <> void exit_if_not_equal(half val, half ref, const char *name) { int16_t cmp_val = reinterpret_cast(val); int16_t cmp_ref = reinterpret_cast(ref); - if (std::abs(cmp_val - cmp_ref) > 1) { + if (std::abs(cmp_val - cmp_ref) > 2) { std::cout << "Unexpected result for " << name << ": " << (float)val << " expected value: " << (float)ref << std::endl; exit(1); diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index f5a3de2e61..679032bd9f 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -22,8 +22,13 @@ template class sycl_subgr; using namespace cl::sycl; +/* Maximum local group size for the dimention on GPU is 256. At the same time + * vector load/store operations require max_sg_size * vec_len elements in local + * group to work. Limit sub-group size to 8 for 16-element vectors to let them + * work on GPU. + */ template void check(queue &Queue) { - const int G = 1024, L = 1024; + const int G = 1024, L = 256; auto sg_sizes = Queue.get_device().get_info(); size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end()); @@ -44,25 +49,29 @@ template void check(queue &Queue) { auto sgsizeacc = sgsizebuf.get_access(cgh); accessor LocalMem( {L + max_sg_size * N}, cgh); - cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { - ONEAPI::sub_group SG = NdItem.get_sub_group(); - if (SG.get_group_id().get(0) % N == 0) { - size_t SGOffset = - SG.get_group_id().get(0) * SG.get_max_local_range().get(0); - size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; - multi_ptr mp( - &acc[WGSGoffset]); - multi_ptr MPL( - &LocalMem[SGOffset]); - // Add all values in read block - vec v(utils::add_vec(SG.load(mp))); - SG.store(MPL, v); - vec t(utils::add_vec(SG.load(MPL))); - SG.store(mp, t); - } - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - }); + cgh.parallel_for>( + NdRange, [= + ](nd_item<1> NdItem)[[intel::reqd_sub_group_size(N == 16 ? 8 : 16)]] { + ONEAPI::sub_group SG = NdItem.get_sub_group(); + auto SGid = SG.get_group_id().get(0); + /* Avoid overlapping data ranges inside and between local groups */ + if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] < L) { + size_t SGOffset = + SG.get_group_id().get(0) * SG.get_max_local_range().get(0); + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp( + &acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + // Add all values in read block + vec v(utils::add_vec(SG.load(mp))); + SG.store(MPL, v); + vec t(utils::add_vec(SG.load(MPL))); + SG.store(mp, t); + } + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_local_range()[0]; + }); }); auto acc = syclbuf.template get_access(); auto sgsizeacc = sgsizebuf.get_access(); From b4bba1e3b755c430e9cfb2979eaf939d3f2a1477 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 12:35:24 +0300 Subject: [PATCH 4/7] Align data ranges to avoid limitting max_sg_size --- SYCL/SubGroup/load_store.cpp | 51 ++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index 679032bd9f..a377de5d88 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -22,11 +22,6 @@ template class sycl_subgr; using namespace cl::sycl; -/* Maximum local group size for the dimention on GPU is 256. At the same time - * vector load/store operations require max_sg_size * vec_len elements in local - * group to work. Limit sub-group size to 8 for 16-element vectors to let them - * work on GPU. - */ template void check(queue &Queue) { const int G = 1024, L = 256; @@ -49,29 +44,27 @@ template void check(queue &Queue) { auto sgsizeacc = sgsizebuf.get_access(cgh); accessor LocalMem( {L + max_sg_size * N}, cgh); - cgh.parallel_for>( - NdRange, [= - ](nd_item<1> NdItem)[[intel::reqd_sub_group_size(N == 16 ? 8 : 16)]] { - ONEAPI::sub_group SG = NdItem.get_sub_group(); - auto SGid = SG.get_group_id().get(0); - /* Avoid overlapping data ranges inside and between local groups */ - if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] < L) { - size_t SGOffset = - SG.get_group_id().get(0) * SG.get_max_local_range().get(0); - size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; - multi_ptr mp( - &acc[WGSGoffset]); - multi_ptr MPL( - &LocalMem[SGOffset]); - // Add all values in read block - vec v(utils::add_vec(SG.load(mp))); - SG.store(MPL, v); - vec t(utils::add_vec(SG.load(MPL))); - SG.store(mp, t); - } - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SG.get_local_range()[0]; - }); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ONEAPI::sub_group SG = NdItem.get_sub_group(); + auto SGid = SG.get_group_id().get(0); + /* Avoid overlapping data ranges inside and between local groups */ + if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] <= L) { + size_t SGOffset = + SG.get_group_id().get(0) * SG.get_max_local_range().get(0); + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp( + &acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + // Add all values in read block + vec v(utils::add_vec(SG.load(mp))); + SG.store(MPL, v); + vec t(utils::add_vec(SG.load(MPL))); + SG.store(mp, t); + } + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_local_range()[0]; + }); }); auto acc = syclbuf.template get_access(); auto sgsizeacc = sgsizebuf.get_access(); @@ -95,7 +88,7 @@ template void check(queue &Queue) { ref *= N; } /* There is no defined out-of-range behavior for these functions. */ - if ((SGid + N) * sg_size < L) { + if ((SGid + N) * sg_size <= L) { std::string s("Vector<"); s += std::string(typeid(ref).name()) + std::string(",") + std::to_string(N) + std::string(">[") + std::to_string(j) + From 3cbb081a5749be1b25e8b26c03cd0491eccfc4a5 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 14:16:49 +0300 Subject: [PATCH 5/7] Apply review comments and skip execution on CPU device only instead of skipping the whole test when CPU device is available on host machine --- SYCL/SubGroup/load_store.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index a377de5d88..00e4e77e2c 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -1,10 +1,8 @@ -// UNSUPPORTED: cpu -// #2252 Disable until all variants of built-ins are available in OpenCL CPU -// runtime for every supported ISA -// // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out +// #2252 Disable until all variants of built-ins are available in OpenCL CPU +// runtime for every supported ISA +// RUNx %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out // @@ -47,10 +45,10 @@ template void check(queue &Queue) { cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { ONEAPI::sub_group SG = NdItem.get_sub_group(); auto SGid = SG.get_group_id().get(0); + auto SGsize = SG.get_max_local_range().get(0); /* Avoid overlapping data ranges inside and between local groups */ - if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] <= L) { - size_t SGOffset = - SG.get_group_id().get(0) * SG.get_max_local_range().get(0); + if (SGid % N == 0 && (SGid + N) * SGsize <= L) { + size_t SGOffset = SGid * SGsize; size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; multi_ptr mp( &acc[WGSGoffset]); @@ -63,7 +61,7 @@ template void check(queue &Queue) { SG.store(mp, t); } if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SG.get_local_range()[0]; + sgsizeacc[0] = SGsize; }); }); auto acc = syclbuf.template get_access(); From 0c34ec87129a24837f4a37f5e462b5f054c09550 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Wed, 5 May 2021 07:45:57 +0300 Subject: [PATCH 6/7] Use bitwise comparison for floating point type --- SYCL/SubGroup/helper.hpp | 21 +++++++++++---------- SYCL/SubGroup/load_store.cpp | 9 ++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp index 1ef035388b..1c85cf11a5 100644 --- a/SYCL/SubGroup/helper.hpp +++ b/SYCL/SubGroup/helper.hpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -109,9 +110,12 @@ template struct utils { template void exit_if_not_equal(T val, T ref, const char *name) { if (std::is_floating_point::value) { - if (std::fabs(val - ref) > 0.02) { - std::cout << "Unexpected result for " << name << ": " << (double)val - << " expected value: " << (double)ref << std::endl; + auto cmp_val = std::bitset(val); + auto cmp_ref = std::bitset(ref); + if (cmp_val != cmp_ref) { + std::cout << "Unexpected result for " << name << ": " << val << "(" + << cmp_val << ") expected value: " << ref << "(" << cmp_ref + << ")" << std::endl; exit(1); } } else { @@ -126,12 +130,9 @@ template void exit_if_not_equal(T val, T ref, const char *name) { template void exit_if_not_equal(std::complex val, std::complex ref, const char *name) { - if (std::fabs(val.real() - ref.real()) > 0.02 || - std::fabs(val.imag() - ref.imag()) > 0.02) { - std::cout << "Unexpected result for " << name << ": " << val - << " expected value: " << ref << std::endl; - exit(1); - } + std::string Name{name}; + exit_if_not_equal(val.real(), ref.real(), (Name+".real()").c_str()); + exit_if_not_equal(val.imag(), ref.imag(), (Name+".imag()").c_str()); } template void exit_if_not_equal(T *val, T *ref, const char *name) { @@ -145,7 +146,7 @@ template void exit_if_not_equal(T *val, T *ref, const char *name) { template <> void exit_if_not_equal(half val, half ref, const char *name) { int16_t cmp_val = reinterpret_cast(val); int16_t cmp_ref = reinterpret_cast(ref); - if (std::abs(cmp_val - cmp_ref) > 2) { + if (std::abs(cmp_val - cmp_ref) > 1) { std::cout << "Unexpected result for " << name << ": " << (float)val << " expected value: " << (float)ref << std::endl; exit(1); diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index 00e4e77e2c..f3409ef00c 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -21,7 +21,7 @@ template class sycl_subgr; using namespace cl::sycl; template void check(queue &Queue) { - const int G = 1024, L = 256; + const int G = 512, L = 256; auto sg_sizes = Queue.get_device().get_info(); size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end()); @@ -34,7 +34,7 @@ template void check(queue &Queue) { auto acc = syclbuf.template get_access(); for (int i = 0; i < G; i++) { acc[i] = i; - acc[i] += 0.1; // Check that floating point types are not casted to int + acc[i] += 0.25; // Check that floating point types are not casted to int } } Queue.submit([&](handler &cgh) { @@ -55,7 +55,7 @@ template void check(queue &Queue) { multi_ptr MPL( &LocalMem[SGOffset]); // Add all values in read block - vec v(utils::add_vec(SG.load(mp))); + vec v(SG.load(mp)); SG.store(MPL, v); vec t(utils::add_vec(SG.load(MPL))); SG.store(mp, t); @@ -81,9 +81,8 @@ template void check(queue &Queue) { ref = acc[j - (SGid % N) * sg_size]; } else { for (int i = 0; i < N; i++) { - ref += (T)(j + i * sg_size) + 0.1; + ref += (T)(j + i * sg_size) + 0.25; } - ref *= N; } /* There is no defined out-of-range behavior for these functions. */ if ((SGid + N) * sg_size <= L) { From afcd3fd987a30aed64e8f12d74c8a170fa43811a Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Wed, 5 May 2021 16:03:42 +0300 Subject: [PATCH 7/7] Fix clang-format --- SYCL/SubGroup/helper.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp index 1c85cf11a5..712537db3f 100644 --- a/SYCL/SubGroup/helper.hpp +++ b/SYCL/SubGroup/helper.hpp @@ -131,8 +131,8 @@ template void exit_if_not_equal(std::complex val, std::complex ref, const char *name) { std::string Name{name}; - exit_if_not_equal(val.real(), ref.real(), (Name+".real()").c_str()); - exit_if_not_equal(val.imag(), ref.imag(), (Name+".imag()").c_str()); + exit_if_not_equal(val.real(), ref.real(), (Name + ".real()").c_str()); + exit_if_not_equal(val.imag(), ref.imag(), (Name + ".imag()").c_str()); } template void exit_if_not_equal(T *val, T *ref, const char *name) {