[SYCL] Enable 2 reduction tests for ACC and level_zero (intel/llvm-test-suite#410)

v-klochkov · web-flow · commit 5e5d564a63fb · 2021-08-20T13:32:30.000+03:00
This patch has minor fixes to the test to increase their stability.
It also changes the input data initialization to make the test-checks
more meaningful/useful.

Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/SYCL/Reduction/reduction_nd_range_scalar.hpp b/SYCL/Reduction/reduction_nd_range_scalar.hpp
@@ -14,11 +14,21 @@ int test(queue &Q, T Identity, T Init, BinaryOperation BOp,
          const nd_range<Dims> &Range) {
   printTestLabel<T, BinaryOperation>(IsSYCL2020, Range);
 
-  // Skip the test for such big arrays now.
-  constexpr size_t TwoGB = 2LL * 1024 * 1024 * 1024;
+  // It is a known problem with passing data that is close to 4Gb in size
+  // to device. Such data breaks the execution pretty badly.
+  // Some of test cases calling this function try to verify the correctness
+  // of reduction with the global range bigger than the maximal work-group size
+  // for the device. Maximal WG size for device may be very big, e.g. it is
+  // 67108864 for ACC emulator. Multiplying that by some factor
+  // (to exceed max WG-Size) and multiplying it by the element size may exceed
+  // the safe size of data passed to device.
+  // Let's set it to 1 GB for now, and just skip the test if it exceeds 1Gb.
+  constexpr size_t OneGB = 1LL * 1024 * 1024 * 1024;
   range<Dims> GlobalRange = Range.get_global_range();
-  if (GlobalRange.size() > TwoGB)
+  if (GlobalRange.size() * sizeof(T) > OneGB) {
+    std::cout << " SKIPPED due to too big data size" << std::endl;
     return 0;
+  }
 
   buffer<T, Dims> InBuf(GlobalRange);
   buffer<T, 1> OutBuf(1);
diff --git a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
@@ -1,10 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-
-// TODO: accelerator may not suport atomics required by the current
-// implementation. Enable testing when implementation is fixed.
-// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #include "reduction_range_scalar.hpp"
 
@@ -30,15 +27,15 @@ int main() {
 
   // Fast-reduce and Fast-atomics. Try various range types/sizes.
   tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1);
-  tests<class A2, int64_t>(Q, 0, 99, std::plus<>{}, 7);
-  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, 64);
   tests<class A4, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
   tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
 
   // Try various types & ranges.
-  tests<class B1, int>(Q, ~0, 99, std::bit_and<>{}, 7);
-  tests<class B2, int>(Q, 0, 0xff99, std::bit_xor<>{}, MaxWGSize);
-  tests<class B3, int>(Q, 0, 0xff99, std::bit_or<>{}, 3);
+  tests<class B1, int>(Q, ~0, 0xfefefefe, std::bit_and<>{}, 7);
+  tests<class B2, int>(Q, 0, 0xfedcff99, std::bit_xor<>{}, MaxWGSize);
+  tests<class B3, int>(Q, 0, 0xfedcff99, std::bit_or<>{}, 3);
   tests<class B4, short>(Q, 1, 2, std::multiplies<>{}, 7);
   tests<class B5, int>(Q, (std::numeric_limits<int>::max)(), -99,
                        ext::oneapi::minimum<>{}, MaxWGSize * 2);
diff --git a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
@@ -1,14 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-
-// TODO: test disabled due to Jenkins testing failure on unrelated commit
-// Sporadic failure
-// UNSUPPORTED: linux && level_zero
-
-// TODO: accelerator may not suport atomics required by the current
-// implementation. Enable testing when implementation is fixed.
-// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // This test performs basic checks of parallel_for(range<1>, reduction, func)
 // with reductions initialized with 1-dimensional read_write accessor
@@ -34,15 +27,15 @@ int main() {
 
   // Fast-reduce and Fast-atomics. Try various range types/sizes.
   tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1);
-  tests<class A2, int64_t>(Q, 0, 99, std::plus<>{}, 7);
-  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, 64);
   tests<class A4, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
   tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
 
   // Try various types & ranges.
   tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
   tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
-  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
+  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 3);
   tests<class B4, uint64_t>(Q, 1, 2, std::multiplies<>{}, 16);
   tests<class B5, float>(Q, 1, 3, std::multiplies<>{}, 11);
   tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
diff --git a/SYCL/Reduction/reduction_utils.hpp b/SYCL/Reduction/reduction_utils.hpp
@@ -14,6 +14,15 @@ void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
     if (std::is_same_v<BinaryOperation, std::multiplies<T>> ||
         std::is_same_v<BinaryOperation, std::multiplies<>>)
       In[I] = 1.1 + (((I % 11) == 0) ? 1 : 0);
+    else if (std::is_same_v<BinaryOperation, std::bit_and<T>> ||
+             std::is_same_v<BinaryOperation, std::bit_and<>>)
+      In[I] = (I + 1) | 0x10203040;
+    else if (std::is_same_v<BinaryOperation, sycl::minimum<T>> ||
+             std::is_same_v<BinaryOperation, sycl::minimum<>>)
+      In[I] = Range[0] - I;
+    else if (std::is_same_v<BinaryOperation, sycl::maximum<T>> ||
+             std::is_same_v<BinaryOperation, sycl::maximum<>>)
+      In[I] = I;
     else
       In[I] = ((I + 1) % 5) + 1.1;
     ExpectedOut = BOp(ExpectedOut, In[I]);
@@ -32,6 +41,15 @@ void initInputData(buffer<T, 2> &InBuf, T &ExpectedOut, T Identity,
       if (std::is_same_v<BinaryOperation, std::multiplies<T>> ||
           std::is_same_v<BinaryOperation, std::multiplies<>>)
         In[J][I] = 1.1 + ((((I + J * 3) % 11) == 0) ? 1 : 0);
+      else if (std::is_same_v<BinaryOperation, std::bit_and<T>> ||
+               std::is_same_v<BinaryOperation, std::bit_and<>>)
+        In[J][I] = (I + J + 1) | 0x10203040;
+      else if (std::is_same_v<BinaryOperation, sycl::minimum<T>> ||
+               std::is_same_v<BinaryOperation, sycl::minimum<>>)
+        In[J][I] = Range[0] + Range[1] - I - J;
+      else if (std::is_same_v<BinaryOperation, sycl::maximum<T>> ||
+               std::is_same_v<BinaryOperation, sycl::maximum<>>)
+        In[J][I] = I + J;
       else
         In[J][I] = ((I + 1 + J) % 5) + 1.1;
       ExpectedOut = BOp(ExpectedOut, In[J][I]);
@@ -52,6 +70,15 @@ void initInputData(buffer<T, 3> &InBuf, T &ExpectedOut, T Identity,
         if (std::is_same_v<BinaryOperation, std::multiplies<T>> ||
             std::is_same_v<BinaryOperation, std::multiplies<>>)
           In[K][J][I] = 1.1 + ((((I + J * 3 + K) % 11) == 0) ? 1 : 0);
+        else if (std::is_same_v<BinaryOperation, std::bit_and<T>> ||
+                 std::is_same_v<BinaryOperation, std::bit_and<>>)
+          In[K][J][I] = (I + J + K + 1) | 0x10203040;
+        else if (std::is_same_v<BinaryOperation, sycl::minimum<T>> ||
+                 std::is_same_v<BinaryOperation, sycl::minimum<>>)
+          In[K][J][I] = Range[0] + Range[1] + Range[2] - I - J - K;
+        else if (std::is_same_v<BinaryOperation, sycl::maximum<T>> ||
+                 std::is_same_v<BinaryOperation, sycl::maximum<>>)
+          In[K][J][I] = I + J + K;
         else
           In[K][J][I] = ((I + 1 + J + K * 3) % 5) + 1.1;
         ExpectedOut = BOp(ExpectedOut, In[K][J][I]);