aelovikov-intel
diff --git a/‎SYCL/KernelFusion/cancel_fusion.cpp
Lines changed: 2 additions & 0 deletions b/‎SYCL/KernelFusion/cancel_fusion.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎SYCL/KernelFusion/event_wait_cancel.cpp
Lines changed: 77 additions & 0 deletions b/‎SYCL/KernelFusion/event_wait_cancel.cpp
Lines changed: 77 additions & 0 deletions
diff --git a/‎SYCL/KernelFusion/fusion_event_test_common.h
Lines changed: 8 additions & 0 deletions b/‎SYCL/KernelFusion/fusion_event_test_common.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎SYCL/KernelFusion/sync_acc_mem_op.cpp
Lines changed: 84 additions & 0 deletions b/‎SYCL/KernelFusion/sync_acc_mem_op.cpp
Lines changed: 84 additions & 0 deletions
diff --git a/‎SYCL/KernelFusion/sync_buffer_destruction.cpp
Lines changed: 76 additions & 0 deletions b/‎SYCL/KernelFusion/sync_buffer_destruction.cpp
Lines changed: 76 additions & 0 deletions
diff --git a/‎SYCL/KernelFusion/sync_event_wait.cpp
Lines changed: 75 additions & 0 deletions b/‎SYCL/KernelFusion/sync_event_wait.cpp
Lines changed: 75 additions & 0 deletions
@@ -1,4 +1,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // UNSUPPORTED: cuda || hip
 
 // Test cancel fusion
 
@@ -0,0 +1,77 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+
+// Test validity of events after cancel_fusion.
+
+#include "fusion_event_test_common.h"
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  int *in1 = sycl::malloc_shared<int>(dataSize, q);
+  int *in2 = sycl::malloc_shared<int>(dataSize, q);
+  int *in3 = sycl::malloc_shared<int>(dataSize, q);
+  int *tmp = sycl::malloc_shared<int>(dataSize, q);
+  int *out = sycl::malloc_shared<int>(dataSize, q);
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw{q};
+  fw.start_fusion();
+
+  assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  auto kernel1 = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
+  });
+
+  auto kernel2 = q.submit([&](handler &cgh) {
+    cgh.depends_on(kernel1);
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
+  });
+
+  fw.cancel_fusion();
+
+  assert(!fw.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  kernel1.wait();
+  assert(isEventComplete(kernel1) && "Event should be complete");
+  // The event returned by submit while in fusion mode depends on both
+  // individual kernels to be executed.
+  assert(kernel1.get_wait_list().size() == 2);
+
+  kernel2.wait();
+  assert(isEventComplete(kernel2) && "Event should be complete");
+  // The event returned by submit while in fusion mode depends on both
+  // individual kernels to be executed.
+  assert(kernel2.get_wait_list().size() == 2);
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  sycl::free(in1, q);
+  sycl::free(in2, q);
+  sycl::free(in3, q);
+  sycl::free(tmp, q);
+  sycl::free(out, q);
+
+  return 0;
+}
@@ -0,0 +1,8 @@
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+static bool isEventComplete(sycl::event &ev) {
+  return ev.get_info<info::event::command_execution_status>() ==
+         info::event_command_status::complete;
+}
@@ -0,0 +1,84 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on an explicit memory operation on an accessor
+// happening before complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+  int dst[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+    dst[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    // This explicit copy operation has an overlapping requirement with one of
+    // the kernels and therefore requires synchronization. This should lead to
+    // cancellation of the fusion.
+    auto copyEvt = q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.copy(accTmp, dst);
+    });
+
+    copyEvt.wait();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(dst[i] == (5 * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
@@ -0,0 +1,76 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on buffer destruction happening before
+// complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    {
+      buffer<int> bIn3{in3, range{dataSize}};
+
+      fw.start_fusion();
+
+      assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+      q.submit([&](handler &cgh) {
+        auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+        auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+        auto accTmp = bTmp.get_access(cgh);
+        cgh.parallel_for<class KernelOne>(
+            dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+      });
+
+      q.submit([&](handler &cgh) {
+        auto accTmp = bTmp.get_access(cgh);
+        auto accIn3 = bIn3.get_access(cgh);
+        auto accOut = bOut.get_access(cgh);
+        cgh.parallel_for<class KernelTwo>(
+            dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+      });
+      // Buffer bIn3, which is accessed by one of the kernels in the fusion list
+      // goes out scope, causing a blocking wait for one of the kernels in the
+      // fusion list. This should lead to cancellation of the fusion.
+    }
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
@@ -0,0 +1,75 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on event::wait() happening before
+// complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    auto kernel1Ev = q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    // This event::wait() causes a blocking wait for one of the kernels in the
+    // fusion list. This should lead to cancellation of the fusion.
+    kernel1Ev.wait();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested