[SYCL][Fusion] Test caching of fused kernels (#1551)

sommerlukas · web-flow · commit b3b471b3b463 · 2023-02-07T09:59:06.000Z
Check that JIT compilation for kernel fusion is or is not repeated, depending on whether a newly submitted sequence of kernels is equivalent to a previous sequence. Next to the sequence of kernels, other invocation information, e.g., the user-specified internalization properties play a role to assess equivalence with a previous compilation. Different scenarios are tested by the test added in this PR. Implementation: intel/llvm#8051 Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
diff --git a/SYCL/KernelFusion/jit_caching.cpp b/SYCL/KernelFusion/jit_caching.cpp
@@ -0,0 +1,142 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test caching for JIT fused kernels. Also test for debug messages being
+// printed when SYCL_RT_WARNING_LEVEL=1.
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr size_t dataSize = 512;
+
+enum class Internalization { None, Local, Private };
+
+void performFusion(queue &q, Internalization internalize, range<1> globalSize,
+                   int beta, int gamma, bool insertBarriers = false) {
+  int alpha = 1;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+  {
+    buffer<int> bIn1{in1, globalSize};
+    buffer<int> bIn2{in2, globalSize};
+    buffer<int> bIn3{in3, globalSize};
+    buffer<int> bTmp{tmp, globalSize};
+    buffer<int> bOut{out, globalSize};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      property_list properties{};
+      if (internalize == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (internalize == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+      cgh.parallel_for<class KernelOne>(globalSize, [=](id<1> i) {
+        accTmp[i] = accIn1[i] + accIn2[i] * alpha;
+      });
+    });
+
+    q.submit([&](handler &cgh) {
+      property_list properties{};
+      if (internalize == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (internalize == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(globalSize, [=](id<1> i) {
+        accOut[i] = accTmp[i] * accIn3[i] * beta * gamma;
+      });
+    });
+
+    if (insertBarriers) {
+      fw.complete_fusion();
+    } else {
+      fw.complete_fusion(
+          {ext::codeplay::experimental::property::no_barriers{}});
+    }
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  size_t numErrors = 0;
+  size_t numInternalized = 0;
+  for (size_t i = 0; i < dataSize; ++i) {
+    if (i < globalSize.size() && out[i] != (20 * i * i * beta * gamma)) {
+      ++numErrors;
+    }
+    if (tmp[i] == -1) {
+      ++numInternalized;
+    }
+  }
+  if (numErrors) {
+    std::cout << "COMPUTATION ERROR\n";
+  }
+  if ((internalize == Internalization::None) && numInternalized) {
+    std::cout << "WRONG INTERNALIZATION\n";
+  }
+}
+
+int main() {
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  // Initial invocation
+  performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
+  // CHECK: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
+
+  // Identical invocation, should lead to JIT cache hit.
+  performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
+  // CHECK-NEXT: JIT DEBUG: Re-using cached JIT kernel
+  // CHECK-NEXT: INFO: Re-using existing device binary for fused kernel
+
+  // Invocation with a different beta. Because beta was identical to alpha so
+  // far, this should lead to a cache miss.
+  performFusion(q, Internalization::Private, range<1>{dataSize}, 2, 1);
+  // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
+
+  // Invocation with barrier insertion should lead to a cache miss.
+  performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1,
+                /* insertBarriers */ true);
+  // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
+
+  // Invocation with different internalization target should lead to a cache
+  // miss.
+  performFusion(q, Internalization::None, range<1>{dataSize}, 1, 1);
+  // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
+
+  // Invocation with a different gamma should lead to a cache miss because gamma
+  // participates in constant propagation.
+  performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 2);
+  // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
+
+  return 0;
+}