[SYCL][CUDA] Select only NVPTX64 device binaries (#1223)

fwyzard · web-flow · commit bcefeb3e3a5e · 2020-03-03T14:48:38.000+03:00
Add the binary target identifier "nvptx64" for NVIDIA PTX devices.

Search through the available binary images and select the first one for
the PI_DEVICE_BINARY_TARGET_NVPTX64 ("nvptx64") target.

Return PI_INVALID_BINARY if no "nvptx64" image is available.

Add a LIT test to check that both backends (PI_OPENCL, PI_CUDA) work
irrespective of the order of the -fsycl-targets=... arguments.

Signed-off-by: Andrea Bocci &lt;andrea.bocci@cern.ch&gt;
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
@@ -449,6 +449,9 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4;
 #define PI_DEVICE_BINARY_TARGET_SPIRV64_GEN "spir64_gen"
 #define PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA "spir64_fpga"
 
+/// PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device
+#define PI_DEVICE_BINARY_TARGET_NVPTX64 "nvptx64"
+
 /// This struct is a record of the device binary information. If the Kind field
 /// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec
 /// field can still be specific and denote e.g. FPGA target. It must match the
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -687,10 +687,10 @@ pi_result cuda_piDevicePartition(
   return {};
 }
 
-pi_result cuda_piextDeviceSelectBinary(
-    pi_device device, // TODO: does this need to be context?
-    pi_device_binary *binaries, pi_uint32 num_binaries,
-    pi_device_binary *selected_binary) {
+pi_result cuda_piextDeviceSelectBinary(pi_device device,
+                                       pi_device_binary *binaries,
+                                       pi_uint32 num_binaries,
+                                       pi_device_binary *selected_binary) {
   if (!binaries) {
     cl::sycl::detail::pi::die("No list of device images provided");
   }
@@ -700,8 +700,19 @@ pi_result cuda_piextDeviceSelectBinary(
   if (!selected_binary) {
     cl::sycl::detail::pi::die("No storage for device binary provided");
   }
-  *selected_binary = binaries[0];
-  return PI_SUCCESS;
+
+  // Look for an image for the NVPTX64 target, and return the first one that is
+  // found
+  for (pi_uint32 i = 0; i < num_binaries; i++) {
+    if (strcmp(binaries[i]->DeviceTargetSpec,
+               PI_DEVICE_BINARY_TARGET_NVPTX64) == 0) {
+      *selected_binary = binaries[i];
+      return PI_SUCCESS;
+    }
+  }
+
+  // No image can be loaded for the given device
+  return PI_INVALID_BINARY;
 }
 
 pi_result cuda_piextGetDeviceFunctionPointer(pi_device device,
diff --git a/sycl/test/plugins/sycl-targets-order.cpp b/sycl/test/plugins/sycl-targets-order.cpp
@@ -0,0 +1,70 @@
+// RUN: %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown-sycldevice,nvptx64-unknown-unknown-sycldevice %s -o %t-spir64-nvptx64.out
+// RUN: env SYCL_BE=PI_OPENCL %t-spir64-nvptx64.out
+// RUN: env SYCL_BE=PI_CUDA   %t-spir64-nvptx64.out
+// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-unknown-unknown-sycldevice,spir64-unknown-unknown-sycldevice %s -o %t-nvptx64-spir64.out
+// RUN: env SYCL_BE=PI_OPENCL %t-nvptx64-spir64.out
+// RUN: env SYCL_BE=PI_CUDA   %t-nvptx64-spir64.out
+
+// REQUIRES: opencl, cuda
+
+//==------- sycl-targets-order.cpp - SYCL -fsycl-targets order test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+int main(int argc, char **argv) {
+
+  // select the default SYCL device
+  cl::sycl::device device{cl::sycl::default_selector{}};
+  std::cout << "Running on SYCL device " << device.get_info<cl::sycl::info::device::name>()
+            << ", driver version " << device.get_info<cl::sycl::info::device::driver_version>()
+            << std::endl;
+
+  // create a queue
+  cl::sycl::queue queue{device};
+
+  // create a buffer of 4 ints to be used inside the kernel code
+  cl::sycl::buffer<unsigned int, 1> buffer(4);
+
+  // size of the index space for the kernel
+  cl::sycl::range<1> NumOfWorkItems{buffer.get_count()};
+
+  // submit a command group(work) to the queue
+  queue.submit([&](cl::sycl::handler &cgh) {
+    // get write only access to the buffer on a device
+    auto accessor = buffer.get_access<cl::sycl::access::mode::write>(cgh);
+    // executing the kernel
+    cgh.parallel_for<class FillBuffer>(
+        NumOfWorkItems, [=](cl::sycl::id<1> WIid) {
+          // fill the buffer with indexes
+          accessor[WIid] = WIid.get(0);
+        });
+  });
+
+  // get read-only access to the buffer on the host
+  // introduce an implicit barrier waiting for queue to complete the work
+  const auto host_accessor = buffer.get_access<cl::sycl::access::mode::read>();
+
+  // check the results
+  bool mismatch = false;
+  for (unsigned int i = 0; i < buffer.get_count(); ++i) {
+    if (host_accessor[i] != i) {
+      std::cout << "The result is incorrect for element: " << i
+                << " , expected: " << i << " , got: " << host_accessor[i]
+                << std::endl;
+      mismatch = true;
+    }
+  }
+
+  if (not mismatch) {
+    std::cout << "The results are correct!" << std::endl;
+  }
+
+  return mismatch;
+}