[SYCL][CUDA] Fix context scope in kernel launch (#4606)

npmiller · web-flow · commit 0d3cc99c19a6 · 2021-09-21T15:17:31.000+03:00
The `guessLocalWorkSize` function uses the CUDA API so it needs an
active context, and there was no active `ScopedContext` when it was
called which may cause issue.
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -2577,61 +2577,62 @@ pi_result cuda_piEnqueueKernelLaunch(
   size_t maxThreadsPerBlock[3] = {};
   bool providedLocalWorkGroupSize = (local_work_size != nullptr);
   pi_uint32 local_size = kernel->get_local_size();
+  pi_result retError = PI_SUCCESS;
 
-  {
-    size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
-    maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
-    command_queue->device_->get_max_work_item_sizes(sizeof(maxThreadsPerBlock),
-                                                    maxThreadsPerBlock);
-
-    if (providedLocalWorkGroupSize) {
-      auto isValid = [&](int dim) {
-        if (reqdThreadsPerBlock[dim] != 0 &&
-            local_work_size[dim] != reqdThreadsPerBlock[dim])
-          return PI_INVALID_WORK_GROUP_SIZE;
-
-        if (local_work_size[dim] > maxThreadsPerBlock[dim])
-          return PI_INVALID_WORK_ITEM_SIZE;
-        // Checks that local work sizes are a divisor of the global work sizes
-        // which includes that the local work sizes are neither larger than the
-        // global work sizes and not 0.
-        if (0u == local_work_size[dim])
-          return PI_INVALID_WORK_GROUP_SIZE;
-        if (0u != (global_work_size[dim] % local_work_size[dim]))
-          return PI_INVALID_WORK_GROUP_SIZE;
-        threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
-        return PI_SUCCESS;
-      };
-
-      for (size_t dim = 0; dim < work_dim; dim++) {
-        auto err = isValid(dim);
-        if (err != PI_SUCCESS)
-          return err;
+  try {
+    // Set the active context here as guessLocalWorkSize needs an active context
+    ScopedContext active(command_queue->get_context());
+    {
+      size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
+      maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
+      command_queue->device_->get_max_work_item_sizes(
+          sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
+
+      if (providedLocalWorkGroupSize) {
+        auto isValid = [&](int dim) {
+          if (reqdThreadsPerBlock[dim] != 0 &&
+              local_work_size[dim] != reqdThreadsPerBlock[dim])
+            return PI_INVALID_WORK_GROUP_SIZE;
+
+          if (local_work_size[dim] > maxThreadsPerBlock[dim])
+            return PI_INVALID_WORK_ITEM_SIZE;
+          // Checks that local work sizes are a divisor of the global work sizes
+          // which includes that the local work sizes are neither larger than
+          // the global work sizes and not 0.
+          if (0u == local_work_size[dim])
+            return PI_INVALID_WORK_GROUP_SIZE;
+          if (0u != (global_work_size[dim] % local_work_size[dim]))
+            return PI_INVALID_WORK_GROUP_SIZE;
+          threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
+          return PI_SUCCESS;
+        };
+
+        for (size_t dim = 0; dim < work_dim; dim++) {
+          auto err = isValid(dim);
+          if (err != PI_SUCCESS)
+            return err;
+        }
+      } else {
+        guessLocalWorkSize(threadsPerBlock, global_work_size,
+                           maxThreadsPerBlock, kernel, local_size);
       }
-    } else {
-      guessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock,
-                         kernel, local_size);
     }
-  }
 
-  if (maxWorkGroupSize <
-      size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
-    return PI_INVALID_WORK_GROUP_SIZE;
-  }
+    if (maxWorkGroupSize <
+        size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
+      return PI_INVALID_WORK_GROUP_SIZE;
+    }
 
-  int blocksPerGrid[3] = {1, 1, 1};
+    int blocksPerGrid[3] = {1, 1, 1};
 
-  for (size_t i = 0; i < work_dim; i++) {
-    blocksPerGrid[i] =
-        static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
-        threadsPerBlock[i];
-  }
+    for (size_t i = 0; i < work_dim; i++) {
+      blocksPerGrid[i] =
+          static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
+          threadsPerBlock[i];
+    }
 
-  pi_result retError = PI_SUCCESS;
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
+    std::unique_ptr<_pi_event> retImplEv{nullptr};
 
-  try {
-    ScopedContext active(command_queue->get_context());
     CUstream cuStream = command_queue->get();
     CUfunction cuFunc = kernel->get();