intel · kbenzie · Mar 4, 2025 · Feb 24, 2025 · Feb 27, 2025
@@ -43,37 +43,6 @@ TEST_P(cudaUrContextCreateTest, CreateWithChildThread) {
   callContextFromOtherThread.join();
 }
 
-TEST_P(cudaUrContextCreateTest, ActiveContext) {
-  uur::raii::Context context = nullptr;
-  ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, context.ptr()));
-  ASSERT_NE(context, nullptr);
-
-  uur::raii::Queue queue = nullptr;
-  ur_queue_properties_t queue_props{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr,
-                                    0};
-  ASSERT_SUCCESS(urQueueCreate(context, device, &queue_props, queue.ptr()));
-  ASSERT_NE(queue, nullptr);
-
-  // check that the queue has the correct context
-  ASSERT_EQ(context, queue->getContext());
-
-  // create a buffer
-  uur::raii::Mem buffer = nullptr;
-  ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024,
-                                   nullptr, buffer.ptr()));
-  ASSERT_NE(buffer, nullptr);
-
-  // check that the context is now the active CUDA context
-  CUcontext cudaCtx = nullptr;
-  ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&cudaCtx));
-  ASSERT_NE(cudaCtx, nullptr);
-
-  ur_native_handle_t native_context = 0;
-  ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context));
-  ASSERT_NE(reinterpret_cast<CUcontext>(native_context), nullptr);
-  ASSERT_EQ(cudaCtx, reinterpret_cast<CUcontext>(native_context));
-}
-
 TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
   // start by setting up a CUDA context on the thread
   CUcontext original;

@@ -14,11 +14,12 @@ TEST_P(cudaMemoryTest, urMemBufferNoActiveContext) {
   constexpr size_t memSize = 1024u;
 
   CUcontext current = nullptr;
-  do {
+  ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+  while (current != nullptr) {
     CUcontext oldContext = nullptr;
     ASSERT_SUCCESS_CUDA(cuCtxPopCurrent(&oldContext));
     ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-  } while (current != nullptr);
+  }
 
   uur::raii::Mem mem;
   ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize,

@@ -203,7 +203,15 @@ struct urMultiQueueMultiDeviceTestWithParam
         urContextCreate(devices.size(), devices.data(), nullptr, &context));
 
     // Duplicate our devices until we hit the minimum size specified.
-    auto srcDevices = devices;
+    std::vector<ur_device_handle_t> srcDevices;
+    // If the test actually only wants one device duplicated a bunch of times
+    // we take devices[0] and discard any other devices that were discovered.
+    if (trueMultiDevice) {
+      srcDevices = devices;
+    } else {
+      srcDevices.push_back(devices[0]);
+      devices.clear();
+    }
     while (devices.size() < minDevices) {
       devices.insert(devices.end(), srcDevices.begin(), srcDevices.end());
     }
@@ -224,6 +232,7 @@ struct urMultiQueueMultiDeviceTestWithParam
 
   ur_context_handle_t context;
   std::vector<ur_queue_handle_t> queues;
+  bool trueMultiDevice = true;
 };
 
 } // namespace uur

@@ -565,7 +565,7 @@ UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urEnqueueKernelLaunchMultiDeviceTest);
 // TODO: rewrite this test, right now it only works for a single queue
 // (the context is only created for one device)
 TEST_P(urEnqueueKernelLaunchMultiDeviceTest, KernelLaunchReadDifferentQueues) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{});
+  UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::LevelZero{}, uur::LevelZeroV2{});
 
   uur::KernelLaunchHelper helper =
       uur::KernelLaunchHelper{platform, context, kernel, queues[0]};

@@ -155,13 +155,14 @@ struct urEnqueueKernelLaunchIncrementTest
 
   using Param = uur::BoolTestParam;
 
-  using urMultiQueueLaunchMemcpyTest<numOps, Param>::context;
   using urMultiQueueLaunchMemcpyTest<numOps, Param>::queues;
-  using urMultiQueueLaunchMemcpyTest<numOps, Param>::devices;
   using urMultiQueueLaunchMemcpyTest<numOps, Param>::kernels;
   using urMultiQueueLaunchMemcpyTest<numOps, Param>::SharedMem;
 
   void SetUp() override {
+    // We actually need a single device used multiple times for this test, as
+    // opposed to utilizing all available devices for the platform.
+    this->trueMultiDevice = false;
     UUR_RETURN_ON_FATAL_FAILURE(
         urMultiQueueLaunchMemcpyTest<numOps, Param>::
             SetUp()); // Use single device, duplicated numOps times
@@ -344,9 +345,28 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) {
   }
 }
 
-using urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest =
-    urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
-        std::tuple<uur::BoolTestParam, uur::BoolTestParam>>;
+struct urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest
+    : urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
+          std::tuple<uur::BoolTestParam, uur::BoolTestParam>> {
+  using Param = std::tuple<uur::BoolTestParam, uur::BoolTestParam>;
+
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::devices;
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::queues;
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::kernels;
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
+      Param>::SharedMem;
+
+  void SetUp() override {
+    useEvents = std::get<0>(getParam()).value;
+    queuePerThread = std::get<1>(getParam()).value;
+    // With !queuePerThread this becomes a test on a single device
+    this->trueMultiDevice = queuePerThread;
+    urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::SetUp();
+  }
+
+  bool useEvents;
+  bool queuePerThread;
+};
 
 UUR_PLATFORM_TEST_SUITE_WITH_PARAM(
     urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest,
@@ -356,11 +376,7 @@ UUR_PLATFORM_TEST_SUITE_WITH_PARAM(
     printParams<urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest>);
 
 // Enqueue kernelLaunch concurrently from multiple threads
-// With !queuePerThread this becomes a test on a single device
 TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
-  auto useEvents = std::get<0>(getParam()).value;
-  auto queuePerThread = std::get<1>(getParam()).value;
-
   if (!queuePerThread) {
     UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{});
   }
@@ -371,11 +387,11 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
   static constexpr size_t numOpsPerThread = 6;
 
   for (size_t i = 0; i < numThreads; i++) {
-    threads.emplace_back([this, i, queuePerThread, useEvents]() {
+    threads.emplace_back([this, i]() {
       constexpr size_t global_offset = 0;
       constexpr size_t n_dimensions = 1;
 
-      auto queue = queuePerThread ? queues[i] : queues.back();
+      auto queue = this->queuePerThread ? queues[i] : queues.back();
       auto kernel = kernels[i];
       auto sharedPtr = SharedMem[i];
 
@@ -385,7 +401,7 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
         ur_event_handle_t *lastEvent = nullptr;
         ur_event_handle_t *signalEvent = nullptr;
 
-        if (useEvents) {
+        if (this->useEvents) {
           waitNum = j > 0 ? 1 : 0;
           lastEvent = j > 0 ? Events[j - 1].ptr() : nullptr;
           signalEvent = Events[j].ptr();