[UR] Fix some tests that are broken when run with multiple cuda devices available. (#17216)

aarongreig · web-flow · commit a7774f2a74c5 · 2025-03-04T12:52:58.000Z
Also removes a test and adds known failures where appropriate (typically
where the test only runs when multiple devices are available so the skip
doesn't affect behaviour of single-device runs).

The test removed is `cudaUrContextCreateTest.ActiveContext`. This test
seems to be testing the assumption that a `urQueueCreate` followed by
`urMemBufferCreate` will set the active cuda context to the one
associated with the context passed to those calls. Neither of these
calls set the active context, this may have changed at some point as the
test dates back to a PI unit test. The test currently passes as long as
only one device is available because a previous `urDeviceGetInfo` sets
the active context to the one associated with that device, which is
inevitably the same as the one associated with the UR context used in
the test. Since the test is based on a faulty assumption about the
adapter I think we can just delete it.
diff --git a/unified-runtime/test/adapters/cuda/context_tests.cpp b/unified-runtime/test/adapters/cuda/context_tests.cpp
@@ -43,37 +43,6 @@ TEST_P(cudaUrContextCreateTest, CreateWithChildThread) {
   callContextFromOtherThread.join();
 }
 
-TEST_P(cudaUrContextCreateTest, ActiveContext) {
-  uur::raii::Context context = nullptr;
-  ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, context.ptr()));
-  ASSERT_NE(context, nullptr);
-
-  uur::raii::Queue queue = nullptr;
-  ur_queue_properties_t queue_props{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr,
-                                    0};
-  ASSERT_SUCCESS(urQueueCreate(context, device, &queue_props, queue.ptr()));
-  ASSERT_NE(queue, nullptr);
-
-  // check that the queue has the correct context
-  ASSERT_EQ(context, queue->getContext());
-
-  // create a buffer
-  uur::raii::Mem buffer = nullptr;
-  ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024,
-                                   nullptr, buffer.ptr()));
-  ASSERT_NE(buffer, nullptr);
-
-  // check that the context is now the active CUDA context
-  CUcontext cudaCtx = nullptr;
-  ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&cudaCtx));
-  ASSERT_NE(cudaCtx, nullptr);
-
-  ur_native_handle_t native_context = 0;
-  ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context));
-  ASSERT_NE(reinterpret_cast<CUcontext>(native_context), nullptr);
-  ASSERT_EQ(cudaCtx, reinterpret_cast<CUcontext>(native_context));
-}
-
 TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
   // start by setting up a CUDA context on the thread
   CUcontext original;
diff --git a/unified-runtime/test/adapters/cuda/memory_tests.cpp b/unified-runtime/test/adapters/cuda/memory_tests.cpp
@@ -14,11 +14,12 @@ TEST_P(cudaMemoryTest, urMemBufferNoActiveContext) {
   constexpr size_t memSize = 1024u;
 
   CUcontext current = nullptr;
-  do {
+  ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+  while (current != nullptr) {
     CUcontext oldContext = nullptr;
     ASSERT_SUCCESS_CUDA(cuCtxPopCurrent(&oldContext));
     ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-  } while (current != nullptr);
+  }
 
   uur::raii::Mem mem;
   ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize,
diff --git a/unified-runtime/test/conformance/enqueue/helpers.h b/unified-runtime/test/conformance/enqueue/helpers.h
@@ -203,7 +203,15 @@ struct urMultiQueueMultiDeviceTestWithParam
         urContextCreate(devices.size(), devices.data(), nullptr, &context));
 
     // Duplicate our devices until we hit the minimum size specified.
-    auto srcDevices = devices;
+    std::vector<ur_device_handle_t> srcDevices;
+    // If the test actually only wants one device duplicated a bunch of times
+    // we take devices[0] and discard any other devices that were discovered.
+    if (trueMultiDevice) {
+      srcDevices = devices;
+    } else {
+      srcDevices.push_back(devices[0]);
+      devices.clear();
+    }
     while (devices.size() < minDevices) {
       devices.insert(devices.end(), srcDevices.begin(), srcDevices.end());
     }
@@ -224,6 +232,7 @@ struct urMultiQueueMultiDeviceTestWithParam
 
   ur_context_handle_t context;
   std::vector<ur_queue_handle_t> queues;
+  bool trueMultiDevice = true;
 };
 
 } // namespace uur
diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
@@ -565,7 +565,7 @@ UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urEnqueueKernelLaunchMultiDeviceTest);
 // TODO: rewrite this test, right now it only works for a single queue
 // (the context is only created for one device)
 TEST_P(urEnqueueKernelLaunchMultiDeviceTest, KernelLaunchReadDifferentQueues) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{});
+  UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::LevelZero{}, uur::LevelZeroV2{});
 
   uur::KernelLaunchHelper helper =
       uur::KernelLaunchHelper{platform, context, kernel, queues[0]};
diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp
@@ -155,13 +155,14 @@ struct urEnqueueKernelLaunchIncrementTest
 
   using Param = uur::BoolTestParam;
 
-  using urMultiQueueLaunchMemcpyTest<numOps, Param>::context;
   using urMultiQueueLaunchMemcpyTest<numOps, Param>::queues;
-  using urMultiQueueLaunchMemcpyTest<numOps, Param>::devices;
   using urMultiQueueLaunchMemcpyTest<numOps, Param>::kernels;
   using urMultiQueueLaunchMemcpyTest<numOps, Param>::SharedMem;
 
   void SetUp() override {
+    // We actually need a single device used multiple times for this test, as
+    // opposed to utilizing all available devices for the platform.
+    this->trueMultiDevice = false;
     UUR_RETURN_ON_FATAL_FAILURE(
         urMultiQueueLaunchMemcpyTest<numOps, Param>::
             SetUp()); // Use single device, duplicated numOps times
@@ -344,9 +345,28 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) {
   }
 }
 
-using urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest =
-    urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
-        std::tuple<uur::BoolTestParam, uur::BoolTestParam>>;
+struct urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest
+    : urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
+          std::tuple<uur::BoolTestParam, uur::BoolTestParam>> {
+  using Param = std::tuple<uur::BoolTestParam, uur::BoolTestParam>;
+
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::devices;
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::queues;
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::kernels;
+  using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
+      Param>::SharedMem;
+
+  void SetUp() override {
+    useEvents = std::get<0>(getParam()).value;
+    queuePerThread = std::get<1>(getParam()).value;
+    // With !queuePerThread this becomes a test on a single device
+    this->trueMultiDevice = queuePerThread;
+    urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::SetUp();
+  }
+
+  bool useEvents;
+  bool queuePerThread;
+};
 
 UUR_PLATFORM_TEST_SUITE_WITH_PARAM(
     urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest,
@@ -356,11 +376,7 @@ UUR_PLATFORM_TEST_SUITE_WITH_PARAM(
     printParams<urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest>);
 
 // Enqueue kernelLaunch concurrently from multiple threads
-// With !queuePerThread this becomes a test on a single device
 TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
-  auto useEvents = std::get<0>(getParam()).value;
-  auto queuePerThread = std::get<1>(getParam()).value;
-
   if (!queuePerThread) {
     UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{});
   }
@@ -371,11 +387,11 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
   static constexpr size_t numOpsPerThread = 6;
 
   for (size_t i = 0; i < numThreads; i++) {
-    threads.emplace_back([this, i, queuePerThread, useEvents]() {
+    threads.emplace_back([this, i]() {
       constexpr size_t global_offset = 0;
       constexpr size_t n_dimensions = 1;
 
-      auto queue = queuePerThread ? queues[i] : queues.back();
+      auto queue = this->queuePerThread ? queues[i] : queues.back();
       auto kernel = kernels[i];
       auto sharedPtr = SharedMem[i];
 
@@ -385,7 +401,7 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
         ur_event_handle_t *lastEvent = nullptr;
         ur_event_handle_t *signalEvent = nullptr;
 
-        if (useEvents) {
+        if (this->useEvents) {
           waitNum = j > 0 ? 1 : 0;
           lastEvent = j > 0 ? Events[j - 1].ptr() : nullptr;
           signalEvent = Events[j].ptr();