Skip to content

[UR] Fix some tests that are broken when run with multiple cuda devices available. #17216

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 0 additions & 31 deletions unified-runtime/test/adapters/cuda/context_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,37 +43,6 @@ TEST_P(cudaUrContextCreateTest, CreateWithChildThread) {
callContextFromOtherThread.join();
}

TEST_P(cudaUrContextCreateTest, ActiveContext) {
uur::raii::Context context = nullptr;
ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, context.ptr()));
ASSERT_NE(context, nullptr);

uur::raii::Queue queue = nullptr;
ur_queue_properties_t queue_props{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr,
0};
ASSERT_SUCCESS(urQueueCreate(context, device, &queue_props, queue.ptr()));
ASSERT_NE(queue, nullptr);

// check that the queue has the correct context
ASSERT_EQ(context, queue->getContext());

// create a buffer
uur::raii::Mem buffer = nullptr;
ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024,
nullptr, buffer.ptr()));
ASSERT_NE(buffer, nullptr);

// check that the context is now the active CUDA context
CUcontext cudaCtx = nullptr;
ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&cudaCtx));
ASSERT_NE(cudaCtx, nullptr);

ur_native_handle_t native_context = 0;
ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context));
ASSERT_NE(reinterpret_cast<CUcontext>(native_context), nullptr);
ASSERT_EQ(cudaCtx, reinterpret_cast<CUcontext>(native_context));
}

TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
// start by setting up a CUDA context on the thread
CUcontext original;
Expand Down
5 changes: 3 additions & 2 deletions unified-runtime/test/adapters/cuda/memory_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ TEST_P(cudaMemoryTest, urMemBufferNoActiveContext) {
constexpr size_t memSize = 1024u;

CUcontext current = nullptr;
do {
ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
while (current != nullptr) {
CUcontext oldContext = nullptr;
ASSERT_SUCCESS_CUDA(cuCtxPopCurrent(&oldContext));
ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
} while (current != nullptr);
}

uur::raii::Mem mem;
ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize,
Expand Down
11 changes: 10 additions & 1 deletion unified-runtime/test/conformance/enqueue/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,15 @@ struct urMultiQueueMultiDeviceTestWithParam
urContextCreate(devices.size(), devices.data(), nullptr, &context));

// Duplicate our devices until we hit the minimum size specified.
auto srcDevices = devices;
std::vector<ur_device_handle_t> srcDevices;
// If the test actually only wants one device duplicated a bunch of times
// we take devices[0] and discard any other devices that were discovered.
if (trueMultiDevice) {
srcDevices = devices;
} else {
srcDevices.push_back(devices[0]);
devices.clear();
}
while (devices.size() < minDevices) {
devices.insert(devices.end(), srcDevices.begin(), srcDevices.end());
}
Expand All @@ -224,6 +232,7 @@ struct urMultiQueueMultiDeviceTestWithParam

ur_context_handle_t context;
std::vector<ur_queue_handle_t> queues;
bool trueMultiDevice = true;
};

} // namespace uur
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urEnqueueKernelLaunchMultiDeviceTest);
// TODO: rewrite this test, right now it only works for a single queue
// (the context is only created for one device)
TEST_P(urEnqueueKernelLaunchMultiDeviceTest, KernelLaunchReadDifferentQueues) {
UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{});
UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::LevelZero{}, uur::LevelZeroV2{});

uur::KernelLaunchHelper helper =
uur::KernelLaunchHelper{platform, context, kernel, queues[0]};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,14 @@ struct urEnqueueKernelLaunchIncrementTest

using Param = uur::BoolTestParam;

using urMultiQueueLaunchMemcpyTest<numOps, Param>::context;
using urMultiQueueLaunchMemcpyTest<numOps, Param>::queues;
using urMultiQueueLaunchMemcpyTest<numOps, Param>::devices;
using urMultiQueueLaunchMemcpyTest<numOps, Param>::kernels;
using urMultiQueueLaunchMemcpyTest<numOps, Param>::SharedMem;

void SetUp() override {
// We actually need a single device used multiple times for this test, as
// opposed to utilizing all available devices for the platform.
this->trueMultiDevice = false;
UUR_RETURN_ON_FATAL_FAILURE(
urMultiQueueLaunchMemcpyTest<numOps, Param>::
SetUp()); // Use single device, duplicated numOps times
Expand Down Expand Up @@ -344,9 +345,28 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) {
}
}

using urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest =
urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
std::tuple<uur::BoolTestParam, uur::BoolTestParam>>;
struct urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest
: urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
std::tuple<uur::BoolTestParam, uur::BoolTestParam>> {
using Param = std::tuple<uur::BoolTestParam, uur::BoolTestParam>;

using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::devices;
using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::queues;
using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::kernels;
using urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<
Param>::SharedMem;

void SetUp() override {
useEvents = std::get<0>(getParam()).value;
queuePerThread = std::get<1>(getParam()).value;
// With !queuePerThread this becomes a test on a single device
this->trueMultiDevice = queuePerThread;
urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam<Param>::SetUp();
}

bool useEvents;
bool queuePerThread;
};

UUR_PLATFORM_TEST_SUITE_WITH_PARAM(
urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest,
Expand All @@ -356,11 +376,7 @@ UUR_PLATFORM_TEST_SUITE_WITH_PARAM(
printParams<urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest>);

// Enqueue kernelLaunch concurrently from multiple threads
// With !queuePerThread this becomes a test on a single device
TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
auto useEvents = std::get<0>(getParam()).value;
auto queuePerThread = std::get<1>(getParam()).value;

if (!queuePerThread) {
UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{});
}
Expand All @@ -371,11 +387,11 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
static constexpr size_t numOpsPerThread = 6;

for (size_t i = 0; i < numThreads; i++) {
threads.emplace_back([this, i, queuePerThread, useEvents]() {
threads.emplace_back([this, i]() {
constexpr size_t global_offset = 0;
constexpr size_t n_dimensions = 1;

auto queue = queuePerThread ? queues[i] : queues.back();
auto queue = this->queuePerThread ? queues[i] : queues.back();
auto kernel = kernels[i];
auto sharedPtr = SharedMem[i];

Expand All @@ -385,7 +401,7 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) {
ur_event_handle_t *lastEvent = nullptr;
ur_event_handle_t *signalEvent = nullptr;

if (useEvents) {
if (this->useEvents) {
waitNum = j > 0 ? 1 : 0;
lastEvent = j > 0 ? Events[j - 1].ptr() : nullptr;
signalEvent = Events[j].ptr();
Expand Down
Loading