Skip to content

[SYCL] Clear cache in case of PI_ERROR_OUT_OF_HOST_MEMORY #14119

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sycl/source/detail/kernel_program_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ class KernelProgramCache {
} catch (const exception &Ex) {
BuildResult->Error.Msg = Ex.what();
BuildResult->Error.Code = Ex.get_cl_code();
if (BuildResult->Error.Code == PI_ERROR_OUT_OF_RESOURCES) {
if (BuildResult->Error.Code == PI_ERROR_OUT_OF_RESOURCES ||
BuildResult->Error.Code == PI_ERROR_OUT_OF_HOST_MEMORY) {
reset();
BuildResult->updateAndNotify(BuildState::BS_Initial);
continue;
Expand Down
6 changes: 4 additions & 2 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1223,7 +1223,8 @@ ProgramManager::ProgramPtr ProgramManager::build(
nullptr, &LinkedProg);
};
sycl::detail::pi::PiResult Error = doLink();
if (Error == PI_ERROR_OUT_OF_RESOURCES) {
if (Error == PI_ERROR_OUT_OF_RESOURCES ||
Error == PI_ERROR_OUT_OF_HOST_MEMORY) {
Context->getKernelProgramCache().reset();
Error = doLink();
}
Expand Down Expand Up @@ -2118,7 +2119,8 @@ ProgramManager::link(const device_image_plain &DeviceImage,
/*user_data=*/nullptr, &LinkedProg);
};
sycl::detail::pi::PiResult Error = doLink();
if (Error == PI_ERROR_OUT_OF_RESOURCES) {
if (Error == PI_ERROR_OUT_OF_RESOURCES ||
Error == PI_ERROR_OUT_OF_HOST_MEMORY) {
ContextImpl->getKernelProgramCache().reset();
Error = doLink();
}
Expand Down
131 changes: 130 additions & 1 deletion sycl/unittests/kernel-and-program/OutOfResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ static sycl::unittest::PiImageArray<2> ImgArray{Img};

static int nProgramCreate = 0;
static volatile bool outOfResourcesToggle = false;
static volatile bool outOfHostMemoryToggle = false;

static pi_result redefinedProgramCreate(pi_context context, const void *il,
size_t length,
Expand All @@ -80,6 +81,17 @@ static pi_result redefinedProgramCreate(pi_context context, const void *il,
return PI_SUCCESS;
}

static pi_result
redefinedProgramCreateOutOfHostMemory(pi_context context, const void *il,
size_t length, pi_program *res_program) {
++nProgramCreate;
if (outOfHostMemoryToggle) {
outOfHostMemoryToggle = false;
return PI_ERROR_OUT_OF_HOST_MEMORY;
}
return PI_SUCCESS;
}

TEST(OutOfResourcesTest, piProgramCreate) {
sycl::unittest::PiMock Mock;
Mock.redefineBefore<detail::PiApiKind::piProgramCreate>(
Expand Down Expand Up @@ -141,6 +153,70 @@ TEST(OutOfResourcesTest, piProgramCreate) {
}
}

TEST(OutOfHostMemoryTest, piProgramCreate) {
// Reset to zero.
nProgramCreate = 0;

sycl::unittest::PiMock Mock;
Mock.redefineBefore<detail::PiApiKind::piProgramCreate>(
redefinedProgramCreateOutOfHostMemory);

sycl::platform Plt{Mock.getPlatform()};
sycl::context Ctx{Plt};
auto CtxImpl = detail::getSyclObjImpl(Ctx);
queue q(Ctx, default_selector_v);

int runningTotal = 0;
// Cache is empty, so one piProgramCreate call.
q.single_task<class OutOfResourcesKernel1>([] {});
EXPECT_EQ(nProgramCreate, runningTotal += 1);

// Now, we make the next piProgramCreate call fail with
// PI_ERROR_OUT_OF_HOST_MEMORY. The caching mechanism should catch this,
// clear the cache, and retry the piProgramCreate.
outOfHostMemoryToggle = true;
q.single_task<class OutOfResourcesKernel2>([] {});
EXPECT_FALSE(outOfHostMemoryToggle);
EXPECT_EQ(nProgramCreate, runningTotal += 2);
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
}

// The next piProgramCreate call will fail with
// PI_ERROR_OUT_OF_HOST_MEMORY. But OutOfResourcesKernel2 is in
// the cache, so we expect no new piProgramCreate calls.
outOfHostMemoryToggle = true;
q.single_task<class OutOfResourcesKernel2>([] {});
EXPECT_TRUE(outOfHostMemoryToggle);
EXPECT_EQ(nProgramCreate, runningTotal);

// OutOfResourcesKernel1 is not in the cache, so we have to
// build it. From what we set before, this call will fail,
// the cache will clear out, and will try again.
q.single_task<class OutOfResourcesKernel1>([] {});
EXPECT_FALSE(outOfHostMemoryToggle);
EXPECT_EQ(nProgramCreate, runningTotal += 2);
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
}

// Finally, OutOfResourcesKernel1 will be in the cache, but
// OutOfResourceKenel2 will not, so one more piProgramCreate.
// Toggle is not set, so this should succeed.
q.single_task<class OutOfResourcesKernel1>([] {});
q.single_task<class OutOfResourcesKernel2>([] {});
EXPECT_EQ(nProgramCreate, runningTotal += 1);
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 2U) << "Expected 2 program in the cache";
}
}

static int nProgramLink = 0;

static pi_result
Expand All @@ -158,6 +234,20 @@ redefinedProgramLink(pi_context context, pi_uint32 num_devices,
return PI_SUCCESS;
}

static pi_result redefinedProgramLinkOutOfHostMemory(
pi_context context, pi_uint32 num_devices, const pi_device *device_list,
const char *options, pi_uint32 num_input_programs,
const pi_program *input_programs,
void (*pfn_notify)(pi_program program, void *user_data), void *user_data,
pi_program *ret_program) {
++nProgramLink;
if (outOfHostMemoryToggle) {
outOfHostMemoryToggle = false;
return PI_ERROR_OUT_OF_HOST_MEMORY;
}
return PI_SUCCESS;
}

TEST(OutOfResourcesTest, piProgramLink) {
sycl::unittest::PiMock Mock;
Mock.redefineBefore<detail::PiApiKind::piProgramLink>(redefinedProgramLink);
Expand Down Expand Up @@ -191,4 +281,43 @@ TEST(OutOfResourcesTest, piProgramLink) {
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
}
}
}

TEST(OutOfHostMemoryTest, piProgramLink) {
// Reset to zero.
nProgramLink = 0;

sycl::unittest::PiMock Mock;
Mock.redefineBefore<detail::PiApiKind::piProgramLink>(
redefinedProgramLinkOutOfHostMemory);

sycl::platform Plt{Mock.getPlatform()};
sycl::context Ctx{Plt};
auto CtxImpl = detail::getSyclObjImpl(Ctx);
queue q(Ctx, default_selector_v);
// Put some programs in the cache
q.single_task<class OutOfResourcesKernel1>([] {});
q.single_task<class OutOfResourcesKernel2>([] {});
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 2U) << "Expect 2 programs in the cache";
}

auto b1 = sycl::get_kernel_bundle<OutOfResourcesKernel1,
sycl::bundle_state::object>(Ctx);
auto b2 = sycl::get_kernel_bundle<OutOfResourcesKernel2,
sycl::bundle_state::object>(Ctx);
outOfHostMemoryToggle = true;
EXPECT_EQ(nProgramLink, 0);
auto b3 = sycl::link({b1, b2});
EXPECT_FALSE(outOfHostMemoryToggle);
// one restart due to out of resources, one link per each of b1 and b2.
EXPECT_EQ(nProgramLink, 3);
// no programs should be in the cache due to out of resources.
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
}
}
Loading