Skip to content

Commit 10b5c96

Browse files
committed
[EXP][Command-buffer] OpenCL kernel command update
Implement the API for updating the kernel commands in a command-buffer defined by oneapi-src#1089 for the OpenCL adapter. This depends on support for the [cl_khr_command_buffer_mutable_dispatch](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#cl_khr_command_buffer_mutable_dispatch) extension.
1 parent 92e154b commit 10b5c96

10 files changed

+358
-27
lines changed

source/adapters/opencl/command_buffer.cpp

Lines changed: 221 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
1515
ur_context_handle_t hContext, ur_device_handle_t hDevice,
16-
[[maybe_unused]] const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
16+
const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
1717
ur_exp_command_buffer_handle_t *phCommandBuffer) {
1818

1919
ur_queue_handle_t Queue = nullptr;
@@ -29,13 +29,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
2929
if (!clCreateCommandBufferKHR || Res != CL_SUCCESS)
3030
return UR_RESULT_ERROR_INVALID_OPERATION;
3131

32+
bool IsUpdatable =
33+
pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;
34+
35+
bool SupportsUpdate = false;
36+
cl_device_id CLDevice = cl_adapter::cast<cl_device_id>(hDevice);
37+
CL_RETURN_ON_FAILURE(
38+
deviceSupportsURCommandBufferKernelUpdate(CLDevice, SupportsUpdate));
39+
40+
bool Updatable = IsUpdatable && SupportsUpdate;
41+
42+
cl_command_buffer_properties_khr Properties[3] = {
43+
CL_COMMAND_BUFFER_FLAGS_KHR,
44+
Updatable ? CL_COMMAND_BUFFER_MUTABLE_KHR : 0u, 0};
3245
auto CLCommandBuffer = clCreateCommandBufferKHR(
33-
1, cl_adapter::cast<cl_command_queue *>(&Queue), nullptr, &Res);
46+
1, cl_adapter::cast<cl_command_queue *>(&Queue), Properties, &Res);
3447
CL_RETURN_ON_FAILURE_AND_SET_NULL(Res, phCommandBuffer);
3548

3649
try {
3750
auto URCommandBuffer = std::make_unique<ur_exp_command_buffer_handle_t_>(
38-
Queue, hContext, CLCommandBuffer);
51+
Queue, hContext, CLCommandBuffer, Updatable);
3952
*phCommandBuffer = URCommandBuffer.release();
4053
} catch (...) {
4154
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
@@ -95,6 +108,7 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
95108

96109
CL_RETURN_ON_FAILURE(
97110
clFinalizeCommandBufferKHR(hCommandBuffer->CLCommandBuffer));
111+
hCommandBuffer->Finalized = true;
98112
return UR_RESULT_SUCCESS;
99113
}
100114

@@ -105,7 +119,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
105119
uint32_t numSyncPointsInWaitList,
106120
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
107121
ur_exp_command_buffer_sync_point_t *pSyncPoint,
108-
ur_exp_command_buffer_command_handle_t *) {
122+
ur_exp_command_buffer_command_handle_t *phCommandHandle) {
109123

110124
cl_context CLContext = cl_adapter::cast<cl_context>(hCommandBuffer->hContext);
111125
cl_ext::clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = nullptr;
@@ -117,11 +131,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
117131
if (!clCommandNDRangeKernelKHR || Res != CL_SUCCESS)
118132
return UR_RESULT_ERROR_INVALID_OPERATION;
119133

134+
cl_mutable_command_khr CommandHandle = nullptr;
135+
cl_mutable_command_khr *OutCommandHandle =
136+
hCommandBuffer->Updatable ? &CommandHandle : nullptr;
137+
138+
cl_ndrange_kernel_command_properties_khr UpdateProperties[] = {
139+
CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
140+
CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR |
141+
CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR |
142+
CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR |
143+
CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | CL_MUTABLE_DISPATCH_EXEC_INFO_KHR,
144+
0};
145+
146+
cl_ndrange_kernel_command_properties_khr *Properties =
147+
hCommandBuffer->Updatable ? UpdateProperties : nullptr;
120148
CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR(
121-
hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
149+
hCommandBuffer->CLCommandBuffer, nullptr, Properties,
122150
cl_adapter::cast<cl_kernel>(hKernel), workDim, pGlobalWorkOffset,
123151
pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList,
124-
pSyncPointWaitList, pSyncPoint, nullptr));
152+
pSyncPointWaitList, pSyncPoint, OutCommandHandle));
153+
154+
try {
155+
auto URCommandHandle =
156+
std::make_unique<ur_exp_command_buffer_command_handle_t_>(
157+
hCommandBuffer, CommandHandle, workDim);
158+
*phCommandHandle = URCommandHandle.release();
159+
hCommandBuffer->CommandHandles.push_back(*phCommandHandle);
160+
} catch (...) {
161+
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
162+
}
125163

126164
return UR_RESULT_SUCCESS;
127165
}
@@ -360,19 +398,180 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
360398

361399
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp(
362400
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand) {
363-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
401+
hCommand->incrementReferenceCount();
402+
return UR_RESULT_SUCCESS;
364403
}
365404

366405
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
367406
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand) {
368-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
407+
if (hCommand->decrementReferenceCount() == 0) {
408+
// TODO
409+
}
410+
return UR_RESULT_SUCCESS;
369411
}
370412

371413
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
372414
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand,
373415
[[maybe_unused]] const ur_exp_command_buffer_update_kernel_launch_desc_t
374416
*pUpdateKernelLaunch) {
375-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
417+
418+
ur_exp_command_buffer_handle_t hCommandBuffer = hCommand->hCommandBuffer;
419+
cl_context CLContext = cl_adapter::cast<cl_context>(hCommandBuffer->hContext);
420+
cl_ext::clUpdateMutableCommandsKHR_fn clUpdateMutableCommandsKHR = nullptr;
421+
cl_int Res =
422+
cl_ext::getExtFuncFromContext<decltype(clUpdateMutableCommandsKHR)>(
423+
CLContext, cl_ext::ExtFuncPtrCache->clUpdateMutableCommandsKHRCache,
424+
cl_ext::UpdateMutableCommandsName, &clUpdateMutableCommandsKHR);
425+
426+
if (!clUpdateMutableCommandsKHR || Res != CL_SUCCESS)
427+
return UR_RESULT_ERROR_INVALID_OPERATION;
428+
429+
if (!hCommandBuffer->Finalized || !hCommandBuffer->Updatable)
430+
return UR_RESULT_ERROR_INVALID_OPERATION;
431+
432+
// Find the CL execution info to update
433+
uint32_t NumExecInfos = pUpdateKernelLaunch->numNewExecInfos;
434+
const ur_exp_command_buffer_update_exec_info_desc_t *ExecInfoList =
435+
pUpdateKernelLaunch->pNewExecInfoList;
436+
std::vector<cl_mutable_dispatch_exec_info_khr> CLExecInfos;
437+
for (uint32_t i = 0; i < NumExecInfos; i++) {
438+
const ur_exp_command_buffer_update_exec_info_desc_t &URExecInfo =
439+
ExecInfoList[i];
440+
441+
if (URExecInfo.propName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS) {
442+
cl_bool TrueVal = CL_TRUE;
443+
cl_mutable_dispatch_exec_info_khr CLExecInfo;
444+
CLExecInfo.param_value_size = sizeof(cl_bool);
445+
CLExecInfo.param_value = &TrueVal;
446+
CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL;
447+
CLExecInfos.push_back(CLExecInfo);
448+
449+
CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL;
450+
CLExecInfos.push_back(CLExecInfo);
451+
452+
CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL;
453+
CLExecInfos.push_back(CLExecInfo);
454+
} else if (URExecInfo.propName == UR_KERNEL_EXEC_INFO_USM_PTRS) {
455+
cl_mutable_dispatch_exec_info_khr CLExecInfo{};
456+
CLExecInfo.param_value_size = URExecInfo.propSize;
457+
CLExecInfo.param_value = URExecInfo.pNewExecInfo;
458+
CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL;
459+
CLExecInfos.push_back(CLExecInfo);
460+
} else if (URExecInfo.propName != UR_KERNEL_EXEC_INFO_CACHE_CONFIG) {
461+
return UR_RESULT_ERROR_INVALID_ENUMERATION;
462+
}
463+
}
464+
465+
// Find the CL USM pointer arguments to the kernel
466+
// WARNING - This relies on USM and SVM using the same implementation,
467+
// which is not guaranteed.
468+
// See https://github.com/KhronosGroup/OpenCL-Docs/issues/843
469+
uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs;
470+
const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList =
471+
pUpdateKernelLaunch->pNewPointerArgList;
472+
std::vector<cl_mutable_dispatch_arg_khr> CLUSMArgs(NumPointerArgs);
473+
for (uint32_t i = 0; i < NumPointerArgs; i++) {
474+
const ur_exp_command_buffer_update_pointer_arg_desc_t &URPointerArg =
475+
ArgPointerList[i];
476+
cl_mutable_dispatch_arg_khr &USMArg = CLUSMArgs[i];
477+
USMArg.arg_index = URPointerArg.argIndex;
478+
USMArg.arg_value = *(void **)URPointerArg.pNewPointerArg;
479+
}
480+
481+
uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs;
482+
const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList =
483+
pUpdateKernelLaunch->pNewMemObjArgList;
484+
uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs;
485+
const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList =
486+
pUpdateKernelLaunch->pNewValueArgList;
487+
488+
std::vector<cl_mutable_dispatch_arg_khr> CLArgs;
489+
for (uint32_t i = 0; i < NumMemobjArgs; i++) {
490+
const ur_exp_command_buffer_update_memobj_arg_desc_t &URMemObjArg =
491+
ArgMemobjList[i];
492+
cl_mutable_dispatch_arg_khr CLArg{
493+
URMemObjArg.argIndex, // arg_index
494+
sizeof(cl_mem), // arg_size
495+
cl_adapter::cast<const cl_mem *>(
496+
&URMemObjArg.hNewMemObjArg) // arg_value
497+
};
498+
499+
CLArgs.push_back(CLArg);
500+
}
501+
502+
for (uint32_t i = 0; i < NumValueArgs; i++) {
503+
const ur_exp_command_buffer_update_value_arg_desc_t &URValueArg =
504+
ArgValueList[i];
505+
cl_mutable_dispatch_arg_khr CLArg{
506+
URValueArg.argIndex, // arg_index
507+
URValueArg.argSize, // arg_size
508+
URValueArg.pNewValueArg // arg_value
509+
};
510+
CLArgs.push_back(CLArg);
511+
}
512+
513+
const cl_uint NewWorkDim = pUpdateKernelLaunch->newWorkDim;
514+
cl_uint &CLWorkDim = hCommand->WorkDim;
515+
if (NewWorkDim != 0 && NewWorkDim != CLWorkDim) {
516+
// Limitation of the cl_khr_command_buffer_mutable_dispatch specification
517+
// that it is an error to change the ND-Range size.
518+
// https://github.com/KhronosGroup/OpenCL-Docs/issues/1057
519+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
520+
}
521+
522+
const size_t CopySize = sizeof(size_t) * CLWorkDim;
523+
std::vector<size_t> CLGlobalWorkOffset, CLGlobalWorkSize, CLLocalWorkSize;
524+
525+
if (auto GlobalWorkOffsetPtr = pUpdateKernelLaunch->pNewGlobalWorkOffset) {
526+
CLGlobalWorkOffset.resize(CLWorkDim);
527+
std::memcpy(CLGlobalWorkOffset.data(), GlobalWorkOffsetPtr, CopySize);
528+
if (CLWorkDim < 3) {
529+
const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim);
530+
std::memset(CLGlobalWorkOffset.data() + CLWorkDim, 0, ZeroSize);
531+
}
532+
}
533+
534+
if (auto GlobalWorkSizePtr = pUpdateKernelLaunch->pNewGlobalWorkSize) {
535+
CLGlobalWorkSize.resize(CLWorkDim);
536+
std::memcpy(CLGlobalWorkSize.data(), GlobalWorkSizePtr, CopySize);
537+
if (CLWorkDim < 3) {
538+
const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim);
539+
std::memset(CLGlobalWorkSize.data() + CLWorkDim, 0, ZeroSize);
540+
}
541+
}
542+
543+
if (auto LocalWorkSizePtr = pUpdateKernelLaunch->pNewLocalWorkSize) {
544+
CLLocalWorkSize.resize(CLWorkDim);
545+
std::memcpy(CLLocalWorkSize.data(), LocalWorkSizePtr, CopySize);
546+
if (CLWorkDim < 3) {
547+
const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim);
548+
std::memset(CLLocalWorkSize.data() + CLWorkDim, 0, ZeroSize);
549+
}
550+
}
551+
552+
cl_mutable_command_khr command =
553+
cl_adapter::cast<cl_mutable_command_khr>(hCommand->CLMutableCommand);
554+
cl_mutable_dispatch_config_khr dispatch_config = {
555+
CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
556+
nullptr,
557+
command,
558+
static_cast<cl_uint>(CLArgs.size()), // num_args
559+
static_cast<cl_uint>(CLUSMArgs.size()), // num_svm_args
560+
static_cast<cl_uint>(CLExecInfos.size()), // num_exec_infos
561+
CLWorkDim, // work_dim
562+
CLArgs.data(), // arg_list
563+
CLUSMArgs.data(), // arg_svm_list
564+
CLExecInfos.data(), // exec_info_list
565+
CLGlobalWorkOffset.data(), // global_work_offset
566+
CLGlobalWorkSize.data(), // global_work_size
567+
CLLocalWorkSize.data(), // local_work_size
568+
};
569+
cl_mutable_base_config_khr config = {
570+
CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1, &dispatch_config};
571+
CL_RETURN_ON_FAILURE(
572+
clUpdateMutableCommandsKHR(hCommandBuffer->CLCommandBuffer, &config));
573+
574+
return UR_RESULT_SUCCESS;
376575
}
377576

378577
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp(
@@ -415,9 +614,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp(
415614
}
416615

417616
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp(
418-
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand,
419-
[[maybe_unused]] ur_exp_command_buffer_command_info_t propName,
420-
[[maybe_unused]] size_t propSize, [[maybe_unused]] void *pPropValue,
421-
[[maybe_unused]] size_t *pPropSizeRet) {
422-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
617+
ur_exp_command_buffer_command_handle_t hCommand,
618+
ur_exp_command_buffer_command_info_t propName, size_t propSize,
619+
void *pPropValue, size_t *pPropSizeRet) {
620+
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
621+
622+
switch (propName) {
623+
case UR_EXP_COMMAND_BUFFER_COMMAND_INFO_REFERENCE_COUNT:
624+
return ReturnValue(hCommand->getReferenceCount());
625+
default:
626+
assert(!"Command-buffer command info request not implemented");
627+
}
628+
629+
return UR_RESULT_ERROR_INVALID_ENUMERATION;
423630
}

source/adapters/opencl/command_buffer.hpp

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,37 @@
1111
#include <CL/cl_ext.h>
1212
#include <ur/ur.hpp>
1313

14+
// Handle to a kernel command.
15+
struct ur_exp_command_buffer_command_handle_t_ {
16+
ur_exp_command_buffer_handle_t hCommandBuffer;
17+
cl_mutable_command_khr CLMutableCommand;
18+
cl_uint WorkDim;
19+
std::atomic_uint32_t RefCount;
20+
21+
ur_exp_command_buffer_command_handle_t_(
22+
ur_exp_command_buffer_handle_t hCommandBuffer,
23+
cl_mutable_command_khr CLMutableCommand, cl_uint WorkDim)
24+
: hCommandBuffer(hCommandBuffer), CLMutableCommand(CLMutableCommand),
25+
WorkDim(WorkDim), RefCount{0} {}
26+
27+
uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
28+
uint32_t decrementReferenceCount() noexcept { return --RefCount; }
29+
uint32_t getReferenceCount() const noexcept { return RefCount; }
30+
};
31+
1432
struct ur_exp_command_buffer_handle_t_ {
1533
ur_queue_handle_t hInternalQueue;
1634
ur_context_handle_t hContext;
1735
cl_command_buffer_khr CLCommandBuffer;
36+
bool Updatable;
37+
bool Finalized;
38+
std::vector<ur_exp_command_buffer_command_handle_t> CommandHandles;
1839

1940
ur_exp_command_buffer_handle_t_(ur_queue_handle_t hQueue,
2041
ur_context_handle_t hContext,
21-
cl_command_buffer_khr CLCommandBuffer)
42+
cl_command_buffer_khr CLCommandBuffer,
43+
bool Updatable)
2244
: hInternalQueue(hQueue), hContext(hContext),
23-
CLCommandBuffer(CLCommandBuffer) {}
45+
CLCommandBuffer(CLCommandBuffer), Updatable(Updatable),
46+
Finalized(false) {}
2447
};

source/adapters/opencl/common.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,33 @@ ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle) {
101101
*NativeHandle = reinterpret_cast<ur_native_handle_t>(URObj);
102102
return UR_RESULT_SUCCESS;
103103
}
104+
105+
cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev,
106+
bool &Result) {
107+
size_t ExtSize = 0;
108+
CL_RETURN_ON_FAILURE(
109+
clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, 0, nullptr, &ExtSize));
110+
111+
std::string ExtStr(ExtSize, '\0');
112+
CL_RETURN_ON_FAILURE(clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, ExtSize,
113+
ExtStr.data(), nullptr));
114+
115+
std::string SupportedExtensions(ExtStr.c_str());
116+
if (ExtStr.find("cl_khr_command_buffer_mutable_dispatch") ==
117+
std::string::npos) {
118+
Result = false;
119+
return CL_SUCCESS;
120+
}
121+
122+
cl_mutable_dispatch_fields_khr mutable_capabilities;
123+
CL_RETURN_ON_FAILURE(clGetDeviceInfo(
124+
Dev, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
125+
sizeof(mutable_capabilities), &mutable_capabilities, nullptr));
126+
const cl_mutable_dispatch_fields_khr required_caps =
127+
CL_MUTABLE_DISPATCH_ARGUMENTS_KHR |
128+
CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR |
129+
CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR | CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR |
130+
CL_MUTABLE_DISPATCH_EXEC_INFO_KHR;
131+
Result = (mutable_capabilities & required_caps) == required_caps;
132+
return CL_SUCCESS;
133+
}

0 commit comments

Comments
 (0)