Skip to content

Commit a836c87

Browse files
[SYCL][L0] Enable round-robin submissions to multiple compute CCS (intel#5657)
1 parent 5468371 commit a836c87

File tree

2 files changed

+75
-52
lines changed

2 files changed

+75
-52
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

100755100644
Lines changed: 74 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,26 @@ class ReturnHelper {
342342

343343
} // anonymous namespace
344344

345+
// SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
346+
// which case all compute commands will be submitted to the command-queue
347+
// with the given index in the compute command group. If it is instead set
348+
// to negative (or unset) then all available compute engines may be used.
349+
//
350+
static const std::pair<int, int> getRangeOfAllowedComputeEngines = [] {
351+
const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
352+
// If the environment variable is not set, all available compute engines
353+
// can be used.
354+
if (!EnvVar)
355+
return std::pair<int, int>(0, INT_MAX);
356+
357+
auto EnvVarValue = std::atoi(EnvVar);
358+
if (EnvVarValue >= 0) {
359+
return std::pair<int, int>(EnvVarValue, EnvVarValue);
360+
}
361+
362+
return std::pair<int, int>(0, INT_MAX);
363+
}();
364+
345365
// SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or
346366
// a pair of integer values of the form "lower_index:upper_index".
347367
// Here, the indices point to copy engines in a list of all available copy
@@ -678,11 +698,6 @@ pi_result _pi_device::initialize(int SubSubDeviceOrdinal,
678698
return PI_ERROR_UNKNOWN;
679699
}
680700

681-
// The index for a root or a sub-device is always 0.
682-
// TODO: we want to start submitting to multiple queues in the
683-
// compute group for more parallelism.
684-
QueueGroup[queue_group_info_t::Compute].ZeIndex = 0;
685-
686701
if (CopyEngineRequested) {
687702
for (uint32_t i = 0; i < numQueueGroups; i++) {
688703
if (((QueueGroupProperties[i].flags &
@@ -1025,19 +1040,40 @@ static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] {
10251040
return ZeCommandListBatchConfig(IsCopy{true});
10261041
}();
10271042

1028-
_pi_queue::_pi_queue(ze_command_queue_handle_t Queue,
1043+
_pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
10291044
std::vector<ze_command_queue_handle_t> &CopyQueues,
10301045
pi_context Context, pi_device Device,
10311046
bool OwnZeCommandQueue,
10321047
pi_queue_properties PiQueueProperties)
10331048
: Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
10341049
Properties(PiQueueProperties) {
10351050

1036-
// Compute group has currently single CCS only.
1037-
ComputeQueueGroup.ZeQueues.push_back(Queue);
1038-
ComputeQueueGroup.LowerIndex = 0;
1039-
ComputeQueueGroup.UpperIndex = 0;
1040-
ComputeQueueGroup.NextIndex = 0;
1051+
// Compute group initialization.
1052+
// First, see if the queue's device allows for round-robin or it is
1053+
// fixed to one particular compute CCS (it is so for sub-sub-devices).
1054+
auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute];
1055+
if (ComputeQueueGroupInfo.ZeIndex >= 0) {
1056+
ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
1057+
ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
1058+
ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
1059+
} else {
1060+
ComputeQueueGroup.LowerIndex = 0;
1061+
ComputeQueueGroup.UpperIndex = INT_MAX;
1062+
ComputeQueueGroup.NextIndex = 0;
1063+
}
1064+
1065+
uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines.first;
1066+
uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines.second;
1067+
FilterUpperIndex =
1068+
std::min((size_t)FilterUpperIndex, ComputeQueues.size() - 1);
1069+
if (FilterLowerIndex <= FilterUpperIndex) {
1070+
ComputeQueueGroup.ZeQueues = ComputeQueues;
1071+
ComputeQueueGroup.LowerIndex = FilterLowerIndex;
1072+
ComputeQueueGroup.UpperIndex = FilterUpperIndex;
1073+
ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
1074+
} else {
1075+
die("No compute queue available.");
1076+
}
10411077

10421078
// Copy group initialization.
10431079
if (getRangeOfAllowedCopyEngines.first < 0 ||
@@ -1447,6 +1483,12 @@ _pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
14471483

14481484
ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
14491485
ZeCommandQueueDesc.index = ZeCommandQueueIndex;
1486+
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1487+
1488+
zePrint("[getZeQueue]: create queue ordinal = %d, index = %d "
1489+
"(round robin in [%d, %d])\n",
1490+
ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1491+
UpperIndex);
14501492

14511493
auto ZeResult = ZE_CALL_NOCHECK(
14521494
zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
@@ -3012,59 +3054,38 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
30123054
PI_QUEUE_ON_DEVICE_DEFAULT)),
30133055
PI_INVALID_VALUE);
30143056

3015-
ze_device_handle_t ZeDevice;
3016-
ze_command_queue_handle_t ZeComputeCommandQueue;
30173057
PI_ASSERT(Context, PI_INVALID_CONTEXT);
3058+
PI_ASSERT(Queue, PI_INVALID_QUEUE);
3059+
PI_ASSERT(Device, PI_INVALID_DEVICE);
30183060

30193061
if (std::find(Context->Devices.begin(), Context->Devices.end(), Device) ==
30203062
Context->Devices.end()) {
30213063
return PI_INVALID_DEVICE;
30223064
}
30233065

3024-
PI_ASSERT(Device, PI_INVALID_DEVICE);
3066+
// Create placeholder queues in the compute queue group.
3067+
// Actual L0 queues will be created at first use.
3068+
std::vector<ze_command_queue_handle_t> ZeComputeCommandQueues(
3069+
Device->QueueGroup[_pi_queue::queue_type::Compute].ZeProperties.numQueues,
3070+
nullptr);
30253071

3026-
ZeDevice = Device->ZeDevice;
3027-
auto &ComputeQueueGroup =
3028-
Device->QueueGroup[_pi_device::queue_group_info_t::Compute];
3029-
ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
3030-
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
3031-
ZeCommandQueueDesc.ordinal = ComputeQueueGroup.ZeOrdinal;
3032-
ZeCommandQueueDesc.index = ComputeQueueGroup.ZeIndex;
3033-
// TODO: add round-robin through compute CCS.
3034-
if (ComputeQueueGroup.ZeIndex == -1)
3035-
ZeCommandQueueDesc.index = 0;
3036-
3037-
ZE_CALL(zeCommandQueueCreate,
3038-
(Context->ZeContext, ZeDevice,
3039-
&ZeCommandQueueDesc, // TODO: translate properties
3040-
&ZeComputeCommandQueue));
3041-
3042-
std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues;
3043-
3044-
// Create a placeholder in ZeCopyCommandQueues for a queue that will be used
3045-
// to submit commands to main copy engine. This queue is initially NULL and
3046-
// will be replaced by the Ze Command Queue which gets created just before its
3047-
// first use.
3048-
ze_command_queue_handle_t ZeMainCopyCommandQueue = nullptr;
3072+
// Create placeholder queues in the copy queue group (main and link
3073+
// native groups are combined into one group).
3074+
// Actual L0 queues will be created at first use.
3075+
size_t NumCopyGroups = 0;
30493076
if (Device->hasMainCopyEngine()) {
3050-
ZeCopyCommandQueues.push_back(ZeMainCopyCommandQueue);
3077+
NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::MainCopy]
3078+
.ZeProperties.numQueues;
30513079
}
3052-
3053-
// Create additional 'placeholder queues' to link copy engines and push them
3054-
// into ZeCopyCommandQueues.
30553080
if (Device->hasLinkCopyEngine()) {
3056-
auto ZeNumLinkCopyQueues =
3057-
Device->QueueGroup[_pi_device::queue_group_info_t::LinkCopy]
3058-
.ZeProperties.numQueues;
3059-
for (uint32_t i = 0; i < ZeNumLinkCopyQueues; ++i) {
3060-
ze_command_queue_handle_t ZeLinkCopyCommandQueue = nullptr;
3061-
ZeCopyCommandQueues.push_back(ZeLinkCopyCommandQueue);
3062-
}
3081+
NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::LinkCopy]
3082+
.ZeProperties.numQueues;
30633083
}
3064-
PI_ASSERT(Queue, PI_INVALID_QUEUE);
3084+
std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues(NumCopyGroups,
3085+
nullptr);
30653086

30663087
try {
3067-
*Queue = new _pi_queue(ZeComputeCommandQueue, ZeCopyCommandQueues, Context,
3088+
*Queue = new _pi_queue(ZeComputeCommandQueues, ZeCopyCommandQueues, Context,
30683089
Device, true, Properties);
30693090
} catch (const std::bad_alloc &) {
30703091
return PI_OUT_OF_HOST_MEMORY;
@@ -3278,6 +3299,8 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
32783299
PI_ASSERT(Queue, PI_INVALID_QUEUE);
32793300

32803301
auto ZeQueue = pi_cast<ze_command_queue_handle_t>(NativeHandle);
3302+
// Assume this is the "0" index queue in the compute command-group.
3303+
std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
32813304

32823305
// Attach the queue to the "0" device.
32833306
// TODO: see if we need to let user choose the device.
@@ -3287,7 +3310,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
32873310
// all commands to the "ZeQueue".
32883311
std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
32893312
*Queue =
3290-
new _pi_queue(ZeQueue, ZeroCopyQueues, Context, Device, OwnNativeHandle);
3313+
new _pi_queue(ZeQueues, ZeroCopyQueues, Context, Device, OwnNativeHandle);
32913314
return PI_SUCCESS;
32923315
}
32933316

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,7 @@ struct _pi_context : _pi_object {
643643
};
644644

645645
struct _pi_queue : _pi_object {
646-
_pi_queue(ze_command_queue_handle_t Queue,
646+
_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
647647
std::vector<ze_command_queue_handle_t> &CopyQueues,
648648
pi_context Context, pi_device Device, bool OwnZeCommandQueue,
649649
pi_queue_properties Properties = 0);

0 commit comments

Comments
 (0)