@@ -342,6 +342,26 @@ class ReturnHelper {
342
342
343
343
} // anonymous namespace
344
344
345
+ // SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
346
+ // which case all compute commands will be submitted to the command-queue
347
+ // with the given index in the compute command group. If it is instead set
348
+ // to negative (or unset) then all available compute engines may be used.
349
+ //
350
+ static const std::pair<int , int > getRangeOfAllowedComputeEngines = [] {
351
+ const char *EnvVar = std::getenv (" SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE" );
352
+ // If the environment variable is not set, all available compute engines
353
+ // can be used.
354
+ if (!EnvVar)
355
+ return std::pair<int , int >(0 , INT_MAX);
356
+
357
+ auto EnvVarValue = std::atoi (EnvVar);
358
+ if (EnvVarValue >= 0 ) {
359
+ return std::pair<int , int >(EnvVarValue, EnvVarValue);
360
+ }
361
+
362
+ return std::pair<int , int >(0 , INT_MAX);
363
+ }();
364
+
345
365
// SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or
346
366
// a pair of integer values of the form "lower_index:upper_index".
347
367
// Here, the indices point to copy engines in a list of all available copy
@@ -678,11 +698,6 @@ pi_result _pi_device::initialize(int SubSubDeviceOrdinal,
678
698
return PI_ERROR_UNKNOWN;
679
699
}
680
700
681
- // The index for a root or a sub-device is always 0.
682
- // TODO: we want to start submitting to multiple queues in the
683
- // compute group for more parallelism.
684
- QueueGroup[queue_group_info_t ::Compute].ZeIndex = 0 ;
685
-
686
701
if (CopyEngineRequested) {
687
702
for (uint32_t i = 0 ; i < numQueueGroups; i++) {
688
703
if (((QueueGroupProperties[i].flags &
@@ -1025,19 +1040,40 @@ static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] {
1025
1040
return ZeCommandListBatchConfig (IsCopy{true });
1026
1041
}();
1027
1042
1028
- _pi_queue::_pi_queue (ze_command_queue_handle_t Queue ,
1043
+ _pi_queue::_pi_queue (std::vector< ze_command_queue_handle_t > &ComputeQueues ,
1029
1044
std::vector<ze_command_queue_handle_t > &CopyQueues,
1030
1045
pi_context Context, pi_device Device,
1031
1046
bool OwnZeCommandQueue,
1032
1047
pi_queue_properties PiQueueProperties)
1033
1048
: Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
1034
1049
Properties (PiQueueProperties) {
1035
1050
1036
- // Compute group has currently single CCS only.
1037
- ComputeQueueGroup.ZeQueues .push_back (Queue);
1038
- ComputeQueueGroup.LowerIndex = 0 ;
1039
- ComputeQueueGroup.UpperIndex = 0 ;
1040
- ComputeQueueGroup.NextIndex = 0 ;
1051
+ // Compute group initialization.
1052
+ // First, see if the queue's device allows for round-robin or it is
1053
+ // fixed to one particular compute CCS (it is so for sub-sub-devices).
1054
+ auto &ComputeQueueGroupInfo = Device->QueueGroup [queue_type::Compute];
1055
+ if (ComputeQueueGroupInfo.ZeIndex >= 0 ) {
1056
+ ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex ;
1057
+ ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex ;
1058
+ ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex ;
1059
+ } else {
1060
+ ComputeQueueGroup.LowerIndex = 0 ;
1061
+ ComputeQueueGroup.UpperIndex = INT_MAX;
1062
+ ComputeQueueGroup.NextIndex = 0 ;
1063
+ }
1064
+
1065
+ uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines.first ;
1066
+ uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines.second ;
1067
+ FilterUpperIndex =
1068
+ std::min ((size_t )FilterUpperIndex, ComputeQueues.size () - 1 );
1069
+ if (FilterLowerIndex <= FilterUpperIndex) {
1070
+ ComputeQueueGroup.ZeQueues = ComputeQueues;
1071
+ ComputeQueueGroup.LowerIndex = FilterLowerIndex;
1072
+ ComputeQueueGroup.UpperIndex = FilterUpperIndex;
1073
+ ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex ;
1074
+ } else {
1075
+ die (" No compute queue available." );
1076
+ }
1041
1077
1042
1078
// Copy group initialization.
1043
1079
if (getRangeOfAllowedCopyEngines.first < 0 ||
@@ -1447,6 +1483,12 @@ _pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
1447
1483
1448
1484
ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
1449
1485
ZeCommandQueueDesc.index = ZeCommandQueueIndex;
1486
+ ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1487
+
1488
+ zePrint (" [getZeQueue]: create queue ordinal = %d, index = %d "
1489
+ " (round robin in [%d, %d])\n " ,
1490
+ ZeCommandQueueDesc.ordinal , ZeCommandQueueDesc.index , LowerIndex,
1491
+ UpperIndex);
1450
1492
1451
1493
auto ZeResult = ZE_CALL_NOCHECK (
1452
1494
zeCommandQueueCreate, (Queue->Context ->ZeContext , Queue->Device ->ZeDevice ,
@@ -3012,59 +3054,38 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
3012
3054
PI_QUEUE_ON_DEVICE_DEFAULT)),
3013
3055
PI_INVALID_VALUE);
3014
3056
3015
- ze_device_handle_t ZeDevice;
3016
- ze_command_queue_handle_t ZeComputeCommandQueue;
3017
3057
PI_ASSERT (Context, PI_INVALID_CONTEXT);
3058
+ PI_ASSERT (Queue, PI_INVALID_QUEUE);
3059
+ PI_ASSERT (Device, PI_INVALID_DEVICE);
3018
3060
3019
3061
if (std::find (Context->Devices .begin (), Context->Devices .end (), Device) ==
3020
3062
Context->Devices .end ()) {
3021
3063
return PI_INVALID_DEVICE;
3022
3064
}
3023
3065
3024
- PI_ASSERT (Device, PI_INVALID_DEVICE);
3066
+ // Create placeholder queues in the compute queue group.
3067
+ // Actual L0 queues will be created at first use.
3068
+ std::vector<ze_command_queue_handle_t > ZeComputeCommandQueues (
3069
+ Device->QueueGroup [_pi_queue::queue_type::Compute].ZeProperties .numQueues ,
3070
+ nullptr );
3025
3071
3026
- ZeDevice = Device->ZeDevice ;
3027
- auto &ComputeQueueGroup =
3028
- Device->QueueGroup [_pi_device::queue_group_info_t ::Compute];
3029
- ZeStruct<ze_command_queue_desc_t > ZeCommandQueueDesc;
3030
- ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
3031
- ZeCommandQueueDesc.ordinal = ComputeQueueGroup.ZeOrdinal ;
3032
- ZeCommandQueueDesc.index = ComputeQueueGroup.ZeIndex ;
3033
- // TODO: add round-robin through compute CCS.
3034
- if (ComputeQueueGroup.ZeIndex == -1 )
3035
- ZeCommandQueueDesc.index = 0 ;
3036
-
3037
- ZE_CALL (zeCommandQueueCreate,
3038
- (Context->ZeContext , ZeDevice,
3039
- &ZeCommandQueueDesc, // TODO: translate properties
3040
- &ZeComputeCommandQueue));
3041
-
3042
- std::vector<ze_command_queue_handle_t > ZeCopyCommandQueues;
3043
-
3044
- // Create a placeholder in ZeCopyCommandQueues for a queue that will be used
3045
- // to submit commands to main copy engine. This queue is initially NULL and
3046
- // will be replaced by the Ze Command Queue which gets created just before its
3047
- // first use.
3048
- ze_command_queue_handle_t ZeMainCopyCommandQueue = nullptr ;
3072
+ // Create placeholder queues in the copy queue group (main and link
3073
+ // native groups are combined into one group).
3074
+ // Actual L0 queues will be created at first use.
3075
+ size_t NumCopyGroups = 0 ;
3049
3076
if (Device->hasMainCopyEngine ()) {
3050
- ZeCopyCommandQueues.push_back (ZeMainCopyCommandQueue);
3077
+ NumCopyGroups += Device->QueueGroup [_pi_queue::queue_type::MainCopy]
3078
+ .ZeProperties .numQueues ;
3051
3079
}
3052
-
3053
- // Create additional 'placeholder queues' to link copy engines and push them
3054
- // into ZeCopyCommandQueues.
3055
3080
if (Device->hasLinkCopyEngine ()) {
3056
- auto ZeNumLinkCopyQueues =
3057
- Device->QueueGroup [_pi_device::queue_group_info_t ::LinkCopy]
3058
- .ZeProperties .numQueues ;
3059
- for (uint32_t i = 0 ; i < ZeNumLinkCopyQueues; ++i) {
3060
- ze_command_queue_handle_t ZeLinkCopyCommandQueue = nullptr ;
3061
- ZeCopyCommandQueues.push_back (ZeLinkCopyCommandQueue);
3062
- }
3081
+ NumCopyGroups += Device->QueueGroup [_pi_queue::queue_type::LinkCopy]
3082
+ .ZeProperties .numQueues ;
3063
3083
}
3064
- PI_ASSERT (Queue, PI_INVALID_QUEUE);
3084
+ std::vector<ze_command_queue_handle_t > ZeCopyCommandQueues (NumCopyGroups,
3085
+ nullptr );
3065
3086
3066
3087
try {
3067
- *Queue = new _pi_queue (ZeComputeCommandQueue , ZeCopyCommandQueues, Context,
3088
+ *Queue = new _pi_queue (ZeComputeCommandQueues , ZeCopyCommandQueues, Context,
3068
3089
Device, true , Properties);
3069
3090
} catch (const std::bad_alloc &) {
3070
3091
return PI_OUT_OF_HOST_MEMORY;
@@ -3278,6 +3299,8 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
3278
3299
PI_ASSERT (Queue, PI_INVALID_QUEUE);
3279
3300
3280
3301
auto ZeQueue = pi_cast<ze_command_queue_handle_t >(NativeHandle);
3302
+ // Assume this is the "0" index queue in the compute command-group.
3303
+ std::vector<ze_command_queue_handle_t > ZeQueues{ZeQueue};
3281
3304
3282
3305
// Attach the queue to the "0" device.
3283
3306
// TODO: see if we need to let user choose the device.
@@ -3287,7 +3310,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
3287
3310
// all commands to the "ZeQueue".
3288
3311
std::vector<ze_command_queue_handle_t > ZeroCopyQueues;
3289
3312
*Queue =
3290
- new _pi_queue (ZeQueue , ZeroCopyQueues, Context, Device, OwnNativeHandle);
3313
+ new _pi_queue (ZeQueues , ZeroCopyQueues, Context, Device, OwnNativeHandle);
3291
3314
return PI_SUCCESS;
3292
3315
}
3293
3316
0 commit comments