Skip to content

Commit 348147f

Browse files
authored
Merge pull request #2192 from igchor/enqueues
[L0 v2] Implement (most of) the remaining enqueues functions
2 parents 8ddea15 + c467afa commit 348147f

26 files changed

+1037
-521
lines changed

source/adapters/level_zero/helpers/memory_helpers.cpp

+40
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,43 @@ bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver,
3131
}
3232
return false;
3333
}
34+
35+
ze_region_params ur2zeRegionParams(ur_rect_offset_t SrcOrigin,
36+
ur_rect_offset_t DstOrigin,
37+
ur_rect_region_t Region, size_t SrcRowPitch,
38+
size_t DstRowPitch, size_t SrcSlicePitch,
39+
size_t DstSlicePitch) {
40+
uint32_t SrcOriginX = ur_cast<uint32_t>(SrcOrigin.x);
41+
uint32_t SrcOriginY = ur_cast<uint32_t>(SrcOrigin.y);
42+
uint32_t SrcOriginZ = ur_cast<uint32_t>(SrcOrigin.z);
43+
44+
uint32_t SrcPitch = SrcRowPitch;
45+
if (SrcPitch == 0)
46+
SrcPitch = ur_cast<uint32_t>(Region.width);
47+
48+
if (SrcSlicePitch == 0)
49+
SrcSlicePitch = ur_cast<uint32_t>(Region.height) * SrcPitch;
50+
51+
uint32_t DstOriginX = ur_cast<uint32_t>(DstOrigin.x);
52+
uint32_t DstOriginY = ur_cast<uint32_t>(DstOrigin.y);
53+
uint32_t DstOriginZ = ur_cast<uint32_t>(DstOrigin.z);
54+
55+
uint32_t DstPitch = DstRowPitch;
56+
if (DstPitch == 0)
57+
DstPitch = ur_cast<uint32_t>(Region.width);
58+
59+
if (DstSlicePitch == 0)
60+
DstSlicePitch = ur_cast<uint32_t>(Region.height) * DstPitch;
61+
62+
uint32_t Width = ur_cast<uint32_t>(Region.width);
63+
uint32_t Height = ur_cast<uint32_t>(Region.height);
64+
uint32_t Depth = ur_cast<uint32_t>(Region.depth);
65+
66+
const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ,
67+
Width, Height, Depth};
68+
const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ,
69+
Width, Height, Depth};
70+
71+
return ze_region_params{ZeDstRegion, DstPitch, DstSlicePitch,
72+
ZeSrcRegion, SrcPitch, SrcSlicePitch};
73+
}

source/adapters/level_zero/helpers/memory_helpers.hpp

+16
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,19 @@ bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver,
2121
ze_context_handle_t hContext, void *ptr, size_t size);
2222

2323
ze_memory_type_t getMemoryType(ze_context_handle_t hContext, void *ptr);
24+
25+
struct ze_region_params {
26+
const ze_copy_region_t dstRegion;
27+
size_t dstPitch;
28+
size_t dstSlicePitch;
29+
const ze_copy_region_t srcRegion;
30+
size_t srcPitch;
31+
size_t srcSlicePitch;
32+
};
33+
34+
// Convert UR region parameters for zeCommandListAppendMemoryCopyRegion
35+
ze_region_params ur2zeRegionParams(ur_rect_offset_t SrcOrigin,
36+
ur_rect_offset_t DstOrigin,
37+
ur_rect_region_t Region, size_t SrcRowPitch,
38+
size_t DstRowPitch, size_t SrcSlicePitch,
39+
size_t DstSlicePitch);

source/adapters/level_zero/memory.cpp

+5-32
Original file line numberDiff line numberDiff line change
@@ -154,40 +154,13 @@ ur_result_t enqueueMemCopyRectHelper(
154154
ur_cast<std::uintptr_t>(ZeEvent));
155155
printZeEventList(WaitList);
156156

157-
uint32_t SrcOriginX = ur_cast<uint32_t>(SrcOrigin.x);
158-
uint32_t SrcOriginY = ur_cast<uint32_t>(SrcOrigin.y);
159-
uint32_t SrcOriginZ = ur_cast<uint32_t>(SrcOrigin.z);
160-
161-
uint32_t SrcPitch = SrcRowPitch;
162-
if (SrcPitch == 0)
163-
SrcPitch = ur_cast<uint32_t>(Region.width);
164-
165-
if (SrcSlicePitch == 0)
166-
SrcSlicePitch = ur_cast<uint32_t>(Region.height) * SrcPitch;
167-
168-
uint32_t DstOriginX = ur_cast<uint32_t>(DstOrigin.x);
169-
uint32_t DstOriginY = ur_cast<uint32_t>(DstOrigin.y);
170-
uint32_t DstOriginZ = ur_cast<uint32_t>(DstOrigin.z);
171-
172-
uint32_t DstPitch = DstRowPitch;
173-
if (DstPitch == 0)
174-
DstPitch = ur_cast<uint32_t>(Region.width);
175-
176-
if (DstSlicePitch == 0)
177-
DstSlicePitch = ur_cast<uint32_t>(Region.height) * DstPitch;
178-
179-
uint32_t Width = ur_cast<uint32_t>(Region.width);
180-
uint32_t Height = ur_cast<uint32_t>(Region.height);
181-
uint32_t Depth = ur_cast<uint32_t>(Region.depth);
182-
183-
const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ,
184-
Width, Height, Depth};
185-
const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ,
186-
Width, Height, Depth};
157+
auto ZeParams = ur2zeRegionParams(SrcOrigin, DstOrigin, Region, SrcRowPitch,
158+
DstRowPitch, SrcSlicePitch, DstSlicePitch);
187159

188160
ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion,
189-
(ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch,
190-
SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, ZeEvent,
161+
(ZeCommandList, DstBuffer, &ZeParams.dstRegion, ZeParams.dstPitch,
162+
ZeParams.dstSlicePitch, SrcBuffer, &ZeParams.srcRegion,
163+
ZeParams.srcPitch, ZeParams.srcSlicePitch, ZeEvent,
191164
WaitList.Length, WaitList.ZeEventList));
192165

193166
logger::debug("calling zeCommandListAppendMemoryCopyRegion()");

source/adapters/level_zero/v2/api.cpp

-8
Original file line numberDiff line numberDiff line change
@@ -225,14 +225,6 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel,
225225
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
226226
}
227227

228-
ur_result_t urEventGetProfilingInfo(ur_event_handle_t hEvent,
229-
ur_profiling_info_t propName,
230-
size_t propSize, void *pPropValue,
231-
size_t *pPropSizeRet) {
232-
logger::error("{} function not implemented!", __FUNCTION__);
233-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
234-
}
235-
236228
ur_result_t urEventGetNativeHandle(ur_event_handle_t hEvent,
237229
ur_native_handle_t *phNativeEvent) {
238230
logger::error("{} function not implemented!", __FUNCTION__);

source/adapters/level_zero/v2/context.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,12 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext,
4949
bool ownZeContext)
5050
: commandListCache(hContext),
5151
eventPoolCache(phDevices[0]->Platform->getNumDevices(),
52-
[context = this,
53-
platform = phDevices[0]->Platform](DeviceId deviceId) {
52+
[context = this, platform = phDevices[0]->Platform](
53+
DeviceId deviceId, v2::event_flags_t flags) {
5454
auto device = platform->getDeviceById(deviceId);
5555
// TODO: just use per-context id?
5656
return std::make_unique<v2::provider_normal>(
57-
context, device, v2::EVENT_COUNTER,
58-
v2::QUEUE_IMMEDIATE);
57+
context, device, v2::QUEUE_IMMEDIATE, flags);
5958
}),
6059
hContext(hContext, ownZeContext),
6160
hDevices(phDevices, phDevices + numDevices),

source/adapters/level_zero/v2/event.cpp

+167-5
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,20 @@
1414
#include "event_pool.hpp"
1515
#include "event_provider.hpp"
1616

17-
ur_event_handle_t_::ur_event_handle_t_(v2::event_allocation eventAllocation,
18-
v2::event_pool *pool)
19-
: type(eventAllocation.type), zeEvent(std::move(eventAllocation.borrow)),
20-
pool(pool) {}
17+
#include "../ur_interface_loader.hpp"
18+
19+
ur_event_handle_t_::ur_event_handle_t_(
20+
v2::raii::cache_borrowed_event eventAllocation, v2::event_pool *pool)
21+
: zeEvent(std::move(eventAllocation)), pool(pool),
22+
adjustedEventStartTimestamp(0), recordEventEndTimestamp(0),
23+
adjustedEventEndTimestamp(0),
24+
zeTimerResolution(getDevice()->ZeDeviceProperties->timerResolution),
25+
timestampMaxValue(getDevice()->getTimestampMask()) {}
2126

2227
void ur_event_handle_t_::reset() {
2328
// consider make an abstraction for regular/counter based
2429
// events if there's more of this type of conditions
25-
if (type == v2::event_type::EVENT_REGULAR) {
30+
if (pool->getFlags() & v2::EVENT_FLAGS_COUNTER) {
2631
zeEventHostReset(zeEvent.get());
2732
}
2833
}
@@ -40,11 +45,90 @@ ur_result_t ur_event_handle_t_::release() {
4045
if (!RefCount.decrementAndTest())
4146
return UR_RESULT_SUCCESS;
4247

48+
if (isTimestamped() && adjustedEventEndTimestamp == 0) {
49+
// L0 will write end timestamp to this event some time in the future,
50+
// so we can't release it yet.
51+
// TODO: delay releasing until the end timestamp is written.
52+
return UR_RESULT_SUCCESS;
53+
}
54+
4355
pool->free(this);
4456

4557
return UR_RESULT_SUCCESS;
4658
}
4759

60+
bool ur_event_handle_t_::isTimestamped() const {
61+
// If we are recording, the start time of the event will be non-zero.
62+
return adjustedEventStartTimestamp != 0;
63+
}
64+
65+
bool ur_event_handle_t_::isProfilingEnabled() const {
66+
return pool->getFlags() & v2::EVENT_FLAGS_PROFILING_ENABLED;
67+
}
68+
69+
ur_device_handle_t ur_event_handle_t_::getDevice() const {
70+
return pool->getProvider()->device();
71+
}
72+
73+
uint64_t ur_event_handle_t_::getEventStartTimestmap() const {
74+
return adjustedEventStartTimestamp;
75+
}
76+
77+
static uint64_t adjustEndEventTimestamp(uint64_t adjustedStartTimestamp,
78+
uint64_t endTimestamp,
79+
uint64_t timestampMaxValue,
80+
uint64_t timerResolution) {
81+
// End time needs to be adjusted for resolution and valid bits.
82+
uint64_t adjustedTimestamp =
83+
(endTimestamp & timestampMaxValue) * timerResolution;
84+
85+
// Handle a possible wrap-around (the underlying HW counter is < 64-bit).
86+
// Note, it will not report correct time if there were multiple wrap
87+
// arounds, and the longer term plan is to enlarge the capacity of the
88+
// HW timestamps.
89+
if (adjustedTimestamp < adjustedStartTimestamp)
90+
adjustedTimestamp += timestampMaxValue * timerResolution;
91+
92+
return adjustedTimestamp;
93+
}
94+
95+
uint64_t ur_event_handle_t_::getEventEndTimestamp() {
96+
std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
97+
98+
// If adjustedEventEndTimestamp on the event is non-zero it means it has
99+
// collected the result of the queue already. In that case it has been
100+
// adjusted and is ready for immediate return.
101+
if (adjustedEventEndTimestamp)
102+
return adjustedEventEndTimestamp;
103+
104+
// If the result is 0, we have not yet gotten results back and so we just
105+
// return it.
106+
if (recordEventEndTimestamp == 0)
107+
return recordEventEndTimestamp;
108+
109+
// Now that we have the result, there is no need to keep it in the queue
110+
// anymore, so we cache it on the event and evict the record from the
111+
// queue.
112+
adjustedEventEndTimestamp =
113+
adjustEndEventTimestamp(getEventStartTimestmap(), recordEventEndTimestamp,
114+
timestampMaxValue, zeTimerResolution);
115+
return adjustedEventEndTimestamp;
116+
}
117+
118+
void ur_event_handle_t_::recordStartTimestamp() {
119+
uint64_t deviceStartTimestamp = 0;
120+
UR_CALL_THROWS(ur::level_zero::urDeviceGetGlobalTimestamps(
121+
getDevice(), &deviceStartTimestamp, nullptr));
122+
123+
std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
124+
125+
adjustedEventStartTimestamp = deviceStartTimestamp;
126+
}
127+
128+
uint64_t *ur_event_handle_t_::getEventEndTimestampPtr() {
129+
return &recordEventEndTimestamp;
130+
}
131+
48132
namespace ur::level_zero {
49133
ur_result_t urEventRetain(ur_event_handle_t hEvent) { return hEvent->retain(); }
50134

@@ -88,4 +172,82 @@ ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName,
88172

89173
return UR_RESULT_SUCCESS;
90174
}
175+
176+
ur_result_t urEventGetProfilingInfo(
177+
ur_event_handle_t hEvent, ///< [in] handle of the event object
178+
ur_profiling_info_t
179+
propName, ///< [in] the name of the profiling property to query
180+
size_t
181+
propValueSize, ///< [in] size in bytes of the profiling property value
182+
void *pPropValue, ///< [out][optional] value of the profiling property
183+
size_t *pPropValueSizeRet ///< [out][optional] pointer to the actual size in
184+
///< bytes returned in propValue
185+
) {
186+
// The event must either have profiling enabled or be recording timestamps.
187+
bool isTimestampedEvent = hEvent->isTimestamped();
188+
if (!hEvent->isProfilingEnabled() && !isTimestampedEvent) {
189+
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
190+
}
191+
192+
UrReturnHelper returnValue(propValueSize, pPropValue, pPropValueSizeRet);
193+
194+
// For timestamped events we have the timestamps ready directly on the event
195+
// handle, so we short-circuit the return.
196+
if (isTimestampedEvent) {
197+
uint64_t contextStartTime = hEvent->getEventStartTimestmap();
198+
switch (propName) {
199+
case UR_PROFILING_INFO_COMMAND_QUEUED:
200+
case UR_PROFILING_INFO_COMMAND_SUBMIT:
201+
return returnValue(contextStartTime);
202+
case UR_PROFILING_INFO_COMMAND_END:
203+
case UR_PROFILING_INFO_COMMAND_START:
204+
case UR_PROFILING_INFO_COMMAND_COMPLETE: {
205+
return returnValue(hEvent->getEventEndTimestamp());
206+
}
207+
default:
208+
logger::error("urEventGetProfilingInfo: not supported ParamName");
209+
return UR_RESULT_ERROR_INVALID_VALUE;
210+
}
211+
}
212+
213+
ze_kernel_timestamp_result_t tsResult;
214+
215+
auto zeTimerResolution =
216+
hEvent->getDevice()->ZeDeviceProperties->timerResolution;
217+
auto timestampMaxValue = hEvent->getDevice()->getTimestampMask();
218+
219+
switch (propName) {
220+
case UR_PROFILING_INFO_COMMAND_START: {
221+
ZE2UR_CALL(zeEventQueryKernelTimestamp, (hEvent->getZeEvent(), &tsResult));
222+
uint64_t contextStartTime =
223+
(tsResult.global.kernelStart & timestampMaxValue) * zeTimerResolution;
224+
return returnValue(contextStartTime);
225+
}
226+
case UR_PROFILING_INFO_COMMAND_END:
227+
case UR_PROFILING_INFO_COMMAND_COMPLETE: {
228+
ZE2UR_CALL(zeEventQueryKernelTimestamp, (hEvent->getZeEvent(), &tsResult));
229+
230+
uint64_t contextStartTime =
231+
(tsResult.global.kernelStart & timestampMaxValue);
232+
233+
auto adjustedEndTime =
234+
adjustEndEventTimestamp(contextStartTime, tsResult.global.kernelEnd,
235+
timestampMaxValue, zeTimerResolution);
236+
return returnValue(adjustedEndTime);
237+
}
238+
case UR_PROFILING_INFO_COMMAND_QUEUED:
239+
case UR_PROFILING_INFO_COMMAND_SUBMIT:
240+
// Note: No users for this case
241+
// The "command_submit" time is implemented by recording submission
242+
// timestamp with a call to urDeviceGetGlobalTimestamps before command
243+
// enqueue.
244+
//
245+
return returnValue(uint64_t{0});
246+
default:
247+
logger::error("urEventGetProfilingInfo: not supported ParamName");
248+
return UR_RESULT_ERROR_INVALID_VALUE;
249+
}
250+
251+
return UR_RESULT_SUCCESS;
252+
}
91253
} // namespace ur::level_zero

source/adapters/level_zero/v2/event.hpp

+24-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class event_pool;
2424

2525
struct ur_event_handle_t_ : _ur_object {
2626
public:
27-
ur_event_handle_t_(v2::event_allocation eventAllocation,
27+
ur_event_handle_t_(v2::raii::cache_borrowed_event eventAllocation,
2828
v2::event_pool *pool);
2929

3030
void reset();
@@ -33,8 +33,30 @@ struct ur_event_handle_t_ : _ur_object {
3333
ur_result_t retain();
3434
ur_result_t release();
3535

36+
// Tells if this event was created as a timestamp event, allowing profiling
37+
// info even if profiling is not enabled.
38+
bool isTimestamped() const;
39+
40+
// Tells if this event comes from a pool that has profiling enabled.
41+
bool isProfilingEnabled() const;
42+
43+
// Device associated with this event
44+
ur_device_handle_t getDevice() const;
45+
46+
void recordStartTimestamp();
47+
uint64_t *getEventEndTimestampPtr();
48+
49+
uint64_t getEventStartTimestmap() const;
50+
uint64_t getEventEndTimestamp();
51+
3652
private:
37-
v2::event_type type;
3853
v2::raii::cache_borrowed_event zeEvent;
3954
v2::event_pool *pool;
55+
56+
uint64_t adjustedEventStartTimestamp;
57+
uint64_t recordEventEndTimestamp;
58+
uint64_t adjustedEventEndTimestamp;
59+
60+
const uint64_t zeTimerResolution;
61+
const uint64_t timestampMaxValue;
4062
};

source/adapters/level_zero/v2/event_pool.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ void event_pool::free(ur_event_handle_t_ *event) {
4848
event->RefCount.increment();
4949
}
5050

51-
event_provider *event_pool::getProvider() { return provider.get(); }
51+
event_provider *event_pool::getProvider() const { return provider.get(); }
52+
53+
event_flags_t event_pool::getFlags() const {
54+
return getProvider()->eventFlags();
55+
}
5256

5357
} // namespace v2

0 commit comments

Comments
 (0)