@@ -363,7 +363,7 @@ pi_uint64 _pi_event::get_end_time() const {
363
363
364
364
pi_result _pi_event::record () {
365
365
366
- if (is_recorded ()) {
366
+ if (is_recorded () || ! is_started () ) {
367
367
return PI_INVALID_EVENT;
368
368
}
369
369
@@ -2074,7 +2074,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
2074
2074
size_t size, void *ptr,
2075
2075
pi_uint32 num_events_in_wait_list,
2076
2076
const pi_event *event_wait_list,
2077
- pi_event *retEvent ) {
2077
+ pi_event *event ) {
2078
2078
2079
2079
assert (buffer != nullptr );
2080
2080
assert (command_queue != nullptr );
@@ -2089,7 +2089,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
2089
2089
retErr = cuda_piEnqueueEventsWait (command_queue, num_events_in_wait_list,
2090
2090
event_wait_list, nullptr );
2091
2091
2092
- if (retEvent ) {
2092
+ if (event ) {
2093
2093
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native (
2094
2094
PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue));
2095
2095
retImplEv->start ();
@@ -2098,16 +2098,16 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
2098
2098
retErr =
2099
2099
PI_CHECK_ERROR (cuMemcpyDtoHAsync (ptr, devPtr + offset, size, cuStream));
2100
2100
2101
- if (retEvent ) {
2101
+ if (event ) {
2102
2102
retErr = retImplEv->record ();
2103
2103
}
2104
2104
2105
2105
if (blocking_read) {
2106
2106
retErr = PI_CHECK_ERROR (cuStreamSynchronize (cuStream));
2107
2107
}
2108
2108
2109
- if (retEvent ) {
2110
- *retEvent = retImplEv.release ();
2109
+ if (event ) {
2110
+ *event = retImplEv.release ();
2111
2111
}
2112
2112
2113
2113
} catch (pi_result err) {
@@ -3381,7 +3381,7 @@ pi_result cuda_piEnqueueMemBufferReadRect(
3381
3381
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
3382
3382
size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
3383
3383
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
3384
- pi_event *retEvent ) {
3384
+ pi_event *event ) {
3385
3385
3386
3386
assert (buffer != nullptr );
3387
3387
assert (command_queue != nullptr );
@@ -3397,9 +3397,9 @@ pi_result cuda_piEnqueueMemBufferReadRect(
3397
3397
retErr = cuda_piEnqueueEventsWait (command_queue, num_events_in_wait_list,
3398
3398
event_wait_list, nullptr );
3399
3399
3400
- if (retEvent ) {
3400
+ if (event ) {
3401
3401
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native (
3402
- PI_COMMAND_TYPE_MEM_BUFFER_READ , command_queue));
3402
+ PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT , command_queue));
3403
3403
retImplEv->start ();
3404
3404
}
3405
3405
@@ -3408,16 +3408,16 @@ pi_result cuda_piEnqueueMemBufferReadRect(
3408
3408
buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
3409
3409
host_offset, host_row_pitch, host_slice_pitch);
3410
3410
3411
- if (retEvent ) {
3411
+ if (event ) {
3412
3412
retErr = retImplEv->record ();
3413
3413
}
3414
3414
3415
3415
if (blocking_read) {
3416
3416
retErr = PI_CHECK_ERROR (cuStreamSynchronize (cuStream));
3417
3417
}
3418
3418
3419
- if (retEvent ) {
3420
- *retEvent = retImplEv.release ();
3419
+ if (event ) {
3420
+ *event = retImplEv.release ();
3421
3421
}
3422
3422
3423
3423
} catch (pi_result err) {
@@ -3432,7 +3432,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
3432
3432
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
3433
3433
size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
3434
3434
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
3435
- pi_event *retEvent ) {
3435
+ pi_event *event ) {
3436
3436
3437
3437
assert (buffer != nullptr );
3438
3438
assert (command_queue != nullptr );
@@ -3448,9 +3448,9 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
3448
3448
retErr = cuda_piEnqueueEventsWait (command_queue, num_events_in_wait_list,
3449
3449
event_wait_list, nullptr );
3450
3450
3451
- if (retEvent ) {
3451
+ if (event ) {
3452
3452
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native (
3453
- PI_COMMAND_TYPE_MEM_BUFFER_WRITE , command_queue));
3453
+ PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT , command_queue));
3454
3454
retImplEv->start ();
3455
3455
}
3456
3456
@@ -3459,16 +3459,16 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
3459
3459
host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
3460
3460
buffer_row_pitch, buffer_slice_pitch);
3461
3461
3462
- if (retEvent ) {
3462
+ if (event ) {
3463
3463
retErr = retImplEv->record ();
3464
3464
}
3465
3465
3466
3466
if (blocking_write) {
3467
3467
retErr = PI_CHECK_ERROR (cuStreamSynchronize (cuStream));
3468
3468
}
3469
3469
3470
- if (retEvent ) {
3471
- *retEvent = retImplEv.release ();
3470
+ if (event ) {
3471
+ *event = retImplEv.release ();
3472
3472
}
3473
3473
3474
3474
} catch (pi_result err) {
@@ -3487,6 +3487,8 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
3487
3487
return PI_INVALID_QUEUE;
3488
3488
}
3489
3489
3490
+ std::unique_ptr<_pi_event> retImplEv{nullptr };
3491
+
3490
3492
try {
3491
3493
ScopedContext active (command_queue->get_context ());
3492
3494
@@ -3497,17 +3499,21 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
3497
3499
3498
3500
pi_result result;
3499
3501
3502
+ if (event) {
3503
+ retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native (
3504
+ PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue));
3505
+ result = retImplEv->start ();
3506
+ }
3507
+
3500
3508
auto stream = command_queue->get ();
3501
3509
auto src = src_buffer->mem_ .buffer_mem_ .get () + src_offset;
3502
3510
auto dst = dst_buffer->mem_ .buffer_mem_ .get () + dst_offset;
3503
3511
3504
3512
result = PI_CHECK_ERROR (cuMemcpyDtoDAsync (dst, src, size, stream));
3505
3513
3506
3514
if (event) {
3507
- auto new_event = _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_COPY,
3508
- command_queue);
3509
- new_event->record ();
3510
- *event = new_event;
3515
+ result = retImplEv->record ();
3516
+ *event = retImplEv.release ();
3511
3517
}
3512
3518
3513
3519
return result;
@@ -3543,7 +3549,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect(
3543
3549
3544
3550
if (event) {
3545
3551
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native (
3546
- PI_COMMAND_TYPE_MEM_BUFFER_COPY , command_queue));
3552
+ PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT , command_queue));
3547
3553
retImplEv->start ();
3548
3554
}
3549
3555
@@ -3586,6 +3592,8 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
3586
3592
(void )pattern_is_valid;
3587
3593
(void )pattern_size_is_valid;
3588
3594
3595
+ std::unique_ptr<_pi_event> retImplEv{nullptr };
3596
+
3589
3597
try {
3590
3598
ScopedContext active (command_queue->get_context ());
3591
3599
@@ -3596,6 +3604,12 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
3596
3604
3597
3605
pi_result result;
3598
3606
3607
+ if (event) {
3608
+ retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native (
3609
+ PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue));
3610
+ result = retImplEv->start ();
3611
+ }
3612
+
3599
3613
auto dstDevice = buffer->mem_ .buffer_mem_ .get () + offset;
3600
3614
auto stream = command_queue->get ();
3601
3615
auto N = size / pattern_size;
@@ -3646,10 +3660,8 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
3646
3660
}
3647
3661
3648
3662
if (event) {
3649
- auto new_event = _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_FILL,
3650
- command_queue);
3651
- new_event->record ();
3652
- *event = new_event;
3663
+ result = retImplEv->record ();
3664
+ *event = retImplEv.release ();
3653
3665
}
3654
3666
3655
3667
return result;
@@ -3971,7 +3983,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
3971
3983
size_t size,
3972
3984
pi_uint32 num_events_in_wait_list,
3973
3985
const pi_event *event_wait_list,
3974
- pi_event *retEvent , void **ret_map) {
3986
+ pi_event *event , void **ret_map) {
3975
3987
3976
3988
assert (ret_map != nullptr );
3977
3989
assert (command_queue != nullptr );
@@ -3993,15 +4005,16 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
3993
4005
if ((map_flags & CL_MAP_READ) || (map_flags & CL_MAP_WRITE)) {
3994
4006
ret_err = cuda_piEnqueueMemBufferRead (
3995
4007
command_queue, buffer, blocking_map, offset, size, hostPtr,
3996
- num_events_in_wait_list, event_wait_list, retEvent );
4008
+ num_events_in_wait_list, event_wait_list, event );
3997
4009
} else {
3998
- if (retEvent ) {
4010
+ if (event ) {
3999
4011
try {
4000
4012
ScopedContext active (command_queue->get_context ());
4001
4013
4002
- *retEvent = _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_MAP,
4003
- command_queue);
4004
- (*retEvent)->record ();
4014
+ *event = _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_MAP,
4015
+ command_queue);
4016
+ (*event)->start ();
4017
+ (*event)->record ();
4005
4018
} catch (pi_result error) {
4006
4019
ret_err = error;
4007
4020
}
@@ -4018,7 +4031,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
4018
4031
void *mapped_ptr,
4019
4032
pi_uint32 num_events_in_wait_list,
4020
4033
const pi_event *event_wait_list,
4021
- pi_event *retEvent ) {
4034
+ pi_event *event ) {
4022
4035
pi_result ret_err = PI_SUCCESS;
4023
4036
4024
4037
assert (command_queue != nullptr );
@@ -4034,15 +4047,16 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
4034
4047
command_queue, memobj, true ,
4035
4048
memobj->mem_ .buffer_mem_ .get_map_offset (mapped_ptr),
4036
4049
memobj->mem_ .buffer_mem_ .get_size (), mapped_ptr,
4037
- num_events_in_wait_list, event_wait_list, retEvent );
4050
+ num_events_in_wait_list, event_wait_list, event );
4038
4051
} else {
4039
- if (retEvent ) {
4052
+ if (event ) {
4040
4053
try {
4041
4054
ScopedContext active (command_queue->get_context ());
4042
4055
4043
- *retEvent = _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_UNMAP,
4044
- command_queue);
4045
- (*retEvent)->record ();
4056
+ *event = _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_UNMAP,
4057
+ command_queue);
4058
+ (*event)->start ();
4059
+ (*event)->record ();
4046
4060
} catch (pi_result error) {
4047
4061
ret_err = error;
4048
4062
}
@@ -4155,7 +4169,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
4155
4169
events_waitlist, nullptr );
4156
4170
if (event) {
4157
4171
event_ptr = std::unique_ptr<_pi_event>(
4158
- _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_COPY , queue));
4172
+ _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_FILL , queue));
4159
4173
event_ptr->start ();
4160
4174
}
4161
4175
result = PI_CHECK_ERROR (cuMemsetD8Async (
0 commit comments