Skip to content

Commit 17cb38d

Browse files
author
iclsrc
committed
Merge from 'sycl' to 'sycl-web' (#8)
2 parents 808ff62 + efac3c2 commit 17cb38d

File tree

9 files changed

+468
-252
lines changed

9 files changed

+468
-252
lines changed

clang/lib/Sema/SemaSYCL.cpp

Lines changed: 195 additions & 187 deletions
Large diffs are not rendered by default.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// RUN: %clang_cc1 -fsycl -fsycl-is-device -fsyntax-only -verify %s
2+
// expected-no-diagnostics
3+
4+
// The kernel_single_task call is emitted as an OpenCL kernel function. The call
5+
// to getFullyQualifiedType caused a 2nd instantiation of zip_iterator<b,b> (the
6+
// first instantiation is on line 28 during phase 1).
7+
// Then, the call to 'foo' in 'main' causes 'foo' to be instantiated. The best
8+
// match for zip_iterator<j,d> is the one instantiated by the
9+
// kernel_single_task, which is now different from the one made in phase 1.
10+
// 'is_same<zip_iterator<b,b>' is instantiated during phase 1 at line 28, and
11+
// 'zip_iterator<b,b>' is instantiated by getFullyQualifiedName.
12+
// So 'is_same<zip_iterator<b,b>, zip_iterator<j,d>>::value' return false
13+
// even though zip_iterator<b,b> and zip_iterator<j,d> have the same type
14+
// 'zip_iterator<b,b>'.
15+
16+
struct b {};
17+
18+
template <typename T, typename U>
19+
struct is_same { static const bool value = false; };
20+
template <typename T>
21+
struct is_same<T, T> { static const bool value = true; };
22+
23+
template <typename... Ts>
24+
struct zip_iterator {};
25+
26+
template <class j, class d>
27+
void foo(j e, d k) {
28+
static_assert(is_same<zip_iterator<b, b>, zip_iterator<j, d>>::value, "device_iterator");
29+
}
30+
31+
template <typename name, typename Func>
32+
__attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) {
33+
kernelFunc();
34+
}
35+
36+
int main() {
37+
kernel_single_task<zip_iterator<b, b>>([] {});
38+
foo(b{}, b{});
39+
return 0;
40+
}

sycl/doc/extensions/QueueShortcuts/QueueShortcuts.adoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,6 @@ Note: These simplifications do not depend on queue order properties. They apply
1919
include::queue.hpp[]
2020
----
2121

22+
Overloads 4-12 shall support generic lambda as the kernel argument. For overloads 4-9 the generic argument is the `item` with the same dimensions that `range` argument has. For overloads 10-12 the generic argument is the `nd_item` with the same dimensions that `nd_range` argument has.
23+
24+
Overloads 4-6 shall support number or `braced-init-list` as the `range` argument.
Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,50 @@
11
class queue {
22
public:
3-
...
3+
// ...
44
template <typename KernelName, typename KernelType>
5-
event single_task(KernelType KernelFunc);
5+
event single_task(KernelType KernelFunc); // (1)
66

77
template <typename KernelName, typename KernelType>
8-
event single_task(event DepEvent, KernelType KernelFunc);
8+
event single_task(event DepEvent, KernelType KernelFunc); // (2)
99

1010
template <typename KernelName, typename KernelType>
1111
event single_task(const vector_class<event> &DepEvents,
12-
KernelType KernelFunc);
12+
KernelType KernelFunc); // (3)
1313

1414
template <typename KernelName, typename KernelType, int Dims>
15-
event parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc);
15+
event parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc); // (4)
1616

1717
template <typename KernelName, typename KernelType, int Dims>
1818
event parallel_for(range<Dims> NumWorkItems, event DepEvent,
19-
KernelType KernelFunc);
19+
KernelType KernelFunc); // (5)
2020

2121
template <typename KernelName, typename KernelType, int Dims>
2222
event parallel_for(range<Dims> NumWorkItems,
2323
const vector_class<event> &DepEvents,
24-
KernelType KernelFunc);
24+
KernelType KernelFunc); // (6)
2525

2626
template <typename KernelName, typename KernelType, int Dims>
2727
event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
28-
KernelType KernelFunc);
28+
KernelType KernelFunc); // (7)
2929

3030
template <typename KernelName, typename KernelType, int Dims>
3131
event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
32-
event DepEvent, KernelType KernelFunc);
32+
event DepEvent, KernelType KernelFunc); // (8)
3333

3434
template <typename KernelName, typename KernelType, int Dims>
3535
event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
3636
const vector_class<event> &DepEvents,
37-
KernelType KernelFunc);
37+
KernelType KernelFunc); // (9)
3838

3939
template <typename KernelName, typename KernelType, int Dims>
40-
event parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc);
40+
event parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc); // (10)
4141

4242
template <typename KernelName, typename KernelType, int Dims>
4343
event parallel_for(nd_range<Dims> ExecutionRange, event DepEvent,
44-
KernelType KernelFunc);
44+
KernelType KernelFunc); // (11)
4545

4646
template <typename KernelName, typename KernelType, int Dims>
4747
event parallel_for(nd_range<Dims> ExecutionRange,
4848
const vector_class<event> &DepEvents,
49-
KernelType KernelFunc);
49+
KernelType KernelFunc); // (12)
5050
};

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 54 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ pi_uint64 _pi_event::get_end_time() const {
363363

364364
pi_result _pi_event::record() {
365365

366-
if (is_recorded()) {
366+
if (is_recorded() || !is_started()) {
367367
return PI_INVALID_EVENT;
368368
}
369369

@@ -2074,7 +2074,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
20742074
size_t size, void *ptr,
20752075
pi_uint32 num_events_in_wait_list,
20762076
const pi_event *event_wait_list,
2077-
pi_event *retEvent) {
2077+
pi_event *event) {
20782078

20792079
assert(buffer != nullptr);
20802080
assert(command_queue != nullptr);
@@ -2089,7 +2089,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
20892089
retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
20902090
event_wait_list, nullptr);
20912091

2092-
if (retEvent) {
2092+
if (event) {
20932093
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
20942094
PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue));
20952095
retImplEv->start();
@@ -2098,16 +2098,16 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
20982098
retErr =
20992099
PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
21002100

2101-
if (retEvent) {
2101+
if (event) {
21022102
retErr = retImplEv->record();
21032103
}
21042104

21052105
if (blocking_read) {
21062106
retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
21072107
}
21082108

2109-
if (retEvent) {
2110-
*retEvent = retImplEv.release();
2109+
if (event) {
2110+
*event = retImplEv.release();
21112111
}
21122112

21132113
} catch (pi_result err) {
@@ -3381,7 +3381,7 @@ pi_result cuda_piEnqueueMemBufferReadRect(
33813381
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
33823382
size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
33833383
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
3384-
pi_event *retEvent) {
3384+
pi_event *event) {
33853385

33863386
assert(buffer != nullptr);
33873387
assert(command_queue != nullptr);
@@ -3397,9 +3397,9 @@ pi_result cuda_piEnqueueMemBufferReadRect(
33973397
retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
33983398
event_wait_list, nullptr);
33993399

3400-
if (retEvent) {
3400+
if (event) {
34013401
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
3402-
PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue));
3402+
PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue));
34033403
retImplEv->start();
34043404
}
34053405

@@ -3408,16 +3408,16 @@ pi_result cuda_piEnqueueMemBufferReadRect(
34083408
buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
34093409
host_offset, host_row_pitch, host_slice_pitch);
34103410

3411-
if (retEvent) {
3411+
if (event) {
34123412
retErr = retImplEv->record();
34133413
}
34143414

34153415
if (blocking_read) {
34163416
retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
34173417
}
34183418

3419-
if (retEvent) {
3420-
*retEvent = retImplEv.release();
3419+
if (event) {
3420+
*event = retImplEv.release();
34213421
}
34223422

34233423
} catch (pi_result err) {
@@ -3432,7 +3432,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
34323432
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
34333433
size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
34343434
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
3435-
pi_event *retEvent) {
3435+
pi_event *event) {
34363436

34373437
assert(buffer != nullptr);
34383438
assert(command_queue != nullptr);
@@ -3448,9 +3448,9 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
34483448
retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
34493449
event_wait_list, nullptr);
34503450

3451-
if (retEvent) {
3451+
if (event) {
34523452
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
3453-
PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue));
3453+
PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue));
34543454
retImplEv->start();
34553455
}
34563456

@@ -3459,16 +3459,16 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
34593459
host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
34603460
buffer_row_pitch, buffer_slice_pitch);
34613461

3462-
if (retEvent) {
3462+
if (event) {
34633463
retErr = retImplEv->record();
34643464
}
34653465

34663466
if (blocking_write) {
34673467
retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
34683468
}
34693469

3470-
if (retEvent) {
3471-
*retEvent = retImplEv.release();
3470+
if (event) {
3471+
*event = retImplEv.release();
34723472
}
34733473

34743474
} catch (pi_result err) {
@@ -3487,6 +3487,8 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
34873487
return PI_INVALID_QUEUE;
34883488
}
34893489

3490+
std::unique_ptr<_pi_event> retImplEv{nullptr};
3491+
34903492
try {
34913493
ScopedContext active(command_queue->get_context());
34923494

@@ -3497,17 +3499,21 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
34973499

34983500
pi_result result;
34993501

3502+
if (event) {
3503+
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
3504+
PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue));
3505+
result = retImplEv->start();
3506+
}
3507+
35003508
auto stream = command_queue->get();
35013509
auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
35023510
auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset;
35033511

35043512
result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
35053513

35063514
if (event) {
3507-
auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY,
3508-
command_queue);
3509-
new_event->record();
3510-
*event = new_event;
3515+
result = retImplEv->record();
3516+
*event = retImplEv.release();
35113517
}
35123518

35133519
return result;
@@ -3543,7 +3549,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect(
35433549

35443550
if (event) {
35453551
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
3546-
PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue));
3552+
PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue));
35473553
retImplEv->start();
35483554
}
35493555

@@ -3586,6 +3592,8 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
35863592
(void)pattern_is_valid;
35873593
(void)pattern_size_is_valid;
35883594

3595+
std::unique_ptr<_pi_event> retImplEv{nullptr};
3596+
35893597
try {
35903598
ScopedContext active(command_queue->get_context());
35913599

@@ -3596,6 +3604,12 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
35963604

35973605
pi_result result;
35983606

3607+
if (event) {
3608+
retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
3609+
PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue));
3610+
result = retImplEv->start();
3611+
}
3612+
35993613
auto dstDevice = buffer->mem_.buffer_mem_.get() + offset;
36003614
auto stream = command_queue->get();
36013615
auto N = size / pattern_size;
@@ -3646,10 +3660,8 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
36463660
}
36473661

36483662
if (event) {
3649-
auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_FILL,
3650-
command_queue);
3651-
new_event->record();
3652-
*event = new_event;
3663+
result = retImplEv->record();
3664+
*event = retImplEv.release();
36533665
}
36543666

36553667
return result;
@@ -3971,7 +3983,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
39713983
size_t size,
39723984
pi_uint32 num_events_in_wait_list,
39733985
const pi_event *event_wait_list,
3974-
pi_event *retEvent, void **ret_map) {
3986+
pi_event *event, void **ret_map) {
39753987

39763988
assert(ret_map != nullptr);
39773989
assert(command_queue != nullptr);
@@ -3993,15 +4005,16 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
39934005
if ((map_flags & CL_MAP_READ) || (map_flags & CL_MAP_WRITE)) {
39944006
ret_err = cuda_piEnqueueMemBufferRead(
39954007
command_queue, buffer, blocking_map, offset, size, hostPtr,
3996-
num_events_in_wait_list, event_wait_list, retEvent);
4008+
num_events_in_wait_list, event_wait_list, event);
39974009
} else {
3998-
if (retEvent) {
4010+
if (event) {
39994011
try {
40004012
ScopedContext active(command_queue->get_context());
40014013

4002-
*retEvent = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP,
4003-
command_queue);
4004-
(*retEvent)->record();
4014+
*event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP,
4015+
command_queue);
4016+
(*event)->start();
4017+
(*event)->record();
40054018
} catch (pi_result error) {
40064019
ret_err = error;
40074020
}
@@ -4018,7 +4031,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
40184031
void *mapped_ptr,
40194032
pi_uint32 num_events_in_wait_list,
40204033
const pi_event *event_wait_list,
4021-
pi_event *retEvent) {
4034+
pi_event *event) {
40224035
pi_result ret_err = PI_SUCCESS;
40234036

40244037
assert(command_queue != nullptr);
@@ -4034,15 +4047,16 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
40344047
command_queue, memobj, true,
40354048
memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
40364049
memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
4037-
num_events_in_wait_list, event_wait_list, retEvent);
4050+
num_events_in_wait_list, event_wait_list, event);
40384051
} else {
4039-
if (retEvent) {
4052+
if (event) {
40404053
try {
40414054
ScopedContext active(command_queue->get_context());
40424055

4043-
*retEvent = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP,
4044-
command_queue);
4045-
(*retEvent)->record();
4056+
*event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP,
4057+
command_queue);
4058+
(*event)->start();
4059+
(*event)->record();
40464060
} catch (pi_result error) {
40474061
ret_err = error;
40484062
}
@@ -4155,7 +4169,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
41554169
events_waitlist, nullptr);
41564170
if (event) {
41574171
event_ptr = std::unique_ptr<_pi_event>(
4158-
_pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue));
4172+
_pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue));
41594173
event_ptr->start();
41604174
}
41614175
result = PI_CHECK_ERROR(cuMemsetD8Async(

sycl/source/detail/scheduler/commands.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ class Command {
219219
bool MIsBlockable = false;
220220
/// Counts the number of memory objects this command is a leaf for.
221221
unsigned MLeafCounter = 0;
222+
/// Used for marking the node as visited during graph traversal.
223+
bool MVisited = false;
222224

223225
enum class BlockReason : int { HostAccessor = 0, HostTask };
224226

0 commit comments

Comments
 (0)