Skip to content

Commit 319cf0e

Browse files
authored
[SYCL] Add Experimental Range Rounding (#12690)
This commit adds new clang command line option -fsycl-exp-range-rounding. Experimental range rounding maps all 1, 2 or 3 dim range kernels to range rounded kernels which can be rounded in all dims. `SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=x:y:z` can be used for exp range rounding, where the `y` param will be a factor of the rounded range in all dims
1 parent d6340b6 commit 319cf0e

File tree

10 files changed

+232
-0
lines changed

10 files changed

+232
-0
lines changed

clang/include/clang/Basic/LangOptions.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ LANGOPT(
303303
ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2,
304304
SYCLRangeRoundingPreference::On,
305305
"Preference for SYCL parallel_for range rounding")
306+
LANGOPT(SYCLExperimentalRangeRounding, 1, 0, "Use experimental parallel for range rounding")
306307
LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the "
307308
"SYCL integration header")
308309
LANGOPT(SYCLAllowVirtualFunctions, 1, 0,

clang/include/clang/Driver/Options.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4012,6 +4012,10 @@ def fsycl_disable_range_rounding : Flag<["-"], "fsycl-disable-range-rounding">,
40124012
Alias<fsycl_range_rounding_EQ>, AliasArgs<["disable"]>,
40134013
HelpText<"Deprecated: please use -fsycl-range-rounding=disable instead.">,
40144014
Flags<[Deprecated]>;
4015+
def fsycl_exp_range_rounding : Flag<["-"], "fsycl-exp-range-rounding">,
4016+
Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>,
4017+
HelpText<"Use experimental range rounding.">,
4018+
MarshallingInfoFlag<LangOpts<"SYCLExperimentalRangeRounding">>;
40154019
def fno_sycl_use_footer : Flag<["-"], "fno-sycl-use-footer">, Visibility<[ClangOption, CLOption, DXCOption]>,
40164020
HelpText<"Disable usage of the integration footer during SYCL enabled "
40174021
"compilations.">;

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5430,6 +5430,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
54305430
if (Arg *A = Args.getLastArg(options::OPT_fsycl_range_rounding_EQ))
54315431
A->render(Args, CmdArgs);
54325432

5433+
if (Arg *A = Args.getLastArg(options::OPT_fsycl_exp_range_rounding))
5434+
A->render(Args, CmdArgs);
5435+
54335436
// Add the Unique ID prefix
54345437
StringRef UniqueID = D.getSYCLUniqueID(Input.getBaseInput());
54355438
if (!UniqueID.empty())

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,12 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
589589
default:
590590
break;
591591
}
592+
593+
// Set __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ macro for
594+
// both host and device compilations if -fsycl-exp-range-rounding
595+
// flag is used.
596+
if (LangOpts.SYCLExperimentalRangeRounding)
597+
Builder.defineMacro("__SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__");
592598
}
593599

594600
if (LangOpts.DeclareSPIRVBuiltins) {

clang/lib/Sema/SemaSYCL.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5187,6 +5187,12 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) {
51875187
break;
51885188
}
51895189

5190+
if (S.getLangOpts().SYCLExperimentalRangeRounding) {
5191+
O << "#ifndef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ \n";
5192+
O << "#define __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ 1\n";
5193+
O << "#endif //__SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__\n\n";
5194+
}
5195+
51905196
if (SpecConsts.size() > 0) {
51915197
O << "// Forward declarations of templated spec constant types:\n";
51925198
for (const auto &SC : SpecConsts)

clang/test/Preprocessor/predefined-macros.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,34 @@
339339
// CHECK-FORCE-RANGE: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1
340340
// CHECK-FORCE-NO-RANGE-NOT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1
341341

342+
// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \
343+
// RUN: -triple spir64-unknown-unknown -fsycl-exp-range-rounding -o - \
344+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-RANGE
345+
346+
// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \
347+
// RUN: -triple spir64_fpga-unknown-unknown -o - \
348+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE
349+
350+
// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-exp-range-rounding \
351+
// RUN: -triple spir64_fpga-unknown-unknown -o - \
352+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-RANGE
353+
354+
// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -o - \
355+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE
356+
357+
// RUN: %clang_cc1 %s -E -dM -o - \
358+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE
359+
360+
// RUN: %clang_cc1 %s -E -dM -fsycl-is-host \
361+
// RUN: -triple x86_64-unknown-linux-gnu -fsycl-exp-range-rounding -o - \
362+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-RANGE
363+
364+
// RUN: %clang_cc1 %s -E -dM -fsycl-is-host -o - \
365+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE
366+
367+
// CHECK-EXP-RANGE: #define __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ 1
368+
// CHECK-EXP-NO-RANGE-NOT: #define __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ 1
369+
342370
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \
343371
// RUN: -fgpu-default-stream=per-thread \
344372
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH

sycl/doc/design/ParallelForRangeRounding.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,35 @@ the range rounded kernel, `-fsycl-range-rounding=force`. The user can also tell
4747
the SYCL implementation to only produce the unrounded kernel using the flag
4848
`-fsycl-range-rounding=disable`. By default both kernels will be generated,
4949
which is equivalent to `-fsycl-range-rounding=on`.
50+
51+
## Experimental Range Rounding
52+
53+
Experimental range rounding will perform rounding in all dimensions.
54+
Experimental range rounding can be set using the `-fsycl-exp-range-rounding`
55+
flag.
56+
57+
Some oddly shaped ranges and how they might round:
58+
59+
```
60+
{43} -> {64}
61+
{43, 79} -> {64, 96}
62+
{43, 79, 7} -> {64, 96, 8}
63+
```
64+
65+
The user can specify the factor that they want the rounded range to be a
66+
multiple of in all dimensions using the
67+
`SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` environment variable. When
68+
experimental range rounding is used, only the middle value in
69+
`SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` is used.
70+
If `SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` is set to `1:256:1`, the rounded
71+
range will divide `256` in all dimensions
72+
73+
```
74+
{43} -> {256}
75+
{43, 257} -> {256, 512}
76+
{43, 257, 7} -> {256, 512, 256}
77+
```
78+
79+
`-fsycl-range-rounding=disable` will override `-fsycl-exp-range-rounding`. If
80+
both are used in conjunction then no range rounding will happen.
81+

sycl/include/sycl/handler.hpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,29 @@ class __SYCL_EXPORT handler {
12411241
DidAdjust = true;
12421242
};
12431243

1244+
#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
1245+
size_t GoodExpFactor = 1;
1246+
switch (Dims) {
1247+
case 1:
1248+
GoodExpFactor = 32; // Make global range multiple of {32}
1249+
break;
1250+
case 2:
1251+
GoodExpFactor = 16; // Make global range multiple of {16, 16}
1252+
break;
1253+
case 3:
1254+
GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
1255+
break;
1256+
}
1257+
1258+
// Check if rounding parameters have been set through environment:
1259+
// SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1260+
this->GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);
1261+
1262+
for (auto i = 0; i < Dims; ++i)
1263+
if (UserRange[i] % GoodExpFactor) {
1264+
Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
1265+
}
1266+
#else
12441267
// Perform range rounding if there are sufficient work-items to
12451268
// need rounding and the user-specified range is not a multiple of
12461269
// a "good" value.
@@ -1251,6 +1274,7 @@ class __SYCL_EXPORT handler {
12511274
// will yield a rounded-up value for the total range.
12521275
Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
12531276
}
1277+
#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
12541278
#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
12551279
// If we are forcing range rounding kernels to be used, we always want the
12561280
// rounded range kernel to be generated, even if rounding isn't needed

sycl/test-e2e/Basic/parallel_for_range_roundup.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
// RUN: %{build} -fsycl-range-rounding=force -o %t.out
66
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT
77

8+
// RUN: %{build} -fsycl-exp-range-rounding -o %t.out
9+
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-EXP
10+
11+
// RUN: %{build} -fsycl-range-rounding=force -fsycl-exp-range-rounding -o %t.out
12+
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-EXP
13+
//
814
// These tests test 3 things:
915
//
1016
// 1. The user range is the same as the in kernel range (using BufRange) as
@@ -243,3 +249,59 @@ int main() {
243249
// CHECK-DEFAULT-NEXT: Counter = 540672
244250
// CHECK-DEFAULT-NEXT: Correct kernel indexes used
245251
// CHECK-DEFAULT-NEXT: Counter = 540672
252+
253+
// CHECK-EXP: parallel_for range adjusted at dim 0 from 1500 to 1504
254+
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500
255+
// CHECK-EXP-NEXT: Counter = 1500
256+
// CHECK-EXP-NEXT: Correct kernel indexes used
257+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
258+
// CHECK-EXP-NEXT: Counter = 1500
259+
// CHECK-EXP-NEXT: Correct kernel indexes used
260+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
261+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
262+
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500
263+
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
264+
// CHECK-EXP-NEXT: Counter = 49500
265+
// CHECK-EXP-NEXT: Correct kernel indexes used
266+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
267+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
268+
// CHECK-EXP-NEXT: Counter = 49500
269+
// CHECK-EXP-NEXT: Correct kernel indexes used
270+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
271+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
272+
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500
273+
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
274+
// CHECK-EXP-NEXT: Size seen by user at Dim 2 = 64
275+
// CHECK-EXP-NEXT: Counter = 3168000
276+
// CHECK-EXP-NEXT: Correct kernel indexes used
277+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
278+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
279+
// CHECK-EXP-NEXT: Counter = 3168000
280+
// CHECK-EXP-NEXT: Correct kernel indexes used
281+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
282+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
283+
// CHECK-EXP-NEXT: Counter = 3168000
284+
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256
285+
// CHECK-EXP-NEXT: Counter = 256
286+
// CHECK-EXP-NEXT: Correct kernel indexes used
287+
// CHECK-EXP-NEXT: Counter = 256
288+
// CHECK-EXP-NEXT: Correct kernel indexes used
289+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
290+
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256
291+
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
292+
// CHECK-EXP-NEXT: Counter = 8448
293+
// CHECK-EXP-NEXT: Correct kernel indexes used
294+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
295+
// CHECK-EXP-NEXT: Counter = 8448
296+
// CHECK-EXP-NEXT: Correct kernel indexes used
297+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
298+
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256
299+
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
300+
// CHECK-EXP-NEXT: Size seen by user at Dim 2 = 64
301+
// CHECK-EXP-NEXT: Counter = 540672
302+
// CHECK-EXP-NEXT: Correct kernel indexes used
303+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
304+
// CHECK-EXP-NEXT: Counter = 540672
305+
// CHECK-EXP-NEXT: Correct kernel indexes used
306+
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
307+
// CHECK-EXP-NEXT: Counter = 540672
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// RUN: echo "Running parallel_for benchmark without range rounding"
2+
// RUN: %{build} -fsycl-range-rounding=disable -o %t.out
3+
// RUN: %{run} %t.out
4+
5+
// RUN: echo "Running parallel_for benchmark with normal range rounding"
6+
// RUN: %{build} -fsycl-range-rounding=force -o %t.out
7+
// RUN: %{run} %t.out
8+
9+
// RUN: echo "Running parallel_for benchmark with experimental range rounding"
10+
// RUN: %{build} -fsycl-exp-range-rounding -fsycl-range-rounding=force -o %t.out
11+
// RUN: %{run} %t.out
12+
13+
#include <chrono>
14+
#include <iostream>
15+
#include <sycl/sycl.hpp>
16+
17+
class FillData;
18+
class Compute;
19+
20+
int main() {
21+
constexpr static size_t width{788};
22+
constexpr static size_t height{1888};
23+
constexpr static size_t N{width * height};
24+
constexpr static size_t iterations{1000};
25+
26+
sycl::queue q{};
27+
float *A{sycl::malloc_device<float>(N, q)};
28+
float *B{sycl::malloc_device<float>(N, q)};
29+
float *C{sycl::malloc_device<float>(N, q)};
30+
31+
q.submit([&](sycl::handler &cgh) {
32+
cgh.parallel_for<FillData>(
33+
sycl::range<2>{height, width}, [=](sycl::id<2> id) {
34+
unsigned int row{static_cast<unsigned int>(id[0])};
35+
unsigned int col{static_cast<unsigned int>(id[1])};
36+
unsigned int ix{row * static_cast<unsigned int>(width) + col};
37+
A[ix] = id[0];
38+
B[ix] = id[1];
39+
});
40+
}).wait_and_throw();
41+
42+
auto start{std::chrono::steady_clock::now()};
43+
for (size_t i{0}; i < iterations; ++i) {
44+
q.submit([&](sycl::handler &cgh) {
45+
cgh.parallel_for<Compute>(
46+
sycl::range<2>{height, width}, [=](sycl::id<2> id) {
47+
unsigned int row{static_cast<unsigned int>(id[0])};
48+
unsigned int col{static_cast<unsigned int>(id[1])};
49+
unsigned int ix{row * static_cast<unsigned int>(width) + col};
50+
if (ix >= static_cast<unsigned int>(N)) {
51+
return;
52+
}
53+
if (A[ix] > B[ix]) {
54+
C[ix] = A[ix];
55+
} else {
56+
C[ix] = B[ix];
57+
}
58+
});
59+
}).wait_and_throw();
60+
}
61+
auto end{std::chrono::steady_clock::now()};
62+
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
63+
start)
64+
.count()
65+
<< " ms" << std::endl;
66+
}

0 commit comments

Comments
 (0)