Skip to content

Commit 988d8cd

Browse files
authored
[SYCL][Experimental] Reduce the set of optimizations for SYCL device (#1550)
This is patch limits the set of optimizations aiming to reduce the size of generated device module. Optimizations are currently disabled by default as they cause multiple sorts of issues. Some of the issues are addressed within this patch, but not all of them. Optimizations can be enabled with `-fsycl-enable-optimizaions` front-end option (or `-Xclang -fsycl-enable-optimizaions` driver option). Signed-off-by: Alexey Bader <[email protected]>
1 parent f117aa4 commit 988d8cd

File tree

9 files changed

+119
-40
lines changed

9 files changed

+119
-40
lines changed

clang/lib/Basic/Targets/SPIR.h

+6-4
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo {
135135
PointerWidth = PointerAlign = 32;
136136
SizeType = TargetInfo::UnsignedInt;
137137
PtrDiffType = IntPtrType = TargetInfo::SignedInt;
138-
resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
139-
"v96:128-v192:256-v256:256-v512:512-v1024:1024");
138+
resetDataLayout(
139+
"e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
140+
"v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64");
140141
}
141142

142143
void getTargetDefines(const LangOptions &Opts,
@@ -151,8 +152,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo {
151152
SizeType = TargetInfo::UnsignedLong;
152153
PtrDiffType = IntPtrType = TargetInfo::SignedLong;
153154

154-
resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
155-
"v96:128-v192:256-v256:256-v512:512-v1024:1024");
155+
resetDataLayout(
156+
"e-i64:64-v16:16-v24:32-v32:32-v48:64-"
157+
"v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64");
156158
}
157159

158160
void getTargetDefines(const LangOptions &Opts,

clang/lib/CodeGen/BackendUtil.cpp

+49-19
Original file line numberDiff line numberDiff line change
@@ -599,19 +599,51 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
599599
CodeGenOpts.PrepareForThinLTO));
600600
}
601601

602-
PMBuilder.OptLevel = CodeGenOpts.OptimizationLevel;
603-
PMBuilder.SizeLevel = CodeGenOpts.OptimizeSize;
604-
PMBuilder.SLPVectorize = CodeGenOpts.VectorizeSLP;
605-
PMBuilder.LoopVectorize = CodeGenOpts.VectorizeLoop;
606-
607-
PMBuilder.DisableUnrollLoops = !CodeGenOpts.UnrollLoops;
608-
// Loop interleaving in the loop vectorizer has historically been set to be
609-
// enabled when loop unrolling is enabled.
610-
PMBuilder.LoopsInterleaved = CodeGenOpts.UnrollLoops;
611-
PMBuilder.MergeFunctions = CodeGenOpts.MergeFunctions;
612-
PMBuilder.PrepareForThinLTO = CodeGenOpts.PrepareForThinLTO;
613-
PMBuilder.PrepareForLTO = CodeGenOpts.PrepareForLTO;
614-
PMBuilder.RerollLoops = CodeGenOpts.RerollLoops;
602+
// FIXME: This code is a workaround for a number of problems with optimized
603+
// SYCL code for the SPIR target. This change trying to balance between doing
604+
// too few and too many optimizations. The current approach is to disable as
605+
// much as possible just to keep the compiler functional. Eventually we can
606+
// consider allowing -On option to configure the optimization set for the FE
607+
// device compiler as well, but before that we must fix all the functional and
608+
// performance issues caused by LLVM transformantions.
609+
// E.g. LLVM optimizations make use of llvm intrinsics, instructions, data
610+
// types, etc., which are not supported by the SPIR-V translator (current
611+
// "back-end" for SYCL device compiler).
612+
// NOTE: We use "normal" inliner (i.e. from O2/O3), but limit the rest of
613+
// optimization pipeline. Inliner is a must for enabling size reduction
614+
// optimizations.
615+
if (LangOpts.SYCLIsDevice && TargetTriple.isSPIR()) {
616+
PMBuilder.OptLevel = 1;
617+
PMBuilder.SizeLevel = 2;
618+
PMBuilder.SLPVectorize = false;
619+
PMBuilder.LoopVectorize = false;
620+
PMBuilder.DivergentTarget = true;
621+
PMBuilder.DisableGVNLoadPRE = true;
622+
PMBuilder.ForgetAllSCEVInLoopUnroll = true;
623+
624+
PMBuilder.DisableUnrollLoops = true;
625+
// Loop interleaving in the loop vectorizer has historically been set to be
626+
// enabled when loop unrolling is enabled.
627+
PMBuilder.LoopsInterleaved = false;
628+
PMBuilder.MergeFunctions = false;
629+
PMBuilder.PrepareForThinLTO = false;
630+
PMBuilder.PrepareForLTO = false;
631+
PMBuilder.RerollLoops = false;
632+
} else {
633+
PMBuilder.OptLevel = CodeGenOpts.OptimizationLevel;
634+
PMBuilder.SizeLevel = CodeGenOpts.OptimizeSize;
635+
PMBuilder.SLPVectorize = CodeGenOpts.VectorizeSLP;
636+
PMBuilder.LoopVectorize = CodeGenOpts.VectorizeLoop;
637+
638+
PMBuilder.DisableUnrollLoops = !CodeGenOpts.UnrollLoops;
639+
// Loop interleaving in the loop vectorizer has historically been set to be
640+
// enabled when loop unrolling is enabled.
641+
PMBuilder.LoopsInterleaved = CodeGenOpts.UnrollLoops;
642+
PMBuilder.MergeFunctions = CodeGenOpts.MergeFunctions;
643+
PMBuilder.PrepareForThinLTO = CodeGenOpts.PrepareForThinLTO;
644+
PMBuilder.PrepareForLTO = CodeGenOpts.PrepareForLTO;
645+
PMBuilder.RerollLoops = CodeGenOpts.RerollLoops;
646+
}
615647

616648
MPM.add(new TargetLibraryInfoWrapperPass(*TLII));
617649

@@ -865,14 +897,15 @@ void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
865897

866898
std::unique_ptr<llvm::ToolOutputFile> ThinLinkOS, DwoOS;
867899

900+
// Clean-up SYCL device code if LLVM passes are disabled
901+
if (LangOpts.SYCLIsDevice && CodeGenOpts.DisableLLVMPasses)
902+
PerModulePasses.add(createDeadCodeEliminationPass());
903+
868904
switch (Action) {
869905
case Backend_EmitNothing:
870906
break;
871907

872908
case Backend_EmitBC:
873-
if (LangOpts.SYCLIsDevice) {
874-
PerModulePasses.add(createDeadCodeEliminationPass());
875-
}
876909
if (CodeGenOpts.PrepareForThinLTO && !CodeGenOpts.DisableLLVMPasses) {
877910
if (!CodeGenOpts.ThinLinkBitcodeFile.empty()) {
878911
ThinLinkOS = openOutputFile(CodeGenOpts.ThinLinkBitcodeFile);
@@ -1346,9 +1379,6 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
13461379
break;
13471380

13481381
case Backend_EmitBC:
1349-
if (LangOpts.SYCLIsDevice) {
1350-
CodeGenPasses.add(createDeadCodeEliminationPass());
1351-
}
13521382
if (CodeGenOpts.PrepareForThinLTO && !CodeGenOpts.DisableLLVMPasses) {
13531383
if (!CodeGenOpts.ThinLinkBitcodeFile.empty()) {
13541384
ThinLinkOS = openOutputFile(CodeGenOpts.ThinLinkBitcodeFile);

clang/test/CodeGen/target-data.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -237,11 +237,11 @@
237237

238238
// RUN: %clang_cc1 -triple spir-unknown -o - -emit-llvm %s | \
239239
// RUN: FileCheck %s -check-prefix=SPIR
240-
// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
240+
// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
241241

242242
// RUN: %clang_cc1 -triple spir64-unknown -o - -emit-llvm %s | \
243243
// RUN: FileCheck %s -check-prefix=SPIR64
244-
// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
244+
// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
245245

246246
// RUN: %clang_cc1 -triple bpfel -o - -emit-llvm %s | \
247247
// RUN: FileCheck %s -check-prefix=BPFEL

clang/test/CodeGenOpenCL/convergent.cl

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ void test_unroll() {
121121

122122
// The new PM produces a slightly different IR for the loop from the legacy PM,
123123
// but the test still checks that the loop is not unrolled.
124-
// CHECK-LEGACY: br i1 %{{.+}}, label %[[for_body]], label %[[for_cond_cleanup]]
124+
// CHECK-LEGACY: br i1 %{{.+}}, label %[[for_cond_cleanup]], label %[[for_body]]
125125
// CHECK-NEW: br i1 %{{.+}}, label %[[for_body_crit_edge:.+]], label %[[for_cond_cleanup]]
126126
// CHECK-NEW: [[for_body_crit_edge]]:
127127

clang/test/CodeGenSYCL/address-space-swap.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %clang -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s
1+
// RUN: %clang -fsycl-device-only -S -Xclang -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s
22
#include <algorithm>
33

44
void test() {

clang/test/CodeGenSYCL/debug-info-srcpos-kernel.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %clang -fsycl-device-only %s -S -I %S/Inputs -emit-llvm -g -o - | FileCheck %s
1+
// RUN: %clang -fsycl-device-only %s -S -emit-llvm -O0 -I %S/Inputs -g -o - | FileCheck %s
22
//
33
// Verify the SYCL kernel routine is marked artificial and has no source
44
// correlation.

llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp

+13-7
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,12 @@ class SPIRVLowerMemmove : public ModulePass,
7575
report_fatal_error("llvm.memmove of non-constant length not supported",
7676
false);
7777
auto *Length = cast<ConstantInt>(I.getLength());
78-
if (isa<BitCastInst>(Src))
79-
// The source could be bit-cast from another type,
80-
// need the original type for the allocation of the temporary variable
81-
SrcTy = cast<BitCastInst>(Src)->getOperand(0)->getType();
78+
auto *S = Src;
79+
// The source could be bit-cast or addrspacecast from another type,
80+
// need the original type for the allocation of the temporary variable
81+
while (isa<BitCastInst>(S) || isa<AddrSpaceCastInst>(S))
82+
S = cast<CastInst>(S)->getOperand(0);
83+
SrcTy = S->getType();
8284
MaybeAlign Align = I.getSourceAlign();
8385
auto Volatile = I.isVolatile();
8486
Value *NumElements = nullptr;
@@ -87,9 +89,13 @@ class SPIRVLowerMemmove : public ModulePass,
8789
NumElements = Builder.getInt32(SrcTy->getArrayNumElements());
8890
ElementsCount = SrcTy->getArrayNumElements();
8991
}
90-
if (Mod->getDataLayout().getTypeSizeInBits(SrcTy->getPointerElementType()) *
91-
ElementsCount !=
92-
Length->getZExtValue() * 8)
92+
if (((ElementsCount > 1) && (Mod->getDataLayout().getTypeSizeInBits(
93+
SrcTy->getPointerElementType()) *
94+
ElementsCount !=
95+
Length->getZExtValue() * 8)) ||
96+
((ElementsCount == 1) &&
97+
(Mod->getDataLayout().getTypeSizeInBits(
98+
SrcTy->getPointerElementType()) < Length->getZExtValue() * 8)))
9399
report_fatal_error("Size of the memcpy should match the allocated memory",
94100
false);
95101

llvm-spirv/lib/SPIRV/SPIRVWriter.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -1310,6 +1310,10 @@ SPIRVValue *LLVMToSPIRV::transValueWithoutDecoration(Value *V,
13101310
if (CallInst *CI = dyn_cast<CallInst>(V))
13111311
return mapValue(V, transCallInst(CI, BB));
13121312

1313+
// FIXME: this is not valid translation of freeze instruction
1314+
if (FreezeInst *FI = dyn_cast<FreezeInst>(V))
1315+
return mapValue(V, transValue(FI->getOperand(0), BB));
1316+
13131317
llvm_unreachable("Not implemented");
13141318
return nullptr;
13151319
}
@@ -1825,6 +1829,7 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II,
18251829
case Intrinsic::invariant_start:
18261830
case Intrinsic::invariant_end:
18271831
case Intrinsic::dbg_label:
1832+
case Intrinsic::assume:
18281833
return nullptr;
18291834
default:
18301835
if (SPIRVAllowUnknownIntrinsics)

llvm-spirv/test/transcoding/llvm.memmove.ll

+41-5
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,46 @@
1717
; CHECK-SPIRV: Bitcast [[i8Ty]] [[tmp3:[0-9]+]] [[mem]]
1818
; CHECK-SPIRV: LifetimeStop [[tmp3]] [[size]]
1919

20+
; CHECK-SPIRV: GenericCastToPtr {{[0-9]+}} [[out:[0-9]+]]
21+
; CHECK-SPIRV: Variable {{[0-9]+}} [[mem:[0-9]+]] 7
22+
; CHECK-SPIRV: Bitcast [[i8Ty:[0-9]+]] [[tmp0:[0-9]+]] [[mem]]
23+
; CHECK-SPIRV: LifetimeStart [[tmp0]] [[size:[0-9]+]]
24+
; CHECK-SPIRV: Bitcast [[i8Ty]] [[tmp1:[0-9]+]] [[mem]]
25+
; CHECK-SPIRV: CopyMemorySized [[tmp1]] {{[0-9]+}} {{[0-9]+}}
26+
; CHECK-SPIRV: Bitcast [[i8Ty]] [[tmp2:[0-9]+]] [[mem]]
27+
; CHECK-SPIRV: CopyMemorySized [[out]] [[tmp2]] {{[0-9]+}}
28+
; CHECK-SPIRV: Bitcast [[i8Ty]] [[tmp3:[0-9]+]] [[mem]]
29+
; CHECK-SPIRV: LifetimeStop [[tmp3]] [[size]]
30+
2031
; CHECK-LLVM-NOT: llvm.memmove
2132

33+
; CHECK-LLVM-LABEL: @test_struct
2234
; CHECK-LLVM: [[local:%[0-9]+]] = alloca %struct.SomeStruct
2335
; CHECK-LLVM: [[tmp1:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type:i[0-9]+\*]]
2436
; CHECK-LLVM: call void @llvm.lifetime.start.p0i8({{i[0-9]+}} {{-?[0-9]+}}, [[type]] [[tmp1]])
2537
; CHECK-LLVM: [[tmp2:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type]]
26-
; CHECK-LLVM: call void @llvm.memcpy
27-
; CHECK-LLVM: ([[type]] align 64 [[tmp2]],
28-
; CHECK-LLVM: {{i[0-9]+}} [[size:[0-9]+]]
38+
; CHECK-LLVM: call void @llvm.memcpy.p0i8.p1i8.i32
39+
; CHECK-LLVM-SAME: ([[type]] align 64 [[tmp2]],
40+
; CHECK-LLVM-SAME: {{i[0-9]+}} [[size:[0-9]+]]
2941
; CHECK-LLVM: [[tmp3:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type]]
30-
; CHECK-LLVM: call void @llvm.memcpy
31-
; CHECK-LLVM: , [[type]] align 64 [[tmp3]], {{i[0-9]+}} [[size]]
42+
; CHECK-LLVM: call void @llvm.memcpy.p1i8.p0i8.i32
43+
; CHECK-LLVM-SAME: , [[type]] align 64 [[tmp3]], {{i[0-9]+}} [[size]]
44+
; CHECK-LLVM: [[tmp4:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type]]
45+
; CHECK-LLVM: call void @llvm.lifetime.end.p0i8({{i[0-9]+}} {{-?[0-9]+}}, [[type]] [[tmp4]])
46+
47+
; CHECK-LLVM-LABEL: @copy_struct
48+
; CHECK-LLVM: [[out:%[0-9]+]] = addrspacecast i8 addrspace(4)* %2 to i8 addrspace(1)*
49+
; CHECK-LLVM: [[local:%[0-9]+]] = alloca %struct.SomeStruct
50+
; CHECK-LLVM: [[tmp1:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type:i[0-9]+\*]]
51+
; CHECK-LLVM: call void @llvm.lifetime.start.p0i8({{i[0-9]+}} {{-?[0-9]+}}, [[type]] [[tmp1]])
52+
; CHECK-LLVM: [[tmp2:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type]]
53+
; CHECK-LLVM: call void @llvm.memcpy.p0i8.p1i8.i32
54+
; CHECK-LLVM-SAME: ([[type]] align 64 [[tmp2]],
55+
; CHECK-LLVM-SAME: {{i[0-9]+}} [[size:[0-9]+]]
56+
; CHECK-LLVM: [[tmp3:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type]]
57+
; CHECK-LLVM: call void @llvm.memcpy.p1i8.p0i8.i32
58+
; CHECK-LLVM-SAME: align 64 [[out]]
59+
; CHECK-LLVM-SAME: , [[type]] align 64 [[tmp3]], {{i[0-9]+}} [[size]]
3260
; CHECK-LLVM: [[tmp4:%[0-9]+]] = bitcast %struct.SomeStruct* [[local]] to [[type]]
3361
; CHECK-LLVM: call void @llvm.lifetime.end.p0i8({{i[0-9]+}} {{-?[0-9]+}}, [[type]] [[tmp4]])
3462

@@ -45,6 +73,14 @@ define spir_kernel void @test_struct(%struct.SomeStruct addrspace(1)* nocapture
4573
ret void
4674
}
4775

76+
define spir_func void @copy_struct(%struct.SomeStruct addrspace(1)* nocapture readonly %in, %struct.SomeStruct addrspace(4)* nocapture %out) {
77+
%1 = bitcast %struct.SomeStruct addrspace(1)* %in to i8 addrspace(1)*
78+
%2 = bitcast %struct.SomeStruct addrspace(4)* %out to i8 addrspace(4)*
79+
%3 = addrspacecast i8 addrspace(4)* %2 to i8 addrspace(1)*
80+
call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* align 64 %3, i8 addrspace(1)* align 64 %1, i32 68, i1 false)
81+
ret void
82+
}
83+
4884
; Function Attrs: nounwind
4985
declare void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #1
5086

0 commit comments

Comments
 (0)