Skip to content

Commit c6c2ecd

Browse files
authored
[SYCL] Materialize shadow local variables for byval arguments before use (#2200)
Currently shadow local variables for byval arguments are used directly. In some cases this leads to invalid casts between address spaces. Materialize these local variables before using to fix this problem. Signed-off-by: Artur Gainullin <[email protected]>
1 parent 115c1a0 commit c6c2ecd

File tree

5 files changed

+74
-31
lines changed

5 files changed

+74
-31
lines changed

llvm/lib/SYCLLowerIR/LowerWGScope.cpp

+10-26
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) {
685685

686686
// Go through "byval" parameters which are passed as AS(0) pointers
687687
// and: (1) create local shadows for them (2) and initialize them from the
688-
// leader's copy and (3) replace usages with pointer to the shadow
688+
// leader's copy and (3) materialize the value in the local variable before use
689689
//
690690
// Do the same for 'this' pointer which points to PFWG lamda object which is
691691
// allocated in the caller. Caller is a kernel function which is generated by
@@ -707,7 +707,7 @@ static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
707707
BasicBlock *LeaderBB = EntryBB->splitBasicBlock(SplitPoint, "leader");
708708
BasicBlock *MergeBB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "merge");
709709

710-
// 1) rewire the above basic blocks so that LeaderBB is executed only for the
710+
// Rewire the above basic blocks so that LeaderBB is executed only for the
711711
// leader workitem
712712
guardBlockWithIsLeaderCheck(EntryBB, LeaderBB, MergeBB,
713713
EntryBB->back().getDebugLoc(), TT);
@@ -719,50 +719,34 @@ static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
719719
IRBuilder<> Builder(Ctx);
720720
Builder.SetInsertPoint(&LeaderBB->front());
721721

722-
// 2) create the shared copy - "shadow" - for current arg
722+
// Create the shared copy - "shadow" - for current arg
723723
GlobalVariable *Shadow = nullptr;
724-
Value *RepVal = nullptr;
725724
if (Arg.hasByValAttr()) {
726725
assert(Arg.getType()->getPointerAddressSpace() ==
727726
asUInt(spirv::AddrSpace::Private));
728727
T = Arg.getParamByValType();
729728
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
730-
RepVal = Shadow;
731-
if (TT.isNVPTX()) {
732-
// For NVPTX target address space inference for kernel arguments and
733-
// allocas is happening in the backend (NVPTXLowerArgs and
734-
// NVPTXLowerAlloca passes). After the frontend these pointers are in
735-
// LLVM default address space 0 which is the generic address space for
736-
// NVPTX target.
737-
assert(Arg.getType()->getPointerAddressSpace() == 0);
738-
739-
// Cast a pointer in the shared address space to the generic address
740-
// space.
741-
RepVal = ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow,
742-
Arg.getType());
743-
}
744729
}
745730
// Process 'this' pointer which points to PFWG lambda object
746731
else if (Arg.getArgNo() == 0) {
747732
PointerType *PtrT = dyn_cast<PointerType>(Arg.getType());
748733
assert(PtrT && "Expected this pointer as the first argument");
749734
T = PtrT->getElementType();
750735
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
751-
RepVal =
752-
Builder.CreatePointerBitCastOrAddrSpaceCast(Shadow, Arg.getType());
753736
}
754737

755-
if (!Shadow || !RepVal)
738+
if (!Shadow)
756739
continue;
757740

758-
// 3) replace argument with shadow in all uses
759-
for (auto *U : Arg.users())
760-
U->replaceUsesOfWith(&Arg, RepVal);
761-
762741
copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
763742
true /*private->shadow*/);
743+
// Materialize the value in the local variable before use
744+
Builder.SetInsertPoint(&MergeBB->front());
745+
copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
746+
false /*shadow->private*/);
764747
}
765-
// 5) make sure workers use up-to-date shared values written by the leader
748+
// Insert barrier to make sure workers use up-to-date shared values written by
749+
// the leader
766750
spirv::genWGBarrier(MergeBB->front(), TT);
767751
}
768752

llvm/test/SYCLLowerIR/byval_arg.ll

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ define internal spir_func void @wibble(%struct.baz* byval(%struct.baz) %arg1) !w
1919
; CHECK-NEXT: br label [[MERGE]]
2020
; CHECK: merge:
2121
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
22+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.baz* [[ARG1]] to i8*
23+
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* [[TMP3]], i8 addrspace(3)* align 8 bitcast (%struct.baz addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i64 8, i1 false)
2224
; CHECK-NEXT: ret void
2325
;
2426
ret void
+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -LowerWGScope -S | FileCheck %s
3+
4+
; Test to check that shadow local variable for byval argument is
5+
; materialized before use. Otherwise invalid cast between address
6+
; spaces is generated.
7+
8+
%struct.widget = type { %struct.baz, %struct.baz, %struct.baz, %struct.spam }
9+
%struct.baz = type { %struct.snork }
10+
%struct.snork = type { [1 x i64] }
11+
%struct.spam = type { %struct.snork }
12+
13+
14+
declare dso_local spir_func void @zot(i8*)
15+
16+
; CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = internal unnamed_addr addrspace(3) global %struct.widget undef, align 16
17+
18+
; Function Attrs: inlinehint norecurse
19+
define dso_local spir_func void @wombat(%struct.widget* byval(%struct.widget) align 8 %arg) align 2 !work_group_scope !1 {
20+
; CHECK-LABEL: @wombat(
21+
; CHECK-NEXT: bb:
22+
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
23+
; CHECK-NEXT: [[CMPZ1:%.*]] = icmp eq i64 [[TMP0]], 0
24+
; CHECK-NEXT: br i1 [[CMPZ1]], label [[LEADER:%.*]], label [[MERGE:%.*]]
25+
; CHECK: leader:
26+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.widget* [[ARG:%.*]] to i8*
27+
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.widget addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 32, i1 false)
28+
; CHECK-NEXT: br label [[MERGE]]
29+
; CHECK: merge:
30+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
31+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.widget* [[ARG]] to i8*
32+
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP2]], i8 addrspace(3)* align 16 bitcast (%struct.widget addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i64 32, i1 false)
33+
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
34+
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0
35+
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
36+
; CHECK: wg_leader:
37+
; CHECK-NEXT: [[TMP:%.*]] = bitcast %struct.widget* [[ARG]] to i8*
38+
; CHECK-NEXT: call void @zot(i8* [[TMP]])
39+
; CHECK-NEXT: br label [[WG_CF]]
40+
; CHECK: wg_cf:
41+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
42+
; CHECK-NEXT: ret void
43+
;
44+
bb:
45+
%tmp = bitcast %struct.widget* %arg to i8*
46+
call void @zot(i8* %tmp)
47+
ret void
48+
}
49+
50+
!1 = !{}

llvm/test/SYCLLowerIR/cast_shadow.ll

+4-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@ target triple = "nvptx64-nvidia-cuda-sycldevice"
1515
define internal void @wobble(%struct.baz* %arg, %struct.spam* byval(%struct.spam) %arg1) !work_group_scope !0 {
1616
; CHECK: [[TMP10:%.*]] = bitcast %struct.spam* [[ARG1:%.*]] to i8*
1717
; CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i8* [[TMP10]], i64 32, i1 false)
18-
; CHECK: call void @widget(%struct.spam* addrspacecast (%struct.spam addrspace(3)* @[[SHADOW]] to %struct.spam*), %struct.quux* byval(%struct.quux) [[TMP2:%.*]])
18+
; CHECK: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
19+
; CHECK: [[TMP11:%.*]] = bitcast %struct.spam* %arg1 to i8*
20+
; CHECK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* [[TMP11:%.*]], i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW]] to i8
21+
; CHECK: call void @widget(%struct.spam* %arg1, %struct.quux* byval(%struct.quux) [[TMP2:%.*]])
1922
;
2023
bb:
2124
%tmp = alloca %struct.baz*

llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll

+8-4
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,17 @@ define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.z
3535
; CHECK-NEXT: br label [[MERGE]]
3636
; CHECK: merge:
3737
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
38-
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
39-
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0
38+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.zot* [[ARG1]] to i8*
39+
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP3]], i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i64 96, i1 false)
40+
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG]] to i8 addrspace(4)*
41+
; CHECK-NEXT: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* align 8 [[TMP4]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i64 1, i1 false)
42+
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
43+
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP5]], 0
4044
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
4145
; CHECK: wg_leader:
42-
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* addrspacecast (%struct.bar addrspace(3)* @[[PFWG_SHADOW]] to [[STRUCT_BAR]] addrspace(4)*), [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
46+
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[ARG]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
4347
; CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
44-
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [[STRUCT_ZOT:%.*]] addrspace(3)* @[[GROUP_SHADOW]] to [[STRUCT_ZOT]] addrspace(4)*
48+
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast %struct.zot* [[ARG1]] to [[STRUCT_ZOT:%.*]] addrspace(4)*
4549
; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]]
4650
; CHECK-NEXT: br label [[WG_CF]]
4751
; CHECK: wg_cf:

0 commit comments

Comments
 (0)