Skip to content

[SYCL] Share PFWG lambda object through shared memory #1455

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 71 additions & 36 deletions llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,20 +375,29 @@ using LocalsSet = SmallPtrSet<AllocaInst *, 4>;
static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
IRBuilder<> &Builder, bool Loc2Shadow) {
Type *T = nullptr;
int LocAlignN = 0;
MaybeAlign LocAlign(0);

if (const auto *AI = dyn_cast<AllocaInst>(L)) {
T = AI->getAllocatedType();
LocAlignN = AI->getAlignment();
LocAlign = MaybeAlign(AI->getAlignment());
} else {
T = cast<Argument>(L)->getParamByValType();
LocAlignN = cast<Argument>(L)->getParamAlignment();
if (cast<Argument>(L)->hasByValAttr()) {
T = cast<Argument>(L)->getParamByValType();
LocAlign = MaybeAlign(cast<Argument>(L)->getParamAlignment());
} else {
Type *Ty = cast<Argument>(L)->getType();
Module &M = *Shadow->getParent();
LocAlign = M.getDataLayout().getValueOrABITypeAlignment(
MaybeAlign(cast<Argument>(L)->getParamAlignment()), Ty);
auto PtrTy = dyn_cast<PointerType>(cast<Argument>(L)->getType());
assert(PtrTy && "Expected pointer type");
T = PtrTy->getElementType();
}
}

if (T->isAggregateType()) {
// TODO: we should use methods which directly return MaybeAlign once such
// are added to LLVM for AllocaInst and GlobalVariable
auto LocAlign = MaybeAlign(LocAlignN);
auto ShdAlign = MaybeAlign(Shadow->getAlignment());
Module &M = *Shadow->getParent();
auto SizeVal = M.getDataLayout().getTypeStoreSize(T);
Expand Down Expand Up @@ -679,10 +688,25 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) {
// Go through "byval" parameters which are passed as AS(0) pointers
// and: (1) create local shadows for them (2) and initialize them from the
// leader's copy and (3) replace usages with pointer to the shadow
static void shareByValParams(Function &F, const Triple &TT) {
// split
//
// Do the same for 'this' pointer which points to PFWG lamda object which is
// allocated in the caller. Caller is a kernel function which is generated by
// SYCL frontend. Kernel function allocates PFWG lambda object and initalizes
// captured objects (like accessors) using arguments of the kernel. After
// intialization kernel calls PFWG function (which is the operator() of the PFWG
// object). PFWG object captures all objects by value and all uses (except
// initialization from kernel arguments) of this values can only be in scope of
// PFWG function that is why copy back of PFWG object is not needed.
static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
// Skip alloca instructions and split. Alloca instructions must be in the
// beginning of the function otherwise they are considered as dynamic which
// can cause the problems with inlining.
BasicBlock *EntryBB = &F.getEntryBlock();
BasicBlock *LeaderBB = EntryBB->splitBasicBlock(&EntryBB->front(), "leader");
Instruction *SplitPoint = &*EntryBB->begin();
for (; SplitPoint->getOpcode() == Instruction::Alloca;
SplitPoint = SplitPoint->getNextNode())
;
BasicBlock *LeaderBB = EntryBB->splitBasicBlock(SplitPoint, "leader");
BasicBlock *MergeBB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "merge");

// 1) rewire the above basic blocks so that LeaderBB is executed only for the
Expand All @@ -692,38 +716,48 @@ static void shareByValParams(Function &F, const Triple &TT) {
Instruction &At = LeaderBB->back();

for (auto &Arg : F.args()) {
if (!Arg.hasByValAttr())
continue;
assert(Arg.getType()->getPointerAddressSpace() ==
asUInt(spirv::AddrSpace::Private));
Type *T = Arg.getParamByValType();

// 2) create the shared copy - "shadow" - for current byval arg
GlobalVariable *Shadow =
spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
Type *T;
LLVMContext &Ctx = At.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&LeaderBB->front());

// 3) replace argument with shadow in all uses
Value *RepVal = Shadow;
if (TT.isNVPTX()) {
// For NVPTX target address space inference for kernel arguments and
// allocas is happening in the backend (NVPTXLowerArgs and
// NVPTXLowerAlloca passes). After the frontend these pointers are in LLVM
// default address space 0 which is the generic address space for NVPTX
// target.
assert(Arg.getType()->getPointerAddressSpace() == 0);

// Cast a pointer in the shared address space to the generic address
// space.
// 2) create the shared copy - "shadow" - for current arg
GlobalVariable *Shadow;
Value *RepVal;
if (Arg.hasByValAttr()) {
assert(Arg.getType()->getPointerAddressSpace() ==
asUInt(spirv::AddrSpace::Private));
T = Arg.getParamByValType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
RepVal = Shadow;
if (TT.isNVPTX()) {
// For NVPTX target address space inference for kernel arguments and
// allocas is happening in the backend (NVPTXLowerArgs and
// NVPTXLowerAlloca passes). After the frontend these pointers are in
// LLVM default address space 0 which is the generic address space for
// NVPTX target.
assert(Arg.getType()->getPointerAddressSpace() == 0);

// Cast a pointer in the shared address space to the generic address
// space.
RepVal = ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow,
Arg.getType());
}
}
// Process 'this' pointer which points to PFWG lambda object
else if (Arg.getArgNo() == 0) {
PointerType *PtrT = dyn_cast<PointerType>(Arg.getType());
assert(PtrT && "Expected this pointer as the first argument");
T = PtrT->getElementType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
RepVal =
ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow, Arg.getType());
Builder.CreatePointerBitCastOrAddrSpaceCast(Shadow, Arg.getType());
}

// 3) replace argument with shadow in all uses
for (auto *U : Arg.users())
U->replaceUsesOfWith(&Arg, RepVal);

// 4) fill the shadow from the argument for the leader WI only
LLVMContext &Ctx = At.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&LeaderBB->front());
copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
true /*private->shadow*/);
}
Expand Down Expand Up @@ -832,8 +866,9 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, const llvm::Triple &TT,
for (auto *PFWICall : PFWICalls)
fixupPrivateMemoryPFWILambdaCaptures(PFWICall);

// Finally, create shadows for and replace usages of byval pointer params
shareByValParams(F, TT);
// Finally, create shadows for and replace usages of byval pointer params and
// PFWG lambda object ('this' pointer).
sharePFWGPrivateObjects(F, TT);

#ifndef NDEBUG
if (HaveChanges && Debug > 0)
Expand Down
42 changes: 23 additions & 19 deletions llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,51 +13,55 @@
%struct.foo = type { %struct.barney }
%struct.foo.0 = type { i8 }

; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)*
; CHECK: @[[GROUP_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.zot addrspace(4)*
; CHECK: @[[PFWG_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)*
; CHECK: @[[PFWI_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.foo.0
; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar
; CHECK: @[[GROUP_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.zot

define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.zot* byval(%struct.zot) align 8 %arg1) align 2 !work_group_scope !0 {
; CHECK-LABEL: @wibble(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8
; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0
; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]]
; CHECK: leader:
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.zot* [[ARG1:%.*]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 96, i1 false)
; CHECK-NEXT: [[ARG_CAST:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i8 addrspace(4)* align 8 [[ARG_CAST]], i64 1, i1 false)
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8
; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1
; CHECK-NEXT: [[ID:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[ID]], 0
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
; CHECK: wg_leader:
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* addrspacecast (%struct.bar addrspace(3)* @[[PFWG_SHADOW]] to [[STRUCT_BAR]] addrspace(4)*), [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [[STRUCT_ZOT:%.*]] addrspace(3)* @[[GROUP_SHADOW]] to [[STRUCT_ZOT]] addrspace(4)*
; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4
; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]]
; CHECK-NEXT: br label [[WG_CF]]
; CHECK: wg_cf:
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP3]], 0
; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0
; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]]
; CHECK: TestMat:
; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP4]], i64 1, i1 false)
; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP5]], i64 1, i1 false)
; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]]
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]]
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]]
; CHECK-NEXT: br label [[LEADERMAT]]
; CHECK: LeaderMat:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]]
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]]
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD1]], [[STRUCT_BAR]] addrspace(4)** [[TMP]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP5]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4
; CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP6]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]]
; CHECK-NEXT: call spir_func void @bar(%struct.zot addrspace(4)* [[WG_VAL_TMP4]], %struct.foo.0* byval(%struct.foo.0) align 1 [[TMP2]])
; CHECK-NEXT: ret void
;
Expand Down