Skip to content

[SYCL] Refactor SYCL kernel object handling in hierarchical parallelism #6212

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jun 15, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 47 additions & 38 deletions clang/lib/Sema/SemaSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2505,38 +2505,44 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
return CompoundStmt::Create(SemaRef.getASTContext(), BodyStmts, {}, {});
}

void markParallelWorkItemCalls() {
if (getKernelInvocationKind(KernelCallerFunc) ==
InvokeParallelForWorkGroup) {
// Fetch the kernel object and the associated call operator
// (of either the lambda or the function object).
CXXRecordDecl *KernelObj =
GetSYCLKernelObjectType(KernelCallerFunc)->getAsCXXRecordDecl();
CXXMethodDecl *WGLambdaFn = nullptr;
if (KernelObj->isLambda())
WGLambdaFn = KernelObj->getLambdaCallOperator();
else
WGLambdaFn = getOperatorParens(KernelObj);
assert(WGLambdaFn && "non callable object is passed as kernel obj");
// Mark the function that it "works" in a work group scope:
// NOTE: In case of parallel_for_work_item the marker call itself is
// marked with work item scope attribute, here the '()' operator of the
// object passed as parameter is marked. This is an optimization -
// there are a lot of locals created at parallel_for_work_group
// scope before calling the lambda - it is more efficient to have
// all of them in the private address space rather then sharing via
// the local AS. See parallel_for_work_group implementation in the
// SYCL headers.
if (!WGLambdaFn->hasAttr<SYCLScopeAttr>()) {
WGLambdaFn->addAttr(SYCLScopeAttr::CreateImplicit(
SemaRef.getASTContext(), SYCLScopeAttr::Level::WorkGroup));
// Search and mark parallel_for_work_item calls:
MarkWIScopeFnVisitor MarkWIScope(SemaRef.getASTContext());
MarkWIScope.TraverseDecl(WGLambdaFn);
// Now mark local variables declared in the PFWG lambda with work group
// scope attribute
addScopeAttrToLocalVars(*WGLambdaFn);
}
void annotateHierarchicalParallelismAPICalls() {
// Is this a hierarchical parallelism kernel invocation?
if (getKernelInvocationKind(KernelCallerFunc) != InvokeParallelForWorkGroup)
return;

// Mark kernel object with work-group scope attribute to avoid work-item
// scope memory allocation.
KernelObjClone->addAttr(SYCLScopeAttr::CreateImplicit(
SemaRef.getASTContext(), SYCLScopeAttr::Level::WorkGroup));

// Fetch the kernel object and the associated call operator
// (of either the lambda or the function object).
CXXRecordDecl *KernelObj =
GetSYCLKernelObjectType(KernelCallerFunc)->getAsCXXRecordDecl();
CXXMethodDecl *WGLambdaFn = nullptr;
if (KernelObj->isLambda())
WGLambdaFn = KernelObj->getLambdaCallOperator();
else
WGLambdaFn = getOperatorParens(KernelObj);
assert(WGLambdaFn && "non callable object is passed as kernel obj");
// Mark the function that it "works" in a work group scope:
// NOTE: In case of parallel_for_work_item the marker call itself is
// marked with work item scope attribute, here the '()' operator of the
// object passed as parameter is marked. This is an optimization -
// there are a lot of locals created at parallel_for_work_group
// scope before calling the lambda - it is more efficient to have
// all of them in the private address space rather then sharing via
// the local AS. See parallel_for_work_group implementation in the
// SYCL headers.
if (!WGLambdaFn->hasAttr<SYCLScopeAttr>()) {
WGLambdaFn->addAttr(SYCLScopeAttr::CreateImplicit(
SemaRef.getASTContext(), SYCLScopeAttr::Level::WorkGroup));
// Search and mark parallel_for_work_item calls:
MarkWIScopeFnVisitor MarkWIScope(SemaRef.getASTContext());
MarkWIScope.TraverseDecl(WGLambdaFn);
// Now mark local variables declared in the PFWG lambda with work group
// scope attribute
addScopeAttrToLocalVars(*WGLambdaFn);
}
}

Expand Down Expand Up @@ -2763,14 +2769,16 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
}

static VarDecl *createKernelObjClone(ASTContext &Ctx, DeclContext *DC,
const CXXRecordDecl *KernelObj) {
const CXXRecordDecl *KernelObj,
FunctionDecl *KernelCallerFunc) {
TypeSourceInfo *TSInfo =
KernelObj->isLambda() ? KernelObj->getLambdaTypeInfo() : nullptr;
auto Type = QualType(KernelObj->getTypeForDecl(), 0);
if (KernelObj->isLambda())
Type->getAsRecordDecl()->setAnonymousStructOrUnion(true);
VarDecl *VD = VarDecl::Create(
Ctx, DC, KernelObj->getLocation(), KernelObj->getLocation(),
KernelObj->getIdentifier(), QualType(KernelObj->getTypeForDecl(), 0),
TSInfo, SC_None);

KernelObj->getIdentifier(), Type, TSInfo, SC_None);
return VD;
}

Expand Down Expand Up @@ -2846,12 +2854,13 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
FunctionDecl *KernelCallerFunc)
: SyclKernelFieldHandler(S), DeclCreator(DC),
KernelObjClone(createKernelObjClone(S.getASTContext(),
DC.getKernelDecl(), KernelObj)),
DC.getKernelDecl(), KernelObj,
KernelCallerFunc)),
VarEntity(InitializedEntity::InitializeVariable(KernelObjClone)),
KernelObj(KernelObj), KernelCallerFunc(KernelCallerFunc),
KernelCallerSrcLoc(KernelCallerFunc->getLocation()) {
CollectionInitExprs.push_back(createInitListExpr(KernelObj));
markParallelWorkItemCalls();
annotateHierarchicalParallelismAPICalls();

Stmt *DS = new (S.Context) DeclStmt(DeclGroupRef(KernelObjClone),
KernelCallerSrcLoc, KernelCallerSrcLoc);
Expand Down
1 change: 1 addition & 0 deletions clang/test/SemaSYCL/kernel-handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ int main() {
// NONATIVESUPPORT-NEXT: InitListExpr
// NONATIVESUPPORT-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
// NONATIVESUPPORT-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '_arg_a' 'int'
// NONATIVESUPPORT-NEXT: SYCLScopeAttr {{.*}} Implicit WorkGroup

// Check declaration and initialization of kernel handler local clone using default constructor
// NONATIVESUPPORT-NEXT: DeclStmt
Expand Down
73 changes: 30 additions & 43 deletions llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,19 @@
// (1) - materialization of a PFWI object
// (2) - "fixup" of the private variable address.
//
// TODO: add support for the case when there are other functions between
// parallel_for_work_group and parallel_for_work_item in the call stack.
// For example:
//
// void foo(sycl::group<1> group, ...) {
// group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... });
// }
// ...
// cgh.parallel_for_work_group<class kernel>(
// range<1>(...), range<1>(...), [=](group<1> g) {
// foo(g, ...);
// });
//
// TODO The approach employed by this pass generates lots of barriers and data
// copying between private and local memory, which might not be efficient. There
// are optimization opportunities listed below. Also other approaches can be
Expand Down Expand Up @@ -385,16 +398,8 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
LocAlign = MaybeAlign(AI->getAlignment());
} else {
auto Arg = cast<Argument>(L);
if (Arg->hasByValAttr()) {
T = Arg->getParamByValType();
LocAlign = MaybeAlign(Arg->getParamAlignment());
} else {
Type *Ty = Arg->getType();
Module &M = *Shadow->getParent();
LocAlign = M.getDataLayout().getValueOrABITypeAlignment(
MaybeAlign(Arg->getParamAlignment()), Ty);
T = Arg->getType()->getPointerElementType();
}
T = Arg->getParamByValType();
LocAlign = MaybeAlign(Arg->getParamAlignment());
}

assert(T && "Unexpected type");
Expand Down Expand Up @@ -698,16 +703,7 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) {
// Go through "byval" parameters which are passed as AS(0) pointers
// and: (1) create local shadows for them (2) and initialize them from the
// leader's copy and (3) materialize the value in the local variable before use
//
// Do the same for 'this' pointer which points to PFWG lamda object which is
// allocated in the caller. Caller is a kernel function which is generated by
// SYCL frontend. Kernel function allocates PFWG lambda object and initalizes
// captured objects (like accessors) using arguments of the kernel. After
// intialization kernel calls PFWG function (which is the operator() of the PFWG
// object). PFWG object captures all objects by value and all uses (except
// initialization from kernel arguments) of this values can only be in scope of
// PFWG function that is why copy back of PFWG object is not needed.
static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
static void shareByValParams(Function &F, const Triple &TT) {
// Skip alloca instructions and split. Alloca instructions must be in the
// beginning of the function otherwise they are considered as dynamic which
// can cause the problems with inlining.
Expand All @@ -726,29 +722,20 @@ static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
Instruction &At = LeaderBB->back();

for (auto &Arg : F.args()) {
Type *T;
LLVMContext &Ctx = At.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&LeaderBB->front());
if (!Arg.hasByValAttr())
Copy link
Contributor

@kbobrovs kbobrovs Jun 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: we skip "this" because it is allocated in the proper AS by the FE, correct? Comment would be helpful for the reader.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I just reverted the changes from #1455 and tried to re-implement it by fixing address space in clang instead.
Do you want me to comment that this points to the object in local address space, so we don't need a shadow copy for that argument?

continue;

assert(Arg.getType()->getPointerAddressSpace() ==
asUInt(spirv::AddrSpace::Private));

// Create the shared copy - "shadow" - for current arg
GlobalVariable *Shadow = nullptr;
if (Arg.hasByValAttr()) {
assert(Arg.getType()->getPointerAddressSpace() ==
asUInt(spirv::AddrSpace::Private));
T = Arg.getParamByValType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
}
// Process 'this' pointer which points to PFWG lambda object
else if (Arg.getArgNo() == 0) {
PointerType *PtrT = dyn_cast<PointerType>(Arg.getType());
assert(PtrT && "Expected this pointer as the first argument");
T = PtrT->getPointerElementType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
}
Type *T = Arg.getParamByValType();
GlobalVariable *Shadow =
spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");

if (!Shadow)
continue;
LLVMContext &Ctx = At.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&LeaderBB->front());

copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
true /*private->shadow*/);
Expand All @@ -766,6 +753,7 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
FunctionAnalysisManager &FAM) {
if (!F.getMetadata(WG_SCOPE_MD))
return PreservedAnalyses::all();
LLVM_DEBUG(llvm::dbgs() << "Function name: " << F.getName() << "\n");
const auto &TT = llvm::Triple(F.getParent()->getTargetTriple());
// Ranges of "side effect" instructions
SmallVector<InstrRange, 16> Ranges;
Expand Down Expand Up @@ -866,9 +854,8 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
for (auto *PFWICall : PFWICalls)
fixupPrivateMemoryPFWILambdaCaptures(PFWICall);

// Finally, create shadows for and replace usages of byval pointer params and
// PFWG lambda object ('this' pointer).
sharePFWGPrivateObjects(F, TT);
// Finally, create shadows for and replace usages of byval pointer params.
shareByValParams(F, TT);

#ifndef NDEBUG
if (HaveChanges && Debug > 0)
Expand Down
Loading