intel · bader · Jun 15, 2022 · May 26, 2022 · May 29, 2022 · May 29, 2022
@@ -2505,38 +2505,44 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
     return CompoundStmt::Create(SemaRef.getASTContext(), BodyStmts, {}, {});
   }
 
-  void markParallelWorkItemCalls() {
-    if (getKernelInvocationKind(KernelCallerFunc) ==
-        InvokeParallelForWorkGroup) {
-      // Fetch the kernel object and the associated call operator
-      // (of either the lambda or the function object).
-      CXXRecordDecl *KernelObj =
-          GetSYCLKernelObjectType(KernelCallerFunc)->getAsCXXRecordDecl();
-      CXXMethodDecl *WGLambdaFn = nullptr;
-      if (KernelObj->isLambda())
-        WGLambdaFn = KernelObj->getLambdaCallOperator();
-      else
-        WGLambdaFn = getOperatorParens(KernelObj);
-      assert(WGLambdaFn && "non callable object is passed as kernel obj");
-      // Mark the function that it "works" in a work group scope:
-      // NOTE: In case of parallel_for_work_item the marker call itself is
-      // marked with work item scope attribute, here  the '()' operator of the
-      // object passed as parameter is marked. This is an optimization -
-      // there are a lot of locals created at parallel_for_work_group
-      // scope before calling the lambda - it is more efficient to have
-      // all of them in the private address space rather then sharing via
-      // the local AS. See parallel_for_work_group implementation in the
-      // SYCL headers.
-      if (!WGLambdaFn->hasAttr<SYCLScopeAttr>()) {
-        WGLambdaFn->addAttr(SYCLScopeAttr::CreateImplicit(
-            SemaRef.getASTContext(), SYCLScopeAttr::Level::WorkGroup));
-        // Search and mark parallel_for_work_item calls:
-        MarkWIScopeFnVisitor MarkWIScope(SemaRef.getASTContext());
-        MarkWIScope.TraverseDecl(WGLambdaFn);
-        // Now mark local variables declared in the PFWG lambda with work group
-        // scope attribute
-        addScopeAttrToLocalVars(*WGLambdaFn);
-      }
+  void annotateHierarchicalParallelismAPICalls() {
+    // Is this a hierarchical parallelism kernel invocation?
+    if (getKernelInvocationKind(KernelCallerFunc) != InvokeParallelForWorkGroup)
+      return;
+
+    // Mark kernel object with work-group scope attribute to avoid work-item
+    // scope memory allocation.
+    KernelObjClone->addAttr(SYCLScopeAttr::CreateImplicit(
+        SemaRef.getASTContext(), SYCLScopeAttr::Level::WorkGroup));
+
+    // Fetch the kernel object and the associated call operator
+    // (of either the lambda or the function object).
+    CXXRecordDecl *KernelObj =
+        GetSYCLKernelObjectType(KernelCallerFunc)->getAsCXXRecordDecl();
+    CXXMethodDecl *WGLambdaFn = nullptr;
+    if (KernelObj->isLambda())
+      WGLambdaFn = KernelObj->getLambdaCallOperator();
+    else
+      WGLambdaFn = getOperatorParens(KernelObj);
+    assert(WGLambdaFn && "non callable object is passed as kernel obj");
+    // Mark the function that it "works" in a work group scope:
+    // NOTE: In case of parallel_for_work_item the marker call itself is
+    // marked with work item scope attribute, here  the '()' operator of the
+    // object passed as parameter is marked. This is an optimization -
+    // there are a lot of locals created at parallel_for_work_group
+    // scope before calling the lambda - it is more efficient to have
+    // all of them in the private address space rather then sharing via
+    // the local AS. See parallel_for_work_group implementation in the
+    // SYCL headers.
+    if (!WGLambdaFn->hasAttr<SYCLScopeAttr>()) {
+      WGLambdaFn->addAttr(SYCLScopeAttr::CreateImplicit(
+          SemaRef.getASTContext(), SYCLScopeAttr::Level::WorkGroup));
+      // Search and mark parallel_for_work_item calls:
+      MarkWIScopeFnVisitor MarkWIScope(SemaRef.getASTContext());
+      MarkWIScope.TraverseDecl(WGLambdaFn);
+      // Now mark local variables declared in the PFWG lambda with work group
+      // scope attribute
+      addScopeAttrToLocalVars(*WGLambdaFn);
     }
   }
 
@@ -2763,14 +2769,16 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
   }
 
   static VarDecl *createKernelObjClone(ASTContext &Ctx, DeclContext *DC,
-                                       const CXXRecordDecl *KernelObj) {
+                                       const CXXRecordDecl *KernelObj,
+                                       FunctionDecl *KernelCallerFunc) {
     TypeSourceInfo *TSInfo =
         KernelObj->isLambda() ? KernelObj->getLambdaTypeInfo() : nullptr;
+    auto Type = QualType(KernelObj->getTypeForDecl(), 0);
+    if (KernelObj->isLambda())
+      Type->getAsRecordDecl()->setAnonymousStructOrUnion(true);
     VarDecl *VD = VarDecl::Create(
         Ctx, DC, KernelObj->getLocation(), KernelObj->getLocation(),
-        KernelObj->getIdentifier(), QualType(KernelObj->getTypeForDecl(), 0),
-        TSInfo, SC_None);
-
+        KernelObj->getIdentifier(), Type, TSInfo, SC_None);
     return VD;
   }
 
@@ -2846,12 +2854,13 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
                         FunctionDecl *KernelCallerFunc)
       : SyclKernelFieldHandler(S), DeclCreator(DC),
         KernelObjClone(createKernelObjClone(S.getASTContext(),
-                                            DC.getKernelDecl(), KernelObj)),
+                                            DC.getKernelDecl(), KernelObj,
+                                            KernelCallerFunc)),
         VarEntity(InitializedEntity::InitializeVariable(KernelObjClone)),
         KernelObj(KernelObj), KernelCallerFunc(KernelCallerFunc),
         KernelCallerSrcLoc(KernelCallerFunc->getLocation()) {
     CollectionInitExprs.push_back(createInitListExpr(KernelObj));
-    markParallelWorkItemCalls();
+    annotateHierarchicalParallelismAPICalls();
 
     Stmt *DS = new (S.Context) DeclStmt(DeclGroupRef(KernelObjClone),
                                         KernelCallerSrcLoc, KernelCallerSrcLoc);

@@ -75,6 +75,7 @@ int main() {
 // NONATIVESUPPORT-NEXT: InitListExpr
 // NONATIVESUPPORT-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // NONATIVESUPPORT-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '_arg_a' 'int'
+// NONATIVESUPPORT-NEXT: SYCLScopeAttr {{.*}} Implicit WorkGroup
 
 // Check declaration and initialization of kernel handler local clone using default constructor
 // NONATIVESUPPORT-NEXT: DeclStmt

@@ -65,6 +65,19 @@
 // (1) - materialization of a PFWI object
 // (2) - "fixup" of the private variable address.
 //
+// TODO: add support for the case when there are other functions between
+// parallel_for_work_group and parallel_for_work_item in the call stack.
+// For example:
+//
+// void foo(sycl::group<1> group, ...) {
+//   group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... });
+// }
+// ...
+//   cgh.parallel_for_work_group<class kernel>(
+//     range<1>(...), range<1>(...), [=](group<1> g) {
+//       foo(g, ...);
+//     });
+//
 // TODO The approach employed by this pass generates lots of barriers and data
 // copying between private and local memory, which might not be efficient. There
 // are optimization opportunities listed below. Also other approaches can be
@@ -385,16 +398,8 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
     LocAlign = MaybeAlign(AI->getAlignment());
   } else {
     auto Arg = cast<Argument>(L);
-    if (Arg->hasByValAttr()) {
-      T = Arg->getParamByValType();
-      LocAlign = MaybeAlign(Arg->getParamAlignment());
-    } else {
-      Type *Ty = Arg->getType();
-      Module &M = *Shadow->getParent();
-      LocAlign = M.getDataLayout().getValueOrABITypeAlignment(
-          MaybeAlign(Arg->getParamAlignment()), Ty);
-      T = Arg->getType()->getPointerElementType();
-    }
+    T = Arg->getParamByValType();
+    LocAlign = MaybeAlign(Arg->getParamAlignment());
   }
 
   assert(T && "Unexpected type");
@@ -698,16 +703,7 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) {
 // Go through "byval" parameters which are passed as AS(0) pointers
 // and: (1) create local shadows for them (2) and initialize them from the
 // leader's copy and (3) materialize the value in the local variable before use
-//
-// Do the same for 'this' pointer which points to PFWG lamda object which is
-// allocated in the caller. Caller is a kernel function which is generated by
-// SYCL frontend. Kernel function allocates PFWG lambda object and initalizes
-// captured objects (like accessors) using arguments of the kernel. After
-// intialization kernel calls PFWG function (which is the operator() of the PFWG
-// object). PFWG object captures all objects by value and all uses (except
-// initialization from kernel arguments) of this values can only be in scope of
-// PFWG function that is why copy back of PFWG object is not needed.
-static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
+static void shareByValParams(Function &F, const Triple &TT) {
   // Skip alloca instructions and split. Alloca instructions must be in the
   // beginning of the function otherwise they are considered as dynamic which
   // can cause the problems with inlining.
@@ -726,29 +722,20 @@ static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
   Instruction &At = LeaderBB->back();
 
   for (auto &Arg : F.args()) {
-    Type *T;
-    LLVMContext &Ctx = At.getContext();
-    IRBuilder<> Builder(Ctx);
-    Builder.SetInsertPoint(&LeaderBB->front());
+    if (!Arg.hasByValAttr())
+      continue;
+
+    assert(Arg.getType()->getPointerAddressSpace() ==
+           asUInt(spirv::AddrSpace::Private));
 
     // Create the shared copy - "shadow" - for current arg
-    GlobalVariable *Shadow = nullptr;
-    if (Arg.hasByValAttr()) {
-      assert(Arg.getType()->getPointerAddressSpace() ==
-             asUInt(spirv::AddrSpace::Private));
-      T = Arg.getParamByValType();
-      Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
-    }
-    // Process 'this' pointer which points to PFWG lambda object
-    else if (Arg.getArgNo() == 0) {
-      PointerType *PtrT = dyn_cast<PointerType>(Arg.getType());
-      assert(PtrT && "Expected this pointer as the first argument");
-      T = PtrT->getPointerElementType();
-      Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
-    }
+    Type *T = Arg.getParamByValType();
+    GlobalVariable *Shadow =
+        spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
 
-    if (!Shadow)
-      continue;
+    LLVMContext &Ctx = At.getContext();
+    IRBuilder<> Builder(Ctx);
+    Builder.SetInsertPoint(&LeaderBB->front());
 
     copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
                                 true /*private->shadow*/);
@@ -766,6 +753,7 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
   if (!F.getMetadata(WG_SCOPE_MD))
     return PreservedAnalyses::all();
+  LLVM_DEBUG(llvm::dbgs() << "Function name: " << F.getName() << "\n");
   const auto &TT = llvm::Triple(F.getParent()->getTargetTriple());
   // Ranges of "side effect" instructions
   SmallVector<InstrRange, 16> Ranges;
@@ -866,9 +854,8 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
   for (auto *PFWICall : PFWICalls)
     fixupPrivateMemoryPFWILambdaCaptures(PFWICall);
 
-  // Finally, create shadows for and replace usages of byval pointer params and
-  // PFWG lambda object ('this' pointer).
-  sharePFWGPrivateObjects(F, TT);
+  // Finally, create shadows for and replace usages of byval pointer params.
+  shareByValParams(F, TT);
 
 #ifndef NDEBUG
   if (HaveChanges && Debug > 0)