@@ -2505,38 +2505,44 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
2505
2505
return CompoundStmt::Create (SemaRef.getASTContext (), BodyStmts, {}, {});
2506
2506
}
2507
2507
2508
- void markParallelWorkItemCalls () {
2509
- if (getKernelInvocationKind (KernelCallerFunc) ==
2510
- InvokeParallelForWorkGroup) {
2511
- // Fetch the kernel object and the associated call operator
2512
- // (of either the lambda or the function object).
2513
- CXXRecordDecl *KernelObj =
2514
- GetSYCLKernelObjectType (KernelCallerFunc)->getAsCXXRecordDecl ();
2515
- CXXMethodDecl *WGLambdaFn = nullptr ;
2516
- if (KernelObj->isLambda ())
2517
- WGLambdaFn = KernelObj->getLambdaCallOperator ();
2518
- else
2519
- WGLambdaFn = getOperatorParens (KernelObj);
2520
- assert (WGLambdaFn && " non callable object is passed as kernel obj" );
2521
- // Mark the function that it "works" in a work group scope:
2522
- // NOTE: In case of parallel_for_work_item the marker call itself is
2523
- // marked with work item scope attribute, here the '()' operator of the
2524
- // object passed as parameter is marked. This is an optimization -
2525
- // there are a lot of locals created at parallel_for_work_group
2526
- // scope before calling the lambda - it is more efficient to have
2527
- // all of them in the private address space rather then sharing via
2528
- // the local AS. See parallel_for_work_group implementation in the
2529
- // SYCL headers.
2530
- if (!WGLambdaFn->hasAttr <SYCLScopeAttr>()) {
2531
- WGLambdaFn->addAttr (SYCLScopeAttr::CreateImplicit (
2532
- SemaRef.getASTContext (), SYCLScopeAttr::Level::WorkGroup));
2533
- // Search and mark parallel_for_work_item calls:
2534
- MarkWIScopeFnVisitor MarkWIScope (SemaRef.getASTContext ());
2535
- MarkWIScope.TraverseDecl (WGLambdaFn);
2536
- // Now mark local variables declared in the PFWG lambda with work group
2537
- // scope attribute
2538
- addScopeAttrToLocalVars (*WGLambdaFn);
2539
- }
2508
+ void annotateHierarchicalParallelismAPICalls () {
2509
+ // Is this a hierarchical parallelism kernel invocation?
2510
+ if (getKernelInvocationKind (KernelCallerFunc) != InvokeParallelForWorkGroup)
2511
+ return ;
2512
+
2513
+ // Mark kernel object with work-group scope attribute to avoid work-item
2514
+ // scope memory allocation.
2515
+ KernelObjClone->addAttr (SYCLScopeAttr::CreateImplicit (
2516
+ SemaRef.getASTContext (), SYCLScopeAttr::Level::WorkGroup));
2517
+
2518
+ // Fetch the kernel object and the associated call operator
2519
+ // (of either the lambda or the function object).
2520
+ CXXRecordDecl *KernelObj =
2521
+ GetSYCLKernelObjectType (KernelCallerFunc)->getAsCXXRecordDecl ();
2522
+ CXXMethodDecl *WGLambdaFn = nullptr ;
2523
+ if (KernelObj->isLambda ())
2524
+ WGLambdaFn = KernelObj->getLambdaCallOperator ();
2525
+ else
2526
+ WGLambdaFn = getOperatorParens (KernelObj);
2527
+ assert (WGLambdaFn && " non callable object is passed as kernel obj" );
2528
+ // Mark the function that it "works" in a work group scope:
2529
+ // NOTE: In case of parallel_for_work_item the marker call itself is
2530
+ // marked with work item scope attribute, here the '()' operator of the
2531
+ // object passed as parameter is marked. This is an optimization -
2532
+ // there are a lot of locals created at parallel_for_work_group
2533
+ // scope before calling the lambda - it is more efficient to have
2534
+ // all of them in the private address space rather then sharing via
2535
+ // the local AS. See parallel_for_work_group implementation in the
2536
+ // SYCL headers.
2537
+ if (!WGLambdaFn->hasAttr <SYCLScopeAttr>()) {
2538
+ WGLambdaFn->addAttr (SYCLScopeAttr::CreateImplicit (
2539
+ SemaRef.getASTContext (), SYCLScopeAttr::Level::WorkGroup));
2540
+ // Search and mark parallel_for_work_item calls:
2541
+ MarkWIScopeFnVisitor MarkWIScope (SemaRef.getASTContext ());
2542
+ MarkWIScope.TraverseDecl (WGLambdaFn);
2543
+ // Now mark local variables declared in the PFWG lambda with work group
2544
+ // scope attribute
2545
+ addScopeAttrToLocalVars (*WGLambdaFn);
2540
2546
}
2541
2547
}
2542
2548
@@ -2768,13 +2774,11 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
2768
2774
TypeSourceInfo *TSInfo =
2769
2775
KernelObj->isLambda () ? KernelObj->getLambdaTypeInfo () : nullptr ;
2770
2776
auto Type = QualType (KernelObj->getTypeForDecl (), 0 );
2771
- Type->getAsRecordDecl ()->setAnonymousStructOrUnion (true );
2777
+ if (KernelObj->isLambda ())
2778
+ Type->getAsRecordDecl ()->setAnonymousStructOrUnion (true );
2772
2779
VarDecl *VD = VarDecl::Create (
2773
2780
Ctx, DC, KernelObj->getLocation (), KernelObj->getLocation (),
2774
2781
KernelObj->getIdentifier (), Type, TSInfo, SC_None);
2775
- if (getKernelInvocationKind (KernelCallerFunc) == InvokeParallelForWorkGroup)
2776
- VD->addAttr (
2777
- SYCLScopeAttr::CreateImplicit (Ctx, SYCLScopeAttr::Level::WorkGroup));
2778
2782
return VD;
2779
2783
}
2780
2784
@@ -2856,7 +2860,7 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
2856
2860
KernelObj(KernelObj), KernelCallerFunc(KernelCallerFunc),
2857
2861
KernelCallerSrcLoc(KernelCallerFunc->getLocation ()) {
2858
2862
CollectionInitExprs.push_back (createInitListExpr (KernelObj));
2859
- markParallelWorkItemCalls ();
2863
+ annotateHierarchicalParallelismAPICalls ();
2860
2864
2861
2865
Stmt *DS = new (S.Context ) DeclStmt (DeclGroupRef (KernelObjClone),
2862
2866
KernelCallerSrcLoc, KernelCallerSrcLoc);
0 commit comments