From a23f9adcbaaf7be5ee02b8e5a9c59d05ebd13710 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Wed, 28 Oct 2020 17:41:43 -0700 Subject: [PATCH 01/24] [SYCL] Parallel-for range rounding-up for improved group size selection by GPU driver. Signed-off-by: rdeodhar --- clang/lib/Sema/SemaSYCL.cpp | 16 ++++++++++ sycl/include/CL/sycl/handler.hpp | 53 ++++++++++++++++++++++++++------ sycl/include/CL/sycl/id.hpp | 2 ++ sycl/include/CL/sycl/item.hpp | 2 ++ sycl/include/CL/sycl/range.hpp | 3 ++ 5 files changed, 67 insertions(+), 9 deletions(-) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index d08f4bc9ae614..86984ec8a993a 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -510,6 +510,22 @@ class MarkDeviceFunction : public RecursiveASTVisitor { FunctionDecl *FD = WorkList.back().first; FunctionDecl *ParentFD = WorkList.back().second; + // To implement rounding-up of a parallel-for range (Jira 20239) + // a kernel call is modified like this: + // auto Wrapper = [=](TransformedArgType Arg) { + // if (Arg[0] >= NumWorkItems[0]) + // return; + // Arg.set_allowed_range(NumWorkItems); + // KernelFunc(Arg); + // }; + // + // This transformation leads to a condition where a kernel body + // function becomes callable from a new kernel body function. + // Hence this test. + if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD)) { + KernelBody = FD; + } + if ((ParentFD == SYCLKernel) && isSYCLKernelBodyFunction(FD)) { assert(!KernelBody && "inconsistent call graph - only one kernel body " "function can be called"); diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 890fa1041293d..d86285cf7cd8f 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -728,23 +728,58 @@ class __SYCL_EXPORT handler { void parallel_for_lambda_impl(range NumWorkItems, KernelType KernelFunc) { throwIfActionIsCreated(); - using NameT = - typename detail::get_kernel_name_t::name; using LambdaArgType = sycl::detail::lambda_arg_type>; + + // If 1D kernel argument is an integral type, convert it to sycl::item<1> using TransformedArgType = typename std::conditional::value && Dims == 1, item, LambdaArgType>::type; + using NameT = + typename detail::get_kernel_name_t::name; + constexpr size_t GoodLocalSizeX = 32; + std::string KName = typeid(NameT *).name(); + bool DisableRounding = + KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos; + if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) { + // Not a multiple + size_t NewValX = + ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * + GoodLocalSizeX; + if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr) + std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to " + << NewValX << " *****\n"; + auto Wrapper = [=](TransformedArgType Arg) { + if (Arg[0] >= NumWorkItems[0]) + return; + Arg.set_allowed_range(NumWorkItems); + KernelFunc(Arg); + }; + + using NameWT = NameT *; + range AdjustedRange = NumWorkItems; + AdjustedRange.set_range(NewValX); #ifdef __SYCL_DEVICE_ONLY__ - (void)NumWorkItems; - kernel_parallel_for(KernelFunc); + kernel_parallel_for(Wrapper); #else - detail::checkValueRange(NumWorkItems); - MNDRDesc.set(std::move(NumWorkItems)); - StoreLambda( - std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + detail::checkValueRange(AdjustedRange); + MNDRDesc.set(std::move(AdjustedRange)); + StoreLambda( + std::move(Wrapper)); + MCGType = detail::CG::KERNEL; #endif + } else { +#ifdef __SYCL_DEVICE_ONLY__ + (void)NumWorkItems; + kernel_parallel_for(KernelFunc); +#else + detail::checkValueRange(NumWorkItems); + MNDRDesc.set(std::move(NumWorkItems)); + StoreLambda( + std::move(KernelFunc)); + MCGType = detail::CG::KERNEL; +#endif + } } /// Defines and invokes a SYCL kernel function for the specified range. diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp index 16d176b8b698d..79f35aca19c40 100644 --- a/sycl/include/CL/sycl/id.hpp +++ b/sycl/include/CL/sycl/id.hpp @@ -94,6 +94,8 @@ template class id : public detail::array { return result; } + void set_allowed_range(range rnwi) { (void)rnwi[0]; } + #ifndef __SYCL_DISABLE_ID_TO_INT_CONV__ /* Template operator is not allowed because it disables further type * conversion. For example, the next code will not work in case of template diff --git a/sycl/include/CL/sycl/item.hpp b/sycl/include/CL/sycl/item.hpp index 9d9a879815294..a5409794dfdcf 100644 --- a/sycl/include/CL/sycl/item.hpp +++ b/sycl/include/CL/sycl/item.hpp @@ -104,6 +104,8 @@ template class item { bool operator!=(const item &rhs) const { return rhs.MImpl != MImpl; } + void set_allowed_range(const range rnwi) { MImpl.MExtent = rnwi; } + protected: template item(detail::enable_if_t> &extent, diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp index 2745a05667a31..8a92bd2c32c4d 100644 --- a/sycl/include/CL/sycl/range.hpp +++ b/sycl/include/CL/sycl/range.hpp @@ -62,6 +62,9 @@ template class range : public detail::array { return size; } + // Adjust the first dim of the range + void set_range(const size_t dim0) { this->common_array[0] = dim0; } + range(const range &rhs) = default; range(range &&rhs) = default; range &operator=(const range &rhs) = default; From 73f50bd487a5305a0a3e34117ada09e96958fa42 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Tue, 3 Nov 2020 17:47:57 -0800 Subject: [PATCH 02/24] Correction to wrapper kernel name. --- sycl/include/CL/sycl/handler.hpp | 69 ++++++++++++++------------------ 1 file changed, 30 insertions(+), 39 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index d86285cf7cd8f..35233bc24d4aa 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -120,6 +120,14 @@ template struct get_kernel_name_t { using name = Type; }; +// Used when parallel_for range is rounded-up. +template class __pf_wrapper_kernel; + +template struct get_kernel_wrapper_name_t { + using name = __pf_wrapper_kernel< + typename get_kernel_name_t::name>; +}; + template struct check_fn_signature { static_assert(std::integral_constant::value, "Second template parameter is required to be of function type"); @@ -738,48 +746,31 @@ class __SYCL_EXPORT handler { using NameT = typename detail::get_kernel_name_t::name; constexpr size_t GoodLocalSizeX = 32; - std::string KName = typeid(NameT *).name(); - bool DisableRounding = - KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos; - if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) { - // Not a multiple - size_t NewValX = - ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * - GoodLocalSizeX; - if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr) - std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to " - << NewValX << " *****\n"; - auto Wrapper = [=](TransformedArgType Arg) { - if (Arg[0] >= NumWorkItems[0]) - return; - Arg.set_allowed_range(NumWorkItems); - KernelFunc(Arg); - }; - - using NameWT = NameT *; - range AdjustedRange = NumWorkItems; - AdjustedRange.set_range(NewValX); -#ifdef __SYCL_DEVICE_ONLY__ - kernel_parallel_for(Wrapper); -#else - detail::checkValueRange(AdjustedRange); - MNDRDesc.set(std::move(AdjustedRange)); - StoreLambda( - std::move(Wrapper)); - MCGType = detail::CG::KERNEL; -#endif - } else { + // Round-up just the first dimension of the range. + // Even for multi-dimensional ranges the total number of work items will + // be a multiple of the rounding factor since any multiple of a rounded-up + // first dimension will also be a rounded-up value. + size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * + GoodLocalSizeX; + auto Wrapper = [=](TransformedArgType Arg) { + if (Arg[0] >= NumWorkItems[0]) + return; + Arg.set_allowed_range(NumWorkItems); + KernelFunc(Arg); + }; + + using NameWT = typename detail::get_kernel_wrapper_name_t::name; + range AdjustedRange = NumWorkItems; + AdjustedRange.set_range(NewValX); #ifdef __SYCL_DEVICE_ONLY__ - (void)NumWorkItems; - kernel_parallel_for(KernelFunc); + kernel_parallel_for(Wrapper); #else - detail::checkValueRange(NumWorkItems); - MNDRDesc.set(std::move(NumWorkItems)); - StoreLambda( - std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + detail::checkValueRange(AdjustedRange); + MNDRDesc.set(std::move(AdjustedRange)); + StoreLambda( + std::move(Wrapper)); + MCGType = detail::CG::KERNEL; #endif - } } /// Defines and invokes a SYCL kernel function for the specified range. From 2aad33d0bd2ee75b8ba93541717aa481de9d683c Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Wed, 4 Nov 2020 18:31:25 -0800 Subject: [PATCH 03/24] Test correction to backend interoperability interface. --- sycl/include/CL/sycl/handler.hpp | 13 +++++++------ sycl/test/basic_tests/handler/handler_set_args.cpp | 8 ++++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 35233bc24d4aa..5986c37115f93 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -745,13 +745,17 @@ class __SYCL_EXPORT handler { item, LambdaArgType>::type; using NameT = typename detail::get_kernel_name_t::name; - constexpr size_t GoodLocalSizeX = 32; + // Round-up just the first dimension of the range. // Even for multi-dimensional ranges the total number of work items will // be a multiple of the rounding factor since any multiple of a rounded-up // first dimension will also be a rounded-up value. + constexpr size_t GoodLocalSizeX = 32; + range AdjustedRange = NumWorkItems; size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * GoodLocalSizeX; + AdjustedRange.set_range(NewValX); + auto Wrapper = [=](TransformedArgType Arg) { if (Arg[0] >= NumWorkItems[0]) return; @@ -759,15 +763,12 @@ class __SYCL_EXPORT handler { KernelFunc(Arg); }; - using NameWT = typename detail::get_kernel_wrapper_name_t::name; - range AdjustedRange = NumWorkItems; - AdjustedRange.set_range(NewValX); #ifdef __SYCL_DEVICE_ONLY__ - kernel_parallel_for(Wrapper); + kernel_parallel_for(Wrapper); #else detail::checkValueRange(AdjustedRange); MNDRDesc.set(std::move(AdjustedRange)); - StoreLambda( + StoreLambda( std::move(Wrapper)); MCGType = detail::CG::KERNEL; #endif diff --git a/sycl/test/basic_tests/handler/handler_set_args.cpp b/sycl/test/basic_tests/handler/handler_set_args.cpp index 6e15ecc6fa836..3bb984ad831eb 100644 --- a/sycl/test/basic_tests/handler/handler_set_args.cpp +++ b/sycl/test/basic_tests/handler/handler_set_args.cpp @@ -115,7 +115,7 @@ int main() { getPrebuiltKernel>(Queue); checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); + cgh.set_args(Range, acc); cgh.parallel_for(Range, preBuiltKernel); }); } @@ -135,7 +135,7 @@ int main() { getPrebuiltKernel>(Queue); checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); + cgh.set_args(Range, acc); cgh.parallel_for(Range, preBuiltKernel); }); } @@ -174,7 +174,7 @@ int main() { getPrebuiltKernel>(Queue); checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); + cgh.set_args(Range, acc); cgh.parallel_for( preBuiltKernel, Range, [=](cl::sycl::id<1> id) { acc[0] = 10; }); }); @@ -197,7 +197,7 @@ int main() { getPrebuiltKernel>(Queue); checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); + cgh.set_args(Range, acc); cgh.parallel_for( preBuiltKernel, Range, [=](cl::sycl::item<1> item) { acc[0] = 10; }); }); From ac6bf289e679815f1ceac496cbac09e95aacbbb2 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Thu, 5 Nov 2020 16:09:43 -0800 Subject: [PATCH 04/24] Environment var control to disable optimization; correction to one test. --- sycl/include/CL/sycl/handler.hpp | 28 ++++++++++++++++++- .../free_function_queries.cpp | 5 +++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 5986c37115f93..b05cbc32b470b 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -732,6 +732,8 @@ class __SYCL_EXPORT handler { /// /// \param NumWorkItems is a range defining indexing space. /// \param KernelFunc is a SYCL kernel function. +#ifndef __SYCL_PFWG_ROUNDED_SIZE_DISABLE__ +#define __SYCL_PFWG_ROUNDED_SIZE__ 32 template void parallel_for_lambda_impl(range NumWorkItems, KernelType KernelFunc) { @@ -750,7 +752,7 @@ class __SYCL_EXPORT handler { // Even for multi-dimensional ranges the total number of work items will // be a multiple of the rounding factor since any multiple of a rounded-up // first dimension will also be a rounded-up value. - constexpr size_t GoodLocalSizeX = 32; + constexpr size_t GoodLocalSizeX = __SYCL_PFWG_ROUNDED_SIZE__; range AdjustedRange = NumWorkItems; size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * GoodLocalSizeX; @@ -773,6 +775,30 @@ class __SYCL_EXPORT handler { MCGType = detail::CG::KERNEL; #endif } +#else // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__ + template + void parallel_for_lambda_impl(range NumWorkItems, + KernelType KernelFunc) { + throwIfActionIsCreated(); + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = + typename std::conditional::value && + Dims == 1, + item, LambdaArgType>::type; +#ifdef __SYCL_DEVICE_ONLY__ + (void)NumWorkItems; + kernel_parallel_for(KernelFunc); +#else + detail::checkValueRange(NumWorkItems); + MNDRDesc.set(std::move(NumWorkItems)); + StoreLambda( + std::move(KernelFunc)); + MCGType = detail::CG::KERNEL; +#endif + } +#endif // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__ /// Defines and invokes a SYCL kernel function for the specified range. /// diff --git a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp b/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp index c410261d06e03..a6f81c98e7a92 100644 --- a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp +++ b/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp @@ -47,7 +47,8 @@ int main() { auto that_item = sycl::this_item<1>(); results_acc[1] = that_item.get_id() == i; - results_acc[2] = that_item.get_range() == sycl::range<1>(n); + results_acc[2] = that_item.get_range() == + sycl::range<1>(__SYCL_PFWG_ROUNDED_SIZE__); acc[i]++; }); }); @@ -81,6 +82,8 @@ int main() { auto that_id = sycl::this_id<1>(); results_acc[0] = i.get_id() == that_id; auto that_item = sycl::this_item<1>(); + i.set_allowed_range(__SYCL_PFWG_ROUNDED_SIZE__); + that_item.set_allowed_range(__SYCL_PFWG_ROUNDED_SIZE__); results_acc[1] = i == that_item; acc[i]++; }); From da1a3ab2ffa49f25a06896d6346ca24eb69c9ece Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Mon, 16 Nov 2020 13:59:08 -0800 Subject: [PATCH 05/24] Fixes for set_args and this_item usage, and for review comments. --- clang/include/clang/Sema/Sema.h | 6 ++ clang/lib/Sema/SemaSYCL.cpp | 97 ++++++++++++++++++- sycl/include/CL/sycl/detail/kernel_desc.hpp | 2 + sycl/include/CL/sycl/handler.hpp | 102 +++++++++++--------- sycl/include/CL/sycl/id.hpp | 6 +- sycl/include/CL/sycl/item.hpp | 5 +- sycl/include/CL/sycl/range.hpp | 11 ++- 7 files changed, 171 insertions(+), 58 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index b1efcf73cf584..4bb5a3716b3c9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -345,6 +345,9 @@ class SYCLIntegrationHeader { /// Registers a specialization constant to emit info for it into the header. void addSpecConstant(StringRef IDName, QualType IDType); + /// Notes that this_item is called within the kernel. + void setCallsThisItem(bool B); + private: // Kernel actual parameter descriptor. struct KernelParamDesc { @@ -378,6 +381,9 @@ class SYCLIntegrationHeader { /// Descriptor of kernel actual parameters. SmallVector Params; + // Whether kernel calls this_item() + bool CallsThisItem; + KernelDesc() = default; }; diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 71e862eb21ab7..033725e47ef61 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -510,7 +510,7 @@ class MarkDeviceFunction : public RecursiveASTVisitor { FunctionDecl *FD = WorkList.back().first; FunctionDecl *ParentFD = WorkList.back().second; - // To implement rounding-up of a parallel-for range (Jira 20239) + // To implement rounding-up of a parallel-for range // a kernel call is modified like this: // auto Wrapper = [=](TransformedArgType Arg) { // if (Arg[0] >= NumWorkItems[0]) @@ -643,6 +643,40 @@ class FindPFWGLambdaFnVisitor const CXXRecordDecl *LambdaObjTy; }; +// Searches for a call to PF lambda function and captures it. +class FindPFLambdaFnVisitor + : public RecursiveASTVisitor { +public: + // LambdaObjTy - lambda type of the PF lambda object + FindPFLambdaFnVisitor(const CXXRecordDecl* LambdaObjTy) + : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {} + + bool VisitCallExpr(CallExpr* Call) { + auto* M = dyn_cast(Call->getDirectCallee()); + if (!M || (M->getOverloadedOperator() != OO_Call)) + return true; + const int NumPFLambdaArgs = 2; // range and lambda obj + if (Call->getNumArgs() != NumPFLambdaArgs) + return true; + QualType Range = Call->getArg(1)->getType(); + if (!Util::isSyclType(Range, "id", true /*Tmpl*/) && + !Util::isSyclType(Range, "item", true /*Tmpl*/)) + return true; + if (Call->getArg(0)->getType()->getAsCXXRecordDecl() != LambdaObjTy) + return true; + LambdaFn = M; // call to PF lambda found - record the lambda + return false; // ... and stop searching + } + + // Returns the captured lambda function or nullptr; + CXXMethodDecl* getLambdaFn() const { return LambdaFn; } + +private: + CXXMethodDecl* LambdaFn; + const CXXRecordDecl* LambdaObjTy; +}; + + class MarkWIScopeFnVisitor : public RecursiveASTVisitor { public: MarkWIScopeFnVisitor(ASTContext &Ctx) : Ctx(Ctx) {} @@ -2655,13 +2689,61 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { return !SemaRef.getASTContext().hasSameType(FD->getType(), Ty); } + // Sets a flag if the kernel is a parallel_for that calls the + // free function API "this_item". + void setThisItemIsCalled(const CXXRecordDecl *KernelObj, + FunctionDecl *KernelFunc) { + if (getKernelInvocationKind(KernelFunc) != InvokeParallelFor) + return; + + FindPFLambdaFnVisitor V(KernelObj); + V.TraverseStmt(KernelFunc->getBody()); + CXXMethodDecl *WGLambdaFn = V.getLambdaFn(); + if (!WGLambdaFn) + return; + + // The call graph for this translation unit. + CallGraph SYCLCG; + SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl()); + typedef std::pair ChildParentPair; + llvm::SmallPtrSet Visited; + llvm::SmallVector WorkList; + WorkList.push_back({WGLambdaFn, nullptr}); + + while (!WorkList.empty()) { + FunctionDecl *FD = WorkList.back().first; + WorkList.pop_back(); + if (!Visited.insert(FD).second) + continue; // We've already seen this Decl + + if (FD->isFunctionOrMethod() && FD->getIdentifier() && + !FD->getName().empty() && "this_item" == FD->getName()) { + Header.setCallsThisItem(true); + return; + } + + CallGraphNode *N = SYCLCG.getNode(FD); + if (!N) + continue; + + for (const CallGraphNode *CI : *N) { + if (auto *Callee = dyn_cast(CI->getDecl())) { + Callee = Callee->getMostRecentDecl(); + if (!Visited.count(Callee)) + WorkList.push_back({Callee, FD}); + } + } + } + } + public: static constexpr const bool VisitInsideSimpleContainers = false; SyclKernelIntHeaderCreator(Sema &S, SYCLIntegrationHeader &H, const CXXRecordDecl *KernelObj, QualType NameType, - StringRef Name, StringRef StableName) + StringRef Name, StringRef StableName, FunctionDecl* KernelFunc) : SyclKernelFieldHandler(S), Header(H) { Header.startKernel(Name, NameType, StableName, KernelObj->getLocation()); + setThisItemIsCalled(KernelObj, KernelFunc); } bool handleSyclAccessorType(const CXXRecordDecl *RD, @@ -3085,7 +3167,7 @@ void Sema::ConstructOpenCLKernel(FunctionDecl *KernelCallerFunc, SyclKernelIntHeaderCreator int_header( *this, getSyclIntegrationHeader(), KernelObj, calculateKernelNameType(Context, KernelCallerFunc), KernelName, - StableName); + StableName, KernelCallerFunc); KernelObjVisitor Visitor{*this}; Visitor.VisitRecordBases(KernelObj, kernel_decl, kernel_body, int_header); @@ -3793,6 +3875,9 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "getParamDesc(unsigned i) {\n"; O << " return kernel_signatures[i+" << CurStart << "];\n"; O << " }\n"; + O << " __SYCL_DLL_LOCAL\n"; + O << " static constexpr bool callsThisItem() { return "; + O << K.CallsThisItem << "; }\n"; O << "};\n"; CurStart += N; } @@ -3849,6 +3934,12 @@ void SYCLIntegrationHeader::addSpecConstant(StringRef IDName, QualType IDType) { SpecConsts.emplace_back(std::make_pair(IDType, IDName.str())); } +void SYCLIntegrationHeader::setCallsThisItem(bool B) { + auto *K = getCurKernelDesc(); + assert(K && "no kernels"); + K->CallsThisItem = B; +} + SYCLIntegrationHeader::SYCLIntegrationHeader(DiagnosticsEngine &_Diag, bool _UnnamedLambdaSupport, Sema &_S) diff --git a/sycl/include/CL/sycl/detail/kernel_desc.hpp b/sycl/include/CL/sycl/detail/kernel_desc.hpp index a7f73d9d19f70..a5153c1fa4da0 100644 --- a/sycl/include/CL/sycl/detail/kernel_desc.hpp +++ b/sycl/include/CL/sycl/detail/kernel_desc.hpp @@ -56,6 +56,7 @@ template struct KernelInfo { return Dummy; } static constexpr const char *getName() { return ""; } + static bool callsThisItem() { return false; } }; #else template struct KernelInfoData { @@ -65,6 +66,7 @@ template struct KernelInfoData { return Dummy; } static constexpr const char *getName() { return ""; } + static bool callsThisItem() { return false; } }; // C++14 like index_sequence and make_index_sequence diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index b05cbc32b470b..f5d6993398665 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -121,10 +121,10 @@ template struct get_kernel_name_t { }; // Used when parallel_for range is rounded-up. -template class __pf_wrapper_kernel; +template class __pf_kernel_wrapper; template struct get_kernel_wrapper_name_t { - using name = __pf_wrapper_kernel< + using name = __pf_kernel_wrapper< typename get_kernel_name_t::name>; }; @@ -732,8 +732,6 @@ class __SYCL_EXPORT handler { /// /// \param NumWorkItems is a range defining indexing space. /// \param KernelFunc is a SYCL kernel function. -#ifndef __SYCL_PFWG_ROUNDED_SIZE_DISABLE__ -#define __SYCL_PFWG_ROUNDED_SIZE__ 32 template void parallel_for_lambda_impl(range NumWorkItems, KernelType KernelFunc) { @@ -748,57 +746,65 @@ class __SYCL_EXPORT handler { using NameT = typename detail::get_kernel_name_t::name; - // Round-up just the first dimension of the range. - // Even for multi-dimensional ranges the total number of work items will - // be a multiple of the rounding factor since any multiple of a rounded-up - // first dimension will also be a rounded-up value. - constexpr size_t GoodLocalSizeX = __SYCL_PFWG_ROUNDED_SIZE__; - range AdjustedRange = NumWorkItems; - size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * - GoodLocalSizeX; - AdjustedRange.set_range(NewValX); - - auto Wrapper = [=](TransformedArgType Arg) { - if (Arg[0] >= NumWorkItems[0]) - return; - Arg.set_allowed_range(NumWorkItems); - KernelFunc(Arg); - }; - + // A reasonable choice for rounding up the range is 32. + constexpr size_t GoodLocalSizeX = 32; + + // Disable the rounding-up optimizations under these conditions: + // 1. The env var SYCL_OPT_PFWGS_DISABLE is set + // 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name. + // 3. The kernel is created and invoked without an integration header entry. + // 4. The API "this_item" is used inside the kernel. + // 5. The range is already a multiple of the rounding factor. + + // Get the kernal name to check condition 3. + std::string KName = typeid(NameT *).name(); + using KI = detail::KernelInfo; + bool DisableRounding = + (getenv("SYCL_OPT_PFWGS_DISABLE") != nullptr) || + (KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos) || + (KI::getName() == nullptr || KI::getName()[0] == '\0') || + (KI::callsThisItem()); + + if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) { + // Range is not a multiple and rounding-up is allowed + size_t NewValX = + ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * + GoodLocalSizeX; + using NameWT = typename detail::get_kernel_wrapper_name_t::name; + if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr) + std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to " + << NewValX << " *****\n"; + auto Wrapper = [=](TransformedArgType Arg) { + if (Arg[0] >= NumWorkItems[0]) + return; + Arg.set_allowed_range(NumWorkItems); + KernelFunc(Arg); + }; + + range AdjustedRange = NumWorkItems; + AdjustedRange.set_range(NewValX); #ifdef __SYCL_DEVICE_ONLY__ - kernel_parallel_for(Wrapper); + kernel_parallel_for(Wrapper); #else - detail::checkValueRange(AdjustedRange); - MNDRDesc.set(std::move(AdjustedRange)); - StoreLambda( - std::move(Wrapper)); - MCGType = detail::CG::KERNEL; + detail::checkValueRange(AdjustedRange); + MNDRDesc.set(std::move(AdjustedRange)); + StoreLambda( + std::move(Wrapper)); + MCGType = detail::CG::KERNEL; #endif - } -#else // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__ - template - void parallel_for_lambda_impl(range NumWorkItems, - KernelType KernelFunc) { - throwIfActionIsCreated(); - using NameT = - typename detail::get_kernel_name_t::name; - using LambdaArgType = sycl::detail::lambda_arg_type>; - using TransformedArgType = - typename std::conditional::value && - Dims == 1, - item, LambdaArgType>::type; + } else { #ifdef __SYCL_DEVICE_ONLY__ - (void)NumWorkItems; - kernel_parallel_for(KernelFunc); + (void)NumWorkItems; + kernel_parallel_for(KernelFunc); #else - detail::checkValueRange(NumWorkItems); - MNDRDesc.set(std::move(NumWorkItems)); - StoreLambda( - std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + detail::checkValueRange(NumWorkItems); + MNDRDesc.set(std::move(NumWorkItems)); + StoreLambda( + std::move(KernelFunc)); + MCGType = detail::CG::KERNEL; #endif + } } -#endif // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__ /// Defines and invokes a SYCL kernel function for the specified range. /// diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp index 79f35aca19c40..151657aa661e8 100644 --- a/sycl/include/CL/sycl/id.hpp +++ b/sycl/include/CL/sycl/id.hpp @@ -94,8 +94,6 @@ template class id : public detail::array { return result; } - void set_allowed_range(range rnwi) { (void)rnwi[0]; } - #ifndef __SYCL_DISABLE_ID_TO_INT_CONV__ /* Template operator is not allowed because it disables further type * conversion. For example, the next code will not work in case of template @@ -241,6 +239,10 @@ template class id : public detail::array { __SYCL_GEN_OPT(^=) #undef __SYCL_GEN_OPT + +private: + friend class handler; + void set_allowed_range(range rnwi) { (void)rnwi[0]; } }; namespace detail { diff --git a/sycl/include/CL/sycl/item.hpp b/sycl/include/CL/sycl/item.hpp index a5409794dfdcf..a8aa9c8ef09f5 100644 --- a/sycl/include/CL/sycl/item.hpp +++ b/sycl/include/CL/sycl/item.hpp @@ -104,8 +104,6 @@ template class item { bool operator!=(const item &rhs) const { return rhs.MImpl != MImpl; } - void set_allowed_range(const range rnwi) { MImpl.MExtent = rnwi; } - protected: template item(detail::enable_if_t> &extent, @@ -120,6 +118,9 @@ template class item { friend class detail::Builder; private: + friend class handler; + void set_allowed_range(const range rnwi) { MImpl.MExtent = rnwi; } + detail::ItemBase MImpl; }; diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp index 8a92bd2c32c4d..3f08a67e7c207 100644 --- a/sycl/include/CL/sycl/range.hpp +++ b/sycl/include/CL/sycl/range.hpp @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include @@ -62,9 +63,6 @@ template class range : public detail::array { return size; } - // Adjust the first dim of the range - void set_range(const size_t dim0) { this->common_array[0] = dim0; } - range(const range &rhs) = default; range(range &&rhs) = default; range &operator=(const range &rhs) = default; @@ -144,6 +142,13 @@ template class range : public detail::array { __SYCL_GEN_OPT(^=) #undef __SYCL_GEN_OPT + +private: + friend class handler; + friend class detail::Builder; + + // Adjust the first dim of the range + void set_range(const size_t dim0) { this->common_array[0] = dim0; } }; #ifdef __cpp_deduction_guides From eaacd8a0faba0e5d6175e1de667efccad6ab43d3 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Mon, 16 Nov 2020 15:19:28 -0800 Subject: [PATCH 06/24] Formatting changes. --- clang/lib/Sema/SemaSYCL.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 69d53b0074c58..2f7bb6f7669fd 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -659,14 +659,14 @@ class FindPFWGLambdaFnVisitor // Searches for a call to PF lambda function and captures it. class FindPFLambdaFnVisitor - : public RecursiveASTVisitor { + : public RecursiveASTVisitor { public: // LambdaObjTy - lambda type of the PF lambda object - FindPFLambdaFnVisitor(const CXXRecordDecl* LambdaObjTy) - : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {} + FindPFLambdaFnVisitor(const CXXRecordDecl *LambdaObjTy) + : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {} - bool VisitCallExpr(CallExpr* Call) { - auto* M = dyn_cast(Call->getDirectCallee()); + bool VisitCallExpr(CallExpr *Call) { + auto *M = dyn_cast(Call->getDirectCallee()); if (!M || (M->getOverloadedOperator() != OO_Call)) return true; const int NumPFLambdaArgs = 2; // range and lambda obj @@ -683,11 +683,11 @@ class FindPFLambdaFnVisitor } // Returns the captured lambda function or nullptr; - CXXMethodDecl* getLambdaFn() const { return LambdaFn; } + CXXMethodDecl *getLambdaFn() const { return LambdaFn; } private: - CXXMethodDecl* LambdaFn; - const CXXRecordDecl* LambdaObjTy; + CXXMethodDecl *LambdaFn; + const CXXRecordDecl *LambdaObjTy; }; @@ -2754,7 +2754,8 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { static constexpr const bool VisitInsideSimpleContainers = false; SyclKernelIntHeaderCreator(Sema &S, SYCLIntegrationHeader &H, const CXXRecordDecl *KernelObj, QualType NameType, - StringRef Name, StringRef StableName, FunctionDecl* KernelFunc) + StringRef Name, StringRef StableName, + FunctionDecl *KernelFunc) : SyclKernelFieldHandler(S), Header(H) { Header.startKernel(Name, NameType, StableName, KernelObj->getLocation()); setThisItemIsCalled(KernelObj, KernelFunc); From 535745fbe80113f3efaf869725f535e174434120 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Mon, 16 Nov 2020 15:24:27 -0800 Subject: [PATCH 07/24] Formatting change. --- clang/lib/Sema/SemaSYCL.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 2f7bb6f7669fd..4460581df7ef5 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -690,7 +690,6 @@ class FindPFLambdaFnVisitor const CXXRecordDecl *LambdaObjTy; }; - class MarkWIScopeFnVisitor : public RecursiveASTVisitor { public: MarkWIScopeFnVisitor(ASTContext &Ctx) : Ctx(Ctx) {} From 5c6c841ee44231fdf8e5511433bd58970f05cc3c Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Mon, 16 Nov 2020 15:46:37 -0800 Subject: [PATCH 08/24] Removed unneeded files. --- .../free_function_queries.cpp | 173 ------------- .../basic_tests/handler/handler_set_args.cpp | 230 ------------------ 2 files changed, 403 deletions(-) delete mode 100644 sycl/test/basic_tests/free_function_queries/free_function_queries.cpp delete mode 100644 sycl/test/basic_tests/handler/handler_set_args.cpp diff --git a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp b/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp deleted file mode 100644 index c410261d06e03..0000000000000 --- a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: env SYCL_DEVICE_TYPE=HOST %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out - -// TODO: Support global work offset on Level Zero. -// XFAIL: level_zero - -//==- free_function_queries.cpp - SYCL free function queries test -=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===------------------------------------------------------------------------===// - -#include - -#include -#include -#include - -int main() { - constexpr std::size_t n = 10; - - int data[n]{}; - int counter{0}; - - { - constexpr int checks_number{3}; - int results[checks_number]{}; - { - sycl::buffer buf(data, sycl::range<1>(n)); - sycl::buffer results_buf(results, sycl::range<1>(checks_number)); - sycl::queue q; - q.submit([&](cl::sycl::handler &cgh) { - sycl::accessor - acc(buf.get_access(cgh)); - sycl::accessor - results_acc(results_buf.get_access(cgh)); - cgh.parallel_for(n, [=](sycl::id<1> i) { - auto that_id = sycl::this_id<1>(); - results_acc[0] = that_id == i; - - auto that_item = sycl::this_item<1>(); - results_acc[1] = that_item.get_id() == i; - results_acc[2] = that_item.get_range() == sycl::range<1>(n); - acc[i]++; - }); - }); - } - ++counter; - for (int i = 0; i < n; i++) { - assert(data[i] == counter); - } - for (auto val : results) { - assert(val == 1); - } - } - - { - constexpr int checks_number{2}; - int results[checks_number]{}; - { - sycl::buffer buf(data, sycl::range<1>(n)); - sycl::buffer results_buf(results, sycl::range<1>(checks_number)); - sycl::queue q; - q.submit([&](cl::sycl::handler &cgh) { - sycl::accessor - acc(buf.get_access(cgh)); - sycl::accessor - results_acc(results_buf.get_access(cgh)); - cgh.parallel_for(n, [=](auto i) { - static_assert(std::is_same>::value, - "lambda arg type is unexpected"); - auto that_id = sycl::this_id<1>(); - results_acc[0] = i.get_id() == that_id; - auto that_item = sycl::this_item<1>(); - results_acc[1] = i == that_item; - acc[i]++; - }); - }); - } - ++counter; - for (int i = 0; i < n; i++) { - assert(data[i] == counter); - } - for (auto val : results) { - assert(val == 1); - } - } - - { - constexpr int checks_number{2}; - int results[checks_number]{}; - { - sycl::buffer buf(data, sycl::range<1>(n)); - sycl::buffer results_buf(results, sycl::range<1>(checks_number)); - sycl::queue q; - sycl::id<1> offset(1); - q.submit([&](cl::sycl::handler &cgh) { - sycl::accessor - acc(buf.get_access(cgh)); - sycl::accessor - results_acc(results_buf.get_access(cgh)); - cgh.parallel_for( - sycl::range<1>{n}, offset, [=](sycl::item<1, true> i) { - auto that_id = sycl::this_id<1>(); - results_acc[0] = i.get_id() == that_id; - auto that_item = sycl::this_item<1>(); - results_acc[1] = i == that_item; - acc[that_item.get_linear_id()]++; - }); - }); - } - ++counter; - for (int i = 0; i < n; i++) { - assert(data[i] == counter); - } - for (auto val : results) { - assert(val == 1); - } - } - - { - constexpr int checks_number{5}; - int results[checks_number]{}; - { - sycl::buffer buf(data, sycl::range<1>(n)); - sycl::buffer results_buf(results, sycl::range<1>(checks_number)); - sycl::queue q; - sycl::nd_range<1> NDR(sycl::range<1>{n}, sycl::range<1>{2}); - q.submit([&](cl::sycl::handler &cgh) { - sycl::accessor - acc(buf.get_access(cgh)); - sycl::accessor - results_acc(results_buf.get_access(cgh)); - cgh.parallel_for(NDR, [=](auto nd_i) { - static_assert(std::is_same>::value, - "lambda arg type is unexpected"); - auto that_nd_item = sycl::this_nd_item<1>(); - results_acc[0] = that_nd_item == nd_i; - auto nd_item_group = that_nd_item.get_group(); - results_acc[1] = nd_item_group == sycl::this_group<1>(); - auto nd_item_id = that_nd_item.get_global_id(); - results_acc[2] = nd_item_id == sycl::this_id<1>(); - auto that_item = sycl::this_item<1>(); - results_acc[3] = nd_item_id == that_item.get_id(); - results_acc[4] = - that_nd_item.get_global_range() == that_item.get_range(); - - acc[that_nd_item.get_global_id(0)]++; - }); - }); - } - ++counter; - for (int i = 0; i < n; i++) { - assert(data[i] == counter); - } - for (auto val : results) { - assert(val == 1); - } - } -} diff --git a/sycl/test/basic_tests/handler/handler_set_args.cpp b/sycl/test/basic_tests/handler/handler_set_args.cpp deleted file mode 100644 index 6e15ecc6fa836..0000000000000 --- a/sycl/test/basic_tests/handler/handler_set_args.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out -//==--------------- handler_set_args.cpp -------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -constexpr bool UseOffset = true; -constexpr bool NoOffset = false; -const cl::sycl::range<1> Range = 1; - -using AccessorT = cl::sycl::accessor; - -struct SingleTaskFunctor { - SingleTaskFunctor(AccessorT acc) : MAcc(acc) {} - - void operator()() const { MAcc[0] = 10; } - - AccessorT MAcc; -}; - -template struct ParallelForRangeIdFunctor { - ParallelForRangeIdFunctor(AccessorT acc) : MAcc(acc) {} - - void operator()(cl::sycl::id<1> id) const { MAcc[0] = 10; } - - AccessorT MAcc; -}; - -template struct ParallelForRangeItemFunctor { - ParallelForRangeItemFunctor(AccessorT acc) : MAcc(acc) {} - - void operator()(cl::sycl::item<1> item) const { MAcc[0] = 10; } - - AccessorT MAcc; -}; - -struct ParallelForNdRangeFunctor { - ParallelForNdRangeFunctor(AccessorT acc) : MAcc(acc) {} - - void operator()(cl::sycl::nd_item<1> ndItem) const { MAcc[0] = 10; } - - AccessorT MAcc; -}; - -template -cl::sycl::kernel getPrebuiltKernel(cl::sycl::queue &queue) { - cl::sycl::program program(queue.get_context()); - program.build_with_kernel_type(); - return program.get_kernel(); -} - -template -void checkApiCall(cl::sycl::queue &queue, kernel_wrapper &&kernelWrapper) { - int result = 0; - { - auto buf = cl::sycl::buffer(&result, Range); - queue.submit([&](cl::sycl::handler &cgh) { - auto acc = buf.get_access(cgh); - kernelWrapper(cgh, acc); - }); - } - assert(result == 10); -} - -int main() { - cl::sycl::queue Queue; - const cl::sycl::id<1> Offset(0); - const cl::sycl::nd_range<1> NdRange(Range, Range); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.single_task(SingleTaskFunctor(acc)); - }); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.parallel_for(Range, ParallelForRangeIdFunctor(acc)); - }); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.parallel_for(Range, Offset, ParallelForRangeIdFunctor(acc)); - }); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.parallel_for(Range, ParallelForRangeItemFunctor(acc)); - }); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.parallel_for(Range, Offset, - ParallelForRangeItemFunctor(acc)); - }); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.parallel_for(NdRange, ParallelForNdRangeFunctor(acc)); - }); - - { - auto preBuiltKernel = getPrebuiltKernel(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.single_task(preBuiltKernel); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for(Range, preBuiltKernel); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for(Range, Offset, preBuiltKernel); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for(Range, preBuiltKernel); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for(Range, Offset, preBuiltKernel); - }); - } - - { - auto preBuiltKernel = getPrebuiltKernel(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for(NdRange, preBuiltKernel); - }); - } - - { - auto preBuiltKernel = getPrebuiltKernel(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.single_task(preBuiltKernel, - [=]() { acc[0] = 10; }); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for( - preBuiltKernel, Range, [=](cl::sycl::id<1> id) { acc[0] = 10; }); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for( - preBuiltKernel, Range, Offset, - [=](cl::sycl::id<1> id) { acc[0] = 10; }); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for( - preBuiltKernel, Range, [=](cl::sycl::item<1> item) { acc[0] = 10; }); - }); - } - - { - auto preBuiltKernel = - getPrebuiltKernel>(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for( - preBuiltKernel, Range, Offset, - [=](cl::sycl::item<1> item) { acc[0] = 10; }); - }); - } - - { - auto preBuiltKernel = getPrebuiltKernel(Queue); - - checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) { - cgh.set_args(acc); - cgh.parallel_for( - preBuiltKernel, NdRange, - [=](cl::sycl::nd_item<1> ndItem) { acc[0] = 10; }); - }); - } - - return 0; -} From dc20fa1e1a56deb4920a6e834f637246f2e00a65 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Mon, 16 Nov 2020 17:34:55 -0800 Subject: [PATCH 09/24] Added some comments requested by reviewers. --- sycl/include/CL/sycl/handler.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 16711e3fde354..d99c196fbae0c 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -765,8 +765,13 @@ class __SYCL_EXPORT handler { (KI::getName() == nullptr || KI::getName()[0] == '\0') || (KI::callsThisItem()); + // Perform range rounding if rounding-up is enabled + // and the user-specified range is not a multiple of a "good" value. if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) { - // Range is not a multiple and rounding-up is allowed + // It is sufficient to round up just the first dimension. + // Multiplying the rounded-up value of the first dimension + // by the values of the remaining dimensions (if any) + // will yield a rounded-up value for the total range. size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * GoodLocalSizeX; From d62e2f1162078ddfe724886eec85775e6e1893af Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Tue, 17 Nov 2020 14:32:58 -0800 Subject: [PATCH 10/24] Added a test for integration header and one execution test. --- clang/test/CodeGenSYCL/Inputs/sycl.hpp | 12 ++ .../CodeGenSYCL/parallel_for_this_item.cpp | 53 ++++++ .../parallel_for_range_roundup.cpp | 153 ++++++++++++++++++ 3 files changed, 218 insertions(+) create mode 100755 clang/test/CodeGenSYCL/parallel_for_this_item.cpp create mode 100755 sycl/test/basic_tests/parallel_for_range_roundup.cpp diff --git a/clang/test/CodeGenSYCL/Inputs/sycl.hpp b/clang/test/CodeGenSYCL/Inputs/sycl.hpp index 402663eafb595..a0f774ad8d243 100644 --- a/clang/test/CodeGenSYCL/Inputs/sycl.hpp +++ b/clang/test/CodeGenSYCL/Inputs/sycl.hpp @@ -118,6 +118,18 @@ struct id { int Data; }; +template struct item { + template + item(T... args) {} // fake constructor +private: + // Some fake field added to see using of item arguments in the + // kernel wrapper + int Data; +}; + +template item +this_item() { return item{}; } + template struct range { template diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp new file mode 100755 index 0000000000000..dbb62b52b22dd --- /dev/null +++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp @@ -0,0 +1,53 @@ +// RUN: %clang_cc1 -fsycl -fsycl-is-device -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -fsyntax-only +// RUN: FileCheck -input-file=%t.h %s + +// This test checks that compiler generates correct kernel description +// for parallel_for kernels that use the this_item API. + +// CHECK: __SYCL_INLINE_NAMESPACE(cl) { +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: namespace detail { + +// CHECK: static constexpr +// CHECK-NEXT: const char* const kernel_names[] = { +// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3GNU", +// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3EMU", +// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3OWL", +// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT" +// CHECK-NEXT: }; + +// CHECK: template <> struct KernelInfo { +// CHECK: __SYCL_DLL_LOCAL +// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; } + +// CHECK: template <> struct KernelInfo { +// CHECK: __SYCL_DLL_LOCAL +// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; } + +// CHECK: template <> struct KernelInfo { +// CHECK: __SYCL_DLL_LOCAL +// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; } + +// CHECK: template <> struct KernelInfo { +// CHECK: __SYCL_DLL_LOCAL +// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; } + +#include "Inputs/sycl.hpp" + +int main() { + cl::sycl::queue myQueue; + myQueue.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for(cl::sycl::range<1>(1), + [=](cl::sycl::item<1> I) {}); + cgh.parallel_for( + cl::sycl::range<1>(1), + [=](cl::sycl::item<1> I) { cl::sycl::this_item<1>(); }); + cgh.parallel_for(cl::sycl::range<1>(1), + [=](cl::sycl::id<1> I) {}); + cgh.parallel_for(cl::sycl::range<1>(1), [=](cl::sycl::id<1> I) { + cl::sycl::this_item<1>(); + }); + }); + + return 0; +} diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp new file mode 100755 index 0000000000000..52519abb8e0f0 --- /dev/null +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -0,0 +1,153 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include + +using namespace sycl; + +class PF_init_item; +class PF_init_id; +class PF_init_id1; + +struct SizesInfo { + range<1> ItemGlobalSize = {0}; + range<1> RealGlobalSizeX = {0}; + range<1> RealGlobalSizeY = {0}; + range<1> RealGlobalSizeZ = {0}; +}; + +void check(const char *msg, size_t v, size_t ref) { + std::cout << msg << v << std::endl; + assert(v == ref); +} + +int try_item(size_t size) { + size_t RoundedUpSize = (size + 32 - 1) / 32 * 32; + SizesInfo SInfo; + range<1> Size{size}; + int Counter = 0; + bool OnGpu; + + { + buffer BufSizes(&SInfo, 1); + buffer BufCounter(&Counter, Size); + queue myQueue; + OnGpu = myQueue.get_device().is_gpu(); + + myQueue.submit([&](handler &cgh) { + auto AccSizes = BufSizes.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + + cgh.parallel_for(Size, [=](item<1> ITEM) { + // cgh.parallel_for(Size, [=](int ITEM) { + AccCounter[0].fetch_add(1); + AccSizes[0].ItemGlobalSize = ITEM.get_range(0); +#ifdef __SYCL_DEVICE_ONLY__ + AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()}; + AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()}; + AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()}; +#endif // __SYCL_DEVICE_ONLY__ + }); + }); + } + + std::cout << std::endl; + if (OnGpu) { + std::cout << "Ran on GPU" << std::endl; + check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize); + check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1); + check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1); + } + check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size); + check("Counter = ", Counter, size); + std::cout << std::endl; + return 0; +} + +int try_id(size_t size) { + size_t RoundedUpSize = (size + 32 - 1) / 32 * 32; + SizesInfo SInfo; + range<1> Size{size}; + int Counter = 0; + bool OnGpu; + + { + buffer BufSizes(&SInfo, 1); + buffer BufCounter(&Counter, Size); + queue myQueue; + OnGpu = myQueue.get_device().is_gpu(); + + myQueue.submit([&](handler &cgh) { + auto AccSizes = BufSizes.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + + cgh.parallel_for(Size, [=](id<1> ID) { + AccCounter[0].fetch_add(1); + AccSizes[0].ItemGlobalSize = ID[0]; +#ifdef __SYCL_DEVICE_ONLY__ + AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()}; + AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()}; + AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()}; +#endif // __SYCL_DEVICE_ONLY__ + }); + }); + } + std::cout << std::endl; + if (OnGpu) { + std::cout << "Ran on GPU" << std::endl; + check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize); + check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1); + check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1); + } + check("Counter = ", Counter, size); + std::cout << std::endl; + + { + buffer BufSizes(&SInfo, 1); + buffer BufCounter(&Counter, Size); + queue myQueue; + Counter = 0; + OnGpu = myQueue.get_device().is_gpu(); + + myQueue.submit([&](handler &cgh) { + auto AccSizes = BufSizes.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](id<1> ID) { + AccCounter[0].fetch_add(1); + AccSizes[0].ItemGlobalSize = ID[0]; +#ifdef __SYCL_DEVICE_ONLY__ + AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()}; + AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()}; + AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()}; +#endif // __SYCL_DEVICE_ONLY__ + }); + }); + } + std::cout << std::endl; + if (OnGpu) { + std::cout << "Ran on GPU" << std::endl; + check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize); + check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1); + check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1); + } + check("Counter = ", Counter, size); + std::cout << std::endl; + + return 0; +} + +int main() { + int x; + + x = 10; + try_item(x); + try_id(x); + x = 256; + try_item(x); + try_id(x); + + return 0; +} \ No newline at end of file From 8677a2b6db7df868cda0cf4996f200bc087aa7ac Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Tue, 17 Nov 2020 16:19:57 -0800 Subject: [PATCH 11/24] Adjustment to test to account for added lines in sycl.hpp. --- clang/test/CodeGenSYCL/kernel-by-reference.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGenSYCL/kernel-by-reference.cpp b/clang/test/CodeGenSYCL/kernel-by-reference.cpp index 6502cddf602d8..f5bbac0e75730 100644 --- a/clang/test/CodeGenSYCL/kernel-by-reference.cpp +++ b/clang/test/CodeGenSYCL/kernel-by-reference.cpp @@ -15,7 +15,7 @@ int simple_add(int i) { int main() { queue q; #if defined(SYCL2020) - // expected-warning@Inputs/sycl.hpp:286 {{Passing kernel functions by value is deprecated in SYCL 2020}} + // expected-warning@Inputs/sycl.hpp:298 {{Passing kernel functions by value is deprecated in SYCL 2020}} // expected-note@+3 {{in instantiation of function template specialization}} #endif q.submit([&](handler &h) { @@ -23,7 +23,7 @@ int main() { }); #if defined(SYCL2017) - // expected-warning@Inputs/sycl.hpp:281 {{Passing of kernel functions by reference is a SYCL 2020 extension}} + // expected-warning@Inputs/sycl.hpp:293 {{Passing of kernel functions by reference is a SYCL 2020 extension}} // expected-note@+3 {{in instantiation of function template specialization}} #endif q.submit([&](handler &h) { From c340ccf649daed65af5f306a8e464d559bb6f095 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Tue, 17 Nov 2020 22:55:43 -0800 Subject: [PATCH 12/24] Changed runtime test. --- .../parallel_for_range_roundup.cpp | 64 ++++--------------- 1 file changed, 12 insertions(+), 52 deletions(-) diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp index 52519abb8e0f0..367b284473bb1 100755 --- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -1,8 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: env SYCL_DEVICE_TYPE=HOST %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout +// RUN: %ACC_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout +// RUN: %CPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout +// RUN: %GPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout +// RUN: cat %t.errout | FileCheck %s #include @@ -29,41 +30,25 @@ int try_item(size_t size) { SizesInfo SInfo; range<1> Size{size}; int Counter = 0; - bool OnGpu; { buffer BufSizes(&SInfo, 1); - buffer BufCounter(&Counter, Size); + buffer BufCounter(&Counter, 1); queue myQueue; - OnGpu = myQueue.get_device().is_gpu(); myQueue.submit([&](handler &cgh) { auto AccSizes = BufSizes.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); cgh.parallel_for(Size, [=](item<1> ITEM) { - // cgh.parallel_for(Size, [=](int ITEM) { AccCounter[0].fetch_add(1); AccSizes[0].ItemGlobalSize = ITEM.get_range(0); -#ifdef __SYCL_DEVICE_ONLY__ - AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()}; - AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()}; - AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()}; -#endif // __SYCL_DEVICE_ONLY__ }); }); } - std::cout << std::endl; - if (OnGpu) { - std::cout << "Ran on GPU" << std::endl; - check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize); - check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1); - check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1); - } check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size); check("Counter = ", Counter, size); - std::cout << std::endl; return 0; } @@ -76,9 +61,8 @@ int try_id(size_t size) { { buffer BufSizes(&SInfo, 1); - buffer BufCounter(&Counter, Size); + buffer BufCounter(&Counter, 1); queue myQueue; - OnGpu = myQueue.get_device().is_gpu(); myQueue.submit([&](handler &cgh) { auto AccSizes = BufSizes.get_access(cgh); @@ -87,30 +71,16 @@ int try_id(size_t size) { cgh.parallel_for(Size, [=](id<1> ID) { AccCounter[0].fetch_add(1); AccSizes[0].ItemGlobalSize = ID[0]; -#ifdef __SYCL_DEVICE_ONLY__ - AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()}; - AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()}; - AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()}; -#endif // __SYCL_DEVICE_ONLY__ }); }); } - std::cout << std::endl; - if (OnGpu) { - std::cout << "Ran on GPU" << std::endl; - check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize); - check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1); - check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1); - } check("Counter = ", Counter, size); - std::cout << std::endl; { buffer BufSizes(&SInfo, 1); - buffer BufCounter(&Counter, Size); + buffer BufCounter(&Counter, 1); queue myQueue; Counter = 0; - OnGpu = myQueue.get_device().is_gpu(); myQueue.submit([&](handler &cgh) { auto AccSizes = BufSizes.get_access(cgh); @@ -118,23 +88,10 @@ int try_id(size_t size) { cgh.parallel_for(Size, [=](id<1> ID) { AccCounter[0].fetch_add(1); AccSizes[0].ItemGlobalSize = ID[0]; -#ifdef __SYCL_DEVICE_ONLY__ - AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()}; - AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()}; - AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()}; -#endif // __SYCL_DEVICE_ONLY__ }); }); } - std::cout << std::endl; - if (OnGpu) { - std::cout << "Ran on GPU" << std::endl; - check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize); - check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1); - check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1); - } check("Counter = ", Counter, size); - std::cout << std::endl; return 0; } @@ -145,6 +102,9 @@ int main() { x = 10; try_item(x); try_id(x); + // CHECK: ***** Adjusted size from 10 to 32 ***** + // CHECK: ***** Adjusted size from 10 to 32 ***** + // CHECK: ***** Adjusted size from 10 to 32 ***** x = 256; try_item(x); try_id(x); From e08a478357e3b83a91aac42f469eeae7296c81b4 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Wed, 18 Nov 2020 21:13:09 -0800 Subject: [PATCH 13/24] Changes requested by reviewers, and test modifications. --- clang/lib/Sema/SemaSYCL.cpp | 25 +++-- .../CodeGenSYCL/parallel_for_this_item.cpp | 95 ++++++++++++++----- sycl/include/CL/sycl/handler.hpp | 2 +- .../parallel_for_range_roundup.cpp | 30 ++++-- 4 files changed, 107 insertions(+), 45 deletions(-) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index e7b84368a90b7..1b50daf55b518 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -80,6 +80,10 @@ class Util { /// stream class. static bool isSyclStreamType(const QualType &Ty); + /// Checks whether given clang type is a full specialization of the SYCL + /// item class. + static bool isSyclItemType(const QualType &Ty); + /// Checks whether given clang type is a full specialization of the SYCL /// half class. static bool isSyclHalfType(const QualType &Ty); @@ -511,8 +515,8 @@ class MarkDeviceFunction : public RecursiveASTVisitor { FunctionDecl *FD = WorkList.back().first; FunctionDecl *ParentFD = WorkList.back().second; - // To implement rounding-up of a parallel-for range - // a kernel call is modified like this: + // To implement rounding-up of a parallel-for range the + // SYCL header implementation modifies the kernel call like this: // auto Wrapper = [=](TransformedArgType Arg) { // if (Arg[0] >= NumWorkItems[0]) // return; @@ -523,9 +527,8 @@ class MarkDeviceFunction : public RecursiveASTVisitor { // This transformation leads to a condition where a kernel body // function becomes callable from a new kernel body function. // Hence this test. - if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD)) { + if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD)) KernelBody = FD; - } if ((ParentFD == SYCLKernel) && isSYCLKernelBodyFunction(FD)) { assert(!KernelBody && "inconsistent call graph - only one kernel body " @@ -2752,7 +2755,7 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { // The call graph for this translation unit. CallGraph SYCLCG; SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl()); - typedef std::pair ChildParentPair; + using ChildParentPair = std::pair; llvm::SmallPtrSet Visited; llvm::SmallVector WorkList; WorkList.push_back({WGLambdaFn, nullptr}); @@ -2764,7 +2767,8 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { continue; // We've already seen this Decl if (FD->isFunctionOrMethod() && FD->getIdentifier() && - !FD->getName().empty() && "this_item" == FD->getName()) { + !FD->getName().empty() && "this_item" == FD->getName() && + Util::isSyclItemType(FD->getReturnType())) { Header.setCallsThisItem(true); return; } @@ -3952,9 +3956,10 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << " __SYCL_DLL_LOCAL\n"; O << " static constexpr bool isESIMD() { return " << K.IsESIMDKernel << "; }\n"; + O << " __SYCL_DLL_LOCAL\n"; O << " static constexpr bool callsThisItem() { return "; O << K.CallsThisItem << "; }\n"; - O << "} ;\n"; + O << "};\n"; CurStart += N; } O << "\n"; @@ -4013,7 +4018,7 @@ void SYCLIntegrationHeader::addSpecConstant(StringRef IDName, QualType IDType) { } void SYCLIntegrationHeader::setCallsThisItem(bool B) { - auto *K = getCurKernelDesc(); + KernelDesc *K = getCurKernelDesc(); assert(K && "no kernels"); K->CallsThisItem = B; } @@ -4039,6 +4044,10 @@ bool Util::isSyclStreamType(const QualType &Ty) { return isSyclType(Ty, "stream"); } +bool Util::isSyclItemType(const QualType &Ty) { + return isSyclType(Ty, "item", true /*Tmpl*/); +} + bool Util::isSyclHalfType(const QualType &Ty) { const StringRef &Name = "half"; std::array Scopes = { diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp index dbb62b52b22dd..69212e83ea925 100755 --- a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp +++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl -fsycl-is-device -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -fsyntax-only +// RUN: %clang_cc1 -fsycl -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -fsyntax-only // RUN: FileCheck -input-file=%t.h %s // This test checks that compiler generates correct kernel description @@ -16,36 +16,79 @@ // CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT" // CHECK-NEXT: }; -// CHECK: template <> struct KernelInfo { -// CHECK: __SYCL_DLL_LOCAL -// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; } +// CHECK:template <> struct KernelInfo { +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3GNU"; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) { +// CHECK-NEXT: return kernel_signatures[i+0]; +// CHECK-NEXT: } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool isESIMD() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool callsThisItem() { return 0; } +// CHECK-NEXT:}; +// CHECK-NEXT:template <> struct KernelInfo { +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3EMU"; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) { +// CHECK-NEXT: return kernel_signatures[i+0]; +// CHECK-NEXT: } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool isESIMD() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool callsThisItem() { return 1; } +// CHECK-NEXT:}; +// CHECK-NEXT:template <> struct KernelInfo { +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3OWL"; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) { +// CHECK-NEXT: return kernel_signatures[i+0]; +// CHECK-NEXT: } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool isESIMD() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool callsThisItem() { return 0; } +// CHECK-NEXT:}; +// CHECK-NEXT:template <> struct KernelInfo { +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT"; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) { +// CHECK-NEXT: return kernel_signatures[i+0]; +// CHECK-NEXT: } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool isESIMD() { return 0; } +// CHECK-NEXT: __SYCL_DLL_LOCAL +// CHECK-NEXT: static constexpr bool callsThisItem() { return 1; } +// CHECK-NEXT:}; -// CHECK: template <> struct KernelInfo { -// CHECK: __SYCL_DLL_LOCAL -// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; } +#include "sycl.hpp" -// CHECK: template <> struct KernelInfo { -// CHECK: __SYCL_DLL_LOCAL -// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; } - -// CHECK: template <> struct KernelInfo { -// CHECK: __SYCL_DLL_LOCAL -// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; } - -#include "Inputs/sycl.hpp" +using namespace cl::sycl; int main() { - cl::sycl::queue myQueue; - myQueue.submit([&](cl::sycl::handler &cgh) { - cgh.parallel_for(cl::sycl::range<1>(1), - [=](cl::sycl::item<1> I) {}); + ::queue myQueue; + myQueue.submit([&](::handler &cgh) { + cgh.parallel_for(::range<1>(1), + [=](::item<1> I) {}); cgh.parallel_for( - cl::sycl::range<1>(1), - [=](cl::sycl::item<1> I) { cl::sycl::this_item<1>(); }); - cgh.parallel_for(cl::sycl::range<1>(1), - [=](cl::sycl::id<1> I) {}); - cgh.parallel_for(cl::sycl::range<1>(1), [=](cl::sycl::id<1> I) { - cl::sycl::this_item<1>(); + ::range<1>(1), + [=](::item<1> I) { ::this_item<1>(); }); + cgh.parallel_for(::range<1>(1), + [=](::id<1> I) {}); + cgh.parallel_for(::range<1>(1), [=](::id<1> I) { + ::this_item<1>(); }); }); diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 36fd72ac2cc41..af4b8ebdc88de 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -791,7 +791,7 @@ class __SYCL_EXPORT handler { GoodLocalSizeX; using NameWT = typename detail::get_kernel_wrapper_name_t::name; if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr) - std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to " + std::cout << "***** Adjusted size from " << NumWorkItems[0] << " to " << NewValX << " *****\n"; auto Wrapper = [=](TransformedArgType Arg) { if (Arg[0] >= NumWorkItems[0]) diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp index 367b284473bb1..a53c48abb29e0 100755 --- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -1,9 +1,6 @@ -// RUN: %clangxx -fsycl %s -o %t.out -// RUN: env SYCL_DEVICE_TYPE=HOST SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout -// RUN: %ACC_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout -// RUN: %CPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout -// RUN: %GPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout -// RUN: cat %t.errout | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_OPT_PFWGS_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER +// RUN: env SYCL_OPT_PFWGS_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER #include @@ -45,6 +42,7 @@ int try_item(size_t size) { AccSizes[0].ItemGlobalSize = ITEM.get_range(0); }); }); + myQueue.wait(); } check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size); @@ -73,6 +71,7 @@ int try_id(size_t size) { AccSizes[0].ItemGlobalSize = ID[0]; }); }); + myQueue.wait(); } check("Counter = ", Counter, size); @@ -90,6 +89,7 @@ int try_id(size_t size) { AccSizes[0].ItemGlobalSize = ID[0]; }); }); + myQueue.wait(); } check("Counter = ", Counter, size); @@ -102,12 +102,22 @@ int main() { x = 10; try_item(x); try_id(x); - // CHECK: ***** Adjusted size from 10 to 32 ***** - // CHECK: ***** Adjusted size from 10 to 32 ***** - // CHECK: ***** Adjusted size from 10 to 32 ***** + x = 256; try_item(x); try_id(x); return 0; -} \ No newline at end of file +} + +// CHECK: ***** Adjusted size from 10 to 32 ***** +// CHECK-NEXT: Size seen by user = 10 +// CHECK-NEXT: Counter = 10 +// CHECK-NEXT: ***** Adjusted size from 10 to 32 ***** +// CHECK-NEXT: Counter = 10 +// CHECK-NEXT: ***** Adjusted size from 10 to 32 ***** +// CHECK-NEXT: Counter = 10 +// CHECK-NEXT: Size seen by user = 256 +// CHECK-NEXT: Counter = 256 +// CHECK-NEXT: Counter = 256 +// CHECK-NEXT: Counter = 256 \ No newline at end of file From 0b878dcf688728d180720448958ee3cc9691497f Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Wed, 18 Nov 2020 22:04:41 -0800 Subject: [PATCH 14/24] Fixed EOL. --- sycl/test/basic_tests/parallel_for_range_roundup.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp index a53c48abb29e0..6ebc376dce1b5 100755 --- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -120,4 +120,4 @@ int main() { // CHECK-NEXT: Size seen by user = 256 // CHECK-NEXT: Counter = 256 // CHECK-NEXT: Counter = 256 -// CHECK-NEXT: Counter = 256 \ No newline at end of file +// CHECK-NEXT: Counter = 256 From d6773cbc2287fbb9be5a3461cca37afea50ea2e9 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Thu, 19 Nov 2020 12:42:20 -0800 Subject: [PATCH 15/24] Modified a test. --- .../CodeGenSYCL/parallel_for_this_item.cpp | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp index 69212e83ea925..40684da91b019 100755 --- a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp +++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp @@ -77,19 +77,17 @@ using namespace cl::sycl; +SYCL_EXTERNAL item<1> g() { return this_item<1>(); } +SYCL_EXTERNAL item<1> f() { return g(); } + int main() { - ::queue myQueue; + queue myQueue; myQueue.submit([&](::handler &cgh) { - cgh.parallel_for(::range<1>(1), - [=](::item<1> I) {}); - cgh.parallel_for( - ::range<1>(1), - [=](::item<1> I) { ::this_item<1>(); }); - cgh.parallel_for(::range<1>(1), - [=](::id<1> I) {}); - cgh.parallel_for(::range<1>(1), [=](::id<1> I) { - ::this_item<1>(); - }); + cgh.parallel_for(range<1>(1), [=](item<1> I) {}); + cgh.parallel_for(range<1>(1), + [=](::item<1> I) { this_item<1>(); }); + cgh.parallel_for(range<1>(1), [=](id<1> I) {}); + cgh.parallel_for(range<1>(1), [=](id<1> I) { f(); }); }); return 0; From 59ae7782bad07790a5449085dcfac904504b2342 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Fri, 20 Nov 2020 13:07:20 -0800 Subject: [PATCH 16/24] Modified this_item search, and env var names. --- clang/lib/Sema/SemaSYCL.cpp | 44 +++---------------- sycl/include/CL/sycl/handler.hpp | 23 +++++++--- .../parallel_for_range_roundup.cpp | 10 ++--- 3 files changed, 27 insertions(+), 50 deletions(-) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 1b50daf55b518..a3d9f2704615d 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -661,39 +661,6 @@ class FindPFWGLambdaFnVisitor const CXXRecordDecl *LambdaObjTy; }; -// Searches for a call to PF lambda function and captures it. -class FindPFLambdaFnVisitor - : public RecursiveASTVisitor { -public: - // LambdaObjTy - lambda type of the PF lambda object - FindPFLambdaFnVisitor(const CXXRecordDecl *LambdaObjTy) - : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {} - - bool VisitCallExpr(CallExpr *Call) { - auto *M = dyn_cast(Call->getDirectCallee()); - if (!M || (M->getOverloadedOperator() != OO_Call)) - return true; - const int NumPFLambdaArgs = 2; // range and lambda obj - if (Call->getNumArgs() != NumPFLambdaArgs) - return true; - QualType Range = Call->getArg(1)->getType(); - if (!Util::isSyclType(Range, "id", true /*Tmpl*/) && - !Util::isSyclType(Range, "item", true /*Tmpl*/)) - return true; - if (Call->getArg(0)->getType()->getAsCXXRecordDecl() != LambdaObjTy) - return true; - LambdaFn = M; // call to PF lambda found - record the lambda - return false; // ... and stop searching - } - - // Returns the captured lambda function or nullptr; - CXXMethodDecl *getLambdaFn() const { return LambdaFn; } - -private: - CXXMethodDecl *LambdaFn; - const CXXRecordDecl *LambdaObjTy; -}; - class MarkWIScopeFnVisitor : public RecursiveASTVisitor { public: MarkWIScopeFnVisitor(ASTContext &Ctx) : Ctx(Ctx) {} @@ -2746,22 +2713,21 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { if (getKernelInvocationKind(KernelFunc) != InvokeParallelFor) return; - FindPFLambdaFnVisitor V(KernelObj); - V.TraverseStmt(KernelFunc->getBody()); - CXXMethodDecl *WGLambdaFn = V.getLambdaFn(); + const CXXMethodDecl *WGLambdaFn = getOperatorParens(KernelObj); if (!WGLambdaFn) return; // The call graph for this translation unit. CallGraph SYCLCG; SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl()); - using ChildParentPair = std::pair; - llvm::SmallPtrSet Visited; + using ChildParentPair = + std::pair; + llvm::SmallPtrSet Visited; llvm::SmallVector WorkList; WorkList.push_back({WGLambdaFn, nullptr}); while (!WorkList.empty()) { - FunctionDecl *FD = WorkList.back().first; + const FunctionDecl *FD = WorkList.back().first; WorkList.pop_back(); if (!Visited.insert(FD).second) continue; // We've already seen this Decl diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index af4b8ebdc88de..7c3323778ad46 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -766,16 +766,27 @@ class __SYCL_EXPORT handler { // Disable the rounding-up optimizations under these conditions: // 1. The env var SYCL_OPT_PFWGS_DISABLE is set // 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name. - // 3. The kernel is created and invoked without an integration header entry. + // 3. The kernel is provided via an interoperability method. // 4. The API "this_item" is used inside the kernel. // 5. The range is already a multiple of the rounding factor. + // + // Cases 3 and 4 could be supported with extra effort. + // As an optimization for the common case it is an + // implementation choice to not support those scenarios. + // Note that "this_item" is a free function, i.e. not tied to any + // specific id or item. When concurrent parallel_fors are executing + // on a device it is difficult to tell which parallel_for the call is + // being made from. One could replicate portions of the + // call-graph to make this_item calls kernel-specific but this is + // not considered worthwhile. // Get the kernal name to check condition 3. std::string KName = typeid(NameT *).name(); using KI = detail::KernelInfo; bool DisableRounding = - (getenv("SYCL_OPT_PFWGS_DISABLE") != nullptr) || - (KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos) || + (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) || + (KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != + std::string::npos) || (KI::getName() == nullptr || KI::getName()[0] == '\0') || (KI::callsThisItem()); @@ -790,9 +801,9 @@ class __SYCL_EXPORT handler { ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) * GoodLocalSizeX; using NameWT = typename detail::get_kernel_wrapper_name_t::name; - if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr) - std::cout << "***** Adjusted size from " << NumWorkItems[0] << " to " - << NewValX << " *****\n"; + if (getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE") != nullptr) + std::cout << "parallel_for range adjusted from " << NumWorkItems[0] + << " to " << NewValX << std::endl; auto Wrapper = [=](TransformedArgType Arg) { if (Arg[0] >= NumWorkItems[0]) return; diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp index 6ebc376dce1b5..e74e8ce51cc98 100755 --- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -1,6 +1,6 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: env SYCL_OPT_PFWGS_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER -// RUN: env SYCL_OPT_PFWGS_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER #include @@ -110,12 +110,12 @@ int main() { return 0; } -// CHECK: ***** Adjusted size from 10 to 32 ***** +// CHECK: parallel_for range adjusted from 10 to 32 // CHECK-NEXT: Size seen by user = 10 // CHECK-NEXT: Counter = 10 -// CHECK-NEXT: ***** Adjusted size from 10 to 32 ***** +// CHECK-NEXT: parallel_for range adjusted from 10 to 32 // CHECK-NEXT: Counter = 10 -// CHECK-NEXT: ***** Adjusted size from 10 to 32 ***** +// CHECK-NEXT: parallel_for range adjusted from 10 to 32 // CHECK-NEXT: Counter = 10 // CHECK-NEXT: Size seen by user = 256 // CHECK-NEXT: Counter = 256 From 81b777cace323d77f39a15d11ff5cd3bd1772b6c Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Fri, 20 Nov 2020 13:09:29 -0800 Subject: [PATCH 17/24] Added env var documentation. --- sycl/doc/EnvironmentVariables.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 98480cb3f3dec..8731f6c8774df 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -29,6 +29,8 @@ subject to change. Do not rely on these variables in production code. | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) | | SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. | +| SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE | Any(\*) | Enables tracing of parallel_for invocations with rounded-up ranges. | +| SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING | Any(\*) | Disables automatic rounding-up of parallel_for invocation ranges. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` From 900aca862030fec8a8db0ac9b5fd4ac8d0eae990 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Sat, 21 Nov 2020 15:23:20 -0800 Subject: [PATCH 18/24] Restrict rounding-up optimization to GPU devices. --- sycl/include/CL/sycl/handler.hpp | 22 ++++++++++++++----- sycl/source/handler.cpp | 5 +++++ .../parallel_for_range_roundup.cpp | 1 - 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 7c3323778ad46..5cdc76a05d233 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -760,17 +760,20 @@ class __SYCL_EXPORT handler { using NameT = typename detail::get_kernel_name_t::name; + // The work group size preferred by this device. // A reasonable choice for rounding up the range is 32. constexpr size_t GoodLocalSizeX = 32; // Disable the rounding-up optimizations under these conditions: - // 1. The env var SYCL_OPT_PFWGS_DISABLE is set - // 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name. - // 3. The kernel is provided via an interoperability method. - // 4. The API "this_item" is used inside the kernel. - // 5. The range is already a multiple of the rounding factor. + // 1. The device is not a GPU. Only GPUs benefit from rounding. + // 2. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set. + // 3. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in + // the kernel name. + // 4. The kernel is provided via an interoperability method. + // 5. The API "this_item" is used inside the kernel. + // 6. The range is already a multiple of the rounding factor. // - // Cases 3 and 4 could be supported with extra effort. + // Cases 4 and 5 could be supported with extra effort. // As an optimization for the common case it is an // implementation choice to not support those scenarios. // Note that "this_item" is a free function, i.e. not tied to any @@ -784,6 +787,7 @@ class __SYCL_EXPORT handler { std::string KName = typeid(NameT *).name(); using KI = detail::KernelInfo; bool DisableRounding = + !is_gpu(MQueue) || (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) || (KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != std::string::npos) || @@ -1949,6 +1953,12 @@ class __SYCL_EXPORT handler { /// \param Count is a number of bytes to be prefetched. void prefetch(const void *Ptr, size_t Count); + /// Check if the queue being used is for a GPU device + /// + /// \param Queue is the queue for this handler. + /// \return Whether the device is a GPU. + bool is_gpu(shared_ptr_class Queue); + private: shared_ptr_class MQueue; /// The storage for the arguments passed. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index b3dc32070f51e..d4f75d2c472c8 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -343,5 +343,10 @@ void handler::prefetch(const void *Ptr, size_t Count) { MLength = Count; MCGType = detail::CG::PREFETCH_USM; } + +bool handler::is_gpu(shared_ptr_class Queue) { + device Dev = Queue->get_device(); + return Dev.is_gpu(); +} } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp index e74e8ce51cc98..cf6749c22d042 100755 --- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -1,6 +1,5 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER -// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER #include From 700c056c2590ed50f23da972f730b32cb60c38a4 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Sat, 21 Nov 2020 17:04:41 -0800 Subject: [PATCH 19/24] Made a method private. --- sycl/include/CL/sycl/handler.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 5cdc76a05d233..14b1552324ce6 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -899,6 +899,12 @@ class __SYCL_EXPORT handler { #endif + /// Check if the queue being used is for a GPU device + /// + /// \param Queue is the queue for this handler. + /// \return Whether the device is a GPU. + bool is_gpu(shared_ptr_class Queue); + public: handler(const handler &) = delete; handler(handler &&) = delete; @@ -1953,12 +1959,6 @@ class __SYCL_EXPORT handler { /// \param Count is a number of bytes to be prefetched. void prefetch(const void *Ptr, size_t Count); - /// Check if the queue being used is for a GPU device - /// - /// \param Queue is the queue for this handler. - /// \return Whether the device is a GPU. - bool is_gpu(shared_ptr_class Queue); - private: shared_ptr_class MQueue; /// The storage for the arguments passed. From 383ae96f9de5007546ee962ff056da76b556f57c Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Sat, 21 Nov 2020 17:20:42 -0800 Subject: [PATCH 20/24] Merge correction. --- sycl/doc/EnvironmentVariables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 8731f6c8774df..2b3e7d6577879 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code. | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images | | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) | -| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. | +| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE | Any(\*) | Enables tracing of parallel_for invocations with rounded-up ranges. | | SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING | Any(\*) | Disables automatic rounding-up of parallel_for invocation ranges. | From e8b0872e4eacd5a3c4386c32116a819275c2102f Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Wed, 25 Nov 2020 18:24:11 -0800 Subject: [PATCH 21/24] Test changes and improved this_item call detection. --- clang/lib/Sema/SemaSYCL.cpp | 65 ++++--- .../CodeGenSYCL/parallel_for_this_item.cpp | 24 ++- sycl/include/CL/sycl/handler.hpp | 2 +- sycl/include/CL/sycl/range.hpp | 2 +- .../parallel_for_range_roundup.cpp | 159 +++++++++++++----- 5 files changed, 185 insertions(+), 67 deletions(-) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 206cbe19fe862..1dd2c8bb64fa8 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -80,10 +80,6 @@ class Util { /// stream class. static bool isSyclStreamType(const QualType &Ty); - /// Checks whether given clang type is a full specialization of the SYCL - /// item class. - static bool isSyclItemType(const QualType &Ty); - /// Checks whether given clang type is a full specialization of the SYCL /// half class. static bool isSyclHalfType(const QualType &Ty); @@ -103,10 +99,23 @@ class Util { /// \param Tmpl whether the class is template instantiation or simple record static bool isSyclType(const QualType &Ty, StringRef Name, bool Tmpl = false); + /// Checks whether given function is a standard SYCL API function with given + /// name. + /// \param FD the function being checked. + /// \param Name the function name to be checked against. + static bool isSyclFunction(const FunctionDecl *FD, StringRef Name); + /// Checks whether given clang type is a full specialization of the SYCL /// specialization constant class. static bool isSyclSpecConstantType(const QualType &Ty); + // Checks declaration context hierarchy. + /// \param DC the context of the item to be checked. + /// \param Scopes the declaration scopes leading from the item context to the + /// translation unit (excluding the latter) + static bool matchContext(const DeclContext *DC, + ArrayRef Scopes); + /// Checks whether given clang type is declared in the given hierarchy of /// declaration contexts. /// \param Ty the clang type being checked @@ -2736,9 +2745,8 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { if (!Visited.insert(FD).second) continue; // We've already seen this Decl - if (FD->isFunctionOrMethod() && FD->getIdentifier() && - !FD->getName().empty() && "this_item" == FD->getName() && - Util::isSyclItemType(FD->getReturnType())) { + // Check whether this call is to sycl::this_item(). + if (Util::isSyclFunction(FD, "this_item")) { Header.setCallsThisItem(true); return; } @@ -4014,10 +4022,6 @@ bool Util::isSyclStreamType(const QualType &Ty) { return isSyclType(Ty, "stream"); } -bool Util::isSyclItemType(const QualType &Ty) { - return isSyclType(Ty, "item", true /*Tmpl*/); -} - bool Util::isSyclHalfType(const QualType &Ty) { const StringRef &Name = "half"; std::array Scopes = { @@ -4064,6 +4068,21 @@ bool Util::isSyclType(const QualType &Ty, StringRef Name, bool Tmpl) { return matchQualifiedTypeName(Ty, Scopes); } +bool Util::isSyclFunction(const FunctionDecl *FD, StringRef Name) { + if (!FD->isFunctionOrMethod() || !FD->getIdentifier() || + FD->getName().empty() || Name != FD->getName()) + return false; + + const DeclContext *DC = FD->getDeclContext(); + if (DC->isTranslationUnit()) + return false; + + std::array Scopes = { + Util::DeclContextDesc{clang::Decl::Kind::Namespace, "cl"}, + Util::DeclContextDesc{clang::Decl::Kind::Namespace, "sycl"}}; + return matchContext(DC, Scopes); +} + bool Util::isAccessorPropertyListType(const QualType &Ty) { const StringRef &Name = "accessor_property_list"; std::array Scopes = { @@ -4074,21 +4093,15 @@ bool Util::isAccessorPropertyListType(const QualType &Ty) { return matchQualifiedTypeName(Ty, Scopes); } -bool Util::matchQualifiedTypeName(const QualType &Ty, - ArrayRef Scopes) { - // The idea: check the declaration context chain starting from the type +bool Util::matchContext(const DeclContext *Ctx, + ArrayRef Scopes) { + // The idea: check the declaration context chain starting from the item // itself. At each step check the context is of expected kind // (namespace) and name. - const CXXRecordDecl *RecTy = Ty->getAsCXXRecordDecl(); - - if (!RecTy) - return false; // only classes/structs supported - const auto *Ctx = cast(RecTy); StringRef Name = ""; for (const auto &Scope : llvm::reverse(Scopes)) { clang::Decl::Kind DK = Ctx->getDeclKind(); - if (DK != Scope.first) return false; @@ -4102,7 +4115,7 @@ bool Util::matchQualifiedTypeName(const QualType &Ty, Name = cast(Ctx)->getName(); break; default: - llvm_unreachable("matchQualifiedTypeName: decl kind not supported"); + llvm_unreachable("matchContext: decl kind not supported"); } if (Name != Scope.second) return false; @@ -4110,3 +4123,13 @@ bool Util::matchQualifiedTypeName(const QualType &Ty, } return Ctx->isTranslationUnit(); } + +bool Util::matchQualifiedTypeName(const QualType &Ty, + ArrayRef Scopes) { + const CXXRecordDecl *RecTy = Ty->getAsCXXRecordDecl(); + + if (!RecTy) + return false; // only classes/structs supported + const auto *Ctx = cast(RecTy); + return Util::matchContext(Ctx, Scopes); +} diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp index 40684da91b019..422a1bad33373 100755 --- a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp +++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp @@ -80,13 +80,33 @@ using namespace cl::sycl; SYCL_EXTERNAL item<1> g() { return this_item<1>(); } SYCL_EXTERNAL item<1> f() { return g(); } +// This is a similar-looking this_item function but not the real one. +template item this_item(int i) { return item<1>{i}; } + +// This is a method named this_item but not the real one. +class C { +public: + template item this_item() { return item<1>{66}; }; +}; + int main() { queue myQueue; myQueue.submit([&](::handler &cgh) { - cgh.parallel_for(range<1>(1), [=](item<1> I) {}); + // This kernel does not call sycl::this_item + cgh.parallel_for(range<1>(1), + [=](item<1> I) { this_item<1>(55); }); + + // This kernel calls sycl::this_item cgh.parallel_for(range<1>(1), [=](::item<1> I) { this_item<1>(); }); - cgh.parallel_for(range<1>(1), [=](id<1> I) {}); + + // This kernel does not call sycl::this_item + cgh.parallel_for(range<1>(1), [=](id<1> I) { + class C c; + c.this_item<1>(); + }); + + // This kernel calls sycl::this_item cgh.parallel_for(range<1>(1), [=](id<1> I) { f(); }); }); diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 14b1552324ce6..833e9be148514 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -816,7 +816,7 @@ class __SYCL_EXPORT handler { }; range AdjustedRange = NumWorkItems; - AdjustedRange.set_range(NewValX); + AdjustedRange.set_range_dim0(NewValX); #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for(Wrapper); #else diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp index 365c852a389dd..32337109f97a9 100644 --- a/sycl/include/CL/sycl/range.hpp +++ b/sycl/include/CL/sycl/range.hpp @@ -148,7 +148,7 @@ template class range : public detail::array { friend class detail::Builder; // Adjust the first dim of the range - void set_range(const size_t dim0) { this->common_array[0] = dim0; } + void set_range_dim0(const size_t dim0) { this->common_array[0] = dim0; } }; #ifdef __cpp_deduction_guides diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp index cf6749c22d042..a9d2e883c282e 100755 --- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -5,93 +5,147 @@ using namespace sycl; -class PF_init_item; -class PF_init_id; -class PF_init_id1; - -struct SizesInfo { - range<1> ItemGlobalSize = {0}; - range<1> RealGlobalSizeX = {0}; - range<1> RealGlobalSizeY = {0}; - range<1> RealGlobalSizeZ = {0}; -}; +range<1> Range1 = {0}; +range<2> Range2 = {0, 0}; +range<3> Range3 = {0, 0, 0}; void check(const char *msg, size_t v, size_t ref) { std::cout << msg << v << std::endl; assert(v == ref); } -int try_item(size_t size) { - size_t RoundedUpSize = (size + 32 - 1) / 32 * 32; - SizesInfo SInfo; +int try_item1(size_t size) { range<1> Size{size}; int Counter = 0; - { - buffer BufSizes(&SInfo, 1); + buffer, 1> BufRange(&Range1, 1); buffer BufCounter(&Counter, 1); queue myQueue; myQueue.submit([&](handler &cgh) { - auto AccSizes = BufSizes.get_access(cgh); + auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); - - cgh.parallel_for(Size, [=](item<1> ITEM) { + cgh.parallel_for(Size, [=](item<1> ITEM) { AccCounter[0].fetch_add(1); - AccSizes[0].ItemGlobalSize = ITEM.get_range(0); + AccRange[0] = ITEM.get_range(0); }); }); myQueue.wait(); } - - check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size); + check("Size seen by user = ", Range1.get(0), size); check("Counter = ", Counter, size); return 0; } -int try_id(size_t size) { - size_t RoundedUpSize = (size + 32 - 1) / 32 * 32; - SizesInfo SInfo; - range<1> Size{size}; +int try_item2(size_t size) { + range<2> Size{size, size}; int Counter = 0; - bool OnGpu; - { - buffer BufSizes(&SInfo, 1); + buffer, 1> BufRange(&Range2, 1); buffer BufCounter(&Counter, 1); queue myQueue; myQueue.submit([&](handler &cgh) { - auto AccSizes = BufSizes.get_access(cgh); + auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](item<2> ITEM) { + AccCounter[0].fetch_add(1); + AccRange[0][0] = ITEM.get_range(0); + }); + }); + myQueue.wait(); + } + check("Size seen by user = ", Range2.get(0), size); + check("Counter = ", Counter, size * size); + return 0; +} + +int try_item3(size_t size) { + range<3> Size{size, size, size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range3, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; - cgh.parallel_for(Size, [=](id<1> ID) { + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](item<3> ITEM) { AccCounter[0].fetch_add(1); - AccSizes[0].ItemGlobalSize = ID[0]; + AccRange[0][0] = ITEM.get_range(0); }); }); myQueue.wait(); } - check("Counter = ", Counter, size); + check("Size seen by user = ", Range3.get(0), size); + check("Counter = ", Counter, size * size * size); + return 0; +} +int try_id1(size_t size) { + range<1> Size{size}; + int Counter = 0; { - buffer BufSizes(&SInfo, 1); + buffer, 1> BufRange(&Range1, 1); buffer BufCounter(&Counter, 1); queue myQueue; - Counter = 0; myQueue.submit([&](handler &cgh) { - auto AccSizes = BufSizes.get_access(cgh); + auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); cgh.parallel_for(Size, [=](id<1> ID) { AccCounter[0].fetch_add(1); - AccSizes[0].ItemGlobalSize = ID[0]; + AccRange[0] = ID[0]; }); }); myQueue.wait(); } check("Counter = ", Counter, size); + return 0; +} + +int try_id2(size_t size) { + range<2> Size{size, size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range2, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](id<2> ID) { + AccCounter[0].fetch_add(1); + AccRange[0][0] = ID[0]; + }); + }); + myQueue.wait(); + } + check("Counter = ", Counter, size * size); + return 0; +} + +int try_id3(size_t size) { + range<3> Size{size, size, size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range3, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](id<3> ID) { + AccCounter[0].fetch_add(1); + AccRange[0][0] = ID[0]; + }); + }); + myQueue.wait(); + } + check("Counter = ", Counter, size * size * size); return 0; } @@ -99,12 +153,20 @@ int main() { int x; x = 10; - try_item(x); - try_id(x); + try_item1(x); + try_item2(x); + try_item3(x); + try_id1(x); + try_id2(x); + try_id3(x); x = 256; - try_item(x); - try_id(x); + try_item1(x); + try_item2(x); + try_item3(x); + try_id1(x); + try_id2(x); + try_id3(x); return 0; } @@ -113,10 +175,23 @@ int main() { // CHECK-NEXT: Size seen by user = 10 // CHECK-NEXT: Counter = 10 // CHECK-NEXT: parallel_for range adjusted from 10 to 32 -// CHECK-NEXT: Counter = 10 +// CHECK-NEXT: Size seen by user = 10 +// CHECK-NEXT: Counter = 100 +// CHECK-NEXT: parallel_for range adjusted from 10 to 32 +// CHECK-NEXT: Size seen by user = 10 +// CHECK-NEXT: Counter = 1000 // CHECK-NEXT: parallel_for range adjusted from 10 to 32 // CHECK-NEXT: Counter = 10 +// CHECK-NEXT: parallel_for range adjusted from 10 to 32 +// CHECK-NEXT: Counter = 100 +// CHECK-NEXT: parallel_for range adjusted from 10 to 32 +// CHECK-NEXT: Counter = 1000 // CHECK-NEXT: Size seen by user = 256 // CHECK-NEXT: Counter = 256 +// CHECK-NEXT: Size seen by user = 256 +// CHECK-NEXT: Counter = 65536 +// CHECK-NEXT: Size seen by user = 256 +// CHECK-NEXT: Counter = 16777216 // CHECK-NEXT: Counter = 256 -// CHECK-NEXT: Counter = 256 +// CHECK-NEXT: Counter = 65536 +// CHECK-NEXT: Counter = 16777216 From e6d42a0f2c71a52dac98b894eab108d611bb3142 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Fri, 27 Nov 2020 09:28:27 -0800 Subject: [PATCH 22/24] Necessary addition to SYCL symbols. --- sycl/test/abi/sycl_symbols_linux.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index e8677277fca0f..9fda77a8d44fb 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3838,6 +3838,7 @@ _ZN2cl4sycl7handler18extractArgsAndReqsEv _ZN2cl4sycl7handler20associateWithHandlerEPNS0_6detail16AccessorBaseHostENS0_6access6targetE _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tEb +_ZN2cl4sycl7handler6is_gpuESt10shared_ptrINS0_6detail10queue_implEE _ZN2cl4sycl7handler6memcpyEPvPKvm _ZN2cl4sycl7handler6memsetEPvim _ZN2cl4sycl7handler7barrierERKSt6vectorINS0_5eventESaIS3_EE From 4b9093eb4be1f51f703e024838c83db90176c21a Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Thu, 3 Dec 2020 13:47:42 -0800 Subject: [PATCH 23/24] Minor corrections. --- sycl/include/CL/sycl/handler.hpp | 2 +- sycl/source/handler.cpp | 2 +- .../basic_tests/parallel_for_range_roundup.cpp | 15 +++++---------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 833e9be148514..4566aa09fb148 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -903,7 +903,7 @@ class __SYCL_EXPORT handler { /// /// \param Queue is the queue for this handler. /// \return Whether the device is a GPU. - bool is_gpu(shared_ptr_class Queue); + bool is_gpu(const shared_ptr_class Queue); public: handler(const handler &) = delete; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index d4f75d2c472c8..4e7a62d0edd23 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -344,7 +344,7 @@ void handler::prefetch(const void *Ptr, size_t Count) { MCGType = detail::CG::PREFETCH_USM; } -bool handler::is_gpu(shared_ptr_class Queue) { +bool handler::is_gpu(const shared_ptr_class Queue) { device Dev = Queue->get_device(); return Dev.is_gpu(); } diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp index a9d2e883c282e..a4a8f45c2ae92 100755 --- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp +++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp @@ -37,7 +37,7 @@ int try_item1(size_t size) { return 0; } -int try_item2(size_t size) { +void try_item2(size_t size) { range<2> Size{size, size}; int Counter = 0; { @@ -57,10 +57,9 @@ int try_item2(size_t size) { } check("Size seen by user = ", Range2.get(0), size); check("Counter = ", Counter, size * size); - return 0; } -int try_item3(size_t size) { +void try_item3(size_t size) { range<3> Size{size, size, size}; int Counter = 0; { @@ -80,10 +79,9 @@ int try_item3(size_t size) { } check("Size seen by user = ", Range3.get(0), size); check("Counter = ", Counter, size * size * size); - return 0; } -int try_id1(size_t size) { +void try_id1(size_t size) { range<1> Size{size}; int Counter = 0; { @@ -102,10 +100,9 @@ int try_id1(size_t size) { myQueue.wait(); } check("Counter = ", Counter, size); - return 0; } -int try_id2(size_t size) { +void try_id2(size_t size) { range<2> Size{size, size}; int Counter = 0; { @@ -124,10 +121,9 @@ int try_id2(size_t size) { myQueue.wait(); } check("Counter = ", Counter, size * size); - return 0; } -int try_id3(size_t size) { +void try_id3(size_t size) { range<3> Size{size, size, size}; int Counter = 0; { @@ -146,7 +142,6 @@ int try_id3(size_t size) { myQueue.wait(); } check("Counter = ", Counter, size * size * size); - return 0; } int main() { From a2a6ded4565a098d4a576e37e8d410d2d93d110c Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Thu, 10 Dec 2020 11:38:24 -0800 Subject: [PATCH 24/24] Enabled rounding for CPU also. --- sycl/include/CL/sycl/handler.hpp | 20 ++++++-------------- sycl/source/handler.cpp | 5 ----- sycl/test/abi/sycl_symbols_linux.dump | 1 - 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 4566aa09fb148..f0eb9c4872fc2 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -765,15 +765,14 @@ class __SYCL_EXPORT handler { constexpr size_t GoodLocalSizeX = 32; // Disable the rounding-up optimizations under these conditions: - // 1. The device is not a GPU. Only GPUs benefit from rounding. - // 2. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set. - // 3. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in + // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set. + // 2. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in // the kernel name. - // 4. The kernel is provided via an interoperability method. - // 5. The API "this_item" is used inside the kernel. - // 6. The range is already a multiple of the rounding factor. + // 3. The kernel is provided via an interoperability method. + // 4. The API "this_item" is used inside the kernel. + // 5. The range is already a multiple of the rounding factor. // - // Cases 4 and 5 could be supported with extra effort. + // Cases 3 and 4 could be supported with extra effort. // As an optimization for the common case it is an // implementation choice to not support those scenarios. // Note that "this_item" is a free function, i.e. not tied to any @@ -787,7 +786,6 @@ class __SYCL_EXPORT handler { std::string KName = typeid(NameT *).name(); using KI = detail::KernelInfo; bool DisableRounding = - !is_gpu(MQueue) || (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) || (KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != std::string::npos) || @@ -899,12 +897,6 @@ class __SYCL_EXPORT handler { #endif - /// Check if the queue being used is for a GPU device - /// - /// \param Queue is the queue for this handler. - /// \return Whether the device is a GPU. - bool is_gpu(const shared_ptr_class Queue); - public: handler(const handler &) = delete; handler(handler &&) = delete; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 4e7a62d0edd23..b3dc32070f51e 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -343,10 +343,5 @@ void handler::prefetch(const void *Ptr, size_t Count) { MLength = Count; MCGType = detail::CG::PREFETCH_USM; } - -bool handler::is_gpu(const shared_ptr_class Queue) { - device Dev = Queue->get_device(); - return Dev.is_gpu(); -} } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 9fda77a8d44fb..e8677277fca0f 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3838,7 +3838,6 @@ _ZN2cl4sycl7handler18extractArgsAndReqsEv _ZN2cl4sycl7handler20associateWithHandlerEPNS0_6detail16AccessorBaseHostENS0_6access6targetE _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tEb -_ZN2cl4sycl7handler6is_gpuESt10shared_ptrINS0_6detail10queue_implEE _ZN2cl4sycl7handler6memcpyEPvPKvm _ZN2cl4sycl7handler6memsetEPvim _ZN2cl4sycl7handler7barrierERKSt6vectorINS0_5eventESaIS3_EE