Skip to content

[SYCL] Parallel-for range correction to improve group size selection by GPU driver #2703

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Dec 17, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a23f9ad
[SYCL] Parallel-for range rounding-up for improved group size selecti…
rdeodhar Oct 29, 2020
047123f
Merge branch 'sycl' of https://github.com/intel/llvm into iwgo5
rdeodhar Nov 2, 2020
73f50bd
Correction to wrapper kernel name.
rdeodhar Nov 4, 2020
2aad33d
Test correction to backend interoperability interface.
rdeodhar Nov 5, 2020
ac6bf28
Environment var control to disable optimization; correction to one test.
rdeodhar Nov 6, 2020
da1a3ab
Fixes for set_args and this_item usage, and for review comments.
rdeodhar Nov 16, 2020
e186605
Merge branch 'sycl' of https://github.com/intel/llvm into iwgo5
rdeodhar Nov 16, 2020
eaacd8a
Formatting changes.
rdeodhar Nov 16, 2020
535745f
Formatting change.
rdeodhar Nov 16, 2020
5c6c841
Removed unneeded files.
rdeodhar Nov 16, 2020
dc20fa1
Added some comments requested by reviewers.
rdeodhar Nov 17, 2020
d62e2f1
Added a test for integration header and one execution test.
rdeodhar Nov 17, 2020
b860666
Merge branch 'sycl' of https://github.com/intel/llvm into iwgo5
rdeodhar Nov 17, 2020
8677a2b
Adjustment to test to account for added lines in sycl.hpp.
rdeodhar Nov 18, 2020
c340ccf
Changed runtime test.
rdeodhar Nov 18, 2020
e08a478
Changes requested by reviewers, and test modifications.
rdeodhar Nov 19, 2020
0b878dc
Fixed EOL.
rdeodhar Nov 19, 2020
d6773cb
Modified a test.
rdeodhar Nov 19, 2020
59ae778
Modified this_item search, and env var names.
rdeodhar Nov 20, 2020
81b777c
Added env var documentation.
rdeodhar Nov 20, 2020
900aca8
Restrict rounding-up optimization to GPU devices.
rdeodhar Nov 21, 2020
094c01d
Merge branch 'sycl' into iwgo5
rdeodhar Nov 21, 2020
700c056
Made a method private.
rdeodhar Nov 22, 2020
6bfede9
Merge branch 'sycl' of https://github.com/intel/llvm into iwgo5
rdeodhar Nov 22, 2020
383ae96
Merge correction.
rdeodhar Nov 22, 2020
e8e7f74
Merge branch 'iwgo5' of https://github.com/rdeodhar/llvm into iwgo5
rdeodhar Nov 22, 2020
e8b0872
Test changes and improved this_item call detection.
rdeodhar Nov 26, 2020
e6d42a0
Necessary addition to SYCL symbols.
rdeodhar Nov 27, 2020
4b9093e
Minor corrections.
rdeodhar Dec 3, 2020
a2a6ded
Enabled rounding for CPU also.
rdeodhar Dec 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions clang/include/clang/Sema/Sema.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,9 @@ class SYCLIntegrationHeader {
/// Registers a specialization constant to emit info for it into the header.
void addSpecConstant(StringRef IDName, QualType IDType);

/// Notes that this_item is called within the kernel.
void setCallsThisItem(bool B);

private:
// Kernel actual parameter descriptor.
struct KernelParamDesc {
Expand Down Expand Up @@ -378,6 +381,9 @@ class SYCLIntegrationHeader {
/// Descriptor of kernel actual parameters.
SmallVector<KernelParamDesc, 8> Params;

// Whether kernel calls this_item()
bool CallsThisItem;

KernelDesc() = default;
};

Expand Down
111 changes: 109 additions & 2 deletions clang/lib/Sema/SemaSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,22 @@ class MarkDeviceFunction : public RecursiveASTVisitor<MarkDeviceFunction> {
FunctionDecl *FD = WorkList.back().first;
FunctionDecl *ParentFD = WorkList.back().second;

// To implement rounding-up of a parallel-for range
// a kernel call is modified like this:
// auto Wrapper = [=](TransformedArgType Arg) {
// if (Arg[0] >= NumWorkItems[0])
// return;
// Arg.set_allowed_range(NumWorkItems);
// KernelFunc(Arg);
// };
//
// This transformation leads to a condition where a kernel body
// function becomes callable from a new kernel body function.
// Hence this test.
if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD)) {
KernelBody = FD;
}

if ((ParentFD == SYCLKernel) && isSYCLKernelBodyFunction(FD)) {
assert(!KernelBody && "inconsistent call graph - only one kernel body "
"function can be called");
Expand Down Expand Up @@ -641,6 +657,39 @@ class FindPFWGLambdaFnVisitor
const CXXRecordDecl *LambdaObjTy;
};

// Searches for a call to PF lambda function and captures it.
class FindPFLambdaFnVisitor
: public RecursiveASTVisitor<FindPFLambdaFnVisitor> {
public:
// LambdaObjTy - lambda type of the PF lambda object
FindPFLambdaFnVisitor(const CXXRecordDecl *LambdaObjTy)
: LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {}

bool VisitCallExpr(CallExpr *Call) {
auto *M = dyn_cast<CXXMethodDecl>(Call->getDirectCallee());
if (!M || (M->getOverloadedOperator() != OO_Call))
return true;
const int NumPFLambdaArgs = 2; // range and lambda obj
if (Call->getNumArgs() != NumPFLambdaArgs)
return true;
QualType Range = Call->getArg(1)->getType();
if (!Util::isSyclType(Range, "id", true /*Tmpl*/) &&
!Util::isSyclType(Range, "item", true /*Tmpl*/))
return true;
if (Call->getArg(0)->getType()->getAsCXXRecordDecl() != LambdaObjTy)
return true;
LambdaFn = M; // call to PF lambda found - record the lambda
return false; // ... and stop searching
}

// Returns the captured lambda function or nullptr;
CXXMethodDecl *getLambdaFn() const { return LambdaFn; }

private:
CXXMethodDecl *LambdaFn;
const CXXRecordDecl *LambdaObjTy;
};

class MarkWIScopeFnVisitor : public RecursiveASTVisitor<MarkWIScopeFnVisitor> {
public:
MarkWIScopeFnVisitor(ASTContext &Ctx) : Ctx(Ctx) {}
Expand Down Expand Up @@ -2653,13 +2702,62 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
return !SemaRef.getASTContext().hasSameType(FD->getType(), Ty);
}

// Sets a flag if the kernel is a parallel_for that calls the
// free function API "this_item".
void setThisItemIsCalled(const CXXRecordDecl *KernelObj,
FunctionDecl *KernelFunc) {
if (getKernelInvocationKind(KernelFunc) != InvokeParallelFor)
return;

FindPFLambdaFnVisitor V(KernelObj);
V.TraverseStmt(KernelFunc->getBody());
CXXMethodDecl *WGLambdaFn = V.getLambdaFn();
if (!WGLambdaFn)
return;

// The call graph for this translation unit.
CallGraph SYCLCG;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK we already build a callgraph in SemaSYCL. Can we try to re-use it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it would be nice to reuse that infrastructure. I first tried pursuing that approach. The result of a scan for calls to this_item would have to be saved somewhere. The existing callgraph traversal lead to various function "attributes" being set. This would be fine, except that calls_this_item is not an attribute. We could define an internal attribute for that. Would that be acceptable? If yes, it would simplify the SemaSYCL changes quite a bit. How to add such an attribute?

Copy link
Contributor

@elizabethandrews elizabethandrews Nov 18, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why not. You can check SYCLRequiresDecomposition for an example of internal attribute. @premanandrao @Fznamznon could you please confirm this is ok?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should be ok.

SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl());
typedef std::pair<FunctionDecl *, FunctionDecl *> ChildParentPair;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
typedef std::pair<FunctionDecl *, FunctionDecl *> ChildParentPair;
using ChildParentPair std::pair<FunctionDecl *, FunctionDecl *>;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

llvm::SmallPtrSet<FunctionDecl *, 16> Visited;
llvm::SmallVector<ChildParentPair, 16> WorkList;
WorkList.push_back({WGLambdaFn, nullptr});

while (!WorkList.empty()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code seems also pretty similar to code which walks through another CallGraph in SemaSYCL. I think we need to merge them one day, but this is not in scope of your PR, I just wanted to capture the problem.

FunctionDecl *FD = WorkList.back().first;
WorkList.pop_back();
if (!Visited.insert(FD).second)
continue; // We've already seen this Decl

if (FD->isFunctionOrMethod() && FD->getIdentifier() &&
!FD->getName().empty() && "this_item" == FD->getName()) {
Header.setCallsThisItem(true);
return;
}

CallGraphNode *N = SYCLCG.getNode(FD);
if (!N)
continue;

for (const CallGraphNode *CI : *N) {
if (auto *Callee = dyn_cast<FunctionDecl>(CI->getDecl())) {
Callee = Callee->getMostRecentDecl();
if (!Visited.count(Callee))
WorkList.push_back({Callee, FD});
}
}
}
}

public:
static constexpr const bool VisitInsideSimpleContainers = false;
SyclKernelIntHeaderCreator(Sema &S, SYCLIntegrationHeader &H,
const CXXRecordDecl *KernelObj, QualType NameType,
StringRef Name, StringRef StableName)
StringRef Name, StringRef StableName,
FunctionDecl *KernelFunc)
: SyclKernelFieldHandler(S), Header(H) {
Header.startKernel(Name, NameType, StableName, KernelObj->getLocation());
setThisItemIsCalled(KernelObj, KernelFunc);
}

bool handleSyclAccessorType(const CXXRecordDecl *RD,
Expand Down Expand Up @@ -3101,7 +3199,7 @@ void Sema::ConstructOpenCLKernel(FunctionDecl *KernelCallerFunc,
SyclKernelIntHeaderCreator int_header(
*this, getSyclIntegrationHeader(), KernelObj,
calculateKernelNameType(Context, KernelCallerFunc), KernelName,
StableName);
StableName, KernelCallerFunc);

KernelObjVisitor Visitor{*this};
Visitor.VisitRecordBases(KernelObj, kernel_decl, kernel_body, int_header);
Expand Down Expand Up @@ -3810,6 +3908,9 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) {
O << "getParamDesc(unsigned i) {\n";
O << " return kernel_signatures[i+" << CurStart << "];\n";
O << " }\n";
O << " __SYCL_DLL_LOCAL\n";
O << " static constexpr bool callsThisItem() { return ";
O << K.CallsThisItem << "; }\n";
O << "};\n";
CurStart += N;
}
Expand Down Expand Up @@ -3866,6 +3967,12 @@ void SYCLIntegrationHeader::addSpecConstant(StringRef IDName, QualType IDType) {
SpecConsts.emplace_back(std::make_pair(IDType, IDName.str()));
}

void SYCLIntegrationHeader::setCallsThisItem(bool B) {
auto *K = getCurKernelDesc();
assert(K && "no kernels");
K->CallsThisItem = B;
}

SYCLIntegrationHeader::SYCLIntegrationHeader(DiagnosticsEngine &_Diag,
bool _UnnamedLambdaSupport,
Sema &_S)
Expand Down
2 changes: 2 additions & 0 deletions sycl/include/CL/sycl/detail/kernel_desc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ template <class KernelNameType> struct KernelInfo {
return Dummy;
}
static constexpr const char *getName() { return ""; }
static bool callsThisItem() { return false; }
};
#else
template <char...> struct KernelInfoData {
Expand All @@ -65,6 +66,7 @@ template <char...> struct KernelInfoData {
return Dummy;
}
static constexpr const char *getName() { return ""; }
static bool callsThisItem() { return false; }
};

// C++14 like index_sequence and make_index_sequence
Expand Down
88 changes: 76 additions & 12 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,14 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
using name = Type;
};

// Used when parallel_for range is rounded-up.
template <typename Type> class __pf_kernel_wrapper;

template <typename Type> struct get_kernel_wrapper_name_t {
using name = __pf_kernel_wrapper<
typename get_kernel_name_t<detail::auto_name, Type>::name>;
};

template <typename, typename T> struct check_fn_signature {
static_assert(std::integral_constant<T, false>::value,
"Second template parameter is required to be of function type");
Expand Down Expand Up @@ -728,23 +736,79 @@ class __SYCL_EXPORT handler {
void parallel_for_lambda_impl(range<Dims> NumWorkItems,
KernelType KernelFunc) {
throwIfActionIsCreated();
using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;

// If 1D kernel argument is an integral type, convert it to sycl::item<1>
using TransformedArgType =
typename detail::conditional_t<std::is_integral<LambdaArgType>::value &&
Dims == 1,
item<Dims>, LambdaArgType>;
typename std::conditional<std::is_integral<LambdaArgType>::value &&
Dims == 1,
item<Dims>, LambdaArgType>::type;
using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;

// A reasonable choice for rounding up the range is 32.
constexpr size_t GoodLocalSizeX = 32;

// Disable the rounding-up optimizations under these conditions:
// 1. The env var SYCL_OPT_PFWGS_DISABLE is set
// 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name.
// 3. The kernel is created and invoked without an integration header entry.
// 4. The API "this_item" is used inside the kernel.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the rationale for this?
Also point 3 is not obvious to me...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Allows disabling this transformation at runtime for all parallel_fors without source code changes.
  2. Allows disabling this transformation for a specific parallel_for by changing the source code.
  3. Disables this optimization when kernel argument setting is done manually by using set_args.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My question was actually for the point "4."

Otherwise I find the point "3." quite clearer.
But is the reason that the kernel is provided by an interoperability method instead and thus there is no way to change the kernel from inside? The fact that set_arg is used does not matter I think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. It is an implementation choice not to support this optimization for the interoperability method. It could conceivably be done but was felt to be of low value. The optimization is intended for the common case of parallel_for.
  2. "this_item" is a free function, i.e. not tied to any specific id or item. When concurrent parallel_fors are executing on a device it is not easy to tell which parallel_for the call is being made from, to return the related item. Use of "this_item" is not considered a common case and it is acceptable to give up the optimization when this_item is used within a kernel.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thanks, very useful. Now I can understand what's going on.
Could you please copy-paste your explanations in the comments? :-)
I have the feeling that if you detect this_item you can clone & rewrite all the call graph to pass explicitly the real item, which looks like some heavy duty... So "giving up" looks like a lazy starting point... :-)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added to the comments.

// 5. The range is already a multiple of the rounding factor.

// Get the kernal name to check condition 3.
std::string KName = typeid(NameT *).name();
using KI = detail::KernelInfo<KernelName>;
bool DisableRounding =
(getenv("SYCL_OPT_PFWGS_DISABLE") != nullptr) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can env var names be less cryptic? Also we need to document them in https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

(KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos) ||
(KI::getName() == nullptr || KI::getName()[0] == '\0') ||
(KI::callsThisItem());

// Perform range rounding if rounding-up is enabled
// and the user-specified range is not a multiple of a "good" value.
if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) {
// It is sufficient to round up just the first dimension.
// Multiplying the rounded-up value of the first dimension
// by the values of the remaining dimensions (if any)
// will yield a rounded-up value for the total range.
size_t NewValX =
((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
GoodLocalSizeX;
using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr)
std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to "
<< NewValX << " *****\n";
auto Wrapper = [=](TransformedArgType Arg) {
if (Arg[0] >= NumWorkItems[0])
return;
Arg.set_allowed_range(NumWorkItems);
KernelFunc(Arg);
};

range<Dims> AdjustedRange = NumWorkItems;
AdjustedRange.set_range(NewValX);
#ifdef __SYCL_DEVICE_ONLY__
(void)NumWorkItems;
kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
#else
detail::checkValueRange<Dims>(NumWorkItems);
MNDRDesc.set(std::move(NumWorkItems));
StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
std::move(KernelFunc));
MCGType = detail::CG::KERNEL;
detail::checkValueRange<Dims>(AdjustedRange);
MNDRDesc.set(std::move(AdjustedRange));
StoreLambda<NameWT, decltype(Wrapper), Dims, TransformedArgType>(
std::move(Wrapper));
MCGType = detail::CG::KERNEL;
#endif
} else {
#ifdef __SYCL_DEVICE_ONLY__
(void)NumWorkItems;
kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
#else
detail::checkValueRange<Dims>(NumWorkItems);
MNDRDesc.set(std::move(NumWorkItems));
StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
std::move(KernelFunc));
MCGType = detail::CG::KERNEL;
#endif
}
}

/// Defines and invokes a SYCL kernel function for the specified range.
Expand Down
4 changes: 4 additions & 0 deletions sycl/include/CL/sycl/id.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,10 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
__SYCL_GEN_OPT(^=)

#undef __SYCL_GEN_OPT

private:
friend class handler;
void set_allowed_range(range<dimensions> rnwi) { (void)rnwi[0]; }
};

namespace detail {
Expand Down
3 changes: 3 additions & 0 deletions sycl/include/CL/sycl/item.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ template <int dimensions = 1, bool with_offset = true> class item {
friend class detail::Builder;

private:
friend class handler;
void set_allowed_range(const range<dimensions> rnwi) { MImpl.MExtent = rnwi; }

detail::ItemBase<dimensions, with_offset> MImpl;
};

Expand Down
8 changes: 8 additions & 0 deletions sycl/include/CL/sycl/range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#pragma once
#include <CL/sycl/detail/array.hpp>
#include <CL/sycl/detail/helpers.hpp>

#include <stdexcept>
#include <type_traits>
Expand Down Expand Up @@ -141,6 +142,13 @@ template <int dimensions = 1> class range : public detail::array<dimensions> {
__SYCL_GEN_OPT(^=)

#undef __SYCL_GEN_OPT

private:
friend class handler;
friend class detail::Builder;

// Adjust the first dim of the range
void set_range(const size_t dim0) { this->common_array[0] = dim0; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sets first dim - better reflect this in the name (e.g. set_zero_dim)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed name.

};

#ifdef __cpp_deduction_guides
Expand Down