Skip to content

[SYCL]Enable Dead Function Elimination in sycl-post-link #2723

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Driver/Action.h
Original file line number Diff line number Diff line change
Expand Up @@ -687,8 +687,13 @@ class SYCLPostLinkJobAction : public JobAction {

bool getRTSetsSpecConstants() const { return RTSetsSpecConsts; }

void setDeadFunctionElimination(bool Val) { DeadFunctionElimination = Val; }

bool getDeadFunctionElimination() const { return DeadFunctionElimination; }

private:
bool RTSetsSpecConsts = true;
bool DeadFunctionElimination = false;
};

class PartialLinkJobAction : public JobAction {
Expand Down
53 changes: 42 additions & 11 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3760,15 +3760,17 @@ class OffloadingActionBuilder final {
for (auto SDA : SYCLDeviceActions)
SYCLLinkBinaryList.push_back(SDA);
if (WrapDeviceOnlyBinary) {
bool SYCLDeviceLibLinked = false;
// If used without -fintelfpga, -fsycl-link is used to wrap device
// objects for future host link. Device libraries should be linked
// by default to resolve any undefined reference.
if (!Args.hasArg(options::OPT_fintelfpga)) {
const auto *TC = ToolChains.front();
addSYCLDeviceLibs(TC, SYCLLinkBinaryList, true,
C.getDefaultToolChain()
.getTriple()
.isWindowsMSVCEnvironment());
SYCLDeviceLibLinked =
addSYCLDeviceLibs(TC, SYCLLinkBinaryList, true,
C.getDefaultToolChain()
.getTriple()
.isWindowsMSVCEnvironment());
}
// -fsycl-link behavior does the following to the unbundled device
// binaries:
Expand All @@ -3777,17 +3779,28 @@ class OffloadingActionBuilder final {
// 3) Translate final .bc file to .spv
// 4) Wrap the binary with the offload wrapper which can be used
// by any compilation link step.
SYCLPostLinkJobAction *PostLinkDFEAction = nullptr;
SYCLPostLinkJobAction *PostLinkAction = nullptr;
auto *DeviceLinkAction = C.MakeAction<LinkJobAction>(
SYCLLinkBinaryList, types::TY_Image);
auto *PostLinkAction = C.MakeAction<SYCLPostLinkJobAction>(
DeviceLinkAction, types::TY_LLVM_BC);
if (!SYCLDeviceLibLinked)
PostLinkAction = C.MakeAction<SYCLPostLinkJobAction>(
DeviceLinkAction, types::TY_LLVM_BC);
else {
PostLinkDFEAction = C.MakeAction<SYCLPostLinkJobAction>(
DeviceLinkAction, types::TY_LLVM_BC);
PostLinkDFEAction->setDeadFunctionElimination(true);
PostLinkDFEAction->setRTSetsSpecConstants(false);
PostLinkAction = C.MakeAction<SYCLPostLinkJobAction>(
PostLinkDFEAction, types::TY_LLVM_BC);
}
auto *TranslateAction = C.MakeAction<SPIRVTranslatorJobAction>(
PostLinkAction, types::TY_Image);
SYCLLinkBinary = C.MakeAction<OffloadWrapperJobAction>(
TranslateAction, types::TY_Object);
} else {
auto *Link = C.MakeAction<LinkJobAction>(SYCLLinkBinaryList,
types::TY_Image);
types::TY_Image);
SYCLLinkBinary = C.MakeAction<SPIRVTranslatorJobAction>(
Link, types::TY_Image);
}
Expand Down Expand Up @@ -3936,7 +3949,9 @@ class OffloadingActionBuilder final {
SYCLDeviceActions.clear();
}

void addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects,
// Return a bool value to indicate whether some device libraries are
// linked with users' device image.
bool addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects,
bool isSpirvAOT, bool isMSVCEnv) {
enum SYCLDeviceLibType {
sycl_devicelib_wrapper,
Expand All @@ -3947,6 +3962,7 @@ class OffloadingActionBuilder final {
StringRef devicelib_option;
};

int NumOfDeviceLibLinked = 0;
bool NoDeviceLibs = false;
// Currently, libc, libm-fp32 will be linked in by default. In order
// to use libm-fp64, -fsycl-device-lib=libm-fp64/all should be used.
Expand Down Expand Up @@ -4005,6 +4021,7 @@ class OffloadingActionBuilder final {
llvm::sys::path::append(LibName, Lib.devicelib_name);
llvm::sys::path::replace_extension(LibName, LibSuffix);
if (llvm::sys::fs::exists(LibName)) {
++NumOfDeviceLibLinked;
Arg *InputArg = MakeInputArg(Args, C.getDriver().getOpts(),
Args.MakeArgString(LibName));
auto *SYCLDeviceLibsInputAction =
Expand All @@ -4020,6 +4037,7 @@ class OffloadingActionBuilder final {
addInputs(sycl_devicelib_wrapper);
if (isSpirvAOT)
addInputs(sycl_devicelib_fallback);
return NumOfDeviceLibLinked != 0;
}

void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
Expand Down Expand Up @@ -4102,6 +4120,7 @@ class OffloadingActionBuilder final {
ActionList DeviceLibObjects;
ActionList LinkObjects;
auto TT = SYCLTripleList[I];
bool SYCLDeviceLibLinked = false;
auto isNVPTX = (*TC)->getTriple().isNVPTX();
bool isSpirvAOT = TT.getSubArch() == llvm::Triple::SPIRSubArch_fpga ||
TT.getSubArch() == llvm::Triple::SPIRSubArch_gen ||
Expand All @@ -4118,7 +4137,7 @@ class OffloadingActionBuilder final {
// device libraries are only needed when current toolchain is using
// AOT compilation.
if (!isNVPTX) {
addSYCLDeviceLibs(
SYCLDeviceLibLinked = addSYCLDeviceLibs(
*TC, LinkObjects, true,
C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment());
}
Expand Down Expand Up @@ -4199,8 +4218,20 @@ class OffloadingActionBuilder final {
types::ID PostLinkOutType = isNVPTX || !MultiFileActionDeps
? types::TY_LLVM_BC
: types::TY_Tempfiletable;
auto *PostLinkAction = C.MakeAction<SYCLPostLinkJobAction>(
DeviceLinkAction, PostLinkOutType);

SYCLPostLinkJobAction *PostLinkDFEAction = nullptr;
SYCLPostLinkJobAction *PostLinkAction = nullptr;
if (!SYCLDeviceLibLinked)
PostLinkAction = C.MakeAction<SYCLPostLinkJobAction>(DeviceLinkAction,
PostLinkOutType);
else {
PostLinkDFEAction = C.MakeAction<SYCLPostLinkJobAction>(
DeviceLinkAction, types::TY_LLVM_BC);
PostLinkDFEAction->setDeadFunctionElimination(true);
PostLinkDFEAction->setRTSetsSpecConstants(false);
PostLinkAction = C.MakeAction<SYCLPostLinkJobAction>(
PostLinkDFEAction, PostLinkOutType);
}
PostLinkAction->setRTSetsSpecConstants(!isAOT);

if (isNVPTX) {
Expand Down
6 changes: 5 additions & 1 deletion clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8037,6 +8037,7 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA,
// OPT_fsycl_device_code_split is not checked as it is an alias to
// -fsycl-device-code-split=per_source

auto *SYCLPostLink = llvm::dyn_cast<SYCLPostLinkJobAction>(&JA);
// Turn on Dead Parameter Elimination Optimization with early optimizations
if (!getToolChain().getTriple().isNVPTX() &&
TCArgs.hasFlag(options::OPT_fsycl_dead_args_optimization,
Expand All @@ -8047,14 +8048,17 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA,
// transformations (like specialization constant intrinsic lowering) and
// output LLVMIR
addArgs(CmdArgs, TCArgs, {"-ir-output-only"});
// DeadFunctionElimination must work with IROutputOnly to clean the
// original LLVMIR
Comment on lines +8051 to +8052
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Side comment: I'm sure I understand the first comment in this scope.

    // single file output requested - this means only perform necessary IR
    // transformations (like specialization constant intrinsic lowering) and
    // output LLVMIR

I think opt is the right tool to do LLVM IR transformations. Adding "DeadFunctionElimination" functionality to sycl-post-link tool seems like a bad idea as it duplicates already existing opt functionality.
I think we should remove "IR transformations" functionality from the sycl-post-link tool and use dedicated opt tool instead. I don't have specific instructions how to re-write the code, but we should work in this direction.

Copy link
Contributor Author

@jinge90 jinge90 Nov 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @bader
I agree that moving the "DeadFunctionElimination" functionality from sycl-post-link to llvm opt is a better idea. This requires us to enable opt in compiler package, opt tool is not generated in SYCL compiler package currently.
However, existing llvm "GlobalDCE" pass can't meet our requirements although it is much more complicated. The reason is existing llvm "GlobalDCE" has different pre-assumption from SYCL device code.
I dumped device code (LLVM IR .bc format) with all device libraries being linked in and used following command to try to eliminate "dead" code:
opt -globaldce redudant.bc -o reduced.bc
However, the redundant functions from device libraries were not eliminated at all. The reason is all "external" linkage functions who has a function body in current module will be marked as "Alive" and won't be eliminated: https://github.com/intel/llvm/blob/sycl/llvm/lib/Transforms/IPO/GlobalDCE.cpp#L321
The functions from device libraries are "external" and have function body, so they will survive in "GlobalDCE".
The pre-assumption "GlobalDCE" is based on seems to make sense as the "GlobalDCE" pass doesn't know whether current module is completely linked or not, the exported functions may be used if current module linked with other modules later. In our scenario, "DeadFunctionElimination" will work after llvm-link finishes linking everything including device libraries and its input is a completely linked device image(In spv online link mode, some "_devicelib* references will be handled by sycl runtime") which means all SYCL functions who has a function body will not be used by other modules later. And the logic of "DeadFunctionsElimination" is much simpler than existing "GlobalDCE", we only need to go through all spirv functions and if the function doesn't have any "user" in current module, we can remove it. The only exception are functions with "referenced-indirectly" attribute, we need to keep them to support device function pointers.
Existing llvm "GlobalDCE" also includes something which are not necessary in SYCL device scenario such as handling virtual functions, scanning vtable load, handling ifuncs...
So, in order to enable "DeadFunctionElimination" in sycl toolchain, we may have following ways:

  1. Implementing "DeadFunctionElimination" in sycl-post.cpp which seems not to be a good idea
  2. Splitting the "DeadFunctionElimination" into a new pass such as "SYCLDeadFunctionEliminationPass" and run the pass in sycl-post-link tool which is the same way as "SpecConstantsPass"
  3. Modifying the existing llvm "GlobalDCE" pass and make it to meet our requirement for SYCL. It seems that we may need to modify much community code and introduce some sycl specific code.
  4. We add a "SYCLGlobalDCE" and don't touch existing community's "GlobalDCE", the "SYCLGlobalDCE" is much simpler. This is similar to "sycl dead argument elimination".
    In 3 and 4, we need to enable opt after completely linked device image is generate by llvm-link.

Which one do you prefer or do you have any other good ideas?
Thanks very much.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to just have a small pass cleaning up the code first before using plain GlobalDCE?
Similar to for example https://github.com/triSYCL/llvm/blob/sycl/master/lib/SYCL/SYCLKernelFilter.cpp

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @keryell
It seems to be a different scenario. In compilation phase, we can run "GlobalDCE" to each module no matter it is "completely linked" or not.
"DeadFunctionElimination" introduced by this patch aims to eliminate SPIRV functions which are not called in device code(mainly for cleaning up unused functions introduced by device libraries), so its input must be a "completely linked" device image. We can only get such "completely linked" device image when llvm-link finishes linking everything.

But I think moving the functionality to a small SYCL pass should be the right direction. At least, we can make sycl-post-link.cpp more clean by doing so.
Thanks very much.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think opt is the right tool to do LLVM IR transformations. Adding "DeadFunctionElimination" functionality to sycl-post-link tool seems like a bad idea as it duplicates already existing opt functionality.
I think we should remove "IR transformations" functionality from the sycl-post-link tool and use dedicated opt tool instead. I don't have specific instructions how to re-write the code, but we should work in this direction.

I agree with @bader here. Originally the post-link tool's purpose was to handle device code splitting. Then spec constants handling was added as it depended on the splitting. Now it turns more and more into an LTO driver. This is not the purpose of the tool.

I'm not sure if we can or it makes sense to reuse the LLVM LTO infra for our device code - maybe @andykaylor has a perspective. To me using opt-based Driver job to run necessary LTO passes like the GlobalDCE before handing off the optimized IR to SYCL post-link is the right way to go at this point. Later we can decide to reuse LTO infra and adjust accordingly. Linking with default device libraries should also be factored out of the post-link into a driver job, IMO.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @kbobrovs
Thank you for suggestion.
Currently, we have enabled functionality to analyze device code to get required device libraries(spv file) in sycl-post-link tool too which is not the purpose of sycl-post-link tool either.
I am not sure whether we can enable llvm "opt" tools in device compilation workflow, if "opt" can be enabled, we can put "dead function elimination" or "device code device library requirement analysis" into llvm passes and run them via opt offline. This will require some driver work, we need to add "opt" in driver workflow but will make sycl-post-link clean.

if (SYCLPostLink && SYCLPostLink->getDeadFunctionElimination())
addArgs(CmdArgs, TCArgs, {"--dead-function-elimination"});
} else {
assert(JA.getType() == types::TY_Tempfiletable);
// Symbol file and specialization constant info generation is mandatory -
// add options unconditionally
addArgs(CmdArgs, TCArgs, {"-symbols"});
}
// specialization constants processing is mandatory
auto *SYCLPostLink = llvm::dyn_cast<SYCLPostLinkJobAction>(&JA);
if (SYCLPostLink && SYCLPostLink->getRTSetsSpecConstants())
addArgs(CmdArgs, TCArgs, {"-spec-const=rt"});
else
Expand Down
53 changes: 48 additions & 5 deletions llvm/tools/sycl-post-link/sycl-post-link.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ static cl::opt<bool> EmitKernelParamInfo{
"emit-param-info", cl::desc("emit kernel parameter optimization info"),
cl::cat(PostLinkCat)};

static cl::opt<bool> DeadFunctionElimination{
"dead-function-elimination",
cl::desc("Eliminate dead functions in device image"), cl::cat(PostLinkCat)};

struct ImagePropSaveInfo {
bool NeedDeviceLibReqMask;
bool DoSpecConst;
Expand Down Expand Up @@ -566,6 +570,32 @@ static string_vector saveResultSymbolsLists(string_vector &ResSymbolsLists) {
return std::move(Res);
}

// Eliminate 'dead' functions which are not called in device LLVM IR module,
// there is one execption: functions with 'reference-indirectly' attribute
// can't be eliminated since they will be called indirectly via function ptr.
static void eliminateDeadFunctions(Module &M) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code duplicated functionality LLVM provides with GlobalDCEPass. I suggest re-using LLVM pass to avoid unnecessary maintenance.

std::vector<Function *> DeadFunctions;
bool NoDeadFunction = false;
while (!NoDeadFunction) {
DeadFunctions.clear();
for (Function &F : M) {
if (F.user_empty() && (F.getCallingConv() == CallingConv::SPIR_FUNC) &&
!F.getAttributes().hasFnAttribute("referenced-indirectly")) {
F.deleteBody();
DeadFunctions.push_back(&F);
}
}

if (!DeadFunctions.empty()) {
for (Function *F : DeadFunctions) {
M.getFunctionList().remove(F);
}
NoDeadFunction = false;
} else
NoDeadFunction = true;
}
}

#define CHECK_AND_EXIT(E) \
{ \
Error LocE = std::move(E); \
Expand Down Expand Up @@ -612,11 +642,18 @@ int main(int argc, char **argv) {
"will produce single output file example_p.bc suitable for SPIRV\n"
"translation.\n");

bool DoSplit = SplitMode.getNumOccurrences() > 0;
bool DoSpecConst = SpecConstLower.getNumOccurrences() > 0;
bool DoParamInfo = EmitKernelParamInfo.getNumOccurrences() > 0;

if (!DoSplit && !DoSpecConst && !DoSymGen && !DoParamInfo) {
// DeadFunctionElimination is used for removing some unused function in
// ORIGINAL IR, it must work with IROutputOnly and can't work with other
// options such as DoSplit, DoSpecConst, DoParamInfo...
bool DoSplit =
(SplitMode.getNumOccurrences() > 0 && !DeadFunctionElimination);
bool DoSpecConst =
(SpecConstLower.getNumOccurrences() > 0 && !DeadFunctionElimination);
bool DoParamInfo =
(EmitKernelParamInfo.getNumOccurrences() > 0 && !DeadFunctionElimination);

if (!DoSplit && !DoSpecConst && !DoSymGen && !DoParamInfo &&
!DeadFunctionElimination) {
errs() << "no actions specified; try --help for usage info\n";
return 1;
}
Expand Down Expand Up @@ -648,6 +685,12 @@ int main(int argc, char **argv) {
if (OutputFilename.getNumOccurrences() == 0)
OutputFilename = (Twine(sys::path::stem(InputFilename)) + ".files").str();

if (DeadFunctionElimination && IROutputOnly) {
eliminateDeadFunctions(*MPtr);
saveModule(*MPtr, OutputFilename);
return 0;
}

std::map<StringRef, std::vector<Function *>> GlobalsSet;

if (DoSplit || DoSymGen) {
Expand Down