Skip to content

[SYCL] Generalize GlobalOffset and enable it for AMDGPU #5855

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 5, 2022
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -316,5 +316,10 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi",
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts")

//===----------------------------------------------------------------------===//
// SYCL builtin.
//===----------------------------------------------------------------------===//
BUILTIN(__builtin_amdgcn_implicit_offset, "Ui*5", "nc")

#undef BUILTIN
#undef TARGET_BUILTIN
51 changes: 38 additions & 13 deletions clang/lib/CodeGen/TargetInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,26 @@ static bool occupiesMoreThan(CodeGenTypes &cgt,
return (intCount + fpCount > maxAllRegisters);
}

/// Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV,
/// Name, and Operand as operands, and adds the resulting MDNode to the
/// AnnotationName MDNode.
static void addAMDGCOrNVVMMetadata(const char *AnnotationName,
llvm::GlobalValue *GV, StringRef Name,
int Operand) {
llvm::Module *M = GV->getParent();
llvm::LLVMContext &Ctx = M->getContext();

// Get annotations metadata node.
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(AnnotationName);

llvm::Metadata *MDVals[] = {
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
// Append metadata to annotations node.
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
}

bool SwiftABIInfo::isLegalVectorTypeForSwift(CharUnits vectorSize,
llvm::Type *eltTy,
unsigned numElts) const {
Expand Down Expand Up @@ -7304,18 +7324,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(

void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
StringRef Name, int Operand) {
llvm::Module *M = GV->getParent();
llvm::LLVMContext &Ctx = M->getContext();

// Get "nvvm.annotations" metadata node
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");

llvm::Metadata *MDVals[] = {
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
// Append metadata to nvvm.annotations
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
addAMDGCOrNVVMMetadata("nvvm.annotations", GV, Name, Operand);
}

bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
Expand Down Expand Up @@ -9285,6 +9294,12 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
llvm::Type *BlockTy) const override;
bool shouldEmitStaticExternCAliases() const override;
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;

private:
// Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
// resulting MDNode to the amdgcn.annotations MDNode.
static void addAMDGCNMetadata(llvm::GlobalValue *GV, StringRef Name,
int Operand);
};
}

Expand All @@ -9301,6 +9316,11 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));
}

void AMDGPUTargetCodeGenInfo::addAMDGCNMetadata(llvm::GlobalValue *GV,
StringRef Name, int Operand) {
addAMDGCOrNVVMMetadata("amdgcn.annotations", GV, Name, Operand);
}

void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
const auto *ReqdWGS =
Expand Down Expand Up @@ -9402,10 +9422,15 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(

const bool IsHIPKernel =
M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>();

if (IsHIPKernel)
F->addFnAttr("uniform-work-group-size", "true");

// Create !{<func-ref>, metadata !"kernel", i32 1} node for SYCL kernels.
const bool IsSYCLKernel =
FD && M.getLangOpts().SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>();
if (IsSYCLKernel)
addAMDGCNMetadata(F, "kernel", 1);

if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");

Expand Down
6 changes: 0 additions & 6 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5865,12 +5865,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-treat-scalable-fixed-error-as-warning");
}

// Enable local accessor to shared memory pass for SYCL.
if (isa<BackendJobAction>(JA) && IsSYCLOffloadDevice &&
(Triple.isNVPTX() || Triple.isAMDGCN())) {
CmdArgs.push_back("-mllvm");
CmdArgs.push_back("-sycl-enable-local-accessor");
}
// These two are potentially updated by AddClangCLArgs.
codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
bool EmitCodeView = false;
Expand Down
8 changes: 2 additions & 6 deletions clang/lib/Driver/ToolChains/HIPAMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
const llvm::opt::ArgList &Args) const {
// Construct lld command.
// The output from ld.lld is an HSA code object file.
ArgStringList LldArgs{"-flavor",
"gnu",
"--no-undefined",
"-shared",
"-plugin-opt=-amdgpu-internalize-symbols",
"-plugin-opt=-sycl-enable-local-accessor"};
ArgStringList LldArgs{"-flavor", "gnu", "--no-undefined", "-shared",
"-plugin-opt=-amdgpu-internalize-symbols"};

auto &TC = getToolChain();
auto &D = TC.getDriver();
Expand Down
48 changes: 48 additions & 0 deletions clang/test/CodeGenSYCL/kernel-annotation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// RUN: %clang_cc1 -triple spir64-unknown-unknown -fsycl-is-device \
// RUN: -S -emit-llvm %s -o %temp.ll
// RUN: FileCheck -check-prefix=CHECK-SPIR --input-file %temp.ll %s

// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fsycl-is-device \
// RUN: -S -emit-llvm %s -o %temp.ll
// RUN: FileCheck -check-prefix=CHECK-NVPTX --input-file %temp.ll %s

// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -fsycl-is-device \
// RUN: -S -emit-llvm %s -o %temp.ll
// RUN: FileCheck -check-prefix=CHECK-AMDGCN --input-file %temp.ll %s

// The test makes sure that `[nnvm|amdgcn].annotations are correctly generated
// only for their respective targets.

#include "Inputs/sycl.hpp"

sycl::handler H;

class Functor {
public:
void operator()() const {}
};

// CHECK-SPIR-NOT: annotations =

// CHECK-NVPTX: nvvm.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
// CHECK-NVPTX: [[FIRST]] = !{void ()* @_ZTS7Functor, !"kernel", i32 1}
// CHECK-NVPTX: [[SECOND]] = !{void ()* @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE0_clES2_E5foo_2, !"kernel", i32 1}

// CHECK-AMDGCN: amdgcn.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
// CHECK-AMDGCN: [[FIRST]] = !{void ()* @_ZTS7Functor, !"kernel", i32 1}
// CHECK-AMDGCN: [[SECOND]] = !{void ()* @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE0_clES2_E5foo_2, !"kernel", i32 1}

int main() {
sycl::queue q;
q.submit([&](sycl::handler &cgh) {
Functor foo{};
cgh.single_task(foo);
});

q.submit([&](cl::sycl::handler &cgh) {
cgh.parallel_for<class foo_2>(cl::sycl::range<1>(1),
[=](cl::sycl::item<1> item) {
});
});
return 0;
}
14 changes: 0 additions & 14 deletions clang/test/Driver/sycl-local-accessor-opt.cpp

This file was deleted.

1 change: 1 addition & 0 deletions libclc/amdgcn-amdhsa/libspirv/SOURCES
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

workitem/get_global_offset.ll
group/collectives.cl
group/collectives_helpers.ll
atomic/loadstore_helpers.ll
Expand Down
42 changes: 42 additions & 0 deletions libclc/amdgcn-amdhsa/libspirv/workitem/get_global_offset.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
;;===----------------------------------------------------------------------===//
;;
;; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
;; See https://llvm.org/LICENSE.txt for license information.
;; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
;;
;;===----------------------------------------------------------------------===//

#if __clang_major__ >= 7
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
#else
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
#endif

; Function Attrs: nounwind readnone speculatable
declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()

define hidden i64 @_Z22__spirv_GlobalOffset_xv() nounwind alwaysinline {
entry:
%0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
%1 = load i32, i32 addrspace(5)* %0, align 4
%zext = zext i32 %1 to i64
ret i64 %zext
}

define hidden i64 @_Z22__spirv_GlobalOffset_yv() nounwind alwaysinline {
entry:
%0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
%arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 1
%1 = load i32, i32 addrspace(5)* %arrayidx, align 4
%zext = zext i32 %1 to i64
ret i64 %zext
}

define hidden i64 @_Z22__spirv_GlobalOffset_zv() nounwind alwaysinline {
entry:
%0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
%arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
%1 = load i32, i32 addrspace(5)* %arrayidx, align 4
%zext = zext i32 %1 to i64
ret i64 %zext
}
1 change: 0 additions & 1 deletion libclc/amdgcn/libspirv/SOURCES
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
workitem/get_global_offset.cl
workitem/get_group_id.cl
workitem/get_global_size.cl
workitem/get_local_id.cl
Expand Down
25 changes: 0 additions & 25 deletions libclc/amdgcn/libspirv/workitem/get_global_offset.cl

This file was deleted.

27 changes: 27 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14985,6 +14985,33 @@ track the usage for each kernel. However, in some cases careful organization of
the kernels and functions in the source file means there is minimal additional
effort required to accurately calculate GPR usage.

SYCL Kernel Metadata
====================

This section describes the additional metadata that is inserted for SYCL
kernels. As SYCL is a single source programming model functions can either
execute on a host or a device (i.e. GPU). Device kernels are akin to kernel
entry-points in GPU program. To mark an LLVM IR function as a device kernel
function, we make use of special LLVM metadata. The AMDGCN back-end will look
for a named metadata node called ``amdgcn.annotations``. This named metadata
must contain a list of metadata that describe the kernel IR. For our purposes,
we need to declare a metadata node that assigns the `"kernel"` attribute to the
LLVM IR function that should be emitted as a SYCL kernel function. These
metadata nodes take the form:

.. code-block:: text

!{<function ref>, metadata !"kernel", i32 1}

Consider the metadata generated by global-offset pass, showing a void kernel
function `example_kernel_with_offset` taking one argument, a pointer to 3 i32
integers:

.. code-block:: llvm

!amdgcn.annotations = !{!0}
!0 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}

Additional Documentation
========================

Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2059,4 +2059,8 @@ def int_amdgcn_reloc_constant : Intrinsic<
[llvm_i32_ty], [llvm_metadata_ty],
[IntrNoMem, IntrSpeculatable, IntrWillReturn]
>;

// SYCL
def int_amdgcn_implicit_offset : GCCBuiltin<"__builtin_amdgcn_implicit_offset">,
Intrinsic<[LLVMQualPointerType<llvm_i32_ty, 5>], [], [IntrNoMem, IntrSpeculatable]>;
}
43 changes: 43 additions & 0 deletions llvm/include/llvm/SYCLLowerIR/TargetHelpers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//===------------ TargetHelpers.h - Helpers for SYCL kernels ------------- ===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Helper functions for processing SYCL kernels.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_SYCL_SYCL_LOWER_IR_TARGET_HELPERS_H
#define LLVM_SYCL_SYCL_LOWER_IR_TARGET_HELPERS_H

#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"

using namespace llvm;

namespace llvm {
namespace TargetHelpers {

enum class ArchType { Cuda, AMDHSA, Unsupported };

struct KernelPayload {
KernelPayload(Function *Kernel, MDNode *MD = nullptr);
Function *Kernel;
MDNode *MD;
};

ArchType getArchType(const Module &M);

std::string getAnnotationString(ArchType AT);

void populateKernels(Module &M, SmallVectorImpl<KernelPayload> &Kernels,
TargetHelpers::ArchType AT);

} // end namespace TargetHelpers
} // end namespace llvm

#endif
2 changes: 2 additions & 0 deletions llvm/lib/SYCLLowerIR/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ add_llvm_component_library(LLVMSYCLLowerIR
MutatePrintfAddrspace.cpp

LocalAccessorToSharedMemory.cpp
GlobalOffset.cpp
TargetHelpers.cpp

ADDITIONAL_HEADER_DIRS
${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR
Expand Down
Loading