Skip to content

Commit 2dc3c06

Browse files
authored
[SYCL] Generalize GlobalOffset and enable it for AMDGPU (#5855)
The purpose of this patch is to generalize SYCL global offset pass and enable it for AMDGPU. * enable global offset in AMD's HIP * decorate SYCL kernel with dedicated MDNode: This removes the need for command line options added by the SYCL driver, discussed here: [SYCL] Generalize local accessor to shared mem pass #5149 (comment) * extract common helpers for local accessor and global offset passes * generalize the pass * introduce builtin_amdgcn_implicit_offset and enable the pass for ADMGPU * implement spirv_GlobalOffset_[x,y,z] * update the docs The main deviation from the NVPTX is the need for supporting address spaces. For AMD kernel arguments reside in constant address space, which for the case with offset forces a copy to private AS, in order to keep the call-graph interface coherent (we can't allocate const address space for the case without offset). Corresponding test-suit PR: intel/llvm-test-suite#941
1 parent 05fe5ae commit 2dc3c06

File tree

45 files changed

+843
-294
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+843
-294
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,5 +316,10 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi",
316316
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts")
317317
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts")
318318

319+
//===----------------------------------------------------------------------===//
320+
// SYCL builtin.
321+
//===----------------------------------------------------------------------===//
322+
BUILTIN(__builtin_amdgcn_implicit_offset, "Ui*5", "nc")
323+
319324
#undef BUILTIN
320325
#undef TARGET_BUILTIN

clang/lib/CodeGen/TargetInfo.cpp

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,26 @@ static bool occupiesMoreThan(CodeGenTypes &cgt,
151151
return (intCount + fpCount > maxAllRegisters);
152152
}
153153

154+
/// Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV,
155+
/// Name, and Operand as operands, and adds the resulting MDNode to the
156+
/// AnnotationName MDNode.
157+
static void addAMDGCOrNVVMMetadata(const char *AnnotationName,
158+
llvm::GlobalValue *GV, StringRef Name,
159+
int Operand) {
160+
llvm::Module *M = GV->getParent();
161+
llvm::LLVMContext &Ctx = M->getContext();
162+
163+
// Get annotations metadata node.
164+
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(AnnotationName);
165+
166+
llvm::Metadata *MDVals[] = {
167+
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
168+
llvm::ConstantAsMetadata::get(
169+
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
170+
// Append metadata to annotations node.
171+
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
172+
}
173+
154174
bool SwiftABIInfo::isLegalVectorTypeForSwift(CharUnits vectorSize,
155175
llvm::Type *eltTy,
156176
unsigned numElts) const {
@@ -7327,18 +7347,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
73277347

73287348
void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
73297349
StringRef Name, int Operand) {
7330-
llvm::Module *M = GV->getParent();
7331-
llvm::LLVMContext &Ctx = M->getContext();
7332-
7333-
// Get "nvvm.annotations" metadata node
7334-
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
7335-
7336-
llvm::Metadata *MDVals[] = {
7337-
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
7338-
llvm::ConstantAsMetadata::get(
7339-
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
7340-
// Append metadata to nvvm.annotations
7341-
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
7350+
addAMDGCOrNVVMMetadata("nvvm.annotations", GV, Name, Operand);
73427351
}
73437352

73447353
bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
@@ -9308,6 +9317,12 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
93089317
llvm::Type *BlockTy) const override;
93099318
bool shouldEmitStaticExternCAliases() const override;
93109319
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
9320+
9321+
private:
9322+
// Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
9323+
// resulting MDNode to the amdgcn.annotations MDNode.
9324+
static void addAMDGCNMetadata(llvm::GlobalValue *GV, StringRef Name,
9325+
int Operand);
93119326
};
93129327
}
93139328

@@ -9324,6 +9339,11 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
93249339
cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));
93259340
}
93269341

9342+
void AMDGPUTargetCodeGenInfo::addAMDGCNMetadata(llvm::GlobalValue *GV,
9343+
StringRef Name, int Operand) {
9344+
addAMDGCOrNVVMMetadata("amdgcn.annotations", GV, Name, Operand);
9345+
}
9346+
93279347
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
93289348
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
93299349
const auto *ReqdWGS =
@@ -9425,10 +9445,15 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
94259445

94269446
const bool IsHIPKernel =
94279447
M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>();
9428-
94299448
if (IsHIPKernel)
94309449
F->addFnAttr("uniform-work-group-size", "true");
94319450

9451+
// Create !{<func-ref>, metadata !"kernel", i32 1} node for SYCL kernels.
9452+
const bool IsSYCLKernel =
9453+
FD && M.getLangOpts().SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>();
9454+
if (IsSYCLKernel)
9455+
addAMDGCNMetadata(F, "kernel", 1);
9456+
94329457
if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
94339458
F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
94349459

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5842,12 +5842,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
58425842
CmdArgs.push_back("-treat-scalable-fixed-error-as-warning");
58435843
}
58445844

5845-
// Enable local accessor to shared memory pass for SYCL.
5846-
if (isa<BackendJobAction>(JA) && IsSYCLOffloadDevice &&
5847-
(Triple.isNVPTX() || Triple.isAMDGCN())) {
5848-
CmdArgs.push_back("-mllvm");
5849-
CmdArgs.push_back("-sycl-enable-local-accessor");
5850-
}
58515845
// These two are potentially updated by AddClangCLArgs.
58525846
codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
58535847
bool EmitCodeView = false;

clang/lib/Driver/ToolChains/HIPAMD.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
7878
const llvm::opt::ArgList &Args) const {
7979
// Construct lld command.
8080
// The output from ld.lld is an HSA code object file.
81-
ArgStringList LldArgs{"-flavor",
82-
"gnu",
83-
"--no-undefined",
84-
"-shared",
85-
"-plugin-opt=-amdgpu-internalize-symbols",
86-
"-plugin-opt=-sycl-enable-local-accessor"};
81+
ArgStringList LldArgs{"-flavor", "gnu", "--no-undefined", "-shared",
82+
"-plugin-opt=-amdgpu-internalize-symbols"};
8783

8884
auto &TC = getToolChain();
8985
auto &D = TC.getDriver();
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// RUN: %clang_cc1 -triple spir64-unknown-unknown -fsycl-is-device \
2+
// RUN: -S -emit-llvm %s -o %temp.ll
3+
// RUN: FileCheck -check-prefix=CHECK-SPIR --input-file %temp.ll %s
4+
5+
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fsycl-is-device \
6+
// RUN: -S -emit-llvm %s -o %temp.ll
7+
// RUN: FileCheck -check-prefix=CHECK-NVPTX --input-file %temp.ll %s
8+
9+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -fsycl-is-device \
10+
// RUN: -S -emit-llvm %s -o %temp.ll
11+
// RUN: FileCheck -check-prefix=CHECK-AMDGCN --input-file %temp.ll %s
12+
13+
// The test makes sure that `[nnvm|amdgcn].annotations are correctly generated
14+
// only for their respective targets.
15+
16+
#include "Inputs/sycl.hpp"
17+
18+
sycl::handler H;
19+
20+
class Functor {
21+
public:
22+
void operator()() const {}
23+
};
24+
25+
// CHECK-SPIR-NOT: annotations =
26+
27+
// CHECK-NVPTX: nvvm.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
28+
// CHECK-NVPTX: [[FIRST]] = !{void ()* @_ZTS7Functor, !"kernel", i32 1}
29+
// CHECK-NVPTX: [[SECOND]] = !{void ()* @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE0_clES2_E5foo_2, !"kernel", i32 1}
30+
31+
// CHECK-AMDGCN: amdgcn.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
32+
// CHECK-AMDGCN: [[FIRST]] = !{void ()* @_ZTS7Functor, !"kernel", i32 1}
33+
// CHECK-AMDGCN: [[SECOND]] = !{void ()* @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE0_clES2_E5foo_2, !"kernel", i32 1}
34+
35+
int main() {
36+
sycl::queue q;
37+
q.submit([&](sycl::handler &cgh) {
38+
Functor foo{};
39+
cgh.single_task(foo);
40+
});
41+
42+
q.submit([&](cl::sycl::handler &cgh) {
43+
cgh.parallel_for<class foo_2>(cl::sycl::range<1>(1),
44+
[=](cl::sycl::item<1> item) {
45+
});
46+
});
47+
return 0;
48+
}

clang/test/Driver/sycl-local-accessor-opt.cpp

Lines changed: 0 additions & 14 deletions
This file was deleted.

libclc/amdgcn-amdhsa/libspirv/SOURCES

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11

2+
workitem/get_global_offset.ll
23
group/collectives.cl
34
group/collectives_helpers.ll
45
atomic/loadstore_helpers.ll
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
;;===----------------------------------------------------------------------===//
2+
;;
3+
;; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
;; See https://llvm.org/LICENSE.txt for license information.
5+
;; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
;;
7+
;;===----------------------------------------------------------------------===//
8+
9+
#if __clang_major__ >= 7
10+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
11+
#else
12+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
13+
#endif
14+
15+
; Function Attrs: nounwind readnone speculatable
16+
declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
17+
18+
define hidden i64 @_Z22__spirv_GlobalOffset_xv() nounwind alwaysinline {
19+
entry:
20+
%0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
21+
%1 = load i32, i32 addrspace(5)* %0, align 4
22+
%zext = zext i32 %1 to i64
23+
ret i64 %zext
24+
}
25+
26+
define hidden i64 @_Z22__spirv_GlobalOffset_yv() nounwind alwaysinline {
27+
entry:
28+
%0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
29+
%arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 1
30+
%1 = load i32, i32 addrspace(5)* %arrayidx, align 4
31+
%zext = zext i32 %1 to i64
32+
ret i64 %zext
33+
}
34+
35+
define hidden i64 @_Z22__spirv_GlobalOffset_zv() nounwind alwaysinline {
36+
entry:
37+
%0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
38+
%arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
39+
%1 = load i32, i32 addrspace(5)* %arrayidx, align 4
40+
%zext = zext i32 %1 to i64
41+
ret i64 %zext
42+
}

libclc/amdgcn/libspirv/SOURCES

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
workitem/get_global_offset.cl
21
workitem/get_group_id.cl
32
workitem/get_global_size.cl
43
workitem/get_local_id.cl

libclc/amdgcn/libspirv/workitem/get_global_offset.cl

Lines changed: 0 additions & 25 deletions
This file was deleted.

llvm/docs/AMDGPUUsage.rst

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15001,6 +15001,33 @@ track the usage for each kernel. However, in some cases careful organization of
1500115001
the kernels and functions in the source file means there is minimal additional
1500215002
effort required to accurately calculate GPR usage.
1500315003

15004+
SYCL Kernel Metadata
15005+
====================
15006+
15007+
This section describes the additional metadata that is inserted for SYCL
15008+
kernels. As SYCL is a single source programming model functions can either
15009+
execute on a host or a device (i.e. GPU). Device kernels are akin to kernel
15010+
entry-points in GPU program. To mark an LLVM IR function as a device kernel
15011+
function, we make use of special LLVM metadata. The AMDGCN back-end will look
15012+
for a named metadata node called ``amdgcn.annotations``. This named metadata
15013+
must contain a list of metadata that describe the kernel IR. For our purposes,
15014+
we need to declare a metadata node that assigns the `"kernel"` attribute to the
15015+
LLVM IR function that should be emitted as a SYCL kernel function. These
15016+
metadata nodes take the form:
15017+
15018+
.. code-block:: text
15019+
15020+
!{<function ref>, metadata !"kernel", i32 1}
15021+
15022+
Consider the metadata generated by global-offset pass, showing a void kernel
15023+
function `example_kernel_with_offset` taking one argument, a pointer to 3 i32
15024+
integers:
15025+
15026+
.. code-block:: llvm
15027+
15028+
!amdgcn.annotations = !{!0}
15029+
!0 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
15030+
1500415031
Additional Documentation
1500515032
========================
1500615033

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2061,4 +2061,8 @@ def int_amdgcn_reloc_constant : Intrinsic<
20612061
[llvm_i32_ty], [llvm_metadata_ty],
20622062
[IntrNoMem, IntrSpeculatable, IntrWillReturn]
20632063
>;
2064+
2065+
// SYCL
2066+
def int_amdgcn_implicit_offset : GCCBuiltin<"__builtin_amdgcn_implicit_offset">,
2067+
Intrinsic<[LLVMQualPointerType<llvm_i32_ty, 5>], [], [IntrNoMem, IntrSpeculatable]>;
20642068
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//===------------ TargetHelpers.h - Helpers for SYCL kernels ------------- ===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Helper functions for processing SYCL kernels.
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#ifndef LLVM_SYCL_SYCL_LOWER_IR_TARGET_HELPERS_H
14+
#define LLVM_SYCL_SYCL_LOWER_IR_TARGET_HELPERS_H
15+
16+
#include "llvm/ADT/SmallVector.h"
17+
#include "llvm/IR/Function.h"
18+
#include "llvm/IR/Module.h"
19+
20+
using namespace llvm;
21+
22+
namespace llvm {
23+
namespace TargetHelpers {
24+
25+
enum class ArchType { Cuda, AMDHSA, Unsupported };
26+
27+
struct KernelPayload {
28+
KernelPayload(Function *Kernel, MDNode *MD = nullptr);
29+
Function *Kernel;
30+
MDNode *MD;
31+
};
32+
33+
ArchType getArchType(const Module &M);
34+
35+
std::string getAnnotationString(ArchType AT);
36+
37+
void populateKernels(Module &M, SmallVectorImpl<KernelPayload> &Kernels,
38+
TargetHelpers::ArchType AT);
39+
40+
} // end namespace TargetHelpers
41+
} // end namespace llvm
42+
43+
#endif

llvm/lib/SYCLLowerIR/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ add_llvm_component_library(LLVMSYCLLowerIR
5959
MutatePrintfAddrspace.cpp
6060

6161
LocalAccessorToSharedMemory.cpp
62+
GlobalOffset.cpp
63+
TargetHelpers.cpp
6264

6365
ADDITIONAL_HEADER_DIRS
6466
${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR

0 commit comments

Comments
 (0)