From 1f4034e4728f744b7b18472002385380b228bcf3 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Fri, 25 Feb 2022 08:24:18 +0000
Subject: [PATCH 01/10] [SYCL] Correctly set global offset for HIP

---
 sycl/plugins/hip/pi_hip.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index a715c7e064386..527c820f5f29b 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -2625,8 +2625,7 @@ pi_result hip_piEnqueueKernelLaunch(
           hip_implicit_offset[i] =
               static_cast<std::uint32_t>(global_work_offset[i]);
           if (global_work_offset[i] != 0) {
-            cl::sycl::detail::pi::die("Global offsets different from 0 are not "
-                                      "implemented in the HIP backend.");
+            hipFunc = kernel->get_with_offset_parameter();
           }
         }
       }

From 8d33c4a276228bbb0d28f393032fa3284ba4795f Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Fri, 25 Feb 2022 11:05:21 +0000
Subject: [PATCH 02/10] [SYCL] Use MDNode to annotate SYCL kernels on AMD

The structure of the metadata is as follows:

```
!amdgcn.annotations = !{!0}

!0 = distinct !{void (<func-ref>, !"kernel", i32 1
```

This allows the removal of `-sycl-enable-local-accessor` as the pass can
recognise SYCL kernels.
---
 clang/lib/CodeGen/TargetInfo.cpp              | 52 ++++++++++++++-----
 clang/lib/Driver/ToolChains/Clang.cpp         |  6 ---
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |  8 +--
 clang/test/Driver/sycl-local-accessor-opt.cpp | 14 -----
 ...r-to-shared-memory-basic-transformation.ll | 11 +++-
 ...sor-to-shared-memory-multiple-functions.ll | 11 +++-
 ...cessor-to-shared-memory-no-entry-points.ll |  9 +++-
 ...cessor-to-shared-memory-preserves-types.ll | 11 +++-
 .../local-accessor-to-shared-memory-triple.ll | 12 ++++-
 ...-accessor-to-shared-memory-valid-triple.ll | 30 ++++++-----
 ...r-to-shared-memory-basic-transformation.ll |  2 +-
 ...ccessor-to-shared-memory-invalid-triple.ll |  2 +-
 ...sor-to-shared-memory-multiple-functions.ll |  2 +-
 ...cessor-to-shared-memory-no-entry-points.ll |  2 +-
 ...cessor-to-shared-memory-preserves-types.ll |  2 +-
 .../local-accessor-to-shared-memory-triple.ll |  4 +-
 ...-accessor-to-shared-memory-valid-triple.ll |  7 +--
 17 files changed, 116 insertions(+), 69 deletions(-)
 delete mode 100644 clang/test/Driver/sycl-local-accessor-opt.cpp

diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 46edb86e89aa2..37dd71b86228d 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -151,6 +151,26 @@ static bool occupiesMoreThan(CodeGenTypes &cgt,
   return (intCount + fpCount > maxAllRegisters);
 }
 
+/// Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV,
+/// Name, and Operand as operands, and adds the resulting MDNode to the
+/// AnnotationName MDNode.
+static void addAMDGCOrNVVMMetadata(const char *AnnotationName,
+                                   llvm::GlobalValue *GV, StringRef Name,
+                                   int Operand) {
+  llvm::Module *M = GV->getParent();
+  llvm::LLVMContext &Ctx = M->getContext();
+
+  // Get annotations metadata node.
+  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(AnnotationName);
+
+  llvm::Metadata *MDVals[] = {
+      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
+      llvm::ConstantAsMetadata::get(
+          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
+  // Append metadata to annotations node.
+  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
 bool SwiftABIInfo::isLegalVectorTypeForSwift(CharUnits vectorSize,
                                              llvm::Type *eltTy,
                                              unsigned numElts) const {
@@ -7304,18 +7324,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
 
 void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
                                              StringRef Name, int Operand) {
-  llvm::Module *M = GV->getParent();
-  llvm::LLVMContext &Ctx = M->getContext();
-
-  // Get "nvvm.annotations" metadata node
-  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
-
-  llvm::Metadata *MDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
-  // Append metadata to nvvm.annotations
-  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+  addAMDGCOrNVVMMetadata("nvvm.annotations", GV, Name, Operand);
 }
 
 bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
@@ -9285,6 +9294,12 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
                             llvm::Type *BlockTy) const override;
   bool shouldEmitStaticExternCAliases() const override;
   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
+
+private:
+  // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
+  // resulting MDNode to the amdgcn.annotations MDNode.
+  static void addAMDGCNMetadata(llvm::GlobalValue *GV, StringRef Name,
+                                int Operand);
 };
 }
 
@@ -9301,6 +9316,11 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));
 }
 
+void AMDGPUTargetCodeGenInfo::addAMDGCNMetadata(llvm::GlobalValue *GV,
+                                                StringRef Name, int Operand) {
+  addAMDGCOrNVVMMetadata("amdgcn.annotations", GV, Name, Operand);
+}
+
 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
   const auto *ReqdWGS =
@@ -9402,10 +9422,16 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
 
   const bool IsHIPKernel =
       M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>();
-
   if (IsHIPKernel)
     F->addFnAttr("uniform-work-group-size", "true");
 
+  const bool IsSYCLKernel =
+      FD && M.getLangOpts().SYCLIsDevice &&
+      F->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL;
+  // Create !{<func-ref>, metadata !"kernel", i32 1} node for SYCL kernels.
+  if (IsSYCLKernel)
+    addAMDGCNMetadata(F, "kernel", 1);
+
   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6bd86e16b193f..3e0551e5473d3 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5865,12 +5865,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-treat-scalable-fixed-error-as-warning");
   }
 
-  // Enable local accessor to shared memory pass for SYCL.
-  if (isa<BackendJobAction>(JA) && IsSYCLOffloadDevice &&
-      (Triple.isNVPTX() || Triple.isAMDGCN())) {
-    CmdArgs.push_back("-mllvm");
-    CmdArgs.push_back("-sycl-enable-local-accessor");
-  }
   // These two are potentially updated by AddClangCLArgs.
   codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
   bool EmitCodeView = false;
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 9fcecd94e0018..f4dedb2c6b45e 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -78,12 +78,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
                                          const llvm::opt::ArgList &Args) const {
   // Construct lld command.
   // The output from ld.lld is an HSA code object file.
-  ArgStringList LldArgs{"-flavor",
-                        "gnu",
-                        "--no-undefined",
-                        "-shared",
-                        "-plugin-opt=-amdgpu-internalize-symbols",
-                        "-plugin-opt=-sycl-enable-local-accessor"};
+  ArgStringList LldArgs{"-flavor", "gnu", "--no-undefined", "-shared",
+                        "-plugin-opt=-amdgpu-internalize-symbols"};
 
   auto &TC = getToolChain();
   auto &D = TC.getDriver();
diff --git a/clang/test/Driver/sycl-local-accessor-opt.cpp b/clang/test/Driver/sycl-local-accessor-opt.cpp
deleted file mode 100644
index 154eee5178d8c..0000000000000
--- a/clang/test/Driver/sycl-local-accessor-opt.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/// Check the correct handling of sycl-enable-local-accessor option.
-
-// REQUIRES: clang-driver
-
-// RUN:   %clang -fsycl -### %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=OPT-CHECK %s
-
-// RUN:   %clang -fsycl -S -### %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=OPT-CHECK %s
-// OPT-CHECK-NOT: "-sycl-enable-local-accessor"
-
-// RUN:   %clang -fsycl -fsycl-targets=nvptx64-nvidia-cuda -### %s 2>&1 \
-// RUN:   | FileCheck %s
-// CHECK: "-sycl-enable-local-accessor"
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll
index 41873b5a70bd8..491e5943229d1 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'basic-transformation.bc'
 source_filename = "basic-transformation.ll"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
@@ -22,3 +22,12 @@ entry:
 ; CHECK: %4 = add i32 %c, %c
   ret void
 }
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+
+!0 = distinct !{void (i32 addrspace(3)*, i32 addrspace(1)*, i32)* @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: !0 = distinct !{void (i32, i32 addrspace(1)*, i32)* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll
index 050491546dd79..7bd4f9dfd7aa2 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'multiple-functions.bc'
 source_filename = "multiple-functions.ll"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
@@ -29,3 +29,12 @@ entry:
 ; CHECK: call void @_ZTS14other_function(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c)
   ret void
 }
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+
+!0 = distinct !{void (i32 addrspace(3)*, i32 addrspace(1)*, i32)* @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: !0 = distinct !{void (i32, i32 addrspace(1)*, i32)* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll
index 45e1823a6aee5..c45d9af01a898 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'no-entry-points.bc'
 source_filename = "no-entry-points.ll"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
@@ -18,3 +18,10 @@ entry:
 ; CHECK: %2 = add i32 %c, %c
   ret void
 }
+
+!amdgcn.annotations = !{!0, !1, !0, !2, !2, !2, !2, !3, !3, !2}
+
+!0 = !{null, !"align", i32 8}
+!1 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!2 = !{null, !"align", i32 16}
+!3 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll
index 3b39347118a64..719bf6fe81cd1 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'bitcasts.bc'
 source_filename = "bitcasts.ll"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
@@ -30,3 +30,12 @@ entry:
 ; CHECK: %11 = load i8, i8 addrspace(3)* %d
   ret void
 }
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+
+!0 = distinct !{void (i32 addrspace(3)*, i64 addrspace(3)*, i16 addrspace(3)*, i8 addrspace(3)*)* @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: !0 = distinct !{void (i32, i32, i32, i32)* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll
index 05247d67e26aa..a662b8607b739 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll
@@ -1,7 +1,7 @@
 ; This test checks that the Local Accessor to Shared Memory pass runs with the
 ; `amdgcn-amd-amdhsa` triple, but not with `amdgcn-amd-amdpal`.
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-VALID %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-INVALID %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-VALID %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal < %s | FileCheck --check-prefix=CHECK-INVALID %s
 
 ; ModuleID = 'local-accessor-to-shared-memory-triple.ll'
 source_filename = "local-accessor-to-shared-memory-triple.ll"
@@ -23,3 +23,11 @@ entry:
   %0 = load i32, i32 addrspace(3)* %a
   ret void
 }
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+
+!0 = distinct !{void (i32 addrspace(3)*)* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll
index d7ef94977dc1e..f39094eaa9a6a 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll
@@ -1,9 +1,7 @@
 ; This test checks that the Local Accessor to Shared Memory pass runs with the
 ; `amdgcn-amd-amdhsa` triple and does not if the option is not present.
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-OPT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor=true < %s | FileCheck --check-prefix=CHECK-OPT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-NO-OPT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor=false < %s | FileCheck --check-prefix=CHECK-NO-OPT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-OPT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-OPT %s
 
 ; ModuleID = 'local-accessor-to-shared-memory-valid-triple.ll'
 source_filename = "local-accessor-to-shared-memory-valid-triple.ll"
@@ -16,17 +14,25 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK-OPT-NEXT: .offset: 0
 ; CHECK-OPT-NEXT: .size: 4
 ; CHECK-OPT-NEXT: .value_kind:     by_value
-; CHECK-NO-OPT: .globl	_ZTS14example_kernel
-; CHECK-NO-OPT: - .args:
-; CHECK-NO-OPT-NEXT: .address_space: local
-; CHECK-NO-OPT-NEXT: .name: a
-; CHECK-NO-OPT-NEXT: .offset: 0
-; CHECK-NO-OPT-NEXT: .pointee_align: 1
-; CHECK-NO-OPT-NEXT: .size: 4
-; CHECK-NO-OPT-NEXT: .value_kind:     dynamic_shared_pointer
 ; Function Attrs: noinline
 define amdgpu_kernel void @_ZTS14example_kernel(i32 addrspace(3)* %a) {
 entry:
   %0 = load i32, i32 addrspace(3)* %a
   ret void
 }
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.ident = !{!7, !8}
+!llvm.module.flags = !{!9, !10}
+
+!0 = distinct !{void (i32 addrspace(3)*)* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = !{i32 1, i32 2}
+!6 = !{i32 4, i32 100000}
+!7 = !{!"clang version 9.0.0"}
+!8 = !{!"clang version 9.0.0"}
+!9 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]}
+!10 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll
index 19359bfe458a9..21e16fcdb3ec1 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'basic-transformation.bc'
 source_filename = "basic-transformation.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll
index 7570adf1e8480..1d04536a8da66 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll
@@ -1,6 +1,6 @@
 ; This test checks that the Local Accessor to Shared Memory pass does not run with the
 ; `nvptx64-nvidia-nvcl` triple.
-; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor < %s | FileCheck %s
+; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck %s
 ; CHECK: .param .u64 .ptr .shared .align 1 _ZTS14example_kernel_param_0
 
 ; ModuleID = 'local-accessor-to-shared-memory-invalid-triple.ll'
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll
index fe9055d676da2..00484c1ffe81b 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'multiple-functions.bc'
 source_filename = "multiple-functions.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll
index 0f434e3491284..62799f05bf134 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'no-entry-points.bc'
 source_filename = "no-entry-points.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll
index bcd0a03b5b051..d38d05636ed24 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'bitcasts.bc'
 source_filename = "bitcasts.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll
index 7d648b68c2fd9..5e5e2b6138f10 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll
@@ -1,7 +1,7 @@
 ; This test checks that the Local Accessor to Shared Memory pass runs with the
 ; `nvptx64-nvidia-cuda` triple.
-; RUN: llc -mtriple=nvptx64-nvidia-cuda -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-VALID %s
-; RUN: llc -mtriple=nvptx64-nvidia-nvcl -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-INVALID %s
+; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck --check-prefix=CHECK-VALID %s
+; RUN: llc -mtriple=nvptx64-nvidia-nvcl < %s | FileCheck --check-prefix=CHECK-INVALID %s
 ; CHECK-VALID: .param .u32 _ZTS14example_kernel_param_0
 ; CHECK-INVALID: .param .u64 .ptr .shared .align 1 _ZTS14example_kernel_param_0
 
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll
index c49e7c5bce550..43688a9bbb489 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll
@@ -1,11 +1,8 @@
 ; This test checks that the Local Accessor to Shared Memory pass runs with the
 ; `nvptx64-nvidia-cuda` triple.
-; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-OPT %s
-; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor=true < %s | FileCheck --check-prefix=CHECK-OPT %s
-; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck --check-prefix=CHECK-NO-OPT %s
-; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor=false < %s | FileCheck --check-prefix=CHECK-NO-OPT %s
+; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck --check-prefix=CHECK-OPT %s
+; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck --check-prefix=CHECK-OPT %s
 ; CHECK-OPT: .param .u32 _ZTS14example_kernel_param_0
-; CHECK-NO-OPT-NOT: .param .u32 _ZTS14example_kernel_param_0
 
 ; ModuleID = 'local-accessor-to-shared-memory-valid-triple.ll'
 source_filename = "local-accessor-to-shared-memory-valid-triple.ll"

From 47f17217b8dd5d6c961248233bfa909fb850e7cb Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Fri, 25 Feb 2022 11:37:23 +0000
Subject: [PATCH 03/10] [SYCL] Introduce SYCL lower IR target helpers

And make use of them in local-to-shared mem pass.
---
 llvm/include/llvm/SYCLLowerIR/TargetHelpers.h |  43 ++++++
 llvm/lib/SYCLLowerIR/CMakeLists.txt           |   1 +
 .../LocalAccessorToSharedMemory.cpp           | 128 ++----------------
 llvm/lib/SYCLLowerIR/TargetHelpers.cpp        |  88 ++++++++++++
 ...r-to-shared-memory-multiple-annotations.ll |   2 +-
 5 files changed, 143 insertions(+), 119 deletions(-)
 create mode 100644 llvm/include/llvm/SYCLLowerIR/TargetHelpers.h
 create mode 100644 llvm/lib/SYCLLowerIR/TargetHelpers.cpp

diff --git a/llvm/include/llvm/SYCLLowerIR/TargetHelpers.h b/llvm/include/llvm/SYCLLowerIR/TargetHelpers.h
new file mode 100644
index 0000000000000..b2b383237a705
--- /dev/null
+++ b/llvm/include/llvm/SYCLLowerIR/TargetHelpers.h
@@ -0,0 +1,43 @@
+//===------------ TargetHelpers.h - Helpers for SYCL kernels ------------- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper functions for processing SYCL kernels.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYCL_SYCL_LOWER_IR_TARGET_HELPERS_H
+#define LLVM_SYCL_SYCL_LOWER_IR_TARGET_HELPERS_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+namespace llvm {
+namespace TargetHelpers {
+
+enum class ArchType { Cuda, AMDHSA, Unsupported };
+
+struct KernelPayload {
+  KernelPayload(Function *Kernel, MDNode *MD = nullptr);
+  Function *Kernel;
+  MDNode *MD;
+};
+
+ArchType getArchType(const Module &M);
+
+std::string getAnnotationString(ArchType AT);
+
+void populateKernels(Module &M, SmallVectorImpl<KernelPayload> &Kernels,
+                     TargetHelpers::ArchType AT);
+
+} // end namespace TargetHelpers
+} // end namespace llvm
+
+#endif
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index fab78d7d30bfa..b5294115dc766 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   MutatePrintfAddrspace.cpp
 
   LocalAccessorToSharedMemory.cpp
+  TargetHelpers.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR
diff --git a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
index 2afaddbb18d71..92b2cc727c059 100644
--- a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
+++ b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
@@ -15,11 +15,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/SYCLLowerIR/TargetHelpers.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 
@@ -27,13 +27,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "localaccessortosharedmemory"
 
-static bool EnableLocalAccessor;
-
-static cl::opt<bool, true> EnableLocalAccessorFlag(
-    "sycl-enable-local-accessor", cl::Hidden,
-    cl::desc("Enable local accessor to shared memory optimisation."),
-    cl::location(EnableLocalAccessor), cl::init(false));
-
 namespace llvm {
 void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
 } // namespace llvm
@@ -42,30 +35,19 @@ namespace {
 
 class LocalAccessorToSharedMemory : public ModulePass {
 private:
-  enum class ArchType { Cuda, AMDHSA, Unsupported };
-
-  struct KernelPayload {
-    KernelPayload(Function *Kernel, MDNode *MD = nullptr)
-        : Kernel(Kernel), MD(MD){};
-    Function *Kernel;
-    MDNode *MD;
-  };
+  using KernelPayload = TargetHelpers::KernelPayload;
+  using ArchType = TargetHelpers::ArchType;
 
-  unsigned SharedASValue = 0;
+  // The ualue for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen to
+  // be 3.
+  const unsigned SharedASValue = 3;
 
 public:
   static char ID;
   LocalAccessorToSharedMemory() : ModulePass(ID) {}
 
   bool runOnModule(Module &M) override {
-    if (!EnableLocalAccessor)
-      return false;
-
-    auto AT = StringSwitch<ArchType>(M.getTargetTriple().c_str())
-                  .Case("nvptx64-nvidia-cuda", ArchType::Cuda)
-                  .Case("nvptx-nvidia-cuda", ArchType::Cuda)
-                  .Case("amdgcn-amd-amdhsa", ArchType::AMDHSA)
-                  .Default(ArchType::Unsupported);
+    const auto AT = TargetHelpers::getArchType(M);
 
     // Invariant: This pass is only intended to operate on SYCL kernels being
     // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa`
@@ -76,23 +58,9 @@ class LocalAccessorToSharedMemory : public ModulePass {
     if (skipModule(M))
       return false;
 
-    switch (AT) {
-    case ArchType::Cuda:
-      // ADDRESS_SPACE_SHARED = 3,
-      SharedASValue = 3;
-      break;
-    case ArchType::AMDHSA:
-      // LOCAL_ADDRESS = 3,
-      SharedASValue = 3;
-      break;
-    default:
-      SharedASValue = 0;
-      break;
-    }
-
     SmallVector<KernelPayload> Kernels;
+    TargetHelpers::populateKernels(M, Kernels, AT);
     SmallVector<std::pair<Function *, KernelPayload>> NewToOldKernels;
-    populateKernels(M, Kernels, AT);
     if (Kernels.empty())
       return false;
 
@@ -106,7 +74,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
     if (NewToOldKernels.empty())
       return false;
 
-    postProcessKernels(NewToOldKernels, AT);
+    postProcessKernels(NewToOldKernels);
 
     return true;
   }
@@ -242,89 +210,13 @@ class LocalAccessorToSharedMemory : public ModulePass {
     return NF;
   }
 
-  void populateCudaKernels(Module &M, SmallVector<KernelPayload> &Kernels) {
-    // Access `nvvm.annotations` to determine which functions are kernel entry
-    // points.
-    auto *NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
-    if (!NvvmMetadata)
-      return;
-
-    // It is possible that the annotations node contains multiple pointers to
-    // the same metadata, recognise visited ones.
-    SmallSet<MDNode *, 4> Visited;
-    for (auto *MetadataNode : NvvmMetadata->operands()) {
-      if (Visited.contains(MetadataNode) || MetadataNode->getNumOperands() != 3)
-        continue;
-
-      Visited.insert(MetadataNode);
-
-      // NVPTX identifies kernel entry points using metadata nodes of the form:
-      //   !X = !{<function>, !"kernel", i32 1}
-      const MDOperand &TypeOperand = MetadataNode->getOperand(1);
-      auto *Type = dyn_cast<MDString>(TypeOperand);
-      if (!Type)
-        continue;
-      // Only process kernel entry points.
-      if (Type->getString() != "kernel")
-        continue;
-
-      // Get a pointer to the entry point function from the metadata.
-      const MDOperand &FuncOperand = MetadataNode->getOperand(0);
-      if (!FuncOperand)
-        continue;
-      auto *FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand);
-      if (!FuncConstant)
-        continue;
-      auto *Func = dyn_cast<Function>(FuncConstant->getValue());
-      if (!Func)
-        continue;
-
-      Kernels.push_back(KernelPayload(Func, MetadataNode));
-    }
-  }
-
-  void populateAMDKernels(Module &M, SmallVector<KernelPayload> &Kernels) {
-    for (auto &F : M) {
-      if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
-        Kernels.push_back(KernelPayload(&F));
-    }
-  }
-
-  void populateKernels(Module &M, SmallVector<KernelPayload> &Kernels,
-                       ArchType AT) {
-    switch (AT) {
-    case ArchType::Cuda:
-      return populateCudaKernels(M, Kernels);
-    case ArchType::AMDHSA:
-      return populateAMDKernels(M, Kernels);
-    default:
-      llvm_unreachable("Unsupported arch type.");
-    }
-  }
-
-  void postProcessCudaKernels(
+  void postProcessKernels(
       SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels) {
     for (auto &Pair : NewToOldKernels) {
       std::get<1>(Pair).MD->replaceOperandWith(
           0, llvm::ConstantAsMetadata::get(std::get<0>(Pair)));
     }
   }
-
-  void postProcessAMDKernels(
-      SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels) {}
-
-  void postProcessKernels(
-      SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels,
-      ArchType AT) {
-    switch (AT) {
-    case ArchType::Cuda:
-      return postProcessCudaKernels(NewToOldKernels);
-    case ArchType::AMDHSA:
-      return postProcessAMDKernels(NewToOldKernels);
-    default:
-      llvm_unreachable("Unsupported arch type.");
-    }
-  }
 };
 } // end anonymous namespace
 
diff --git a/llvm/lib/SYCLLowerIR/TargetHelpers.cpp b/llvm/lib/SYCLLowerIR/TargetHelpers.cpp
new file mode 100644
index 0000000000000..a4a7e35cfc297
--- /dev/null
+++ b/llvm/lib/SYCLLowerIR/TargetHelpers.cpp
@@ -0,0 +1,88 @@
+//===----------- TargetHelpers.cpp - Helpers for SYCL kernels ------------ ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper functions for processing SYCL kernels.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SYCLLowerIR/TargetHelpers.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Metadata.h"
+
+using namespace llvm;
+
+namespace llvm {
+namespace TargetHelpers {
+
+KernelPayload::KernelPayload(Function *Kernel, MDNode *MD)
+    : Kernel(Kernel), MD(MD) {}
+
+ArchType getArchType(const Module &M) {
+  return StringSwitch<ArchType>(M.getTargetTriple().c_str())
+      .Case("nvptx64-nvidia-cuda", ArchType::Cuda)
+      .Case("nvptx-nvidia-cuda", ArchType::Cuda)
+      .Case("amdgcn-amd-amdhsa", ArchType::AMDHSA)
+      .Case("amdgcn--amdhsa", ArchType::AMDHSA)
+      .Default(ArchType::Unsupported);
+}
+
+std::string getAnnotationString(ArchType AT) {
+  switch (AT) {
+  case TargetHelpers::ArchType::Cuda:
+    return std::string("nvvm.annotations");
+    break;
+  case TargetHelpers::ArchType::AMDHSA:
+    return std::string("amdgcn.annotations");
+    break;
+  default:
+    llvm_unreachable("Unsupported arch type.");
+  }
+  return std::string();
+}
+
+void populateKernels(Module &M, SmallVectorImpl<KernelPayload> &Kernels,
+                     ArchType AT) {
+  // Access `{amdgcn|nvvm}.annotations` to determine which functions are kernel
+  // entry points.
+  std::string Annotation = getAnnotationString(AT);
+  auto *AnnotationMetadata = M.getNamedMetadata(Annotation);
+  // No kernels in the module, early exit.
+  if (!AnnotationMetadata)
+    return;
+
+  // It is possible that the annotations node contains multiple pointers to the
+  // same metadata, recognise visited ones.
+  SmallSet<MDNode *, 4> Visited;
+  for (auto *MetadataNode : AnnotationMetadata->operands()) {
+    if (Visited.contains(MetadataNode) || MetadataNode->getNumOperands() != 3)
+      continue;
+
+    Visited.insert(MetadataNode);
+
+    // Kernel entry points are identified using metadata nodes of the form:
+    //   !X = !{<function>, !"kernel", i32 1}
+    auto *Type = dyn_cast<MDString>(MetadataNode->getOperand(1));
+    if (!Type)
+      continue;
+    // Only process kernel entry points.
+    if (Type->getString() != "kernel")
+      continue;
+
+    // Get a pointer to the entry point function from the metadata.
+    const MDOperand &FuncOperand = MetadataNode->getOperand(0);
+    if (!FuncOperand)
+      continue;
+    if (auto *FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand))
+      if (auto *Func = dyn_cast<Function>(FuncConstant->getValue()))
+        Kernels.push_back(KernelPayload(Func, MetadataNode));
+  }
+}
+
+} // namespace TargetHelpers
+} // namespace llvm
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll
index 458a00bab5c28..e16501cced35b 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s
+; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s
 ; ModuleID = 'multiple-annotations.bc'
 source_filename = "multiple-annotations.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"

From 8212755de70572f2bc25d7b044bf36e821ce4eba Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Fri, 25 Feb 2022 14:33:07 +0000
Subject: [PATCH 04/10] [SYCL] Generalize SYCL global offset pass

---
 .../llvm/SYCLLowerIR}/GlobalOffset.h          |   0
 llvm/lib/SYCLLowerIR/CMakeLists.txt           |   1 +
 .../SYCL => SYCLLowerIR}/GlobalOffset.cpp     | 175 +++++++++++-------
 .../LocalAccessorToSharedMemory.cpp           |   8 +-
 llvm/lib/Target/NVPTX/CMakeLists.txt          |   1 -
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |   3 +-
 6 files changed, 117 insertions(+), 71 deletions(-)
 rename llvm/{lib/Target/NVPTX/SYCL => include/llvm/SYCLLowerIR}/GlobalOffset.h (100%)
 rename llvm/lib/{Target/NVPTX/SYCL => SYCLLowerIR}/GlobalOffset.cpp (68%)

diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h b/llvm/include/llvm/SYCLLowerIR/GlobalOffset.h
similarity index 100%
rename from llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
rename to llvm/include/llvm/SYCLLowerIR/GlobalOffset.h
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index b5294115dc766..c9f8b12c73241 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   MutatePrintfAddrspace.cpp
 
   LocalAccessorToSharedMemory.cpp
+  GlobalOffset.cpp
   TargetHelpers.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
similarity index 68%
rename from llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
rename to llvm/lib/SYCLLowerIR/GlobalOffset.cpp
index e76d4f14ace9d..c5c52bd379020 100644
--- a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
+++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
@@ -6,29 +6,34 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass operates on SYCL kernels being compiled to CUDA. It looks for uses
-// of the `llvm.nvvm.implicit.offset` intrinsic and replaces it with a offset
-// parameter which will be threaded through from the kernel entry point.
+// This pass operates on SYCL kernels. It looks for uses of the
+// `llvm.{amdgcn|nvvm}.implicit.offset` intrinsic and replaces it with an
+// offset parameter which will be threaded through from the kernel entry point.
 //
 //===----------------------------------------------------------------------===//
 
-#include "GlobalOffset.h"
-
-#include "../MCTargetDesc/NVPTXBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/SYCLLowerIR/TargetHelpers.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
 using namespace llvm;
+using namespace TargetHelpers;
 
 #define DEBUG_TYPE "globaloffset"
 
+#include "llvm/Support/CommandLine.h"
+static cl::opt<bool>
+    EnableGlobalOffset("enable-global-offset", cl::Hidden, cl::init(true),
+                       cl::desc("Enabel SYCL global offset pass"));
 namespace llvm {
+ModulePass *createGlobalOffsetPass();
 void initializeGlobalOffsetPass(PassRegistry &);
 } // end namespace llvm
 
@@ -43,24 +48,35 @@ class GlobalOffset : public ModulePass {
     if (skipModule(M))
       return false;
 
-    llvm::Function *ImplicitOffsetIntrinsic =
-        M.getFunction(Intrinsic::getName(Intrinsic::nvvm_implicit_offset));
+    if (!EnableGlobalOffset)
+      return false;
+
+    AT = getArchType(M);
+    llvm::Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName(
+        AT == ArchType::Cuda ? Intrinsic::nvvm_implicit_offset
+                             : Intrinsic::amdgcn_implicit_offset));
 
     if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) {
       return false;
     }
 
+    // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
+    // happy with ADDRESS_SPACE_GENERIC (0).
+    TargetAS = AT == ArchType::Cuda ? 0 : 5;
     KernelImplicitArgumentType =
         ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
-    ImplicitOffsetPtrType = Type::getInt32Ty(M.getContext())->getPointerTo();
+    ImplicitOffsetPtrType =
+        Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS);
     assert(
         (!ImplicitOffsetIntrinsic ||
          ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
-        "Intrinsic::nvvm_implicit_offset does not return the expected "
-        "type");
+        "Implicit offset intrinsic does not return the expected type");
 
-    // Find all entry points.
-    EntryPointMetadata = getEntryPointMetadata(M);
+    SmallVector<KernelPayload, 4> KernelPayloads;
+    populateKernels(M, KernelPayloads, AT);
+
+    // Validate kernels and populate entry map
+    EntryPointMetadata = validateKernels(M, KernelPayloads);
 
     // Add implicit parameters to all direct and indirect users of the offset
     addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
@@ -81,36 +97,41 @@ class GlobalOffset : public ModulePass {
     LLVMContext &Ctx = M.getContext();
     MDNode *FuncMetadata = EntryPointMetadata[Func];
 
-    bool AlreadyProcessed = ProcessedFunctions.count(Func) == 1;
-    if (AlreadyProcessed)
+    // Already processed.
+    if (ProcessedFunctions.count(Func) == 1)
       return;
 
     // Add the new argument to all other kernel entry points, despite not
     // using the global offset.
-    auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
-    assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations");
+    auto *KernelMetadata = M.getNamedMetadata(getAnnotationString(AT).c_str());
+    assert(KernelMetadata && "IR compiled must have correct annotations");
 
-    auto NewFunc = addOffsetArgumentToFunction(
-                       M, Func, KernelImplicitArgumentType->getPointerTo(),
-                       /*KeepOriginal=*/true)
-                       .first;
+    auto *NewFunc = addOffsetArgumentToFunction(
+                        M, Func, KernelImplicitArgumentType->getPointerTo(),
+                        /*KeepOriginal=*/true)
+                        .first;
     Argument *NewArgument = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
-    // Pass the values by value to the kernel
-    NewArgument->addAttr(
-        Attribute::getWithByValType(Ctx, KernelImplicitArgumentType));
+    // Pass byval to the kernel for NVIDIA, AMD's calling convention disallows
+    // byval args, use byref.
+    auto Attr =
+        AT == ArchType::Cuda
+            ? Attribute::getWithByValType(Ctx, KernelImplicitArgumentType)
+            : Attribute::getWithByRefType(Ctx, KernelImplicitArgumentType);
+    NewArgument->addAttr(Attr);
 
     // Add the metadata.
     Metadata *NewMetadata[] = {ConstantAsMetadata::get(NewFunc),
                                FuncMetadata->getOperand(1),
                                FuncMetadata->getOperand(2)};
-    NvvmMetadata->addOperand(MDNode::get(Ctx, NewMetadata));
+    KernelMetadata->addOperand(MDNode::get(Ctx, NewMetadata));
 
     // Create alloca of zeros for the implicit offset in original func
     BasicBlock *EntryBlock = &Func->getEntryBlock();
     IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
     Type *ImplicitOffsetType =
         ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
-    AllocaInst *ImplicitOffset = Builder.CreateAlloca(ImplicitOffsetType);
+    AllocaInst *ImplicitOffset =
+        Builder.CreateAlloca(ImplicitOffsetType, TargetAS);
     uint64_t AllocByteSize =
         ImplicitOffset->getAllocationSizeInBits(M.getDataLayout()).getValue() /
         8;
@@ -166,7 +187,7 @@ class GlobalOffset : public ModulePass {
       if (!CallToOld)
         return;
 
-      auto Caller = CallToOld->getFunction();
+      auto *Caller = CallToOld->getFunction();
 
       // Determine if `Caller` needs processed or if this is another callsite
       // from an already-processed function.
@@ -193,7 +214,7 @@ class GlobalOffset : public ModulePass {
 
         // Replace call to other function (which now has a new parameter),
         // with a call including the new parameter to that same function.
-        auto NewCaller = CallInst::Create(
+        auto *NewCaller = CallInst::Create(
             /* Ty= */ CalleeWithImplicitParam->getFunctionType(),
             /* Func= */ CalleeWithImplicitParam,
             /* Args= */ ImplicitOffsets,
@@ -244,8 +265,7 @@ class GlobalOffset : public ModulePass {
     }
 
     // Add the offset argument. Must be the same type as returned by
-    // `llvm.nvvm.implicit.offset`.
-
+    // `llvm.{amdgcn|nvvm}.implicit.offset`.
     Arguments.push_back(ImplicitArgumentType);
     ArgumentAttributes.push_back(AttributeSet());
 
@@ -263,6 +283,8 @@ class GlobalOffset : public ModulePass {
     // Keep original function ordering.
     M.getFunctionList().insertAfter(Func->getIterator(), NewFunc);
 
+    Value *ImplicitOffset = nullptr;
+    bool ImplicitOffsetAllocaInserted = false;
     if (KeepOriginal) {
       // TODO: Are there better naming alternatives that allow for unmangling?
       NewFunc->setName(Func->getName() + "_with_offset");
@@ -278,6 +300,40 @@ class GlobalOffset : public ModulePass {
       SmallVector<ReturnInst *, 8> Returns;
       CloneFunctionInto(NewFunc, Func, VMap,
                         CloneFunctionChangeType::GlobalChanges, Returns);
+      // In order to keep the signatures of functions called by the kernel
+      // unified, the pass has to copy global offset to an array allocated in
+      // addrspace(3). This is done as kernels can't allocate and fill the
+      // array in constant address space, which would be required for the case
+      // with no global offset.
+      if (AT == ArchType::AMDHSA) {
+        BasicBlock *EntryBlock = &NewFunc->getEntryBlock();
+        IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
+        Type *ImplicitOffsetType =
+            ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
+        Value *OrigImplicitOffset =
+            NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
+        AllocaInst *ImplicitOffsetAlloca =
+            Builder.CreateAlloca(ImplicitOffsetType, TargetAS);
+        auto DL = M.getDataLayout();
+        uint64_t AllocByteSize =
+            ImplicitOffsetAlloca->getAllocationSizeInBits(DL).getValue() / 8;
+        // After AMD's kernel arg lowering pass runs the accesses to arguments
+        // are replaced with uses of kernarg.segment.ptr which is in
+        // addrspace(4), cast implicit offset arg to constant memory so the
+        // memcpy is issued into a correct address space.
+        auto OrigImplicitOffsetAS4 = Builder.CreateAddrSpaceCast(
+            OrigImplicitOffset,
+            Type::getInt8Ty(M.getContext())->getPointerTo(4));
+        Builder.CreateMemCpy(
+            ImplicitOffsetAlloca, ImplicitOffsetAlloca->getAlign(),
+            OrigImplicitOffsetAS4,
+            OrigImplicitOffsetAS4->getPointerAlignment(DL), AllocByteSize);
+        ImplicitOffset = ImplicitOffsetAlloca;
+        ImplicitArgumentType = ImplicitOffset->getType();
+        ImplicitOffsetAllocaInserted = true;
+      } else {
+        ImplicitOffset = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
+      }
     } else {
       NewFunc->copyAttributesFrom(Func);
       NewFunc->setComdat(Func->getComdat());
@@ -300,15 +356,23 @@ class GlobalOffset : public ModulePass {
       Func->getAllMetadata(MDs);
       for (auto MD : MDs)
         NewFunc->addMetadata(MD.first, *MD.second);
+
+      ImplicitOffset = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
     }
+    assert(ImplicitOffset && "Value of implicit offset must be set.");
 
-    Value *ImplicitOffset = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
     // Add bitcast to match the return type of the intrinsic if needed.
     if (ImplicitArgumentType != ImplicitOffsetPtrType) {
       BasicBlock *EntryBlock = &NewFunc->getEntryBlock();
-      IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
-      ImplicitOffset =
-          Builder.CreateBitCast(ImplicitOffset, ImplicitOffsetPtrType);
+      // Make sure bitcast is inserted after alloca, if present.
+      BasicBlock::iterator InsertionPt =
+          ImplicitOffsetAllocaInserted
+              ? std::next(((AllocaInst *)ImplicitOffset)->getIterator())
+              : EntryBlock->getFirstInsertionPt();
+      IRBuilder<> Builder(EntryBlock, InsertionPt);
+      ImplicitOffset = Builder.CreateBitCast(
+          ImplicitOffset,
+          Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS));
     }
 
     ProcessedFunctions[NewFunc] = ImplicitOffset;
@@ -317,10 +381,8 @@ class GlobalOffset : public ModulePass {
     return {NewFunc, ImplicitOffset};
   }
 
-  static llvm::DenseMap<Function *, MDNode *> getEntryPointMetadata(Module &M) {
-    auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
-    assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations");
-
+  static DenseMap<Function *, MDNode *>
+  validateKernels(Module &M, SmallVectorImpl<KernelPayload> &KernelPayloads) {
     SmallPtrSet<GlobalValue *, 8u> Used;
     SmallVector<GlobalValue *, 4> Vec;
     collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/false);
@@ -333,36 +395,18 @@ class GlobalOffset : public ModulePass {
       return !GV->hasOneUse() || !Used.count(GV);
     };
 
-    llvm::DenseMap<Function *, MDNode *> NvvmEntryPointMetadata;
-    for (auto MetadataNode : NvvmMetadata->operands()) {
-      if (MetadataNode->getNumOperands() != 3)
-        continue;
-
-      // NVPTX identifies kernel entry points using metadata nodes of the form:
-      //   !X = !{<function>, !"kernel", i32 1}
-      auto Type = dyn_cast<MDString>(MetadataNode->getOperand(1));
-      // Only process kernel entry points.
-      if (!Type || Type->getString() != "kernel")
-        continue;
+    llvm::DenseMap<Function *, MDNode *> EntryPointMetadata;
+    for (auto &KP : KernelPayloads) {
+      if (HasUseOtherThanLLVMUsed(KP.Kernel))
+        llvm_unreachable("Kernel entry point can't have uses.");
 
-      // Get a pointer to the entry point function from the metadata.
-      const auto &FuncOperand = MetadataNode->getOperand(0);
-      if (!FuncOperand)
-        continue;
-      auto FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand);
-      if (!FuncConstant)
-        continue;
-      auto Func = dyn_cast<Function>(FuncConstant->getValue());
-      if (!Func)
-        continue;
-
-      assert(!HasUseOtherThanLLVMUsed(Func) && "Kernel entry point with uses");
-      NvvmEntryPointMetadata[Func] = MetadataNode;
+      EntryPointMetadata[KP.Kernel] = KP.MD;
     }
-    return NvvmEntryPointMetadata;
+
+    return EntryPointMetadata;
   }
 
-  virtual llvm::StringRef getPassName() const {
+  virtual llvm::StringRef getPassName() const override {
     return "Add implicit SYCL global offset";
   }
 
@@ -373,6 +417,9 @@ class GlobalOffset : public ModulePass {
   llvm::DenseMap<Function *, MDNode *> EntryPointMetadata;
   llvm::Type *KernelImplicitArgumentType;
   llvm::Type *ImplicitOffsetPtrType;
+
+  TargetHelpers::ArchType AT;
+  unsigned TargetAS = 0;
 };
 
 } // end anonymous namespace
diff --git a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
index 92b2cc727c059..b7b799ecabade 100644
--- a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
+++ b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
@@ -38,7 +38,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
   using KernelPayload = TargetHelpers::KernelPayload;
   using ArchType = TargetHelpers::ArchType;
 
-  // The ualue for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen to
+  // The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen to
   // be 3.
   const unsigned SharedASValue = 3;
 
@@ -58,9 +58,9 @@ class LocalAccessorToSharedMemory : public ModulePass {
     if (skipModule(M))
       return false;
 
-    SmallVector<KernelPayload> Kernels;
+    SmallVector<KernelPayload, 4> Kernels;
     TargetHelpers::populateKernels(M, Kernels, AT);
-    SmallVector<std::pair<Function *, KernelPayload>> NewToOldKernels;
+    SmallVector<std::pair<Function *, KernelPayload>, 4> NewToOldKernels;
     if (Kernels.empty())
       return false;
 
@@ -211,7 +211,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
   }
 
   void postProcessKernels(
-      SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels) {
+      SmallVectorImpl<std::pair<Function *, KernelPayload>> &NewToOldKernels) {
     for (auto &Pair : NewToOldKernels) {
       std::get<1>(Pair).MD->replaceOperandWith(
           0, llvm::ConstantAsMetadata::get(std::get<0>(Pair)));
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 8c53d3173f4ff..4396471490f6d 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -36,7 +36,6 @@ set(NVPTXCodeGen_sources
   NVVMIntrRange.cpp
   NVVMReflect.cpp
   NVPTXProxyRegErasure.cpp
-  SYCL/GlobalOffset.cpp
   )
 
 add_llvm_target(NVPTXCodeGen
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 2b4511205234c..31afc8059ba47 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -17,7 +17,6 @@
 #include "NVPTXLowerAggrCopies.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXTargetTransformInfo.h"
-#include "SYCL/GlobalOffset.h"
 #include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
@@ -29,6 +28,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/SYCLLowerIR/GlobalOffset.h"
 #include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
@@ -72,7 +72,6 @@ static cl::opt<bool>
 
 namespace llvm {
 
-void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
 void initializeNVVMIntrRangePass(PassRegistry&);
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);

From 4fb0d977fbab84c4e25674b692b0f8163612e001 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Fri, 25 Feb 2022 14:53:57 +0000
Subject: [PATCH 05/10] [SYCL] Run global offset on AMDGPU

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   5 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   4 +
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   6 +-
 llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll |  57 +++++++++
 .../AMDGPU/global-offset-invalid-triple.ll    |  33 ++++++
 ...offset-multiple-calls-from-one-function.ll |  68 +++++++++++
 .../global-offset-multiple-entry-points.ll    | 111 ++++++++++++++++++
 .../CodeGen/AMDGPU/global-offset-simple.ll    |  53 +++++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   5 +
 .../global-offset-multiple-entry-points.ll    |   2 +-
 11 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/global-offset-simple.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index afcfa07f6df13..4a215caaf6aca 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -316,5 +316,10 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi",
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts")
 
+//===----------------------------------------------------------------------===//
+// SYCL builtin.
+//===----------------------------------------------------------------------===//
+BUILTIN(__builtin_amdgcn_implicit_offset, "Ui*5", "nc")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c2cf523216c0c..aad4284afdb85 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2059,4 +2059,8 @@ def int_amdgcn_reloc_constant : Intrinsic<
   [llvm_i32_ty], [llvm_metadata_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
+
+// SYCL
+def int_amdgcn_implicit_offset : GCCBuiltin<"__builtin_amdgcn_implicit_offset">,
+  Intrinsic<[LLVMQualPointerType<llvm_i32_ty, 5>], [], [IntrNoMem, IntrSpeculatable]>;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index be81868650b7c..ea8213452447d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -26,6 +26,7 @@ FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
 FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
 void initializeAMDGPURegBankCombinerPass(PassRegistry &);
 
+void initializeGlobalOffsetPass(PassRegistry &);
 void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
 
 // SI Passes
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9b1ae18b4016b..8bf597de13b4f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -44,6 +44,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/SYCLLowerIR/GlobalOffset.h"
 #include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
@@ -383,6 +384,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGCNPreRAOptimizationsPass(*PR);
 
   // SYCL-specific passes, needed here to be available to `opt`.
+  initializeGlobalOffsetPass(*PR);
   initializeLocalAccessorToSharedMemoryPass(*PR);
 }
 
@@ -1042,8 +1044,10 @@ void AMDGPUPassConfig::addIRPasses() {
     addEarlyCSEOrGVNPass();
 
   if (TM.getTargetTriple().getArch() == Triple::amdgcn &&
-      TM.getTargetTriple().getOS() == Triple::OSType::AMDHSA)
+      TM.getTargetTriple().getOS() == Triple::OSType::AMDHSA) {
     addPass(createLocalAccessorToSharedMemoryPass());
+    addPass(createGlobalOffsetPass());
+  }
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll b/llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll
new file mode 100644
index 0000000000000..9fdcae74f0eeb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll
@@ -0,0 +1,57 @@
+; RUN: opt -enable-new-pm=0 -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'simple_debug.bc'
+source_filename = "global-offset-dbg.ll"
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; This test checks that debug information on functions and callsites are preserved
+
+declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: llvm.amdgcn.implicit.offset
+
+define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
+; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32 addrspace(5)* %0) !dbg !11 {
+  %1 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %2 = getelementptr inbounds i32, i32 addrspace(5)* %1, i64 2
+  %3 = load i32, i32 addrspace(5)* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() !dbg !14 {
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel() !dbg !14 {
+entry:
+  %0 = call i64 @_ZTS14other_function(), !dbg !15
+; CHECK: %3 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2), !dbg !15
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byref([3 x i32]) %0) !dbg !16 {
+; CHECK:  %1 = alloca [3 x i32], align 4, addrspace(5), !dbg !17
+; CHECK:  %2 = bitcast [3 x i32] addrspace(5)* %1 to i32 addrspace(5)*, !dbg !17
+; CHECK:  call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 4 %4, i8 addrspace(4)* align 1 %3, i64 12, i1 false), !dbg !17
+; CHECK:  %5 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2), !dbg !17
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!amdgcn.annotations = !{!5, !6, !7, !6, !8, !8, !8, !8, !9, !9, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 0.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "global-offset-debug.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!6 = !{i32 1, i32 4}
+!7 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!8 = !{null, !"align", i32 16}
+!9 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!11 = distinct !DISubprogram(name: "other_function", scope: !1, file: !1, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null}
+!14 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!15 = !DILocation(line: 1, column: 2, scope: !14)
+; CHECK: !16 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+; CHECK: !17 = !DILocation(line: 1, column: 2, scope: !16)
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll b/llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll
new file mode 100644
index 0000000000000..d5e4140f7f6a5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll
@@ -0,0 +1,33 @@
+; RUN: not --crash llc -march=amdgcn -mcpu=hawaii %s -o - 2>&1 | FileCheck %s
+; ModuleID = 'global-offset-invalid-triple.bc'
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.implicit.offset
+source_filename = "global-offset-invalid-triple.ll"
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+
+; This test checks that the pass does not run on nvcl triples.
+
+declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+  %1 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %2 = getelementptr inbounds i32, i32 addrspace(5)* %1, i64 2
+  %3 = load i32, i32 addrspace(5)* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+  %0 = call i64 @_ZTS14other_function()
+  ret void
+}
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll
new file mode 100644
index 0000000000000..647c59ed9ea9e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll
@@ -0,0 +1,68 @@
+; RUN: opt -enable-new-pm=0 -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'global-offset-multiple-calls-from-one-function.bc'
+source_filename = "global-offset-multiple-calls-from-one-function.ll"
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; This test checks that when there are multiple calls to a function that uses
+; the intrinsic that the caller and the callee only have a single clone each
+; with the offset parameter. It also checks that the clone with multiple calls
+; to other functions that has a variant that takes an offset parameter will have
+; all calls redirected to the corresponding variants.
+
+declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32 addrspace(5)* %0) {
+  %1 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %2 = getelementptr inbounds i32, i32 addrspace(5)* %1, i64 2
+; CHECK: %2 = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
+  %3 = load i32, i32 addrspace(5)* %2, align 4
+  %4 = zext i32 %3 to i64
+
+  %5 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %6 = getelementptr inbounds i32, i32 addrspace(5)* %5, i64 2
+; CHECK: %5 = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
+  %7 = load i32, i32 addrspace(5)* %6, align 4
+  %8 = zext i32 %7 to i64
+
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4, addrspace(5)
+; CHECK: %1 = bitcast [3 x i32] addrspace(5)* %0 to i8 addrspace(5)*
+; CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14other_function()
+; CHECK: %3 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2)
+  %1 = call i64 @_ZTS14other_function()
+; CHECK: %4 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byref([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
+; CHECK:   %2 = bitcast [3 x i32] addrspace(5)* %1 to i32 addrspace(5)*
+; CHECK:   call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 4 %4, i8 addrspace(4)* align 1 %3, i64 12, i1 false)
+; CHECK:   %5 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2)
+; CHECK:   %6 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2)
+; CHECK:   ret void
+; CHECK: }
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+; CHECK: !amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll
new file mode 100644
index 0000000000000..a38f4e1023dad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll
@@ -0,0 +1,111 @@
+; RUN: opt -enable-new-pm=0 -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'global-offset-multiple-entry-points.bc'
+source_filename = "global-offset-multiple-entry-points.ll"
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; This test checks that the pass works with multiple entry points.
+
+declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+
+; This function is a kernel entry point that does not use global offset. It will
+; not get a clone with a global offset parameter.
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS12third_kernel() {
+entry:
+  ret void
+}
+
+define weak_odr dso_local i64 @_ZTS15common_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15common_function(i32 addrspace(5)* %0) {
+  %1 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK: %2 = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
+  %2 = getelementptr inbounds i32, i32 addrspace(5)* %1, i64 2
+  %3 = load i32, i32 addrspace(5)* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define weak_odr dso_local i64 @_ZTS14first_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14first_function(i32 addrspace(5)* %0) {
+  %1 = call i64 @_ZTS15common_function()
+; CHECK: %2 = call i64 @_ZTS15common_function(i32 addrspace(5)* %0)
+  ret i64 %1
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS12first_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32] addrspace(5)* %0 to i8 addrspace(5)*
+; CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14first_function()
+; CHECK: %3 = call i64 @_ZTS14first_function(i32 addrspace(5)* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS12first_kernel_with_offset([3 x i32]* byref([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
+; CHECK:   %2 = bitcast [3 x i32] addrspace(5)* %1 to i32 addrspace(5)*
+; CHECK:   call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 4 %4, i8 addrspace(4)* align 1 %3, i64 12, i1 false)
+; CHECK:   %5 = call i64 @_ZTS14first_function(i32 addrspace(5)* %2)
+; CHECK:   ret void
+; CHECK: }
+
+define weak_odr dso_local i64 @_ZTS15second_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15second_function(i32 addrspace(5)* %0) {
+  %1 = call i64 @_ZTS15common_function()
+; CHECK: %2 = call i64 @_ZTS15common_function(i32 addrspace(5)* %0)
+  ret i64 %1
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS13second_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32] addrspace(5)* %0 to i8 addrspace(5)*
+; CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS15second_function()
+; CHECK: %3 = call i64 @_ZTS15second_function(i32 addrspace(5)* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS13second_kernel_with_offset([3 x i32]* byref([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
+; CHECK:   %2 = bitcast [3 x i32] addrspace(5)* %1 to i32 addrspace(5)*
+; CHEKC:   call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 4 %4, i8 addrspace(4)* align 1 %3, i64 12, i1 false)
+; CHECK:   %5 = call i64 @_ZTS15second_function(i32 addrspace(5)* %2)
+; CHECK:   ret void
+; CHECK: }
+
+; This function doesn't get called by a kernel entry point.
+define weak_odr dso_local i64 @_ZTS15no_entry_point() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point(i32 addrspace(5)* %0) {
+  %1 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %2 = getelementptr inbounds i32, i32 addrspace(5)* %1, i64 2
+; CHECK: %2 = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
+  %3 = load i32, i32 addrspace(5)* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6}
+; CHECK: !amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6, !7, !8}
+
+!0 = distinct !{void ()* @_ZTS12first_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = distinct !{void ()* @_ZTS13second_kernel, !"kernel", i32 1}
+!6 = distinct !{void ()* @_ZTS12third_kernel, !"kernel", i32 1}
+; CHECK: !7 = !{void ([3 x i32]*)* @_ZTS13second_kernel_with_offset, !"kernel", i32 1}
+; CHECK: !8 = !{void ([3 x i32]*)* @_ZTS12first_kernel_with_offset, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-simple.ll b/llvm/test/CodeGen/AMDGPU/global-offset-simple.ll
new file mode 100644
index 0000000000000..7b77cabe1c43b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-simple.ll
@@ -0,0 +1,53 @@
+; RUN: opt -enable-new-pm=0 -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'global-offset-simple.bc'
+source_filename = "global-offset-simple.ll"
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; This test checks that the transformation is applied in the basic case.
+
+declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: llvm.amdgcn.implicit.offset
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32 addrspace(5)* %0) {
+; CHECK: %2 = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
+  %1 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %2 = getelementptr inbounds i32, i32 addrspace(5)* %1, i64 2
+  %3 = load i32, i32 addrspace(5)* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+; CHECK:   %0 = alloca [3 x i32], align 4, addrspace(5)
+; CHECK:   %1 = bitcast [3 x i32] addrspace(5)* %0 to i8 addrspace(5)*
+; CHECK:   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK:   %2 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14other_function()
+; CHECK: %3 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byref([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
+; CHECK:   %2 = bitcast [3 x i32] addrspace(5)* %1 to i32 addrspace(5)*
+; CHECK:   call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 4 %4, i8 addrspace(4)* align 1 %3, i64 12, i1 false)
+; CHECK:   %5 = call i64 @_ZTS14other_function(i32 addrspace(5)* %2)
+; CHECK:   ret void
+; CHECK: }
+
+!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+; CHECK: !amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 3b2d282dfe61c..d3c155ddd897b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -49,6 +49,7 @@
 ; GCN-O0-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O0-NEXT:      Expand reduction intrinsics
 ; GCN-O0-NEXT:    SYCL Local Accessor to Shared Memory
+; GCN-O0-NEXT:    Add implicit SYCL global offset
 ; GCN-O0-NEXT:    AMDGPU Attributor
 ; GCN-O0-NEXT:    CallGraph Construction
 ; GCN-O0-NEXT:    Call Graph SCC Pass Manager
@@ -218,6 +219,7 @@
 ; GCN-O1-NEXT:      Natural Loop Information
 ; GCN-O1-NEXT:      TLS Variable Hoist
 ; GCN-O1-NEXT:    SYCL Local Accessor to Shared Memory
+; GCN-O1-NEXT:    Add implicit SYCL global offset
 ; GCN-O1-NEXT:    AMDGPU Attributor
 ; GCN-O1-NEXT:    CallGraph Construction
 ; GCN-O1-NEXT:    Call Graph SCC Pass Manager
@@ -492,6 +494,7 @@
 ; GCN-O1-OPTS-NEXT:      TLS Variable Hoist
 ; GCN-O1-OPTS-NEXT:      Early CSE
 ; GCN-O1-OPTS-NEXT:    SYCL Local Accessor to Shared Memory
+; GCN-O1-OPTS-NEXT:    Add implicit SYCL global offset
 ; GCN-O1-OPTS-NEXT:    AMDGPU Attributor
 ; GCN-O1-OPTS-NEXT:    CallGraph Construction
 ; GCN-O1-OPTS-NEXT:    Call Graph SCC Pass Manager
@@ -780,6 +783,7 @@
 ; GCN-O2-NEXT:      TLS Variable Hoist
 ; GCN-O2-NEXT:      Early CSE
 ; GCN-O2-NEXT:    SYCL Local Accessor to Shared Memory
+; GCN-O2-NEXT:    Add implicit SYCL global offset
 ; GCN-O2-NEXT:    AMDGPU Attributor
 ; GCN-O2-NEXT:    CallGraph Construction
 ; GCN-O2-NEXT:    Call Graph SCC Pass Manager
@@ -1082,6 +1086,7 @@
 ; GCN-O3-NEXT:      Optimization Remark Emitter
 ; GCN-O3-NEXT:      Global Value Numbering
 ; GCN-O3-NEXT:    SYCL Local Accessor to Shared Memory
+; GCN-O3-NEXT:    Add implicit SYCL global offset
 ; GCN-O3-NEXT:    AMDGPU Attributor
 ; GCN-O3-NEXT:    CallGraph Construction
 ; GCN-O3-NEXT:    Call Graph SCC Pass Manager
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
index f3341a2369a5f..22253c187d2ef 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
@@ -2,7 +2,7 @@
 ; ModuleID = 'multiple-entry-points.bc'
 source_filename = "multiple-entry-points.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-nvcl"
+target triple = "nvptx64-nvidia-cuda"
 
 ; This test checks that the pass works with multiple entry points.
 

From a916ce6ebebec05284eae631aa0b7cdb0ca5d6ca Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Fri, 25 Feb 2022 14:55:34 +0000
Subject: [PATCH 06/10] [LIBCLC] Add proper support for get_global_offset for
 AMD

---
 libclc/amdgcn-amdhsa/libspirv/SOURCES         |  1 +
 .../libspirv/workitem/get_global_offset.ll    | 42 +++++++++++++++++++
 libclc/amdgcn/libspirv/SOURCES                |  1 -
 .../libspirv/workitem/get_global_offset.cl    | 25 -----------
 4 files changed, 43 insertions(+), 26 deletions(-)
 create mode 100644 libclc/amdgcn-amdhsa/libspirv/workitem/get_global_offset.ll
 delete mode 100644 libclc/amdgcn/libspirv/workitem/get_global_offset.cl

diff --git a/libclc/amdgcn-amdhsa/libspirv/SOURCES b/libclc/amdgcn-amdhsa/libspirv/SOURCES
index 248230e2c4cda..8230b26ff8458 100644
--- a/libclc/amdgcn-amdhsa/libspirv/SOURCES
+++ b/libclc/amdgcn-amdhsa/libspirv/SOURCES
@@ -1,4 +1,5 @@
 
+workitem/get_global_offset.ll
 group/collectives.cl
 group/collectives_helpers.ll
 atomic/loadstore_helpers.ll
diff --git a/libclc/amdgcn-amdhsa/libspirv/workitem/get_global_offset.ll b/libclc/amdgcn-amdhsa/libspirv/workitem/get_global_offset.ll
new file mode 100644
index 0000000000000..30751d14a7ab5
--- /dev/null
+++ b/libclc/amdgcn-amdhsa/libspirv/workitem/get_global_offset.ll
@@ -0,0 +1,42 @@
+;;===----------------------------------------------------------------------===//
+;;
+;; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+;; See https://llvm.org/LICENSE.txt for license information.
+;; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;;
+;;===----------------------------------------------------------------------===//
+
+#if __clang_major__ >= 7
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+#else
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+#endif
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+
+define hidden i64 @_Z22__spirv_GlobalOffset_xv() nounwind alwaysinline {
+entry:
+  %0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %1 = load i32, i32 addrspace(5)* %0, align 4
+  %zext = zext i32 %1 to i64
+  ret i64 %zext
+}
+
+define hidden i64 @_Z22__spirv_GlobalOffset_yv() nounwind alwaysinline {
+entry:
+  %0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 1
+  %1 = load i32, i32 addrspace(5)* %arrayidx, align 4
+  %zext = zext i32 %1 to i64
+  ret i64 %zext
+}
+
+define hidden i64 @_Z22__spirv_GlobalOffset_zv() nounwind alwaysinline {
+entry:
+  %0 = tail call i32 addrspace(5)* @llvm.amdgcn.implicit.offset()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %0, i64 2
+  %1 = load i32, i32 addrspace(5)* %arrayidx, align 4
+  %zext = zext i32 %1 to i64
+  ret i64 %zext
+}
diff --git a/libclc/amdgcn/libspirv/SOURCES b/libclc/amdgcn/libspirv/SOURCES
index 300e54c4769e3..f4196b533c49f 100644
--- a/libclc/amdgcn/libspirv/SOURCES
+++ b/libclc/amdgcn/libspirv/SOURCES
@@ -1,4 +1,3 @@
-workitem/get_global_offset.cl
 workitem/get_group_id.cl
 workitem/get_global_size.cl
 workitem/get_local_id.cl
diff --git a/libclc/amdgcn/libspirv/workitem/get_global_offset.cl b/libclc/amdgcn/libspirv/workitem/get_global_offset.cl
deleted file mode 100644
index 3e79449352186..0000000000000
--- a/libclc/amdgcn/libspirv/workitem/get_global_offset.cl
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <spirv/spirv.h>
-
-#if __clang_major__ >= 8
-#define CONST_AS __constant
-#elif __clang_major__ >= 7
-#define CONST_AS __attribute__((address_space(4)))
-#else
-#define CONST_AS __attribute__((address_space(2)))
-#endif
-
-// TODO: implement proper support for global offsets, this also requires
-// changes in the compiler and the HIP plugin.
-_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_x() { return 0; }
-
-_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_y() { return 0; }
-
-_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_z() { return 0; }

From ffddff3a4be21dff8ea3f1fed36b8b55594c91df Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Mon, 28 Feb 2022 09:40:45 +0000
Subject: [PATCH 07/10] [SYCL] Global offset docs

---
 llvm/docs/AMDGPUUsage.rst                   | 27 +++++++++++++++++++++
 sycl/doc/design/CompilerAndRuntimeDesign.md | 13 +++++-----
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index bd6cdd84e3a50..67683cc646085 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -14985,6 +14985,33 @@ track the usage for each kernel. However, in some cases careful organization of
 the kernels and functions in the source file means there is minimal additional
 effort required to accurately calculate GPR usage.
 
+SYCL Kernel Metadata
+====================
+
+This section describes the additional metadata that is inserted for SYCL
+kernels. As SYCL is a single source programming model functions can either
+execute on a host or a device (i.e. GPU). Device kernels are akin to kernel
+entry-points in GPU program. To mark an LLVM IR function as a device kernel
+function, we make use of special LLVM metadata. The AMDGCN back-end will look
+for a named metadata node called ``amdgcn.annotations``. This named metadata
+must contain a list of metadata that describe the kernel IR. For our purposes,
+we need to declare a metadata node that assigns the `"kernel"` attribute to the
+LLVM IR function that should be emitted as a SYCL kernel function. These
+metadata nodes take the form:
+
+.. code-block:: text
+
+  !{<function ref>, metadata !"kernel", i32 1}
+
+Consider the metadata generated by global-offset pass, showing a void kernel
+function `example_kernel_with_offset` taking one argument, a pointer to 3 i32
+integers:
+
+.. code-block:: llvm
+
+  !amdgcn.annotations = !{!0}
+  !0 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+
 Additional Documentation
 ========================
 
diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index 7ae691ed92f90..2183b3d19532f 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -659,11 +659,12 @@ PI interface.
 The CUDA API does not natively support the global offset parameter
 expected by the SYCL.
 
-In order to emulate this and make generated kernel compliant, an
-intrinsic `llvm.nvvm.implicit.offset` (clang builtin
-`__builtin_ptx_implicit_offset`) was introduced materializing the use
-of this implicit parameter for the NVPTX backend. The intrinsic returns
-a pointer to `i32` referring to a 3 elements array.
+In order to emulate this and make generated kernel compliant, an intrinsic
+`llvm.nvvm.implicit.offset` (clang builtin `__builtin_ptx_implicit_offset`) was
+introduced materializing the use of this implicit parameter for the NVPTX
+backend. AMDGCN uses the same approach with `llvm.andgpu.implicit.offset` and
+`__builtin_amdgcn_implicit_offset`. The intrinsic returns a pointer to `i32`
+referring to a 3 elements array.
 
 Each non-kernel function reaching the implicit offset intrinsic in the
 call graph is augmented with an extra implicit parameter of type
@@ -682,7 +683,7 @@ on the following logic:
 
 - If the 2 versions exist, the original kernel is called if global
   offset is 0 otherwise it will call the cloned one and pass the
-  offset by value;
+  offset by value (for CUDA backend), or by ref for AMD;
 - If only 1 function exist, it is assumed that the kernel makes no use
   of this parameter and therefore ignores it.
 

From fa945e6ee75d6f453fb2a2f22158c2712a87f6b4 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Fri, 25 Mar 2022 08:24:31 +0000
Subject: [PATCH 08/10] [SYCL] Use SYCLKernelAttr for AMDGCN metadata
 generation

---
 clang/lib/CodeGen/TargetInfo.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 37dd71b86228d..5d29cffa498f8 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -9425,10 +9425,9 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
   if (IsHIPKernel)
     F->addFnAttr("uniform-work-group-size", "true");
 
-  const bool IsSYCLKernel =
-      FD && M.getLangOpts().SYCLIsDevice &&
-      F->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL;
   // Create !{<func-ref>, metadata !"kernel", i32 1} node for SYCL kernels.
+  const bool IsSYCLKernel =
+      FD && M.getLangOpts().SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>();
   if (IsSYCLKernel)
     addAMDGCNMetadata(F, "kernel", 1);
 

From 7997a2d7b83302318d09066b859aa654fe81044a Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Tue, 29 Mar 2022 08:30:35 +0000
Subject: [PATCH 09/10] [SYCL] Add test for kernel annotations

---
 clang/test/CodeGenSYCL/kernel-annotation.cpp | 48 ++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 clang/test/CodeGenSYCL/kernel-annotation.cpp

diff --git a/clang/test/CodeGenSYCL/kernel-annotation.cpp b/clang/test/CodeGenSYCL/kernel-annotation.cpp
new file mode 100644
index 0000000000000..05da46a1226d3
--- /dev/null
+++ b/clang/test/CodeGenSYCL/kernel-annotation.cpp
@@ -0,0 +1,48 @@
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -fsycl-is-device \
+// RUN: -S -emit-llvm %s -o %temp.ll
+// RUN: FileCheck -check-prefix=CHECK-SPIR --input-file %temp.ll %s
+
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fsycl-is-device \
+// RUN: -S -emit-llvm %s -o %temp.ll
+// RUN: FileCheck -check-prefix=CHECK-NVPTX --input-file %temp.ll %s
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -fsycl-is-device \
+// RUN: -S -emit-llvm %s -o %temp.ll
+// RUN: FileCheck -check-prefix=CHECK-AMDGCN --input-file %temp.ll %s
+
+// The test makes sure that `[nnvm|amdgcn].annotations are correctly generated
+// only for their respective targets.
+
+#include "Inputs/sycl.hpp"
+
+sycl::handler H;
+
+class Functor {
+public:
+  void operator()() const {}
+};
+
+// CHECK-SPIR-NOT: annotations =
+
+// CHECK-NVPTX: nvvm.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
+// CHECK-NVPTX: [[FIRST]] = !{void ()* @_ZTS7Functor, !"kernel", i32 1}
+// CHECK-NVPTX: [[SECOND]] = !{void ()* @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE0_clES2_E5foo_2, !"kernel", i32 1}
+
+// CHECK-AMDGCN: amdgcn.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
+// CHECK-AMDGCN: [[FIRST]] = !{void ()* @_ZTS7Functor, !"kernel", i32 1}
+// CHECK-AMDGCN: [[SECOND]] = !{void ()* @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE0_clES2_E5foo_2, !"kernel", i32 1}
+
+int main() {
+  sycl::queue q;
+  q.submit([&](sycl::handler &cgh) {
+    Functor foo{};
+    cgh.single_task(foo);
+  });
+
+  q.submit([&](cl::sycl::handler &cgh) {
+    cgh.parallel_for<class foo_2>(cl::sycl::range<1>(1),
+                                  [=](cl::sycl::item<1> item) {
+                                  });
+  });
+  return 0;
+}

From 2c433e9d7d1aed5d7792ada58ecffceda74960e3 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Thu, 5 May 2022 11:06:36 +0000
Subject: [PATCH 10/10] PR comments addressed.

---
 llvm/lib/SYCLLowerIR/GlobalOffset.cpp       | 2 +-
 sycl/doc/design/CompilerAndRuntimeDesign.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
index c5c52bd379020..df696111a68f8 100644
--- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
+++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
@@ -31,7 +31,7 @@ using namespace TargetHelpers;
 #include "llvm/Support/CommandLine.h"
 static cl::opt<bool>
     EnableGlobalOffset("enable-global-offset", cl::Hidden, cl::init(true),
-                       cl::desc("Enabel SYCL global offset pass"));
+                       cl::desc("Enable SYCL global offset pass"));
 namespace llvm {
 ModulePass *createGlobalOffsetPass();
 void initializeGlobalOffsetPass(PassRegistry &);
diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index 2183b3d19532f..2cc321ffca81c 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -662,7 +662,7 @@ expected by the SYCL.
 In order to emulate this and make generated kernel compliant, an intrinsic
 `llvm.nvvm.implicit.offset` (clang builtin `__builtin_ptx_implicit_offset`) was
 introduced materializing the use of this implicit parameter for the NVPTX
-backend. AMDGCN uses the same approach with `llvm.andgpu.implicit.offset` and
+backend. AMDGCN uses the same approach with `llvm.amdgpu.implicit.offset` and
 `__builtin_amdgcn_implicit_offset`. The intrinsic returns a pointer to `i32`
 referring to a 3 elements array.