Skip to content

Commit 9d899d8

Browse files
committed
[HIP] Support -fgpu-default-stream
Introduce -fgpu-default-stream={legacy|per-thread} option to support per-thread default stream for HIP runtime. When -fgpu-default-stream=per-thread, HIP kernels are launched through hipLaunchKernel_spt instead of hipLaunchKernel. Also HIP_API_PER_THREAD_DEFAULT_STREAM=1 is defined by the preprocessor to enable other per-thread stream API's. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D120298
1 parent b01430a commit 9d899d8

File tree

9 files changed

+68
-7
lines changed

9 files changed

+68
-7
lines changed

clang/include/clang/Basic/LangOptions.h

+10
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,13 @@ class LangOptions : public LangOptionsBase {
309309
ExtendTo64
310310
};
311311

312+
enum class GPUDefaultStreamKind {
313+
/// Legacy default stream
314+
Legacy,
315+
/// Per-thread default stream
316+
PerThread,
317+
};
318+
312319
public:
313320
/// The used language standard.
314321
LangStandard::Kind LangStd;
@@ -402,6 +409,9 @@ class LangOptions : public LangOptionsBase {
402409
/// input is a header file (i.e. -x c-header).
403410
bool IsHeaderFile = false;
404411

412+
/// The default stream kind used for HIP kernel launching.
413+
GPUDefaultStreamKind GPUDefaultStream;
414+
405415
LangOptions();
406416

407417
// Define accessors/mutators for language options of enumeration type.

clang/include/clang/Driver/Options.td

+7
Original file line numberDiff line numberDiff line change
@@ -959,6 +959,13 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
959959
TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
960960
PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
961961
NegFlag<SetFalse>>;
962+
def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
963+
HelpText<"Specify default stream. Valid values are 'legacy' and 'per-thread'. The default value is 'legacy'. (HIP only)">,
964+
Flags<[CC1Option]>,
965+
Values<"legacy,per-thread">,
966+
NormalizedValuesScope<"LangOptions::GPUDefaultStreamKind">,
967+
NormalizedValues<["Legacy", "PerThread"]>,
968+
MarshallingInfoEnum<LangOpts<"GPUDefaultStream">, "Legacy">;
962969
def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
963970
HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
964971
def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,

clang/lib/CodeGen/CGCUDANV.cpp

+11-4
Original file line numberDiff line numberDiff line change
@@ -332,15 +332,22 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
332332
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
333333

334334
// Lookup cudaLaunchKernel/hipLaunchKernel function.
335+
// HIP kernel launching API name depends on -fgpu-default-stream option. For
336+
// the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
337+
// it is hipLaunchKernel_spt.
335338
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
336339
// void **args, size_t sharedMem,
337340
// cudaStream_t stream);
338-
// hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
339-
// void **args, size_t sharedMem,
340-
// hipStream_t stream);
341+
// hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
342+
// dim3 blockDim, void **args,
343+
// size_t sharedMem, hipStream_t stream);
341344
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
342345
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
343-
auto LaunchKernelName = addPrefixToName("LaunchKernel");
346+
std::string KernelLaunchAPI = "LaunchKernel";
347+
if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream ==
348+
LangOptions::GPUDefaultStreamKind::PerThread)
349+
KernelLaunchAPI = KernelLaunchAPI + "_spt";
350+
auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
344351
IdentifierInfo &cudaLaunchKernelII =
345352
CGM.getContext().Idents.get(LaunchKernelName);
346353
FunctionDecl *cudaLaunchKernelFD = nullptr;

clang/lib/Driver/ToolChains/Clang.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -6915,8 +6915,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
69156915
CmdArgs.push_back(Args.MakeArgString(Twine("-cuid=") + Twine(CUID)));
69166916
}
69176917

6918-
if (IsHIP)
6918+
if (IsHIP) {
69196919
CmdArgs.push_back("-fcuda-allow-variadic-functions");
6920+
Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ);
6921+
}
69206922

69216923
if (IsCudaDevice || IsHIPDevice) {
69226924
StringRef InlineThresh =

clang/lib/Frontend/InitPreprocessor.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,9 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
538538
Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
539539
if (LangOpts.CUDAIsDevice)
540540
Builder.defineMacro("__HIP_DEVICE_COMPILE__");
541+
if (LangOpts.GPUDefaultStream ==
542+
LangOptions::GPUDefaultStreamKind::PerThread)
543+
Builder.defineMacro("HIP_API_PER_THREAD_DEFAULT_STREAM");
541544
}
542545
}
543546

clang/test/CodeGenCUDA/Inputs/cuda.h

+7
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,18 @@ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
3535
extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
3636
size_t sharedSize = 0,
3737
hipStream_t stream = 0);
38+
#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
3839
extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
3940
dim3 blockDim, void **args,
4041
size_t sharedMem,
4142
hipStream_t stream);
4243
#else
44+
extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
45+
dim3 blockDim, void **args,
46+
size_t sharedMem,
47+
hipStream_t stream);
48+
#endif //HIP_API_PER_THREAD_DEFAULT_STREAM
49+
#else
4350
typedef struct cudaStream *cudaStream_t;
4451
typedef enum cudaError {} cudaError_t;
4552
extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,

clang/test/CodeGenCUDA/kernel-call.cu

+9-2
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,22 @@
55
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
66
// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
77
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
8-
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK
8+
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
9+
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
10+
// RUN: -fgpu-default-stream=legacy \
11+
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
12+
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
13+
// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \
14+
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,PTH,CHECK
915

1016
#include "Inputs/cuda.h"
1117

1218
// CHECK-LABEL: define{{.*}}g1
1319
// HIP-OLD: call{{.*}}hipSetupArgument
1420
// HIP-OLD: call{{.*}}hipLaunchByPtr
1521
// HIP-NEW: call{{.*}}__hipPopCallConfiguration
16-
// HIP-NEW: call{{.*}}hipLaunchKernel
22+
// LEGACY: call{{.*}}hipLaunchKernel
23+
// PTH: call{{.*}}hipLaunchKernel_spt
1724
// CUDA-OLD: call{{.*}}cudaSetupArgument
1825
// CUDA-OLD: call{{.*}}cudaLaunch
1926
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration

clang/test/Driver/hip-options.hip

+8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
// DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
1515
// DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
1616

17+
// Check -fgpu-default-stream=per-thread.
18+
// RUN: %clang -### -nogpuinc -nogpulib -fgpu-default-stream=per-thread \
19+
// RUN: %s -save-temps 2>&1 | FileCheck -check-prefix=PTH %s
20+
// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
21+
// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
22+
// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
23+
// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
24+
1725
// RUN: %clang -### -x hip -target x86_64-pc-windows-msvc -fms-extensions \
1826
// RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \
1927
// RUN: FileCheck -check-prefix=MLLVM %s

clang/test/Preprocessor/predefined-macros.c

+10
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@
247247
// CHECK-HIP-NEG-NOT: #define __CUDA_ARCH__
248248
// CHECK-HIP-NEG-NOT: #define __HIP_DEVICE_COMPILE__ 1
249249
// CHECK-HIP-NEG-NOT: #define __CLANG_RDC__ 1
250+
// CHECK-HIP-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
250251

251252
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
252253
// RUN: -fcuda-is-device \
@@ -265,6 +266,7 @@
265266
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-DEV-NEG
266267
// CHECK-HIP-DEV-NEG-NOT: #define __CUDA_ARCH__
267268
// CHECK-HIP-DEV-NEG-NOT: #define __CLANG_RDC__ 1
269+
// CHECK-HIP-DEV-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
268270

269271
// RUN: %clang_cc1 %s -E -dM -o - -x cuda -triple x86_64-unknown-linux-gnu \
270272
// RUN: -fgpu-rdc | FileCheck %s --check-prefix=CHECK-RDC
@@ -277,3 +279,11 @@
277279
// RUN: -fgpu-rdc -fcuda-is-device \
278280
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RDC
279281
// CHECK-RDC: #define __CLANG_RDC__ 1
282+
283+
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \
284+
// RUN: -fgpu-default-stream=per-thread \
285+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
286+
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
287+
// RUN: -fcuda-is-device -fgpu-default-stream=per-thread \
288+
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
289+
// CHECK-PTH: #define HIP_API_PER_THREAD_DEFAULT_STREAM 1

0 commit comments

Comments
 (0)