-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Clang] Make -Xarch_
handling generic for all toolchains
#125421
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang-driver @llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) ChangesSummary: The main benefit here is that we now have a more generic version of This patch is done in preparation for making selecting offloading Full diff: https://github.com/llvm/llvm-project/pull/125421.diff 5 Files Affected:
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d8123cc39fdc951..6dd9f2e8a9b1fc4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -932,7 +932,9 @@ def W_Joined : Joined<["-"], "W">, Group<W_Group>,
def Xanalyzer : Separate<["-"], "Xanalyzer">,
HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">,
Group<StaticAnalyzer_Group>;
-def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[NoXarchOption]>;
+def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[NoXarchOption]>,
+ HelpText<"Pass <arg> to the compiliation if the target matches <arch>">,
+ MetaVarName<"<arch> <arg>">;
def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[NoXarchOption]>,
HelpText<"Pass <arg> to the CUDA/HIP host compilation">, MetaVarName<"<arg>">;
def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[NoXarchOption]>,
@@ -1115,14 +1117,13 @@ def fno_convergent_functions : Flag<["-"], "fno-convergent-functions">,
// Common offloading options
let Group = offload_Group in {
-def offload_arch_EQ : Joined<["--"], "offload-arch=">, Flags<[NoXarchOption]>,
+def offload_arch_EQ : Joined<["--"], "offload-arch=">,
Visibility<[ClangOption, FlangOption]>,
HelpText<"Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g. sm_35). "
"If 'native' is used the compiler will detect locally installed architectures. "
"For HIP offloading, the device architecture can be followed by target ID features "
"delimited by a colon (e.g. gfx908:xnack+:sramecc-). May be specified more than once.">;
def no_offload_arch_EQ : Joined<["--"], "no-offload-arch=">,
- Flags<[NoXarchOption]>,
Visibility<[ClangOption, FlangOption]>,
HelpText<"Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. "
"'all' resets the list to its default value.">;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index ebc982096595e61..c25d1b6be14b50d 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1648,7 +1648,8 @@ void ToolChain::TranslateXarchArgs(
A->getOption().matches(options::OPT_Xarch_host))
ValuePos = 0;
- unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(ValuePos));
+ const InputArgList &BaseArgs = Args.getBaseArgs();
+ unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos));
unsigned Prev = Index;
std::unique_ptr<llvm::opt::Arg> XarchArg(Opts.ParseOneArg(Args, Index));
@@ -1672,8 +1673,31 @@ void ToolChain::TranslateXarchArgs(
Diags.Report(DiagID) << A->getAsString(Args);
return;
}
+
XarchArg->setBaseArg(A);
A = XarchArg.release();
+
+ // Linker input arguments require custom handling. The problem is that we
+ // have already constructed the phase actions, so we can not treat them as
+ // "input arguments".
+ if (A->getOption().hasFlag(options::LinkerInput)) {
+ // Convert the argument into individual Zlinker_input_args. Need to do this
+ // manually to avoid memory leaks with the allocated arguments.
+ for (const char *Value : A->getValues()) {
+ auto Opt = Opts.getOption(options::OPT_Zlinker_input);
+ unsigned Index = BaseArgs.MakeIndex(Opt.getName(), Value);
+ auto NewArg =
+ new Arg(Opt, BaseArgs.MakeArgString(Opt.getPrefix() + Opt.getName()),
+ Index, BaseArgs.getArgString(Index + 1), A);
+
+ DAL->append(NewArg);
+ if (!AllocatedArgs)
+ DAL->AddSynthesizedArg(NewArg);
+ else
+ AllocatedArgs->push_back(NewArg);
+ }
+ }
+
if (!AllocatedArgs)
DAL->AddSynthesizedArg(A);
else
@@ -1697,19 +1721,17 @@ llvm::opt::DerivedArgList *ToolChain::TranslateXarchArgs(
} else if (A->getOption().matches(options::OPT_Xarch_host)) {
NeedTrans = !IsDevice;
Skip = IsDevice;
- } else if (A->getOption().matches(options::OPT_Xarch__) && IsDevice) {
- // Do not translate -Xarch_ options for non CUDA/HIP toolchain since
- // they may need special translation.
- // Skip this argument unless the architecture matches BoundArch
- if (BoundArch.empty() || A->getValue(0) != BoundArch)
- Skip = true;
- else
- NeedTrans = true;
+ } else if (A->getOption().matches(options::OPT_Xarch__)) {
+ NeedTrans = A->getValue() == getArchName() ||
+ (!BoundArch.empty() && A->getValue() == BoundArch);
+ Skip = !NeedTrans;
}
if (NeedTrans || Skip)
Modified = true;
- if (NeedTrans)
+ if (NeedTrans) {
+ A->claim();
TranslateXarchArgs(Args, A, DAL, AllocatedArgs);
+ }
if (!Skip)
DAL->append(A);
}
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 9a276c55bf7bcbf..b26c5bf1a909ed6 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -2777,30 +2777,6 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args,
// and try to push it down into tool specific logic.
for (Arg *A : Args) {
- if (A->getOption().matches(options::OPT_Xarch__)) {
- // Skip this argument unless the architecture matches either the toolchain
- // triple arch, or the arch being bound.
- StringRef XarchArch = A->getValue(0);
- if (!(XarchArch == getArchName() ||
- (!BoundArch.empty() && XarchArch == BoundArch)))
- continue;
-
- Arg *OriginalArg = A;
- TranslateXarchArgs(Args, A, DAL);
-
- // Linker input arguments require custom handling. The problem is that we
- // have already constructed the phase actions, so we can not treat them as
- // "input arguments".
- if (A->getOption().hasFlag(options::LinkerInput)) {
- // Convert the argument into individual Zlinker_input_args.
- for (const char *Value : A->getValues()) {
- DAL->AddSeparateArg(
- OriginalArg, Opts.getOption(options::OPT_Zlinker_input), Value);
- }
- continue;
- }
- }
-
// Sob. These is strictly gcc compatible for the time being. Apple
// gcc translates options twice, which means that self-expanding
// options add duplicates.
diff --git a/clang/test/Driver/Xarch.c b/clang/test/Driver/Xarch.c
index f7693fb689d588e..f35e2926f9c8d5f 100644
--- a/clang/test/Driver/Xarch.c
+++ b/clang/test/Driver/Xarch.c
@@ -1,8 +1,13 @@
// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -Xarch_x86_64 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
+// RUN: %clang -target x86_64-unknown-windows-msvc -Xarch_x86_64 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
+// RUN: %clang -target aarch64-unknown-linux-gnu -Xarch_aarch64 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
+// RUN: %clang -target powerpc64le-unknown-linux-gnu -Xarch_powerpc64le -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
// O3ONCE: "-O3"
// O3ONCE-NOT: "-O3"
// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3NONE %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -m64 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3NONE %s
// O3NONE-NOT: "-O3"
// O3NONE: argument unused during compilation: '-Xarch_i386 -O3'
@@ -10,3 +15,6 @@
// INVALID: error: invalid Xarch argument: '-Xarch_i386 -o'
// INVALID: error: invalid Xarch argument: '-Xarch_i386 -S'
// INVALID: error: invalid Xarch argument: '-Xarch_i386 -o'
+
+// RUN: %clang -target x86_64-unknown-linux-gnu -Xarch_x86_64 -Wl,foo %s -### 2>&1 | FileCheck -check-prefix=LINKER %s
+// LINKER: "foo"
diff --git a/clang/test/Driver/offload-Xarch.c b/clang/test/Driver/offload-Xarch.c
new file mode 100644
index 000000000000000..07b29b8e4884121
--- /dev/null
+++ b/clang/test/Driver/offload-Xarch.c
@@ -0,0 +1,31 @@
+// RUN: %clang -x cuda %s -Xarch_nvptx64 -O3 -S -nogpulib -nogpuinc -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
+// RUN: %clang -x hip %s -Xarch_amdgcn -O3 -S -nogpulib -nogpuinc -### 2>&1 | FileCheck -check-prefix=O3ONCE %s
+// RUN: %clang -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib -nogpuinc \
+// RUN: -Xarch_amdgcn -march=gfx90a -Xarch_amdgcn -O3 -S -### %s 2>&1 \
+// RUN: | FileCheck -check-prefix=O3ONCE %s
+// RUN: %clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib -nogpuinc \
+// RUN: -Xarch_nvptx64 -march=sm_52 -Xarch_nvptx64 -O3 -S -### %s 2>&1 \
+// RUN: | FileCheck -check-prefix=O3ONCE %s
+// O3ONCE: "-O3"
+// O3ONCE-NOT: "-O3"
+
+// RUN: %clang -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -nogpulib \
+// RUN: --target=x86_64-unknown-linux-gnu -Xarch_nvptx64 --offload-arch=sm_52,sm_60 -nogpuinc \
+// RUN: -Xarch_amdgcn --offload-arch=gfx90a,gfx1030 -ccc-print-bindings -### %s 2>&1 \
+// RUN: | FileCheck -check-prefix=OPENMP %s
+//
+// OPENMP: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// OPENMP: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[GFX1030_BC:.+]]"
+// OPENMP: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[GFX90A_BC:.+]]"
+// OPENMP: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[SM52_PTX:.+]]"
+// OPENMP: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[SM52_PTX]]"], output: "[[SM52_CUBIN:.+]]"
+// OPENMP: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[SM60_PTX:.+]]"
+// OPENMP: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[SM60_PTX]]"], output: "[[SM60_CUBIN:.+]]"
+// OPENMP: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[GFX1030_BC]]", "[[GFX90A_BC]]", "[[SM52_CUBIN]]", "[[SM60_CUBIN]]"], output: "[[BINARY:.+]]"
+// OPENMP: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
+// OPENMP: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -x cuda %s --offload-arch=sm_52,sm_60 -Xarch_sm_52 -O3 -Xarch_sm_60 -O0 \
+// RUN: -S -nogpulib -nogpuinc -### 2>&1 | FileCheck -check-prefix=CUDA %s
+// CUDA: "-cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}}"-target-cpu" "sm_52" {{.*}}"-O3"
+// CUDA: "-cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}}"-target-cpu" "sm_60" {{.*}}"-O0"
|
I think this could use some editing.
This is the confusing part. I'm pretty sure IMO, a better way to describe the situation is that MachO is the last remaining special case implementation of Xarch and the patch folds it into a common |
I don't think a single
Yeah, I wasn't counting the host / device parts since they're separate flags. We do support the architecture part already but it's a special case that doesn't need to be there. Realistically this is just removing the MachO handling to simplify it and make it work everywhere. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM overall, with a few nits.
d8f565a
to
0385210
Compare
Here's something fun, |
Also see: #110325 |
That's fun, well for now I've updated it to not hit that bug and also accept |
0385210
to
79000d0
Compare
It's just another argument combination we need to handle in a sensible way. We do have a number of options that are only meaningful for the top-level driver. IMO the reasonable approaches in this case would be :
I would argue that the latter is a better approach, as it cleanly divides the responsibilities. The contract for "-X" options is "we'll pass whatever you give us to " and it's user's job to know what they are doing, and In this particular case, the question becomes a plain "what does cc1 do with --offload-arch=sm_52" and the answer should be as straightforward. |
What do you suggest for this case? Right now it works as I'd expect, it passes |
What happens with that |
Good
I don't think there's any use of |
I agree. Yet we do need to deal with such nonsensical input in a consistent manner. We do not control what the users give us, but we control how we respond. |
Right now if someone passes
|
79000d0
to
2f3002d
Compare
This part SGTM, too. However, I don't think I've seen the answer what happens when we do pass --offload arch to cc1. E.g. a user may accidentally paste an argument in the wrong place and instead of intended |
|
Silently? That would be wrong, imo. It should be diagnosed somewhere. |
It's already an error if you pass it directly via - |
2f3002d
to
137b87c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--offload-arch= isn't an accepted -cc1 argument so it won't be forwarded at all.
Silently? That would be wrong, imo. It should be diagnosed somewhere.
It's already an error if you pass it directly via -
Xclang
because it's not an accepted-cc1
argument. A lot of driver arguments are both driver andcc1
arguments so those get marshalled or forwarded.
Yes, I'm aware of that. My question is -- with this patch, and my example command above which wants to forward --offload-arch, does clang -cc1
report an error, or stays silent because the argument "won't be forwarded at all." ?
Perhaps we should take a step back, document desired behavior/interactions between -Xarch, --offload-arch, and OpenMP/CUDA/HIP offloading modes, so we have somewhat consistent (or at least documented) behavior. Right now we seem to chase corner cases.
clang/test/Driver/offload-Xarch.c
Outdated
// RUN: %clang -x cuda %s -nogpulib -nogpuinc \ | ||
// RUN: -Xarch_sm_51 --offload-arch=sm_52 -S -### 2>&1 \ | ||
// RUN: | FileCheck -check-prefix=SPECIFIC-WARN %s | ||
// SPECIFIC-WARN: warning: argument unused during compilation: '-Xarch_sm_51 --offload-arch=sm_52' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What will happen when we do pass -offload-arch=sm_80
to cc1?
--offload-arch=sm_52 -Xarch_sm52 --offload-arch=sm_80 --some-option-for_sm52
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case, --offload-arch=sm_52
would enable sm_52
as a bound job, then it would make -Xarch-sm_52
enable --offload-arch=sm_80
but by the time it sees that the list was already generated so it wouldn't do anything.
clang/test/Driver/offload-Xarch.c
Outdated
// RUN: %clang -x cuda %s -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ | ||
// RUN: -Xarch_sm_52 --offload-arch=sm_52 -S -nogpulib -nogpuinc -### 2>&1 \ | ||
// RUN: | FileCheck -check-prefix=SPECIFIC %s | ||
// SPECIFIC: "-cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}}"-target-cpu" "sm_52" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO this special case makes no sense.
If I were to look at this command line in real life, my assumption would be that a user made a mistake and intended to write --offload-arch=sm_52 -Xarch_sm_52 -some-option
. I.e. they targeted sm_52, and then wanted to tweak that compilation. In this case we effectively ignoring -Xarch_sm_52
which was very likely not the user's intent.
cc1
reporting an error when it got --offload-arch
would be a better approach IMO, giving the feedback that the user is doing something wrong.
On the other hand you've mentioned that:
using -Xarch_amdgcn --offload-arch=gfx1030 is very meaningful for OpenMP where the user can enable multiple toolchains at the same time.
So, it looks like handling of these options is also language-dependent. For CUDA, blindly passing Xarch* options down to the compilation selected by Xarch kind (back-end, or specific GPU) and letting cc1 deal with those options would probably be acceptable.
The same approach may also work for OpenMP, where cc1 can do something sensible with --offload-arch passed to it and does not have to error out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But we want -Xarch_amdgcn --offloard-arch=gfx90a
to work for cases like OpenMP which can combine multiple targets into a single compile and needs the lists to be separate. I guess we could go with @yxsamliu's suggestion and just error if the value isn't a valid triple architecture.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I understand what exactly it's intended to do and how is that supposed to work. I'm missing something here. Can you elaborate on the intended use case here and walk me through it?
So, the top-level driver sees -Xarch_amdgcn
. I would assume that we want it to pass the following --offloard-arch=gfx90a
to all cc1 subcompilations using amdgcn. What is --offload-arch=gfx90a
expected to do in this case, once it's passed to cc1? I can see how it might be used if we have a single cc1 subcompilation to tell that cc1 invocation to target gfx90a, but that looks like an odd fix for an odd problem. IMO that's something that should be done by the top-level driver.
If we have multiple cc1 subcompilation using amdgcn, then we'll end up with multiple cc1 invocations with potentially identical target... Not sure if that's going to cause troubles further down the compilation pipeline. E.g. can we incorporate N binaries for the same target? How will runtime figure out which one to load?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently we have -Xopenmp-target=<triple>
which does the same thing. OpenMP can do something like the following:
clang foo.c -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa,nvptx64-nvidia-cuda -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx1030,gfx90a -Xopenmp-target=nvptx64-nvidia-cuda --offload-arch=sm_52,sm_90
These are passed to the driver stage because we call getArgsForToolchain
while processing the offloading architecture inputs. This call has no bound architecture which is the problem with parsing -Xarch_sm_52 --offload-arch=sm_52
. There is no -cc1
call here, it passes -offload-arch
to the offloading toolchain, which then results in us building N
compilation jobs for each of those architectures. --offload-arch
is not accepted by -cc1
at all, so it's not relevant here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It all seems to boil down to "what does Xarch mean". The conventional -X<something>
model is "pass the arg to in the compilation pipeline". You seem to propose that for openMP it will be "use it to construct compiler pipeline". Not sure I like this dichotomy where we commingling pipeline construction with tweaking of already constructed pipeline.
Perhaps a cleaner way would be to figure out a better way tof OpenMP to set up the build pipeline structure (that's what --offload-arch is for), and keep Xarch
exclusively as a way to tweak the options for the already constructed pipeline.
Presumably OpenMP already has the ways to specify the desired offloading targets. Xarch
does not look like the right tool for that job. It can do it, but it opens a can of corner cases.
If openMP does not have a viable better way to configure the compilation pipeline, we could make -Xarch=backend. --offload-arch=target
a documented special case for OpenMP only. I would strongly prefer to keep CUDA/HIP to continue diagnosing --offload-arch
being passed to cc1
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That being said, what is your recommended solution for mixing targets? This is not unique to OpenMP, we can already target HIP and CUDA from SPIRV-64 which is a different toolchain that can't accept --offload-arch
arguments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is your recommended solution for mixing targets
Can you be more specific? We seem to view things from very different angles, so I want to make sure we're talking about the same things here.
we can already target HIP and CUDA from SPIRV-64
That's one example where we're looking at it differently. The way I see it it's HIP/CUDA (as in front-end) can use spir-v as the target toolchain.
Is your question -- how do we tell the driver to construct yet another cc1 subcompilation variant?
I do not have a ready answer for that, but if I had one, -Xarch
probably would not be it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the question is how do we separate architectures when we don't assume a single offloading toolchain. Right now we just wing it and guess based off of hard coded string values. I'd like to have some solution, currently we do something like --offload-arch=amdgcnspirv,gfx1030
or the extremely broken -offload=
flag. Compared to those I really don't think -Xarch_
is a big deal, since it does what you want, passes those arguments only to the Triple toolchain.
I removed the contentious part, so can we at least land the easy fix.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're probably the one holding most of the puzzle pieces here. For CUDA/HIP things (mostly) work well enough. Even though there are somewhat useful attempts to target spir-V, they are half-baked at best. For OpenMP the target set is widely open, so I can see how it would need a more flexible way to customize the compilation pipeline construction.
In abstract, part of the problem boils down to naming. You need to be able to tell the driver, what we want to target (currently we use --offload-arch= and -offload options for that) and how to select subsets of the constructed pipeline (-X<host|device|arch|something else>).
--offload-arch=amdgcnspirv,gfx1030
looks like a reasonable approach to me, and the same naming scheme could be used as a selector part of -Xarch
. What we have now may not be perfect, but I think it's reasonably functional. Is there a particular reason --offload-arch=amdgcnspirv,gfx1030
is not sufficient to drive pipeline construction? I'm not sure why we want -Xarch to do that job.
Compared to those I really don't think -Xarch_ is a big deal, since it does what you want, passes those arguments only to the Triple toolchain.
My issue is that the patch was making -Xarch do the job it's not intended for and hiding real errors in the process. The "passing the arguments to the triple{-selected} toolchain" (as an wider-scope selector, compared to per-GPU or host/device ones we have now) part is fine. Let's handle pipeline creation challenge separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My complaints with --offload-arch=
is that it's pulling double duty on -mcpu
and --target
options. It's wholly insufficient for OpenMP when targeting anything that isn't a GPU because of that. I'd prefer to have the tools to individually enable and control the toolchains. My perspective is that offloading is a product of all --target=
values and -mcpu=
values. That's what it boils down to internally, and I'd like to be able to compile things that way if necessary, i.e. clang --offload-target=spirv64 foo.hip
makes more sense to me.
Either way, does this patch LG now that part has been delayed until later?
Summary: Currently, `-Xarch_` is used to forward argument specially to certain toolchains. Currently, this is only supported by the Darwin toolchain. We want to be able to use this generically, and for offloading too. This patch moves the handling out of the Darwin Toolchain and places it in the `getArgsForToolchain` helper which is run before the arguments get passed to the tools. The main benefit here is that we now have a more generic version of `-Xopenmp-target=`, which should probably just be deprecated. Additionally, it allows us to specially pass arguments to different architectures for offloading. This patch is done in preparation for making selecting offloading toolchains more generic, this will be helpful while people are moving toward compile jobs that include multiple toolchins (SPIR-V, AMDGCN, NVPTX).
137b87c
to
3ed4040
Compare
// O3ONCE: "-O3" | ||
// O3ONCE-NOT: "-O3" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I'd also add the check that -O3 is present in a compilation with fcuda-is-device. Otherwise the tests would succeed somewhere else.
// RUN: | FileCheck -check-prefix=CUDA %s | ||
// CUDA: "-cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}}"-target-cpu" "sm_52" {{.*}}"-O3" | ||
// CUDA: "-cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}}"-target-cpu" "sm_60" {{.*}}"-O0" | ||
// CUDA: "-cc1" "-triple" "x86_64-unknown-linux-gnu" {{.*}}"-O3" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Using -D<something readable>
as an argument to pass may let you unambiguously mark each subcompilation, and avoid triggering that -O1/-O2 problem.
@@ -932,7 +932,9 @@ def W_Joined : Joined<["-"], "W">, Group<W_Group>, | |||
def Xanalyzer : Separate<["-"], "Xanalyzer">, | |||
HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">, | |||
Group<StaticAnalyzer_Group>; | |||
def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[NoXarchOption]>; | |||
def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[NoXarchOption]>, | |||
HelpText<"Pass <arg> to the compiliation if the target matches <arch>">, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do need a better documentation. It's not obvious that could be a GPU name or an arch name from the target triple.
This fixes an issue where your host triple is not compatible with the 64 bit ptx being the offload architecture. At least, that's my guess. This failed on our Arm 32 bit bot: https://lab.llvm.org/buildbot/#/builders/154/builds/11413/steps/5/logs/FAIL__Clang__offload-Xarch_c Crucially it outputted: clang: warning: argument unused during compilation: '-Xarch_nvptx64 -O3' [-Wunused-command-line-argument] Making the triple always something 64 bit means this will work everywhere. Fixes 455cedc / #125421.
I've pushed a fix for one of the tests on 32 bit Arm, as it failed on our bot: Maybe your intent was specifically to have a line that uses the host architecture, if that was the case, we can figure out a The other way to fix it was to use Let me know if you want to follow up on this. |
Thanks! I totally forgot that |
This fixes an issue where your host triple is not compatible with the 64 bit ptx being the offload architecture. At least, that's my guess. This failed on our Arm 32 bit bot: https://lab.llvm.org/buildbot/#/builders/154/builds/11413/steps/5/logs/FAIL__Clang__offload-Xarch_c Crucially it outputted: clang: warning: argument unused during compilation: '-Xarch_nvptx64 -O3' [-Wunused-command-line-argument] Making the triple always something 64 bit means this will work everywhere. Fixes 455cedc / llvm/llvm-project#125421.
Summary: Currently, `-Xarch_` is handled specially between different toolchains, (i.e. Mach-O). This patch unifies the handling so that it can be used generically. The main benefit here is that we now have a more generic version of `-Xopenmp-target=`, which should probably just be deprecated. Additionally, it allows us to specially pass arguments to different architectures for offloading. This patch is done in preparation for making selecting offloading toolchains more generic, this will be helpful while people are moving toward compile jobs that include multiple toolchains (SPIR-V, AMDGCN, NVPTX).
This fixes an issue where your host triple is not compatible with the 64 bit ptx being the offload architecture. At least, that's my guess. This failed on our Arm 32 bit bot: https://lab.llvm.org/buildbot/#/builders/154/builds/11413/steps/5/logs/FAIL__Clang__offload-Xarch_c Crucially it outputted: clang: warning: argument unused during compilation: '-Xarch_nvptx64 -O3' [-Wunused-command-line-argument] Making the triple always something 64 bit means this will work everywhere. Fixes 455cedc / llvm#125421.
Summary:
Currently,
-Xarch_
is handled specially between different toolchains, (i.e. Mach-O).This patch unifies the handling so that it can be used generically.
The main benefit here is that we now have a more generic version of
-Xopenmp-target=
, which should probably just be deprecated.Additionally, it allows us to specially pass arguments to different
architectures for offloading.
This patch is done in preparation for making selecting offloading
toolchains more generic, this will be helpful while people are moving
toward compile jobs that include multiple toolchains (SPIR-V, AMDGCN,
NVPTX).