Skip to content

Commit dd096ac

Browse files
committed
[AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases
It is known that for vector whose element fits in i16 will be split and scalarized in SelectionDag's type legalizer (see SIISelLowering::getPreferredVectorAction). LRO attempts to undo the scalarizing of vectors across basic block boundary and shoehorn Values in VGPRs. LRO is beneficial for operations that natively work on illegal vector types to prevent flip-flopping between unpacked and packed. If we know that operations on vector will be split and scalarized, then we don't want to shoehorn them back to packed VGPR. Operations that we know to work natively on illegal vector types usually come in the form of intrinsics (MFMA, DOT8), buffer store, shuffle, insert/extract element, phi nodes to name a few.
1 parent 1b46db7 commit dd096ac

File tree

5 files changed

+344
-220
lines changed

5 files changed

+344
-220
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "llvm/CodeGen/TargetPassConfig.h"
2121
#include "llvm/IR/IRBuilder.h"
2222
#include "llvm/IR/InstVisitor.h"
23+
#include "llvm/IR/IntrinsicsAMDGPU.h"
2324
#include "llvm/Support/CommandLine.h"
2425
#include "llvm/Support/KnownBits.h"
2526
#include "llvm/Transforms/Utils/Local.h"
@@ -75,6 +76,7 @@ class LiveRegOptimizer {
7576
Module &Mod;
7677
const DataLayout &DL;
7778
const GCNSubtarget &ST;
79+
7880
/// The scalar type to convert to
7981
Type *const ConvertToScalar;
8082
/// The set of visited Instructions
@@ -125,6 +127,119 @@ class LiveRegOptimizer {
125127
return LK.first != TargetLoweringBase::TypeLegal;
126128
}
127129

130+
/// Check if intrinsic natively operates on 8-bit or 16-bit
131+
bool isNativeIntrinsic(Intrinsic::ID ID) {
132+
switch (ID) {
133+
case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
134+
case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
135+
case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
136+
case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
137+
case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
138+
case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
139+
case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
140+
case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
141+
case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
142+
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
143+
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
144+
case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
145+
case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
146+
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
147+
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
148+
case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
149+
case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
150+
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
151+
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
152+
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
153+
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
154+
case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
155+
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
156+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
157+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
158+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
159+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
160+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
161+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
162+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
163+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
164+
case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
165+
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
166+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
167+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
168+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
169+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
170+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
171+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
172+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
173+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
174+
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
175+
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
176+
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
177+
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
178+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
179+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
180+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
181+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
182+
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
183+
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
184+
case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
185+
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
186+
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
187+
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
188+
return true;
189+
default:
190+
return false;
191+
}
192+
}
193+
194+
bool isOpLegal(Instruction *I) {
195+
if (const auto *Intr = dyn_cast<IntrinsicInst>(I)) {
196+
Intrinsic::ID ID = Intr->getIntrinsicID();
197+
if (isNativeIntrinsic(ID))
198+
return true;
199+
}
200+
// Stores
201+
if (isa<StoreInst>(I))
202+
return true;
203+
return false;
204+
}
205+
206+
bool isCoercionProfitable(Instruction *II) {
207+
SmallPtrSet<Instruction *, 4> CVisited;
208+
SmallVector<Instruction *, 4> UserList;
209+
210+
// Check users for profitable conditions (across block user which can
211+
// natively handle the illegal vector).
212+
for (User *V : II->users())
213+
if (auto *UseInst = dyn_cast<Instruction>(V))
214+
UserList.push_back(UseInst);
215+
216+
auto IsLookThru = [](Instruction *II) {
217+
if (const auto *Intr = dyn_cast<IntrinsicInst>(II))
218+
return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm;
219+
return isa<PHINode>(II) || isa<ShuffleVectorInst>(II) ||
220+
isa<InsertElementInst>(II) || isa<ExtractElementInst>(II) ||
221+
isa<CastInst>(II);
222+
};
223+
224+
while (!UserList.empty()) {
225+
auto CII = UserList.pop_back_val();
226+
if (!CVisited.insert(CII).second)
227+
continue;
228+
229+
if (CII->getParent() == II->getParent() && !IsLookThru(II))
230+
continue;
231+
232+
if (isOpLegal(CII))
233+
return true;
234+
235+
if (IsLookThru(CII))
236+
for (User *V : CII->users())
237+
if (auto *UseInst = dyn_cast<Instruction>(V))
238+
UserList.push_back(UseInst);
239+
}
240+
return false;
241+
}
242+
128243
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
129244
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
130245
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
@@ -259,6 +374,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259374
if (!shouldReplace(II->getType()))
260375
continue;
261376

377+
if (!isCoercionProfitable(II))
378+
continue;
379+
262380
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263381
PhiNodes.insert(Phi);
264382
// Collect all the incoming values of problematic PHI nodes.
@@ -478,7 +596,6 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478596
PreservedAnalyses
479597
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
480598
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
481-
482599
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
483600
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
484601

0 commit comments

Comments
 (0)