Skip to content

Commit edb17b2

Browse files
committed
[AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases
It is known that for vector whose element fits in i16 will be split and scalarized in SelectionDag's type legalizer (see SIISelLowering::getPreferredVectorAction). LRO attempts to undo the scalarizing of vectors across basic block boundary and shoehorn Values in VGPRs. LRO is beneficial for operations that natively work on illegal vector types to prevent flip-flopping between unpacked and packed. If we know that operations on vector will be split and scalarized, then we don't want to shoehorn them back to packed VGPR. Operations that we know to work natively on illegal vector types usually come in the form of intrinsics (MFMA, DOT8), buffer store, shuffle, insert/extract element, phi nodes to name a few.
1 parent 0d2722c commit edb17b2

File tree

5 files changed

+334
-215
lines changed

5 files changed

+334
-215
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "llvm/CodeGen/TargetPassConfig.h"
2121
#include "llvm/IR/IRBuilder.h"
2222
#include "llvm/IR/InstVisitor.h"
23+
#include "llvm/IR/IntrinsicsAMDGPU.h"
2324
#include "llvm/Support/CommandLine.h"
2425
#include "llvm/Support/KnownBits.h"
2526
#include "llvm/Transforms/Utils/Local.h"
@@ -75,6 +76,7 @@ class LiveRegOptimizer {
7576
Module &Mod;
7677
const DataLayout &DL;
7778
const GCNSubtarget &ST;
79+
7880
/// The scalar type to convert to
7981
Type *const ConvertToScalar;
8082
/// The set of visited Instructions
@@ -125,6 +127,116 @@ class LiveRegOptimizer {
125127
return LK.first != TargetLoweringBase::TypeLegal;
126128
}
127129

130+
/// Check if intrinsic natively operates on 8-bit or 16-bit
131+
bool isNativeIntrinsic(Intrinsic::ID ID) {
132+
switch (ID) {
133+
case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
134+
case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
135+
case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
136+
case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
137+
case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
138+
case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
139+
case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
140+
case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
141+
case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
142+
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
143+
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
144+
case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
145+
case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
146+
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
147+
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
148+
case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
149+
case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
150+
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
151+
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
152+
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
153+
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
154+
case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
155+
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
156+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
157+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
158+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
159+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
160+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
161+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
162+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
163+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
164+
case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
165+
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
166+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
167+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
168+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
169+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
170+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
171+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
172+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
173+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
174+
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
175+
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
176+
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
177+
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
178+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
179+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
180+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
181+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
182+
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
183+
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
184+
case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
185+
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
186+
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
187+
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
188+
return true;
189+
default:
190+
return false;
191+
}
192+
}
193+
194+
bool isOpLegal(Instruction *I) {
195+
if (const auto Intr = dyn_cast<IntrinsicInst>(I)) {
196+
Intrinsic::ID ID = Intr->getIntrinsicID();
197+
if (isNativeIntrinsic(ID))
198+
return true;
199+
}
200+
// Stores
201+
if (isa<StoreInst>(I))
202+
return true;
203+
return false;
204+
}
205+
206+
bool isCoercionProfitable(Instruction *II) {
207+
SmallPtrSet<Instruction *, 4> CVisited;
208+
SmallVector<Instruction *, 4> UserList;
209+
210+
// Check users for profitable conditions (across block user which can natively
211+
// handle the illegal vector).
212+
for (User *V : II->users())
213+
if (auto *UseInst = dyn_cast<Instruction>(V))
214+
UserList.push_back(UseInst);
215+
216+
auto IsLookThru = [](Instruction *II) {
217+
return isa<PHINode>(II) || isa<ShuffleVectorInst>(II) ||
218+
isa<InsertElementInst>(II) || isa<ExtractElementInst>(II) || isa<CastInst>(II);
219+
};
220+
221+
while (!UserList.empty()) {
222+
auto CII = UserList.pop_back_val();
223+
if (!CVisited.insert(CII).second)
224+
continue;
225+
226+
if (CII->getParent() == II->getParent() && !IsLookThru(II))
227+
continue;
228+
229+
if (isOpLegal(CII))
230+
return true;
231+
232+
if (IsLookThru(CII))
233+
for (User *V : CII->users())
234+
if (auto *UseInst = dyn_cast<Instruction>(V))
235+
UserList.push_back(UseInst);
236+
}
237+
return false;
238+
}
239+
128240
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
129241
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
130242
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
@@ -259,6 +371,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259371
if (!shouldReplace(II->getType()))
260372
continue;
261373

374+
if (!isCoercionProfitable(II))
375+
continue;
376+
262377
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263378
PhiNodes.insert(Phi);
264379
// Collect all the incoming values of problematic PHI nodes.
@@ -478,7 +593,6 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478593
PreservedAnalyses
479594
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
480595
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
481-
482596
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
483597
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
484598

0 commit comments

Comments
 (0)