Skip to content

Commit 05e0d64

Browse files
committed
[AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases
It is known that for vector whose element fits in i16 will be split and scalarized in SelectionDag's type legalizer (see SIISelLowering::getPreferredVectorAction). LRO attempts to undo the scalarizing of vectors across basic block boundary and shoehorn Values in VGPRs. LRO is beneficial for operations that natively work on illegal vector types to prevent flip-flopping between unpacked and packed. If we know that operations on vector will be split and scalarized, then we don't want to shoehorn them back to packed VGPR. Operations that we know to work natively on illegal vector types usually come in the form of intrinsics (MFMA, DOT8), buffer store, shuffle, phi nodes to name a few.
1 parent 83cad68 commit 05e0d64

File tree

10 files changed

+123
-35
lines changed

10 files changed

+123
-35
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,9 @@ class TargetTransformInfo {
890890
/// Return true if this type is legal.
891891
bool isTypeLegal(Type *Ty) const;
892892

893+
/// Return true if this operation is legal.
894+
bool isOpLegal(Instruction *I) const;
895+
893896
/// Returns the estimated number of registers required to represent \p Ty.
894897
unsigned getRegUsageForType(Type *Ty) const;
895898

@@ -2037,6 +2040,7 @@ class TargetTransformInfo::Concept {
20372040
virtual bool isProfitableToHoist(Instruction *I) = 0;
20382041
virtual bool useAA() = 0;
20392042
virtual bool isTypeLegal(Type *Ty) = 0;
2043+
virtual bool isOpLegal(Instruction *I) = 0;
20402044
virtual unsigned getRegUsageForType(Type *Ty) = 0;
20412045
virtual bool shouldBuildLookupTables() = 0;
20422046
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
@@ -2621,6 +2625,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
26212625
}
26222626
bool useAA() override { return Impl.useAA(); }
26232627
bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
2628+
bool isOpLegal(Instruction *I) override {return Impl.isOpLegal(I); }
26242629
unsigned getRegUsageForType(Type *Ty) override {
26252630
return Impl.getRegUsageForType(Ty);
26262631
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,8 @@ class TargetTransformInfoImplBase {
377377

378378
bool isTypeLegal(Type *Ty) const { return false; }
379379

380+
bool isOpLegal(Instruction *I) const { return false; }
381+
380382
unsigned getRegUsageForType(Type *Ty) const { return 1; }
381383

382384
bool shouldBuildLookupTables() const { return true; }

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,11 +465,15 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
465465

466466
bool useAA() const { return getST()->useAA(); }
467467

468-
bool isTypeLegal(Type *Ty) {
468+
bool isTypeLegal(Type *Ty) const {
469469
EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
470470
return getTLI()->isTypeLegal(VT);
471471
}
472472

473+
bool isOpLegal(Instruction *I) const {
474+
return isTypeLegal(I->getType());
475+
}
476+
473477
unsigned getRegUsageForType(Type *Ty) {
474478
EVT ETy = getTLI()->getValueType(DL, Ty);
475479
return getTLI()->getNumRegisters(Ty->getContext(), ETy);

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,10 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
584584
return TTIImpl->isTypeLegal(Ty);
585585
}
586586

587+
bool TargetTransformInfo::isOpLegal(Instruction *I) const {
588+
return TTIImpl->isOpLegal(I);
589+
}
590+
587591
unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
588592
return TTIImpl->getRegUsageForType(Ty);
589593
}

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "AMDGPU.h"
1616
#include "AMDGPUTargetMachine.h"
17+
#include "AMDGPUTargetTransformInfo.h"
1718
#include "llvm/Analysis/AssumptionCache.h"
1819
#include "llvm/Analysis/UniformityAnalysis.h"
1920
#include "llvm/Analysis/ValueTracking.h"
@@ -45,6 +46,7 @@ class AMDGPULateCodeGenPrepare
4546
Function &F;
4647
const DataLayout &DL;
4748
const GCNSubtarget &ST;
49+
const TargetTransformInfo &TTI;
4850

4951
AssumptionCache *const AC;
5052
UniformityInfo &UA;
@@ -53,8 +55,9 @@ class AMDGPULateCodeGenPrepare
5355

5456
public:
5557
AMDGPULateCodeGenPrepare(Function &F, const GCNSubtarget &ST,
56-
AssumptionCache *AC, UniformityInfo &UA)
57-
: F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
58+
const TargetTransformInfo &TTI, AssumptionCache *AC,
59+
UniformityInfo &UA)
60+
: F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
5861
bool run();
5962
bool visitInstruction(Instruction &) { return false; }
6063

@@ -75,6 +78,8 @@ class LiveRegOptimizer {
7578
Module &Mod;
7679
const DataLayout &DL;
7780
const GCNSubtarget &ST;
81+
const TargetTransformInfo &TTI;
82+
7883
/// The scalar type to convert to
7984
Type *const ConvertToScalar;
8085
/// The set of visited Instructions
@@ -125,8 +130,45 @@ class LiveRegOptimizer {
125130
return LK.first != TargetLoweringBase::TypeLegal;
126131
}
127132

128-
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
129-
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
133+
// Filtering based on operation or its cost.
134+
// If an operation incurs high enough cost or natively work on
135+
// vector of illegal type, ie. v2i8, then it makes sense to try
136+
// to coerce them as packed VGPR across BB.
137+
bool shouldReplaceBasedOnOp(Instruction *II) {
138+
static const int SCALARIZE_INST_COST = 2;
139+
static const int LRO_COST_THRES = 12;
140+
141+
// Ignore pseudos
142+
if (II->isDebugOrPseudoInst())
143+
return false;
144+
145+
// Instruction Cost
146+
auto Cost = TTI.getInstructionCost(
147+
II, TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
148+
if (const auto *Def = II->getOperand(0)) {
149+
if (const auto *DefTy = dyn_cast<FixedVectorType>(Def->getType())) {
150+
const auto *ElTy = dyn_cast<IntegerType>(DefTy->getElementType());
151+
// Assume vNi8 and vNi16 will be scalarized.
152+
if (ElTy && ElTy->getBitWidth() <= 16) {
153+
const auto ElCount = DefTy->getElementCount().getFixedValue();
154+
Cost += SCALARIZE_INST_COST * ElCount;
155+
}
156+
}
157+
}
158+
LLVM_DEBUG(dbgs() << "shouldReplaceBasedOnOp: " << *II << " Cost=" << Cost
159+
<< '\n';);
160+
if (Cost >= LRO_COST_THRES)
161+
return true;
162+
163+
if (TTI.isOpLegal(II))
164+
return true;
165+
166+
return false;
167+
}
168+
169+
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST,
170+
const TargetTransformInfo &TTI)
171+
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
130172
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
131173
};
132174

@@ -140,7 +182,7 @@ bool AMDGPULateCodeGenPrepare::run() {
140182
// vectors to equivalent vectors of legal type (which are converted back
141183
// before uses in subsequent blocks), to pack the bits into fewer physical
142184
// registers (used in CopyToReg/CopyFromReg pairs).
143-
LiveRegOptimizer LRO(*F.getParent(), ST);
185+
LiveRegOptimizer LRO(*F.getParent(), ST, TTI);
144186

145187
bool Changed = false;
146188

@@ -291,6 +333,9 @@ bool LiveRegOptimizer::optimizeLiveType(
291333
}
292334

293335
Instruction *UseInst = cast<Instruction>(V);
336+
if (!shouldReplaceBasedOnOp(UseInst))
337+
break; // reject this II.
338+
294339
// Collect all uses of PHINodes and any use the crosses BB boundaries.
295340
if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
296341
Uses.insert(UseInst);
@@ -478,11 +523,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478523
PreservedAnalyses
479524
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
480525
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
526+
const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
481527

482528
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
483529
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
484530

485-
bool Changed = AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
531+
bool Changed = AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
486532

487533
if (!Changed)
488534
return PreservedAnalyses::all();
@@ -518,13 +564,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
518564
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
519565
const TargetMachine &TM = TPC.getTM<TargetMachine>();
520566
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
567+
const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
521568

522569
AssumptionCache &AC =
523570
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
524571
UniformityInfo &UI =
525572
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
526573

527-
return AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
574+
return AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
528575
}
529576

530577
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,3 +1446,25 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14461446
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14471447
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14481448
}
1449+
1450+
/// Check if operation is legal.
1451+
/// TODO: If we had IR<->SDag mapping, we could use TLI->isOperationLegal
1452+
bool GCNTTIImpl::isOpLegal(Instruction *I) const {
1453+
Type *T = I->getType();
1454+
if (!isTypeLegal(T)) {
1455+
// Intrinsics - assume they natively handle illegal type
1456+
if (isa<IntrinsicInst>(I))
1457+
return true;
1458+
1459+
// Stores
1460+
if (isa<StoreInst>(I))
1461+
return true;
1462+
1463+
// Shuffles
1464+
if (isa<ShuffleVectorInst>(I))
1465+
return true;
1466+
1467+
return false;
1468+
}
1469+
return true;
1470+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
276276
void collectKernelLaunchBounds(
277277
const Function &F,
278278
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
279+
280+
/// Query if operation is legal
281+
bool isOpLegal(Instruction *I) const;
279282
};
280283

281284
} // end namespace llvm

llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,24 @@ define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace
1313
; CHECK-NEXT: s_clause 0x1
1414
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1515
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
16-
; CHECK-NEXT: v_mov_b32_e32 v2, 8
1716
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
18-
; CHECK-NEXT: s_clause 0x1
19-
; CHECK-NEXT: global_load_ushort v1, v0, s[0:1]
20-
; CHECK-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2
17+
; CHECK-NEXT: global_load_sbyte v0, v0, s[0:1] offset:2
2118
; CHECK-NEXT: s_bitcmp1_b32 s2, 0
2219
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
2320
; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
24-
; CHECK-NEXT: s_waitcnt vmcnt(1)
25-
; CHECK-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
26-
; CHECK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
27-
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
28-
; CHECK-NEXT: s_waitcnt vmcnt(0)
29-
; CHECK-NEXT: v_lshl_or_b32 v0, v0, 16, v1
3021
; CHECK-NEXT: s_cbranch_vccz .LBB0_2
3122
; CHECK-NEXT: ; %bb.1: ; %bb19
3223
; CHECK-NEXT: v_mov_b32_e32 v1, 0
3324
; CHECK-NEXT: ds_write_b32 v1, v1
3425
; CHECK-NEXT: .LBB0_2: ; %bb20
35-
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3626
; CHECK-NEXT: s_mov_b32 s0, exec_lo
27+
; CHECK-NEXT: s_waitcnt vmcnt(0)
3728
; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0
3829
; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0
3930
; CHECK-NEXT: s_cbranch_execz .LBB0_4
4031
; CHECK-NEXT: ; %bb.3: ; %bb11
4132
; CHECK-NEXT: v_mov_b32_e32 v1, 2
42-
; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
33+
; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
4334
; CHECK-NEXT: v_mov_b32_e32 v1, 0
4435
; CHECK-NEXT: ds_write_b32 v0, v1 offset:84
4536
; CHECK-NEXT: .LBB0_4: ; %bb14

llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2102,15 +2102,18 @@ define void @crash_lshlrevb16_not_reg_op() {
21022102
; NOSDWA: ; %bb.0: ; %bb0
21032103
; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21042104
; NOSDWA-NEXT: s_mov_b64 s[4:5], 0
2105+
; NOSDWA-NEXT: s_and_b32 s6, s4, 0xff
2106+
; NOSDWA-NEXT: s_bitset1_b32 s6, 8
2107+
; NOSDWA-NEXT: s_and_b32 s6, s6, 0x1ff
21052108
; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
21062109
; NOSDWA-NEXT: .LBB22_1: ; %bb1
21072110
; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
2108-
; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3
2111+
; NOSDWA-NEXT: s_lshl_b32 s7, s4, 3
21092112
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
2110-
; NOSDWA-NEXT: s_lshr_b32 s6, 0x100, s6
2113+
; NOSDWA-NEXT: s_lshr_b32 s7, s6, s7
21112114
; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
21122115
; NOSDWA-NEXT: s_mov_b64 s[4:5], 1
2113-
; NOSDWA-NEXT: v_mov_b32_e32 v2, s6
2116+
; NOSDWA-NEXT: v_mov_b32_e32 v2, s7
21142117
; NOSDWA-NEXT: flat_store_byte v[0:1], v2
21152118
; NOSDWA-NEXT: s_mov_b64 vcc, vcc
21162119
; NOSDWA-NEXT: s_cbranch_vccnz .LBB22_1
@@ -2122,15 +2125,18 @@ define void @crash_lshlrevb16_not_reg_op() {
21222125
; GFX89: ; %bb.0: ; %bb0
21232126
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21242127
; GFX89-NEXT: s_mov_b64 s[4:5], 0
2128+
; GFX89-NEXT: s_and_b32 s6, s4, 0xff
2129+
; GFX89-NEXT: s_bitset1_b32 s6, 8
2130+
; GFX89-NEXT: s_and_b32 s6, s6, 0x1ff
21252131
; GFX89-NEXT: s_and_b64 vcc, exec, -1
21262132
; GFX89-NEXT: .LBB22_1: ; %bb1
21272133
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
2128-
; GFX89-NEXT: s_lshl_b32 s6, s4, 3
2134+
; GFX89-NEXT: s_lshl_b32 s7, s4, 3
21292135
; GFX89-NEXT: v_mov_b32_e32 v0, s4
2130-
; GFX89-NEXT: s_lshr_b32 s6, 0x100, s6
2136+
; GFX89-NEXT: s_lshr_b32 s7, s6, s7
21312137
; GFX89-NEXT: v_mov_b32_e32 v1, s5
21322138
; GFX89-NEXT: s_mov_b64 s[4:5], 1
2133-
; GFX89-NEXT: v_mov_b32_e32 v2, s6
2139+
; GFX89-NEXT: v_mov_b32_e32 v2, s7
21342140
; GFX89-NEXT: flat_store_byte v[0:1], v2
21352141
; GFX89-NEXT: s_mov_b64 vcc, vcc
21362142
; GFX89-NEXT: s_cbranch_vccnz .LBB22_1
@@ -2142,15 +2148,18 @@ define void @crash_lshlrevb16_not_reg_op() {
21422148
; GFX9: ; %bb.0: ; %bb0
21432149
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21442150
; GFX9-NEXT: s_mov_b64 s[4:5], 0
2151+
; GFX9-NEXT: s_and_b32 s6, s4, 0xff
2152+
; GFX9-NEXT: s_bitset1_b32 s6, 8
2153+
; GFX9-NEXT: s_and_b32 s6, s6, 0x1ff
21452154
; GFX9-NEXT: s_and_b64 vcc, exec, -1
21462155
; GFX9-NEXT: .LBB22_1: ; %bb1
21472156
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2148-
; GFX9-NEXT: s_lshl_b32 s6, s4, 3
2157+
; GFX9-NEXT: s_lshl_b32 s7, s4, 3
21492158
; GFX9-NEXT: v_mov_b32_e32 v0, s4
2150-
; GFX9-NEXT: s_lshr_b32 s6, 0x100, s6
2159+
; GFX9-NEXT: s_lshr_b32 s7, s6, s7
21512160
; GFX9-NEXT: v_mov_b32_e32 v1, s5
21522161
; GFX9-NEXT: s_mov_b64 s[4:5], 1
2153-
; GFX9-NEXT: v_mov_b32_e32 v2, s6
2162+
; GFX9-NEXT: v_mov_b32_e32 v2, s7
21542163
; GFX9-NEXT: flat_store_byte v[0:1], v2
21552164
; GFX9-NEXT: s_mov_b64 vcc, vcc
21562165
; GFX9-NEXT: s_cbranch_vccnz .LBB22_1
@@ -2161,14 +2170,17 @@ define void @crash_lshlrevb16_not_reg_op() {
21612170
; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
21622171
; GFX10: ; %bb.0: ; %bb0
21632172
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2164-
; GFX10-NEXT: s_mov_b64 s[4:5], 0
2173+
; GFX10-NEXT: s_and_b32 s4, s4, 0xff
21652174
; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
2175+
; GFX10-NEXT: s_or_b32 s6, s4, 0x100
2176+
; GFX10-NEXT: s_mov_b64 s[4:5], 0
2177+
; GFX10-NEXT: s_and_b32 s6, s6, 0x1ff
21662178
; GFX10-NEXT: .LBB22_1: ; %bb1
21672179
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2168-
; GFX10-NEXT: s_lshl_b32 s6, s4, 3
2180+
; GFX10-NEXT: s_lshl_b32 s7, s4, 3
21692181
; GFX10-NEXT: v_mov_b32_e32 v0, s4
21702182
; GFX10-NEXT: v_mov_b32_e32 v1, s5
2171-
; GFX10-NEXT: s_lshr_b32 s4, 0x100, s6
2183+
; GFX10-NEXT: s_lshr_b32 s4, s6, s7
21722184
; GFX10-NEXT: v_mov_b32_e32 v2, s4
21732185
; GFX10-NEXT: s_mov_b64 s[4:5], 1
21742186
; GFX10-NEXT: flat_store_byte v[0:1], v2

llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -397,13 +397,11 @@ define amdgpu_kernel void @reuseOp() {
397397
; GFX906-SAME: ) #[[ATTR0]] {
398398
; GFX906-NEXT: entry:
399399
; GFX906-NEXT: [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
400-
; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32>
401400
; GFX906-NEXT: br label [[BB_1:%.*]]
402401
; GFX906: bb.1:
403-
; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8>
404402
; GFX906-NEXT: [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
405403
; GFX906-NEXT: [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32>
406-
; GFX906-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]]
404+
; GFX906-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1]], <16 x i8> [[SEL0]]
407405
; GFX906-NEXT: br label [[BB_2:%.*]]
408406
; GFX906: bb.2:
409407
; GFX906-NEXT: [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8>

0 commit comments

Comments
 (0)