Skip to content

Commit 8fb6ceb

Browse files
alexey-bataevjustinfargnoli
authored andcommitted
[SLP][TTI]Improve detection of the insert-subvector pattern for SLP. (llvm#74749)
SLP vectorizer passes the type of the subvector and the mask, which size determines the size of the resulting vector. TTI should support this pattern to improve cost estimation of the insert_subvector shuffle pattern.
1 parent fcb2a91 commit 8fb6ceb

File tree

2 files changed

+42
-21
lines changed

2 files changed

+42
-21
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6894,6 +6894,31 @@ class BaseShuffleAnalysis {
68946894
};
68956895
} // namespace
68966896

6897+
/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6898+
/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6899+
/// subvector pattern.
6900+
static InstructionCost
6901+
getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
6902+
VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
6903+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
6904+
int Index = 0, VectorType *SubTp = nullptr,
6905+
ArrayRef<const Value *> Args = std::nullopt) {
6906+
if (Kind != TTI::SK_PermuteTwoSrc)
6907+
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
6908+
int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6909+
int NumSubElts;
6910+
if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
6911+
Mask, NumSrcElts, NumSubElts, Index)) {
6912+
if (Index + NumSubElts > NumSrcElts &&
6913+
Index + NumSrcElts <= static_cast<int>(Mask.size()))
6914+
return TTI.getShuffleCost(
6915+
TTI::SK_InsertSubvector,
6916+
FixedVectorType::get(Tp->getElementType(), Mask.size()), std::nullopt,
6917+
TTI::TCK_RecipThroughput, Index, Tp);
6918+
}
6919+
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
6920+
}
6921+
68976922
/// Merges shuffle masks and emits final shuffle instruction, if required. It
68986923
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
68996924
/// when the actual shuffle instruction is generated only if this is actually
@@ -7141,15 +7166,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
71417166
std::optional<TTI::ShuffleKind> RegShuffleKind =
71427167
CheckPerRegistersShuffle(SubMask);
71437168
if (!RegShuffleKind) {
7144-
Cost += TTI.getShuffleCost(
7145-
*ShuffleKinds[Part],
7169+
Cost += ::getShuffleCost(
7170+
TTI, *ShuffleKinds[Part],
71467171
FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
71477172
continue;
71487173
}
71497174
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
71507175
!ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
7151-
Cost += TTI.getShuffleCost(
7152-
*RegShuffleKind,
7176+
Cost += ::getShuffleCost(
7177+
TTI, *RegShuffleKind,
71537178
FixedVectorType::get(VL.front()->getType(), EltsPerVector),
71547179
SubMask);
71557180
}
@@ -7222,8 +7247,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
72227247
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
72237248
if (isEmptyOrIdentity(Mask, VF))
72247249
return TTI::TCC_Free;
7225-
return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
7226-
cast<VectorType>(V1->getType()), Mask);
7250+
return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
7251+
cast<VectorType>(V1->getType()), Mask);
72277252
}
72287253
InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
72297254
// Empty mask or identity mask are free.
@@ -8101,7 +8126,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
81018126
for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
81028127
Mask[I] =
81038128
((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
8104-
Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
8129+
Cost +=
8130+
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
81058131
}
81068132
}
81078133
return Cost;
@@ -8428,8 +8454,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
84288454
return I->getOpcode() == E->getAltOpcode();
84298455
},
84308456
Mask);
8431-
VecCost += TTIRef.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
8432-
FinalVecTy, Mask);
8457+
VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
8458+
FinalVecTy, Mask);
84338459
// Patterns like [fadd,fsub] can be combined into a single instruction
84348460
// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
84358461
// need to take into account their order when looking for the most used
@@ -9133,7 +9159,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
91339159
auto *FTy =
91349160
FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
91359161
InstructionCost C =
9136-
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
9162+
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
91379163
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
91389164
<< " for final shuffle of vector node and external "
91399165
"insertelement users.\n";

llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
; YAML-NEXT: Function: test
99
; YAML-NEXT: Args:
1010
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
11-
; YAML-NEXT: - Cost: '9'
11+
; YAML-NEXT: - Cost: '3'
1212
; YAML-NEXT: - String: ' and with tree size '
1313
; YAML-NEXT: - TreeSize: '7'
1414

@@ -19,20 +19,15 @@ define void @test() {
1919
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4
2020
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4
2121
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr null, align 4
22-
; CHECK-NEXT: [[V9IDX:%.*]] = getelementptr i8, ptr null, i32 4
23-
; CHECK-NEXT: [[V14IDX:%.*]] = getelementptr i8, ptr null, i32 8
2422
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP1]], i32 0
2523
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
2624
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
2725
; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
28-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
29-
; CHECK-NEXT: [[V0_0:%.*]] = select i1 [[TMP7]], float [[TMP0]], float 0.000000e+00
30-
; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP3]], <2 x float> zeroinitializer
31-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
32-
; CHECK-NEXT: [[V9_0:%.*]] = select i1 [[TMP9]], float [[TMP2]], float 0.000000e+00
33-
; CHECK-NEXT: store float [[V0_0]], ptr null, align 4
34-
; CHECK-NEXT: store float [[V9_0]], ptr [[V9IDX]], align 4
35-
; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[V14IDX]], align 4
26+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
27+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
28+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
29+
; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP9]], <4 x float> zeroinitializer
30+
; CHECK-NEXT: store <4 x float> [[TMP10]], ptr null, align 4
3631
; CHECK-NEXT: ret void
3732
;
3833
entry:

0 commit comments

Comments
 (0)