diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8e22b54f002d1..0ce5d619d9b14 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6894,6 +6894,31 @@ class BaseShuffleAnalysis { }; } // namespace +/// Returns the cost of the shuffle instructions with the given \p Kind, vector +/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert +/// subvector pattern. +static InstructionCost +getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, + VectorType *Tp, ArrayRef Mask = std::nullopt, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, + int Index = 0, VectorType *SubTp = nullptr, + ArrayRef Args = std::nullopt) { + if (Kind != TTI::SK_PermuteTwoSrc) + return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); + int NumSrcElts = Tp->getElementCount().getKnownMinValue(); + int NumSubElts; + if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( + Mask, NumSrcElts, NumSubElts, Index)) { + if (Index + NumSubElts > NumSrcElts && + Index + NumSrcElts <= static_cast(Mask.size())) + return TTI.getShuffleCost( + TTI::SK_InsertSubvector, + FixedVectorType::get(Tp->getElementType(), Mask.size()), std::nullopt, + TTI::TCK_RecipThroughput, Index, Tp); + } + return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); +} + /// Merges shuffle masks and emits final shuffle instruction, if required. It /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, /// when the actual shuffle instruction is generated only if this is actually @@ -7141,15 +7166,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { std::optional RegShuffleKind = CheckPerRegistersShuffle(SubMask); if (!RegShuffleKind) { - Cost += TTI.getShuffleCost( - *ShuffleKinds[Part], + Cost += ::getShuffleCost( + TTI, *ShuffleKinds[Part], FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice); continue; } if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { - Cost += TTI.getShuffleCost( - *RegShuffleKind, + Cost += ::getShuffleCost( + TTI, *RegShuffleKind, FixedVectorType::get(VL.front()->getType(), EltsPerVector), SubMask); } @@ -7222,8 +7247,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { cast(V1->getType())->getElementCount().getKnownMinValue(); if (isEmptyOrIdentity(Mask, VF)) return TTI::TCC_Free; - return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, - cast(V1->getType()), Mask); + return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, + cast(V1->getType()), Mask); } InstructionCost createShuffleVector(Value *V1, ArrayRef Mask) const { // Empty mask or identity mask are free. @@ -8101,7 +8126,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) Mask[I] = ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I; - Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); + Cost += + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); } } return Cost; @@ -8428,8 +8454,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return I->getOpcode() == E->getAltOpcode(); }, Mask); - VecCost += TTIRef.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); // Patterns like [fadd,fsub] can be combined into a single instruction // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we // need to take into account their order when looking for the most used @@ -9133,7 +9159,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { auto *FTy = FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF); InstructionCost C = - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask); + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of vector node and external " "insertelement users.\n"; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll index 8e0f382222241..de1eecd98eeb3 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll @@ -8,7 +8,7 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '9' +; YAML-NEXT: - Cost: '3' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '7' @@ -19,20 +19,15 @@ define void @test() { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr null, align 4 -; CHECK-NEXT: [[V9IDX:%.*]] = getelementptr i8, ptr null, i32 4 -; CHECK-NEXT: [[V14IDX:%.*]] = getelementptr i8, ptr null, i32 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: [[V0_0:%.*]] = select i1 [[TMP7]], float [[TMP0]], float 0.000000e+00 -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP3]], <2 x float> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; CHECK-NEXT: [[V9_0:%.*]] = select i1 [[TMP9]], float [[TMP2]], float 0.000000e+00 -; CHECK-NEXT: store float [[V0_0]], ptr null, align 4 -; CHECK-NEXT: store float [[V9_0]], ptr [[V9IDX]], align 4 -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[V14IDX]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP9]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x float> [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; entry: