@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260
260
VF * getNumElements(ScalarTy));
261
261
}
262
262
263
+ /// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264
+ /// which forms type, which splits by \p TTI into whole vector types during
265
+ /// legalization.
266
+ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267
+ Type *Ty, unsigned Sz) {
268
+ if (!isValidElementType(Ty))
269
+ return PowerOf2Ceil(Sz);
270
+ // Find the number of elements, which forms full vectors.
271
+ const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
272
+ if (NumParts == 0 || NumParts == Sz)
273
+ return PowerOf2Ceil(Sz);
274
+ return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
275
+ }
276
+
263
277
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
264
278
SmallVectorImpl<int> &Mask) {
265
279
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1224,6 +1238,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1224
1238
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
1225
1239
}
1226
1240
1241
+ /// Returns true if widened type of \p Ty elements with size \p Sz represents
1242
+ /// full vector type, i.e. adding extra element results in extra parts upon type
1243
+ /// legalization.
1244
+ static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
1245
+ unsigned Sz) {
1246
+ if (Sz <= 1)
1247
+ return false;
1248
+ if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1249
+ return false;
1250
+ if (has_single_bit(Sz))
1251
+ return true;
1252
+ const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
1253
+ return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) &&
1254
+ Sz % NumParts == 0;
1255
+ }
1256
+
1227
1257
namespace slpvectorizer {
1228
1258
1229
1259
/// Bottom Up SLP Vectorizer.
@@ -2467,7 +2497,9 @@ class BoUpSLP {
2467
2497
}
2468
2498
// TODO: Check if we can remove a check for non-power-2 number of
2469
2499
// scalars after full support of non-power-2 vectorization.
2470
- return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2500
+ return UniqueValues.size() != 2 &&
2501
+ hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
2502
+ UniqueValues.size());
2471
2503
};
2472
2504
2473
2505
// If the initial strategy fails for any of the operand indexes, then we
@@ -3276,8 +3308,9 @@ class BoUpSLP {
3276
3308
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3277
3309
3278
3310
/// Return true if this is a non-power-of-2 node.
3279
- bool isNonPowOf2Vec() const {
3280
- bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3311
+ bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
3312
+ bool IsNonPowerOf2 = !hasFullVectorsOnly(
3313
+ TTI, getValueType(Scalars.front()), Scalars.size());
3281
3314
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3282
3315
"Reshuffling not supported with non-power-of-2 vectors yet.");
3283
3316
return IsNonPowerOf2;
@@ -3455,7 +3488,7 @@ class BoUpSLP {
3455
3488
3456
3489
if (UserTreeIdx.UserTE) {
3457
3490
Last->UserTreeIndices.push_back(UserTreeIdx);
3458
- assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3491
+ assert((!Last->isNonPowOf2Vec(*TTI ) || Last->ReorderIndices.empty()) &&
3459
3492
"Reordering isn't implemented for non-power-of-2 nodes yet");
3460
3493
}
3461
3494
return Last;
@@ -4361,7 +4394,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4361
4394
if (!isValidElementType(ScalarTy))
4362
4395
return std::nullopt;
4363
4396
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4364
- int NumParts = TTI->getNumberOfParts (VecTy);
4397
+ int NumParts = TTI->getRegUsageForType (VecTy);
4365
4398
if (NumParts == 0 || NumParts >= NumScalars)
4366
4399
NumParts = 1;
4367
4400
SmallVector<int> ExtractMask;
@@ -4733,7 +4766,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4733
4766
// Check the order of pointer operands or that all pointers are the same.
4734
4767
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4735
4768
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4736
- if (!Order.empty() && !has_single_bit(VL.size() )) {
4769
+ if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz )) {
4737
4770
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4738
4771
"supported with VectorizeNonPowerOf2");
4739
4772
return LoadsState::Gather;
@@ -4787,12 +4820,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4787
4820
});
4788
4821
});
4789
4822
const unsigned AbsoluteDiff = std::abs(*Diff);
4790
- if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4791
- ((Sz > MinProfitableStridedLoads ||
4792
- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4793
- has_single_bit(AbsoluteDiff))) &&
4794
- AbsoluteDiff > Sz) ||
4795
- *Diff == -(static_cast<int>(Sz) - 1))) {
4823
+ if (IsPossibleStrided &&
4824
+ (IsAnyPointerUsedOutGraph ||
4825
+ ((Sz > MinProfitableStridedLoads ||
4826
+ (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4827
+ hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
4828
+ AbsoluteDiff > Sz) ||
4829
+ *Diff == -(static_cast<int>(Sz) - 1))) {
4796
4830
int Stride = *Diff / static_cast<int>(Sz - 1);
4797
4831
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4798
4832
Align Alignment =
@@ -5197,7 +5231,7 @@ static bool areTwoInsertFromSameBuildVector(
5197
5231
std::optional<BoUpSLP::OrdersType>
5198
5232
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5199
5233
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5200
- if (TE.isNonPowOf2Vec())
5234
+ if (TE.isNonPowOf2Vec(*TTI ))
5201
5235
return std::nullopt;
5202
5236
5203
5237
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5231,8 +5265,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5231
5265
}
5232
5266
}
5233
5267
if (Sz == 2 && TE.getVectorFactor() == 4 &&
5234
- TTI->getNumberOfParts (getWidenedType(TE.Scalars.front()->getType(),
5235
- 2 * TE.getVectorFactor())) == 1)
5268
+ TTI->getRegUsageForType (getWidenedType(TE.Scalars.front()->getType(),
5269
+ 2 * TE.getVectorFactor())) == 1)
5236
5270
return std::nullopt;
5237
5271
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5238
5272
Sz)) {
@@ -5581,7 +5615,7 @@ void BoUpSLP::reorderTopToBottom() {
5581
5615
5582
5616
// Reorder the graph nodes according to their vectorization factor.
5583
5617
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5584
- VF / = 2) {
5618
+ VF - = 2) {
5585
5619
auto It = VFToOrderedEntries.find(VF);
5586
5620
if (It == VFToOrderedEntries.end())
5587
5621
continue;
@@ -5754,7 +5788,7 @@ bool BoUpSLP::canReorderOperands(
5754
5788
ArrayRef<TreeEntry *> ReorderableGathers,
5755
5789
SmallVectorImpl<TreeEntry *> &GatherOps) {
5756
5790
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5757
- if (UserTE->isNonPowOf2Vec())
5791
+ if (UserTE->isNonPowOf2Vec(*TTI ))
5758
5792
return false;
5759
5793
5760
5794
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5929,7 +5963,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5929
5963
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5930
5964
const auto AllowsReordering = [&](const TreeEntry *TE) {
5931
5965
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5932
- if (TE->isNonPowOf2Vec())
5966
+ if (TE->isNonPowOf2Vec(*TTI ))
5933
5967
return false;
5934
5968
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5935
5969
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6575,7 +6609,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6575
6609
case Instruction::ExtractElement: {
6576
6610
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6577
6611
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6578
- if (!has_single_bit( VL.size()))
6612
+ if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
6579
6613
return TreeEntry::NeedToGather;
6580
6614
if (Reuse || !CurrentOrder.empty())
6581
6615
return TreeEntry::Vectorize;
@@ -6985,7 +7019,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6985
7019
ReuseShuffleIndices.clear();
6986
7020
} else {
6987
7021
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6988
- if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
7022
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI )) {
6989
7023
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6990
7024
"for nodes with padding.\n");
6991
7025
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -6998,15 +7032,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6998
7032
return isa<UndefValue>(V) ||
6999
7033
!isConstant(V);
7000
7034
})) ||
7001
- !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
7035
+ !hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
7036
+ NumUniqueScalarValues)) {
7002
7037
if (DoNotFail && UniquePositions.size() > 1 &&
7003
7038
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
7004
7039
all_of(UniqueValues, [=](Value *V) {
7005
7040
return isa<ExtractElementInst>(V) ||
7006
7041
areAllUsersVectorized(cast<Instruction>(V),
7007
7042
UserIgnoreList);
7008
7043
})) {
7009
- unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
7044
+ // Find the number of elements, which forms full vectors.
7045
+ unsigned PWSz = getFullVectorNumberOfElements(
7046
+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
7010
7047
if (PWSz == VL.size()) {
7011
7048
ReuseShuffleIndices.clear();
7012
7049
} else {
@@ -9217,7 +9254,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9217
9254
}
9218
9255
assert(!CommonMask.empty() && "Expected non-empty common mask.");
9219
9256
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9220
- unsigned NumParts = TTI.getNumberOfParts (MaskVecTy);
9257
+ unsigned NumParts = TTI.getRegUsageForType (MaskVecTy);
9221
9258
if (NumParts == 0 || NumParts >= Mask.size())
9222
9259
NumParts = 1;
9223
9260
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9234,7 +9271,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9234
9271
}
9235
9272
assert(!CommonMask.empty() && "Expected non-empty common mask.");
9236
9273
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9237
- unsigned NumParts = TTI.getNumberOfParts (MaskVecTy);
9274
+ unsigned NumParts = TTI.getRegUsageForType (MaskVecTy);
9238
9275
if (NumParts == 0 || NumParts >= Mask.size())
9239
9276
NumParts = 1;
9240
9277
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9740,7 +9777,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9740
9777
unsigned const NumElts = SrcVecTy->getNumElements();
9741
9778
unsigned const NumScalars = VL.size();
9742
9779
9743
- unsigned NumOfParts = TTI->getNumberOfParts (SrcVecTy);
9780
+ unsigned NumOfParts = TTI->getRegUsageForType (SrcVecTy);
9744
9781
9745
9782
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9746
9783
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -10956,7 +10993,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10956
10993
// Keep original scalar if number of externally used instructions in
10957
10994
// the same entry is not power of 2. It may help to do some extra
10958
10995
// vectorization for now.
10959
- KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
10996
+ KeepScalar =
10997
+ ScalarUsesCount <= 1 ||
10998
+ !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
10960
10999
}
10961
11000
if (KeepScalar) {
10962
11001
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11649,13 +11688,14 @@ BoUpSLP::isGatherShuffledEntry(
11649
11688
if (TE == VectorizableTree.front().get())
11650
11689
return {};
11651
11690
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11652
- if (TE->isNonPowOf2Vec())
11691
+ if (TE->isNonPowOf2Vec(*TTI ))
11653
11692
return {};
11654
11693
Mask.assign(VL.size(), PoisonMaskElem);
11655
11694
assert(TE->UserTreeIndices.size() == 1 &&
11656
11695
"Expected only single user of the gather node.");
11657
- assert(VL.size() % NumParts == 0 &&
11658
- "Number of scalars must be divisible by NumParts.");
11696
+ // Number of scalars must be divisible by NumParts.
11697
+ if (VL.size() % NumParts != 0)
11698
+ return {};
11659
11699
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11660
11700
SmallVector<std::optional<TTI::ShuffleKind>> Res;
11661
11701
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12794,7 +12834,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12794
12834
SmallVector<SmallVector<const TreeEntry *>> Entries;
12795
12835
Type *OrigScalarTy = GatheredScalars.front()->getType();
12796
12836
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12797
- unsigned NumParts = TTI->getNumberOfParts (VecTy);
12837
+ unsigned NumParts = TTI->getRegUsageForType (VecTy);
12798
12838
if (NumParts == 0 || NumParts >= GatheredScalars.size())
12799
12839
NumParts = 1;
12800
12840
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -16040,7 +16080,7 @@ void BoUpSLP::computeMinimumValueSizes() {
16040
16080
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
16041
16081
return 0u;
16042
16082
16043
- unsigned NumParts = TTI->getNumberOfParts (
16083
+ unsigned NumParts = TTI->getRegUsageForType (
16044
16084
getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
16045
16085
16046
16086
// The maximum bit width required to represent all the values that can be
@@ -16097,7 +16137,7 @@ void BoUpSLP::computeMinimumValueSizes() {
16097
16137
// use - ignore it.
16098
16138
if (NumParts > 1 &&
16099
16139
NumParts ==
16100
- TTI->getNumberOfParts (getWidenedType(
16140
+ TTI->getRegUsageForType (getWidenedType(
16101
16141
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
16102
16142
return 0u;
16103
16143
@@ -16958,7 +16998,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16958
16998
for (unsigned I = NextInst; I < MaxInst; ++I) {
16959
16999
unsigned ActualVF = std::min(MaxInst - I, VF);
16960
17000
16961
- if (!has_single_bit( ActualVF))
17001
+ if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
16962
17002
continue;
16963
17003
16964
17004
if (MaxVFOnly && ActualVF < MaxVF)
0 commit comments