Skip to content

Commit 6ab07d7

Browse files
[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands.
Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #106449
1 parent 688a274 commit 6ab07d7

File tree

2 files changed

+78
-42
lines changed

2 files changed

+78
-42
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+73-33
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260260
VF * getNumElements(ScalarTy));
261261
}
262262

263+
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264+
/// which forms type, which splits by \p TTI into whole vector types during
265+
/// legalization.
266+
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267+
Type *Ty, unsigned Sz) {
268+
if (!isValidElementType(Ty))
269+
return PowerOf2Ceil(Sz);
270+
// Find the number of elements, which forms full vectors.
271+
const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
272+
if (NumParts == 0 || NumParts == Sz)
273+
return PowerOf2Ceil(Sz);
274+
return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
275+
}
276+
263277
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
264278
SmallVectorImpl<int> &Mask) {
265279
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1224,6 +1238,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
12241238
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
12251239
}
12261240

1241+
/// Returns true if widened type of \p Ty elements with size \p Sz represents
1242+
/// full vector type, i.e. adding extra element results in extra parts upon type
1243+
/// legalization.
1244+
static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
1245+
unsigned Sz) {
1246+
if (Sz <= 1)
1247+
return false;
1248+
if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1249+
return false;
1250+
if (has_single_bit(Sz))
1251+
return true;
1252+
const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
1253+
return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) &&
1254+
Sz % NumParts == 0;
1255+
}
1256+
12271257
namespace slpvectorizer {
12281258

12291259
/// Bottom Up SLP Vectorizer.
@@ -2467,7 +2497,9 @@ class BoUpSLP {
24672497
}
24682498
// TODO: Check if we can remove a check for non-power-2 number of
24692499
// scalars after full support of non-power-2 vectorization.
2470-
return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2500+
return UniqueValues.size() != 2 &&
2501+
hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
2502+
UniqueValues.size());
24712503
};
24722504

24732505
// If the initial strategy fails for any of the operand indexes, then we
@@ -3276,8 +3308,9 @@ class BoUpSLP {
32763308
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
32773309

32783310
/// Return true if this is a non-power-of-2 node.
3279-
bool isNonPowOf2Vec() const {
3280-
bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3311+
bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
3312+
bool IsNonPowerOf2 = !hasFullVectorsOnly(
3313+
TTI, getValueType(Scalars.front()), Scalars.size());
32813314
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
32823315
"Reshuffling not supported with non-power-of-2 vectors yet.");
32833316
return IsNonPowerOf2;
@@ -3455,7 +3488,7 @@ class BoUpSLP {
34553488

34563489
if (UserTreeIdx.UserTE) {
34573490
Last->UserTreeIndices.push_back(UserTreeIdx);
3458-
assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3491+
assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
34593492
"Reordering isn't implemented for non-power-of-2 nodes yet");
34603493
}
34613494
return Last;
@@ -4361,7 +4394,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
43614394
if (!isValidElementType(ScalarTy))
43624395
return std::nullopt;
43634396
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4364-
int NumParts = TTI->getNumberOfParts(VecTy);
4397+
int NumParts = TTI->getRegUsageForType(VecTy);
43654398
if (NumParts == 0 || NumParts >= NumScalars)
43664399
NumParts = 1;
43674400
SmallVector<int> ExtractMask;
@@ -4733,7 +4766,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
47334766
// Check the order of pointer operands or that all pointers are the same.
47344767
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
47354768
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4736-
if (!Order.empty() && !has_single_bit(VL.size())) {
4769+
if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
47374770
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
47384771
"supported with VectorizeNonPowerOf2");
47394772
return LoadsState::Gather;
@@ -4787,12 +4820,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
47874820
});
47884821
});
47894822
const unsigned AbsoluteDiff = std::abs(*Diff);
4790-
if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4791-
((Sz > MinProfitableStridedLoads ||
4792-
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4793-
has_single_bit(AbsoluteDiff))) &&
4794-
AbsoluteDiff > Sz) ||
4795-
*Diff == -(static_cast<int>(Sz) - 1))) {
4823+
if (IsPossibleStrided &&
4824+
(IsAnyPointerUsedOutGraph ||
4825+
((Sz > MinProfitableStridedLoads ||
4826+
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4827+
hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
4828+
AbsoluteDiff > Sz) ||
4829+
*Diff == -(static_cast<int>(Sz) - 1))) {
47964830
int Stride = *Diff / static_cast<int>(Sz - 1);
47974831
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
47984832
Align Alignment =
@@ -5197,7 +5231,7 @@ static bool areTwoInsertFromSameBuildVector(
51975231
std::optional<BoUpSLP::OrdersType>
51985232
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
51995233
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5200-
if (TE.isNonPowOf2Vec())
5234+
if (TE.isNonPowOf2Vec(*TTI))
52015235
return std::nullopt;
52025236

52035237
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5231,8 +5265,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
52315265
}
52325266
}
52335267
if (Sz == 2 && TE.getVectorFactor() == 4 &&
5234-
TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5235-
2 * TE.getVectorFactor())) == 1)
5268+
TTI->getRegUsageForType(getWidenedType(TE.Scalars.front()->getType(),
5269+
2 * TE.getVectorFactor())) == 1)
52365270
return std::nullopt;
52375271
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
52385272
Sz)) {
@@ -5581,7 +5615,7 @@ void BoUpSLP::reorderTopToBottom() {
55815615

55825616
// Reorder the graph nodes according to their vectorization factor.
55835617
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5584-
VF /= 2) {
5618+
VF -= 2) {
55855619
auto It = VFToOrderedEntries.find(VF);
55865620
if (It == VFToOrderedEntries.end())
55875621
continue;
@@ -5754,7 +5788,7 @@ bool BoUpSLP::canReorderOperands(
57545788
ArrayRef<TreeEntry *> ReorderableGathers,
57555789
SmallVectorImpl<TreeEntry *> &GatherOps) {
57565790
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5757-
if (UserTE->isNonPowOf2Vec())
5791+
if (UserTE->isNonPowOf2Vec(*TTI))
57585792
return false;
57595793

57605794
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5929,7 +5963,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
59295963
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
59305964
const auto AllowsReordering = [&](const TreeEntry *TE) {
59315965
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5932-
if (TE->isNonPowOf2Vec())
5966+
if (TE->isNonPowOf2Vec(*TTI))
59335967
return false;
59345968
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
59355969
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6575,7 +6609,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
65756609
case Instruction::ExtractElement: {
65766610
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
65776611
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6578-
if (!has_single_bit(VL.size()))
6612+
if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
65796613
return TreeEntry::NeedToGather;
65806614
if (Reuse || !CurrentOrder.empty())
65816615
return TreeEntry::Vectorize;
@@ -6985,7 +7019,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
69857019
ReuseShuffleIndices.clear();
69867020
} else {
69877021
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6988-
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
7022+
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
69897023
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
69907024
"for nodes with padding.\n");
69917025
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -6998,15 +7032,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
69987032
return isa<UndefValue>(V) ||
69997033
!isConstant(V);
70007034
})) ||
7001-
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
7035+
!hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
7036+
NumUniqueScalarValues)) {
70027037
if (DoNotFail && UniquePositions.size() > 1 &&
70037038
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
70047039
all_of(UniqueValues, [=](Value *V) {
70057040
return isa<ExtractElementInst>(V) ||
70067041
areAllUsersVectorized(cast<Instruction>(V),
70077042
UserIgnoreList);
70087043
})) {
7009-
unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
7044+
// Find the number of elements, which forms full vectors.
7045+
unsigned PWSz = getFullVectorNumberOfElements(
7046+
*TTI, UniqueValues.front()->getType(), UniqueValues.size());
70107047
if (PWSz == VL.size()) {
70117048
ReuseShuffleIndices.clear();
70127049
} else {
@@ -9217,7 +9254,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92179254
}
92189255
assert(!CommonMask.empty() && "Expected non-empty common mask.");
92199256
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9220-
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9257+
unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
92219258
if (NumParts == 0 || NumParts >= Mask.size())
92229259
NumParts = 1;
92239260
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9234,7 +9271,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92349271
}
92359272
assert(!CommonMask.empty() && "Expected non-empty common mask.");
92369273
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9237-
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9274+
unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
92389275
if (NumParts == 0 || NumParts >= Mask.size())
92399276
NumParts = 1;
92409277
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9740,7 +9777,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
97409777
unsigned const NumElts = SrcVecTy->getNumElements();
97419778
unsigned const NumScalars = VL.size();
97429779

9743-
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9780+
unsigned NumOfParts = TTI->getRegUsageForType(SrcVecTy);
97449781

97459782
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
97469783
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -10956,7 +10993,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1095610993
// Keep original scalar if number of externally used instructions in
1095710994
// the same entry is not power of 2. It may help to do some extra
1095810995
// vectorization for now.
10959-
KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
10996+
KeepScalar =
10997+
ScalarUsesCount <= 1 ||
10998+
!hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
1096010999
}
1096111000
if (KeepScalar) {
1096211001
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11649,13 +11688,14 @@ BoUpSLP::isGatherShuffledEntry(
1164911688
if (TE == VectorizableTree.front().get())
1165011689
return {};
1165111690
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11652-
if (TE->isNonPowOf2Vec())
11691+
if (TE->isNonPowOf2Vec(*TTI))
1165311692
return {};
1165411693
Mask.assign(VL.size(), PoisonMaskElem);
1165511694
assert(TE->UserTreeIndices.size() == 1 &&
1165611695
"Expected only single user of the gather node.");
11657-
assert(VL.size() % NumParts == 0 &&
11658-
"Number of scalars must be divisible by NumParts.");
11696+
// Number of scalars must be divisible by NumParts.
11697+
if (VL.size() % NumParts != 0)
11698+
return {};
1165911699
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
1166011700
SmallVector<std::optional<TTI::ShuffleKind>> Res;
1166111701
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12794,7 +12834,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1279412834
SmallVector<SmallVector<const TreeEntry *>> Entries;
1279512835
Type *OrigScalarTy = GatheredScalars.front()->getType();
1279612836
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12797-
unsigned NumParts = TTI->getNumberOfParts(VecTy);
12837+
unsigned NumParts = TTI->getRegUsageForType(VecTy);
1279812838
if (NumParts == 0 || NumParts >= GatheredScalars.size())
1279912839
NumParts = 1;
1280012840
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -16040,7 +16080,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1604016080
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
1604116081
return 0u;
1604216082

16043-
unsigned NumParts = TTI->getNumberOfParts(
16083+
unsigned NumParts = TTI->getRegUsageForType(
1604416084
getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
1604516085

1604616086
// The maximum bit width required to represent all the values that can be
@@ -16097,7 +16137,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1609716137
// use - ignore it.
1609816138
if (NumParts > 1 &&
1609916139
NumParts ==
16100-
TTI->getNumberOfParts(getWidenedType(
16140+
TTI->getRegUsageForType(getWidenedType(
1610116141
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
1610216142
return 0u;
1610316143

@@ -16958,7 +16998,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1695816998
for (unsigned I = NextInst; I < MaxInst; ++I) {
1695916999
unsigned ActualVF = std::min(MaxInst - I, VF);
1696017000

16961-
if (!has_single_bit(ActualVF))
17001+
if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
1696217002
continue;
1696317003

1696417004
if (MaxVFOnly && ActualVF < MaxVF)

llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll

+5-9
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,11 @@
44
define i64 @test(ptr %p) {
55
; CHECK-LABEL: @test(
66
; CHECK-NEXT: entry:
7-
; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
8-
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
9-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
10-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
11-
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
12-
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
13-
; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
14-
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
15-
; CHECK-NEXT: ret i64 [[TMP6]]
7+
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
8+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
9+
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
10+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
11+
; CHECK-NEXT: ret i64 [[TMP3]]
1612
;
1713
entry:
1814
%arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1

0 commit comments

Comments
 (0)