@@ -1371,18 +1371,6 @@ class BoUpSLP {
1371
1371
return MinBWs.at(VectorizableTree.front().get()).second;
1372
1372
}
1373
1373
1374
- /// Returns reduction bitwidth and signedness, if it does not match the
1375
- /// original requested size.
1376
- std::optional<std::pair<unsigned, bool>> getReductionBitWidthAndSign() const {
1377
- if (ReductionBitWidth == 0 ||
1378
- ReductionBitWidth ==
1379
- DL->getTypeSizeInBits(
1380
- VectorizableTree.front()->Scalars.front()->getType()))
1381
- return std::nullopt;
1382
- return std::make_pair(ReductionBitWidth,
1383
- MinBWs.at(VectorizableTree.front().get()).second);
1384
- }
1385
-
1386
1374
/// Builds external uses of the vectorized scalars, i.e. the list of
1387
1375
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
1388
1376
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -17899,37 +17887,24 @@ void BoUpSLP::computeMinimumValueSizes() {
17899
17887
// Add reduction ops sizes, if any.
17900
17888
if (UserIgnoreList &&
17901
17889
isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
17902
- // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
17903
- // x i1> to in)).
17904
- if (all_of(*UserIgnoreList,
17905
- [](Value *V) {
17906
- return cast<Instruction>(V)->getOpcode() == Instruction::Add;
17907
- }) &&
17908
- VectorizableTree.front()->State == TreeEntry::Vectorize &&
17909
- VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
17910
- cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
17911
- Builder.getInt1Ty()) {
17912
- ReductionBitWidth = 1;
17913
- } else {
17914
- for (Value *V : *UserIgnoreList) {
17915
- unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17916
- TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
17917
- unsigned BitWidth1 = NumTypeBits - NumSignBits;
17918
- if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17919
- ++BitWidth1;
17920
- unsigned BitWidth2 = BitWidth1;
17921
- if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17922
- APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
17923
- BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17924
- }
17925
- ReductionBitWidth =
17926
- std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
17890
+ for (Value *V : *UserIgnoreList) {
17891
+ auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17892
+ auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
17893
+ unsigned BitWidth1 = NumTypeBits - NumSignBits;
17894
+ if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17895
+ ++BitWidth1;
17896
+ unsigned BitWidth2 = BitWidth1;
17897
+ if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17898
+ auto Mask = DB->getDemandedBits(cast<Instruction>(V));
17899
+ BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17927
17900
}
17928
- if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17929
- ReductionBitWidth = 8;
17930
-
17931
- ReductionBitWidth = bit_ceil(ReductionBitWidth);
17901
+ ReductionBitWidth =
17902
+ std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
17932
17903
}
17904
+ if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17905
+ ReductionBitWidth = 8;
17906
+
17907
+ ReductionBitWidth = bit_ceil(ReductionBitWidth);
17933
17908
}
17934
17909
bool IsTopRoot = NodeIdx == 0;
17935
17910
while (NodeIdx < VectorizableTree.size() &&
@@ -19785,8 +19760,8 @@ class HorizontalReduction {
19785
19760
19786
19761
// Estimate cost.
19787
19762
InstructionCost TreeCost = V.getTreeCost(VL);
19788
- InstructionCost ReductionCost = getReductionCost(
19789
- TTI, VL, IsCmpSelMinMax, RdxFMF, V.getReductionBitWidthAndSign() );
19763
+ InstructionCost ReductionCost =
19764
+ getReductionCost( TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF );
19790
19765
InstructionCost Cost = TreeCost + ReductionCost;
19791
19766
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19792
19767
<< " for reduction\n");
@@ -19891,12 +19866,10 @@ class HorizontalReduction {
19891
19866
createStrideMask(I, ScalarTyNumElements, VL.size());
19892
19867
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
19893
19868
ReducedSubTree = Builder.CreateInsertElement(
19894
- ReducedSubTree,
19895
- emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
19869
+ ReducedSubTree, emitReduction(Lane, Builder, TTI), I);
19896
19870
}
19897
19871
} else {
19898
- ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
19899
- RdxRootInst->getType());
19872
+ ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI);
19900
19873
}
19901
19874
if (ReducedSubTree->getType() != VL.front()->getType()) {
19902
19875
assert(ReducedSubTree->getType() != VL.front()->getType() &&
@@ -20077,13 +20050,12 @@ class HorizontalReduction {
20077
20050
20078
20051
private:
20079
20052
/// Calculate the cost of a reduction.
20080
- InstructionCost getReductionCost(
20081
- TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals,
20082
- bool IsCmpSelMinMax, FastMathFlags FMF ,
20083
- const std::optional<std::pair<unsigned, bool>> BitwidthAndSign ) {
20053
+ InstructionCost getReductionCost(TargetTransformInfo *TTI,
20054
+ ArrayRef<Value *> ReducedVals,
20055
+ bool IsCmpSelMinMax, unsigned ReduxWidth ,
20056
+ FastMathFlags FMF ) {
20084
20057
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20085
20058
Type *ScalarTy = ReducedVals.front()->getType();
20086
- unsigned ReduxWidth = ReducedVals.size();
20087
20059
FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
20088
20060
InstructionCost VectorCost = 0, ScalarCost;
20089
20061
// If all of the reduced values are constant, the vector cost is 0, since
@@ -20142,22 +20114,8 @@ class HorizontalReduction {
20142
20114
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20143
20115
/*Extract*/ false, TTI::TCK_RecipThroughput);
20144
20116
} else {
20145
- auto [Bitwidth, IsSigned] =
20146
- BitwidthAndSign.value_or(std::make_pair(0u, false));
20147
- if (RdxKind == RecurKind::Add && Bitwidth == 1) {
20148
- // Represent vector_reduce_add(ZExt(<n x i1>)) to
20149
- // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20150
- auto *IntTy = IntegerType::get(ScalarTy->getContext(), ReduxWidth);
20151
- IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
20152
- VectorCost =
20153
- TTI->getCastInstrCost(Instruction::BitCast, IntTy,
20154
- getWidenedType(ScalarTy, ReduxWidth),
20155
- TTI::CastContextHint::None, CostKind) +
20156
- TTI->getIntrinsicInstrCost(ICA, CostKind);
20157
- } else {
20158
- VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20159
- FMF, CostKind);
20160
- }
20117
+ VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF,
20118
+ CostKind);
20161
20119
}
20162
20120
}
20163
20121
ScalarCost = EvaluateScalarCost([&]() {
@@ -20194,22 +20152,11 @@ class HorizontalReduction {
20194
20152
20195
20153
/// Emit a horizontal reduction of the vectorized value.
20196
20154
Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20197
- const TargetTransformInfo *TTI, Type *DestTy ) {
20155
+ const TargetTransformInfo *TTI) {
20198
20156
assert(VectorizedValue && "Need to have a vectorized tree node");
20199
20157
assert(RdxKind != RecurKind::FMulAdd &&
20200
20158
"A call to the llvm.fmuladd intrinsic is not handled yet");
20201
20159
20202
- auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20203
- if (FTy->getScalarType() == Builder.getInt1Ty() &&
20204
- RdxKind == RecurKind::Add &&
20205
- DestTy->getScalarType() != FTy->getScalarType()) {
20206
- // Convert vector_reduce_add(ZExt(<n x i1>)) to
20207
- // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20208
- Value *V = Builder.CreateBitCast(
20209
- VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20210
- ++NumVectorInstructions;
20211
- return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20212
- }
20213
20160
++NumVectorInstructions;
20214
20161
return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20215
20162
}
0 commit comments