Skip to content

Commit b62557a

Browse files
committed
Revert "[SLP]Model reduction_add(ext(<n x i1>)) as ext(ctpop(bitcast <n x i1> to int n))"
This reverts commit 0298c59 to fix a buildbot crash reported by https://lab.llvm.org/buildbot/#/builders/113/builds/4079.
1 parent 9ba6672 commit b62557a

File tree

3 files changed

+31
-86
lines changed

3 files changed

+31
-86
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+27-80
Original file line numberDiff line numberDiff line change
@@ -1371,18 +1371,6 @@ class BoUpSLP {
13711371
return MinBWs.at(VectorizableTree.front().get()).second;
13721372
}
13731373

1374-
/// Returns reduction bitwidth and signedness, if it does not match the
1375-
/// original requested size.
1376-
std::optional<std::pair<unsigned, bool>> getReductionBitWidthAndSign() const {
1377-
if (ReductionBitWidth == 0 ||
1378-
ReductionBitWidth ==
1379-
DL->getTypeSizeInBits(
1380-
VectorizableTree.front()->Scalars.front()->getType()))
1381-
return std::nullopt;
1382-
return std::make_pair(ReductionBitWidth,
1383-
MinBWs.at(VectorizableTree.front().get()).second);
1384-
}
1385-
13861374
/// Builds external uses of the vectorized scalars, i.e. the list of
13871375
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
13881376
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -17899,37 +17887,24 @@ void BoUpSLP::computeMinimumValueSizes() {
1789917887
// Add reduction ops sizes, if any.
1790017888
if (UserIgnoreList &&
1790117889
isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
17902-
// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
17903-
// x i1> to in)).
17904-
if (all_of(*UserIgnoreList,
17905-
[](Value *V) {
17906-
return cast<Instruction>(V)->getOpcode() == Instruction::Add;
17907-
}) &&
17908-
VectorizableTree.front()->State == TreeEntry::Vectorize &&
17909-
VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
17910-
cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
17911-
Builder.getInt1Ty()) {
17912-
ReductionBitWidth = 1;
17913-
} else {
17914-
for (Value *V : *UserIgnoreList) {
17915-
unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17916-
TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
17917-
unsigned BitWidth1 = NumTypeBits - NumSignBits;
17918-
if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17919-
++BitWidth1;
17920-
unsigned BitWidth2 = BitWidth1;
17921-
if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17922-
APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
17923-
BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17924-
}
17925-
ReductionBitWidth =
17926-
std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
17890+
for (Value *V : *UserIgnoreList) {
17891+
auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17892+
auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
17893+
unsigned BitWidth1 = NumTypeBits - NumSignBits;
17894+
if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17895+
++BitWidth1;
17896+
unsigned BitWidth2 = BitWidth1;
17897+
if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17898+
auto Mask = DB->getDemandedBits(cast<Instruction>(V));
17899+
BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
1792717900
}
17928-
if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17929-
ReductionBitWidth = 8;
17930-
17931-
ReductionBitWidth = bit_ceil(ReductionBitWidth);
17901+
ReductionBitWidth =
17902+
std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
1793217903
}
17904+
if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17905+
ReductionBitWidth = 8;
17906+
17907+
ReductionBitWidth = bit_ceil(ReductionBitWidth);
1793317908
}
1793417909
bool IsTopRoot = NodeIdx == 0;
1793517910
while (NodeIdx < VectorizableTree.size() &&
@@ -19785,8 +19760,8 @@ class HorizontalReduction {
1978519760

1978619761
// Estimate cost.
1978719762
InstructionCost TreeCost = V.getTreeCost(VL);
19788-
InstructionCost ReductionCost = getReductionCost(
19789-
TTI, VL, IsCmpSelMinMax, RdxFMF, V.getReductionBitWidthAndSign());
19763+
InstructionCost ReductionCost =
19764+
getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
1979019765
InstructionCost Cost = TreeCost + ReductionCost;
1979119766
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
1979219767
<< " for reduction\n");
@@ -19891,12 +19866,10 @@ class HorizontalReduction {
1989119866
createStrideMask(I, ScalarTyNumElements, VL.size());
1989219867
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
1989319868
ReducedSubTree = Builder.CreateInsertElement(
19894-
ReducedSubTree,
19895-
emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
19869+
ReducedSubTree, emitReduction(Lane, Builder, TTI), I);
1989619870
}
1989719871
} else {
19898-
ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
19899-
RdxRootInst->getType());
19872+
ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI);
1990019873
}
1990119874
if (ReducedSubTree->getType() != VL.front()->getType()) {
1990219875
assert(ReducedSubTree->getType() != VL.front()->getType() &&
@@ -20077,13 +20050,12 @@ class HorizontalReduction {
2007720050

2007820051
private:
2007920052
/// Calculate the cost of a reduction.
20080-
InstructionCost getReductionCost(
20081-
TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals,
20082-
bool IsCmpSelMinMax, FastMathFlags FMF,
20083-
const std::optional<std::pair<unsigned, bool>> BitwidthAndSign) {
20053+
InstructionCost getReductionCost(TargetTransformInfo *TTI,
20054+
ArrayRef<Value *> ReducedVals,
20055+
bool IsCmpSelMinMax, unsigned ReduxWidth,
20056+
FastMathFlags FMF) {
2008420057
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2008520058
Type *ScalarTy = ReducedVals.front()->getType();
20086-
unsigned ReduxWidth = ReducedVals.size();
2008720059
FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
2008820060
InstructionCost VectorCost = 0, ScalarCost;
2008920061
// If all of the reduced values are constant, the vector cost is 0, since
@@ -20142,22 +20114,8 @@ class HorizontalReduction {
2014220114
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
2014320115
/*Extract*/ false, TTI::TCK_RecipThroughput);
2014420116
} else {
20145-
auto [Bitwidth, IsSigned] =
20146-
BitwidthAndSign.value_or(std::make_pair(0u, false));
20147-
if (RdxKind == RecurKind::Add && Bitwidth == 1) {
20148-
// Represent vector_reduce_add(ZExt(<n x i1>)) to
20149-
// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20150-
auto *IntTy = IntegerType::get(ScalarTy->getContext(), ReduxWidth);
20151-
IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
20152-
VectorCost =
20153-
TTI->getCastInstrCost(Instruction::BitCast, IntTy,
20154-
getWidenedType(ScalarTy, ReduxWidth),
20155-
TTI::CastContextHint::None, CostKind) +
20156-
TTI->getIntrinsicInstrCost(ICA, CostKind);
20157-
} else {
20158-
VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20159-
FMF, CostKind);
20160-
}
20117+
VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF,
20118+
CostKind);
2016120119
}
2016220120
}
2016320121
ScalarCost = EvaluateScalarCost([&]() {
@@ -20194,22 +20152,11 @@ class HorizontalReduction {
2019420152

2019520153
/// Emit a horizontal reduction of the vectorized value.
2019620154
Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20197-
const TargetTransformInfo *TTI, Type *DestTy) {
20155+
const TargetTransformInfo *TTI) {
2019820156
assert(VectorizedValue && "Need to have a vectorized tree node");
2019920157
assert(RdxKind != RecurKind::FMulAdd &&
2020020158
"A call to the llvm.fmuladd intrinsic is not handled yet");
2020120159

20202-
auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20203-
if (FTy->getScalarType() == Builder.getInt1Ty() &&
20204-
RdxKind == RecurKind::Add &&
20205-
DestTy->getScalarType() != FTy->getScalarType()) {
20206-
// Convert vector_reduce_add(ZExt(<n x i1>)) to
20207-
// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20208-
Value *V = Builder.CreateBitCast(
20209-
VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20210-
++NumVectorInstructions;
20211-
return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20212-
}
2021320160
++NumVectorInstructions;
2021420161
return createSimpleReduction(Builder, VectorizedValue, RdxKind);
2021520162
}

llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,8 @@ define i16 @test(i16 %call37) {
1111
; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i16> [[SHUFFLE]], zeroinitializer
1212
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <8 x i16> [[SHUFFLE]], zeroinitializer
1313
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
14-
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
15-
; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.ctpop.i8(i8 [[TMP8]])
16-
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP7]] to i16
14+
; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
15+
; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP5]])
1716
; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP6]], 0
1817
; CHECK-NEXT: ret i16 [[OP_RDX]]
1918
;

llvm/test/Transforms/SLPVectorizer/zext-incoming-for-neg-icmp.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,8 @@ define i32 @test(i32 %a, i8 %b, i8 %c) {
1414
; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
1515
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP4]] to <4 x i16>
1616
; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <4 x i16> [[TMP8]], [[TMP9]]
17-
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i1> [[TMP5]] to i4
18-
; CHECK-NEXT: [[TMP11:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP10]])
19-
; CHECK-NEXT: [[TMP7:%.*]] = zext i4 [[TMP11]] to i32
17+
; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP5]] to <4 x i32>
18+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
2019
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[A]]
2120
; CHECK-NEXT: ret i32 [[OP_RDX]]
2221
;

0 commit comments

Comments
 (0)