-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[RISCV][CostModel] Updates reduction and shuffle cost #77342
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Shih-Po Hung (arcbbb) Changes
Patch is 344.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77342.diff 17 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index b3916c98700519..6c143a762b0c03 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -46,6 +46,9 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
InstructionCost Cost = 0;
for (auto Op : OpCodes) {
switch (Op) {
+ case RISCV::SLT:
+ Cost += 1;
+ break;
case RISCV::VRGATHER_VI:
Cost += TLI->getVRGatherVICost(VT);
break;
@@ -84,8 +87,14 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
Cost += VL;
break;
}
+ case RISCV::VMV_X_S:
+ case RISCV::VFMV_F_S:
+ Cost += 1;
+ break;
case RISCV::VMV_S_X:
- // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1
+ case RISCV::VFMV_S_F:
+ Cost += 1;
+ break;
default:
Cost += LMULCost;
}
@@ -444,9 +453,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// vmv.s.x v0, a0
// vmerge.vvm v8, v9, v8, v0
return LT.first *
- (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li
- getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
- LT.second, CostKind));
+ (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
+ LT.second, CostKind));
}
case TTI::SK_Broadcast: {
bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
@@ -459,9 +467,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// vmv.v.x v8, a0
// vmsne.vi v0, v8, 0
return LT.first *
- (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi
- getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
- LT.second, CostKind));
+ (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
+ LT.second, CostKind));
}
// Example sequence:
// vsetivli zero, 2, e8, mf8, ta, mu (ignored)
@@ -473,12 +480,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// vmsne.vi v0, v8, 0
return LT.first *
- (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
- TLI->getLMULCost(
- LT.second) + // FIXME: vmv.x.s is the same as extractelement
- getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
- RISCV::VMV_V_X, RISCV::VMSNE_VI},
- LT.second, CostKind));
+ (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
+ RISCV::VMV_X_S, RISCV::VMV_V_X,
+ RISCV::VMSNE_VI},
+ LT.second, CostKind));
}
if (HasScalar) {
@@ -523,9 +528,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
if (LT.second.isFixedLengthVector())
// vrsub.vi has a 5 bit immediate field, otherwise an li suffices
LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
- // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
- InstructionCost GatherCost =
- 2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
+ InstructionCost GatherCost = getRISCVInstructionCost(
+ {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}, LT.second,
+ CostKind);
// Mask operation additionally required extend and truncate
InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
return LT.first * (LenCost + GatherCost + ExtendCost);
@@ -1358,19 +1363,53 @@ RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
- if (Ty->getElementType()->isIntegerTy(1))
- // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
- // cost 2, but we don't have enough info here so we slightly over cost.
- return (LT.first - 1) + 3;
+ std::array<unsigned, 3> Opcodes;
+ if (Ty->getElementType()->isIntegerTy(1)) {
+ // vcpop sequences, see vreduction-mask.ll.
+ if ((IID == Intrinsic::umax) || (IID == Intrinsic::smin))
+ Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M, RISCV::SLT};
+ else
+ Opcodes = {RISCV::VCPOP_M, RISCV::SLT};
+ return (LT.first - 1) +
+ getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+ }
// IR Reduction is composed by two vmv and one rvv reduction instruction.
- InstructionCost BaseCost = 2;
-
- if (CostKind == TTI::TCK_CodeSize)
- return (LT.first - 1) + BaseCost;
-
- unsigned VL = getEstimatedVLFor(Ty);
- return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+ unsigned SplitOp;
+ switch (IID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic");
+ case Intrinsic::smax:
+ SplitOp = RISCV::VMAX_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
+ break;
+ case Intrinsic::smin:
+ SplitOp = RISCV::VMIN_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
+ break;
+ case Intrinsic::umax:
+ SplitOp = RISCV::VMAXU_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
+ break;
+ case Intrinsic::umin:
+ SplitOp = RISCV::VMINU_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
+ break;
+ case Intrinsic::maxnum:
+ SplitOp = RISCV::VFMAX_VV;
+ Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
+ break;
+ case Intrinsic::minnum:
+ SplitOp = RISCV::VFMIN_VV;
+ Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
+ break;
+ }
+ // Add a cost for data larger than LMUL8
+ InstructionCost SplitCost =
+ (LT.first > 1) ? (LT.first - 1) *
+ getRISCVInstructionCost(SplitOp, LT.second, CostKind)
+ : 0;
+ return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
}
InstructionCost
@@ -1392,20 +1431,50 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
- if (Ty->getElementType()->isIntegerTy(1))
+ std::array<unsigned, 3> Opcodes;
+ if (Ty->getElementType()->isIntegerTy(1)) {
// vcpop sequences, see vreduction-mask.ll
- return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
+ if (ISD == ISD::AND)
+ Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M, RISCV::SLT};
+ else
+ Opcodes = {RISCV::VCPOP_M, RISCV::SLT};
+ return (LT.first - 1) +
+ getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+ }
// IR Reduction is composed by two vmv and one rvv reduction instruction.
- InstructionCost BaseCost = 2;
-
- if (CostKind == TTI::TCK_CodeSize)
- return (LT.first - 1) + BaseCost;
-
- unsigned VL = getEstimatedVLFor(Ty);
- if (TTI::requiresOrderedReduction(FMF))
- return (LT.first - 1) + BaseCost + VL;
- return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+ unsigned SplitOp;
+ switch (ISD) {
+ case ISD::ADD:
+ SplitOp = RISCV::VADD_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
+ break;
+ case ISD::OR:
+ SplitOp = RISCV::VOR_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
+ break;
+ case ISD::XOR:
+ SplitOp = RISCV::VXOR_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
+ break;
+ case ISD::AND:
+ SplitOp = RISCV::VAND_VV;
+ Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
+ break;
+ case ISD::FADD:
+ SplitOp = RISCV::VFADD_VV;
+ if (TTI::requiresOrderedReduction(FMF))
+ Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDOSUM_VS, RISCV::VFMV_F_S};
+ else
+ Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
+ break;
+ }
+ // Add a cost for data larger than LMUL8
+ InstructionCost SplitCost =
+ (LT.first > 1) ? (LT.first - 1) *
+ getRISCVInstructionCost(SplitOp, LT.second, CostKind)
+ : 0;
+ return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
}
InstructionCost RISCVTTIImpl::getExtendedReductionCost(
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
index 6fe098628ea078..ed9d71cad0be61 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
@@ -6,25 +6,25 @@
define i32 @reduce_i1(i32 %arg) {
; CHECK-LABEL: 'reduce_i1'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i1'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
@@ -51,14 +51,14 @@ define i32 @reduce_i8(i32 %arg) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i8'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
@@ -85,14 +85,14 @@ define i32 @reduce_i16(i32 %arg) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i16'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
@@ -115,18 +115,18 @@ define i32 @reduce_i32(i32 %arg) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i32'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for i...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nits.
Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M}; | ||
return (LT.first - 1) + | ||
getRISCVInstructionCost(Opcodes, LT.second, CostKind) + | ||
getCmpSelInstrCost(Instruction::Select, ElementTy, ElementTy, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this be Instruction::ICmp?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. Thanks!
case ISD::FADD: | ||
SplitOp = RISCV::VFADD_VV; | ||
if (TTI::requiresOrderedReduction(FMF)) | ||
Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDOSUM_VS, RISCV::VFMV_F_S}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ordered reductions don't split the same way. They chain a series of VFREDOSUMs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing out this! Fixed.
This is inspired by llvm#77342 (review), and is split off of same with some differences in style. A select is a vmerge.vv with the additional cost of materializing the bitmask vector in a vreg. All masks fit within a single vector register (e8 + m8 is the worst case), and thus our worst case cost should be roughly 3 (2 scalar to produce the address, one vector load op). Given most shuffles are small, and the mask will be instead produced by LUI/ADDI + vmv.s.x or ADDI + vmv.s.x, using 2 as the default seems quite reasonable. At worst, we're not going to be off by much. The prior lowering scaled the cost of the bitmask with LMUL, which I don't understand. At m1 it did use the same base cost of 2.
I split one piece of this off in #77963. For this to make rapid progress, I recommend splitting it into individual pieces to the maximum extent possible. |
) This is inspired by #77342 (review), and is split off of same with some differences in style. A select is a vmerge.vv with the additional cost of materializing the bitmask vector in a vreg. All masks fit within a single vector register (e8 + m8 is the worst case), and thus our worst case cost should be roughly 3 (2 scalar to produce the address, one vector load op). Given most shuffles are small, and the mask will be instead produced by LUI/ADDI + vmv.s.x or ADDI + vmv.s.x, using 2 as the default seems quite reasonable. At worst, we're not going to be off by much. The prior lowering scaled the cost of the bitmask with LMUL, which I don't understand. At m1 it did use the same base cost of 2. (@lukel97 You wrote the original code here, anything I'm missing here?)
case RISCV::VMV_X_S: | ||
case RISCV::VFMV_F_S: | ||
case RISCV::VMV_S_X: | ||
// FIXME: VMV_S_X doesn't use LMUL, the cost should be 1 | ||
case RISCV::VFMV_S_F: | ||
Cost += 1; | ||
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we split this off into a separate PR? I think it would be useful to be able to see if this has any effect on the current cost model tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I split it in #78739. will update this PR after that.
InstructionCost GatherCost = | ||
2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); | ||
InstructionCost GatherCost = getRISCVInstructionCost( | ||
{RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}, LT.second, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This bit isn't right. In particular, it ignore the code two lines above which talks about different cases for costing the index sequence.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I made a change to query the cost of VRSUB_VX/VRSUB_VI separately. Not sure this is what you expected.
// vcpop sequences, see vreduction-mask.ll. umax, smin actually only | ||
// cost 2, but we don't have enough info here so we slightly over cost. | ||
return (LT.first - 1) + 3; | ||
if (Ty->getElementType()->isIntegerTy(1)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you separate this as it's own patch, should be easy to review.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is split off in #79401
if (CostKind == TTI::TCK_CodeSize) | ||
return (LT.first - 1) + BaseCost; | ||
|
||
unsigned VL = getEstimatedVLFor(Ty); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The net effect of this change appears to be to increase BaseCost by 1, make the reduction linear in lmul, and remove the log2(VL) term.
This bit (maybe with the other reduction change) needs to be it's own review so we can clearly see the effect. This one (in particular), you should also try running through some reasonable large codebase (llvm test-suite, spec, whatever) and checking for unexpected interactions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK! I split this off into #79103.
This patch is split off from #77342 - Correct for CodeSize cost that 1 instruction is not included. 3 is from {VMV.S, ReductionOp, VMV.X} - Add SplitCost Unordered reduction chain a series of VADD/VFADD/... which scales with LMUL. Ordered reductions chain a series of VFREDOSUMs. - Use MVT to estimate VL.
This patch is split off from llvm#77342, and follows llvm#79103 - Correct for CodeSize cost that 1 instruction is not included. 3 is from {VMV.S, ReductionOp, VMV.X} - Add SplitCost which chains a series of VMAX/VMIN/... which scales with LMUL. - Use MVT to estimate VL.
It is split off from #77342. InstCombine transform min/max reduction with i1 into arithmetic reduction, so this patch reuses the cost logic in arithmetic reduction cost function.
After #79402 lands, I think all the pieces of this have gone in right? If not, can you rebase so what's left is visible? |
- Make `andi` cost 1 in SK_Broadcast - Query the cost of VID_V, VRSUB_VX which would scale with LMUL
8c9ec5d
to
890f7da
Compare
@@ -482,11 +481,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
// vmsne.vi v0, v8, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated codegen opportunity
vmv.x.s a0, v8
andi a0, a0, 1
vmv.v.x v8, a0
Could be a vrgather.vi
@@ -482,11 +481,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
// vmsne.vi v0, v8, 0 | |||
|
|||
return LT.first * | |||
(TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the index is non-zero, then there's a vslide needed in the current sequence which does scale with LMUL.
So, this change is wrong for non-zero index.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For SK_Broadcast, my understanding is that the index should always be the first element of the source vector.
I was based on this
if (Shuffle->isZeroEltSplat())
return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy,
Shuffle->getShuffleMask(), CostKind, 0,
nullptr, Operands);
Could we clarify this?"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like you're correct here. There are other cases which create SK_Broadcast, but from what I can tell, all are consistent with the splat index always being zero.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To chime in here, it's defined in TargetTransformInfo.h as being a broadcast of element zero:
enum ShuffleKind {
SK_Broadcast, ///< Broadcast element 0 to all other elements.
// FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} | ||
InstructionCost GatherCost = | ||
2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); | ||
GatherCost = getRISCVInstructionCost( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please common the tail here, and undo the change which is restructuring the existing LenCost computation since that appears to be non-functional in your diff.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
Given you've now reduced the scope of this, you need to update the review description so that it matches the current patch. |
62be3d6
to
5ee789e
Compare
Updated, thanks! |
@@ -531,9 +529,12 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
if (LT.second.isFixedLengthVector()) | |||
// vrsub.vi has a 5 bit immediate field, otherwise an li suffices | |||
LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; | |||
// FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} | |||
unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV : VRGATHER_VV}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"ISCV : VRGATHER_VV" - Does this build? Shouldn't there be a second :?
The cost sequence here is now ignoring the LI for the VX case.
Please, don't change LenCost. It's correct. Adjust only the other bits to make your fix.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe I overlooked something, here are the three cost sequences I am considering:
- scalable vector
LenCost:3 from { csrr a0, vlenb, srli a0, a0, 3, addi a0, a0, -1 }
GatherCost from {vid.v, vrsub.vx, vrgather.vv} - fixed-length vector, imm > 5bit
LenCost:1 from { li }
GatherCost from {vid.v, vrsub.vx, vrgather.vv} - fixed-length vector, imm <= 5 bit
LenCost:0 from { }
GatherCost from {vid.v, vrsub.vi, vrgather.vv}
Could you help me correct this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}; | ||
if (LT.second.isFixedLengthVector() && | ||
isInt<5>(LT.second.getVectorNumElements() - 1)) | ||
Opcodes[1] = RISCV::VRSUB_VI; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to double check, we don't cost VRSUB_VI
any differently from VRSUB_VX
do we?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the cost for both is the same.
@@ -531,9 +529,12 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
if (LT.second.isFixedLengthVector()) | |||
// vrsub.vi has a 5 bit immediate field, otherwise an li suffices | |||
LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; | |||
// FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} | |||
unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV : VRGATHER_VV}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@@ -482,11 +481,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
// vmsne.vi v0, v8, 0 | |||
|
|||
return LT.first * | |||
(TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To chime in here, it's defined in TargetTransformInfo.h as being a broadcast of element zero:
enum ShuffleKind {
SK_Broadcast, ///< Broadcast element 0 to all other elements.
@preames ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I am going to merge this if no further inputs. Thanks! |
andi
cost 1 in SK_Broadcast