Skip to content

Commit 9eb0d55

Browse files
committed
[RISCV][CostModel] Estimate cost of llvm.vector.reduce.fmaximum/fminimum
The ‘llvm.vector.reduce.fmaximum/fminimum.*’ intrinsics propagate NaNs. and if any element of the vector is a NaN. Following llvm#79402, the patch add the cost of NaN check (vmfne + vcpop)
1 parent 2a547f0 commit 9eb0d55

File tree

3 files changed

+138
-52
lines changed

3 files changed

+138
-52
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,53 @@ RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
10011001
return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
10021002
}
10031003

1004+
if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1005+
SmallVector<unsigned, 5> SplitOps;
1006+
SmallVector<unsigned, 3> Opcodes;
1007+
InstructionCost ExtraCost = 0;
1008+
switch (IID) {
1009+
case Intrinsic::maximum:
1010+
if (FMF.noNaNs()) {
1011+
SplitOps = {RISCV::VFMAX_VV};
1012+
Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1013+
} else {
1014+
SplitOps = {RISCV::VMFEQ_VV, RISCV::VMERGE_VVM, RISCV::VMFEQ_VV,
1015+
RISCV::VMERGE_VVM, RISCV::VFMAX_VV};
1016+
Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1017+
RISCV::VFMV_F_S};
1018+
// Cost of Canonical Nan
1019+
// lui a0, 523264
1020+
// fmv.w.x fa0, a0
1021+
ExtraCost = 2;
1022+
}
1023+
break;
1024+
1025+
case Intrinsic::minimum:
1026+
if (FMF.noNaNs()) {
1027+
SplitOps = {RISCV::VFMIN_VV};
1028+
Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1029+
} else {
1030+
SplitOps = {RISCV::VMFEQ_VV, RISCV::VMERGE_VVM, RISCV::VMFEQ_VV,
1031+
RISCV::VMERGE_VVM, RISCV::VFMIN_VV};
1032+
Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1033+
RISCV::VFMV_F_S};
1034+
// Cost of Canonical Nan
1035+
// lui a0, 523264
1036+
// fmv.w.x fa0, a0
1037+
ExtraCost = 2;
1038+
}
1039+
break;
1040+
}
1041+
// Add a cost for data larger than LMUL8
1042+
InstructionCost SplitCost =
1043+
(LT.first > 1)
1044+
? (LT.first - 1) *
1045+
getRISCVInstructionCost(SplitOps, LT.second, CostKind)
1046+
: 0;
1047+
return ExtraCost + SplitCost +
1048+
getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1049+
}
1050+
10041051
// IR Reduction is composed by two vmv and one rvv reduction instruction.
10051052
InstructionCost BaseCost = 2;
10061053

llvm/test/Analysis/CostModel/RISCV/reduce-fmaximum.ll

Lines changed: 65 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,37 @@
66

77
define float @reduce_fmaximum_f32(float %arg) {
88
; CHECK-LABEL: 'reduce_fmaximum_f32'
9-
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
10-
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
11-
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
12-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
13-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
14-
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
15-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
15+
; CHECK-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
1623
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float undef
1724
;
1825
; SIZE-LABEL: 'reduce_fmaximum_f32'
19-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
20-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
21-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
22-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
23-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
24-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
25-
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
26+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
27+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
28+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
29+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
30+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
31+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
32+
; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
33+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
34+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
35+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
36+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
37+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
38+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
39+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
2640
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef
2741
;
2842
%V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
@@ -32,6 +46,13 @@ define float @reduce_fmaximum_f32(float %arg) {
3246
%V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
3347
%V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
3448
%V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
49+
call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
50+
call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
51+
call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
52+
call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
53+
call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
54+
call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
55+
call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
3556
ret float undef
3657
}
3758
declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>)
@@ -44,21 +65,33 @@ declare float @llvm.vector.reduce.fmaximum.v128f32(<128 x float>)
4465

4566
define double @reduce_fmaximum_f64(double %arg) {
4667
; CHECK-LABEL: 'reduce_fmaximum_f64'
47-
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
48-
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
49-
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
50-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
51-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
52-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
68+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
69+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
70+
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
71+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
72+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
73+
; CHECK-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
74+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
75+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
76+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
77+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
78+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
79+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
5380
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double undef
5481
;
5582
; SIZE-LABEL: 'reduce_fmaximum_f64'
56-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
57-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
58-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
59-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
60-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
61-
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
83+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
84+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
85+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
86+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
87+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
88+
; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
89+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
90+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
91+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
92+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
93+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
94+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
6295
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef
6396
;
6497
%V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
@@ -67,6 +100,12 @@ define double @reduce_fmaximum_f64(double %arg) {
67100
%V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
68101
%V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
69102
%V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
103+
call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
104+
call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
105+
call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
106+
call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
107+
call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
108+
call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
70109
ret double undef
71110
}
72111
declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)

0 commit comments

Comments
 (0)