[RISCV][CostModel] Estimate cost of llvm.vector.reduce.fmaximum/fminimum

arcbbb · arcbbb · commit 9eb0d55a4c48 · 2024-03-15T01:11:58.000-07:00
The ‘llvm.vector.reduce.fmaximum/fminimum.*’ intrinsics propagate NaNs. and if any element of the vector is a NaN. Following llvm#79402, the patch add the cost of NaN check (vmfne + vcpop)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1001,6 +1001,53 @@ RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
       return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
   }
 
+  if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
+    SmallVector<unsigned, 5> SplitOps;
+    SmallVector<unsigned, 3> Opcodes;
+    InstructionCost ExtraCost = 0;
+    switch (IID) {
+    case Intrinsic::maximum:
+      if (FMF.noNaNs()) {
+        SplitOps = {RISCV::VFMAX_VV};
+        Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
+      } else {
+        SplitOps = {RISCV::VMFEQ_VV, RISCV::VMERGE_VVM, RISCV::VMFEQ_VV,
+                    RISCV::VMERGE_VVM, RISCV::VFMAX_VV};
+        Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
+                   RISCV::VFMV_F_S};
+        // Cost of Canonical Nan
+        // lui a0, 523264
+        // fmv.w.x fa0, a0
+        ExtraCost = 2;
+      }
+      break;
+
+    case Intrinsic::minimum:
+      if (FMF.noNaNs()) {
+        SplitOps = {RISCV::VFMIN_VV};
+        Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
+      } else {
+        SplitOps = {RISCV::VMFEQ_VV, RISCV::VMERGE_VVM, RISCV::VMFEQ_VV,
+                    RISCV::VMERGE_VVM, RISCV::VFMIN_VV};
+        Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
+                   RISCV::VFMV_F_S};
+        // Cost of Canonical Nan
+        // lui a0, 523264
+        // fmv.w.x fa0, a0
+        ExtraCost = 2;
+      }
+      break;
+    }
+    // Add a cost for data larger than LMUL8
+    InstructionCost SplitCost =
+        (LT.first > 1)
+            ? (LT.first - 1) *
+                  getRISCVInstructionCost(SplitOps, LT.second, CostKind)
+            : 0;
+    return ExtraCost + SplitCost +
+           getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+  }
+
   // IR Reduction is composed by two vmv and one rvv reduction instruction.
   InstructionCost BaseCost = 2;
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fmaximum.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fmaximum.ll
@@ -6,23 +6,37 @@
 
 define float @reduce_fmaximum_f32(float %arg) {
 ; CHECK-LABEL: 'reduce_fmaximum_f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float undef
 ;
 ; SIZE-LABEL: 'reduce_fmaximum_f32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float undef
 ;
 %V2   = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
@@ -32,6 +46,13 @@ define float @reduce_fmaximum_f32(float %arg) {
 %V32   = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
 %V64   = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
 %V128   = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
+call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
+call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
+call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
+call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
+call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
+call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
+call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
 ret float undef
 }
 declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>)
@@ -44,21 +65,33 @@ declare float @llvm.vector.reduce.fmaximum.v128f32(<128 x float>)
 
 define double @reduce_fmaximum_f64(double %arg) {
 ; CHECK-LABEL: 'reduce_fmaximum_f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double undef
 ;
 ; SIZE-LABEL: 'reduce_fmaximum_f64'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double undef
 ;
 %V2   = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
@@ -67,6 +100,12 @@ define double @reduce_fmaximum_f64(double %arg) {
 %V16   = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
 %V32   = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
 %V64   = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
+call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
+call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
+call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
+call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
+call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
+call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
 ret double undef
 }
 declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fminimum.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fminimum.ll