diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index 42bccdc028461..140edff13a67f 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -47,6 +47,8 @@ enum class RecurKind { FMax, ///< FP max implemented in terms of select(cmp()). FMinimum, ///< FP min with llvm.minimum semantics FMaximum, ///< FP max with llvm.maximum semantics + FMinimumNum, ///< FP min with llvm.minimumnum semantics + FMaximumNum, ///< FP max with llvm.maximumnum semantics FMulAdd, ///< Sum of float products with llvm.fmuladd(a * b + sum). IAnyOf, ///< Any_of reduction with select(icmp(),x,y) where one of (x,y) is ///< loop invariant, and both x and y are integer type. @@ -239,7 +241,8 @@ class RecurrenceDescriptor { /// Returns true if the recurrence kind is a floating-point min/max kind. static bool isFPMinMaxRecurrenceKind(RecurKind Kind) { return Kind == RecurKind::FMin || Kind == RecurKind::FMax || - Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum; + Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum || + Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum; } /// Returns true if the recurrence kind is any min/max kind. diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 94c347b01bbfb..e7e0bef048f71 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -788,6 +788,10 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind, return InstDesc(Kind == RecurKind::FMin, I); if (match(I, m_Intrinsic(m_Value(), m_Value()))) return InstDesc(Kind == RecurKind::FMax, I); + if (match(I, m_Intrinsic(m_Value(), m_Value()))) + return InstDesc(Kind == RecurKind::FMinimumNum, I); + if (match(I, m_Intrinsic(m_Value(), m_Value()))) + return InstDesc(Kind == RecurKind::FMaximumNum, I); if (match(I, m_Intrinsic(m_Value(), m_Value()))) return InstDesc(Kind == RecurKind::FMinimum, I); if (match(I, m_Intrinsic(m_Value(), m_Value()))) @@ -892,10 +896,14 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( return true; if (isa(I) && I->hasNoNaNs() && I->hasNoSignedZeros()) return true; - // minimum and maximum intrinsics do not require nsz and nnan flags since - // NaN and signed zeroes are propagated in the intrinsic implementation. + // minimum/minnum and maximum/maxnum intrinsics do not require nsz and nnan + // flags since NaN and signed zeroes are propagated in the intrinsic + // implementation. return match(I, m_Intrinsic(m_Value(), m_Value())) || - match(I, m_Intrinsic(m_Value(), m_Value())); + match(I, m_Intrinsic(m_Value(), m_Value())) || + match(I, + m_Intrinsic(m_Value(), m_Value())) || + match(I, m_Intrinsic(m_Value(), m_Value())); }; if (isIntMinMaxRecurrenceKind(Kind) || (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind))) @@ -1035,6 +1043,19 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, LLVM_DEBUG(dbgs() << "Found a float MINIMUM reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::FMaximumNum, TheLoop, FMF, RedDes, DB, AC, + DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a float MAXIMUMNUM reduction PHI." << *Phi + << "\n"); + return true; + } + if (AddReductionVar(Phi, RecurKind::FMinimumNum, TheLoop, FMF, RedDes, DB, AC, + DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a float MINIMUMNUM reduction PHI." << *Phi + << "\n"); + return true; + } + // Not a reduction of known type. return false; } @@ -1155,6 +1176,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::FMin: case RecurKind::FMaximum: case RecurKind::FMinimum: + case RecurKind::FMaximumNum: + case RecurKind::FMinimumNum: case RecurKind::FAnyOf: case RecurKind::FFindLastIV: return Instruction::FCmp; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index f57d95e7722dc..2fff9521017ff 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -958,6 +958,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { return Intrinsic::vector_reduce_fmaximum; case RecurKind::FMinimum: return Intrinsic::vector_reduce_fminimum; + case RecurKind::FMaximumNum: + return Intrinsic::vector_reduce_fmax; + case RecurKind::FMinimumNum: + return Intrinsic::vector_reduce_fmin; } } @@ -1053,6 +1057,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) { return Intrinsic::minimum; case RecurKind::FMaximum: return Intrinsic::maximum; + case RecurKind::FMinimumNum: + return Intrinsic::minimumnum; + case RecurKind::FMaximumNum: + return Intrinsic::maximumnum; } } @@ -1101,7 +1109,8 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { Type *Ty = Left->getType(); if (Ty->isIntOrIntVectorTy() || - (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum)) { + (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum || + RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) { // TODO: Add float minnum/maxnum support when FMF nnan is set. Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK); return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr, @@ -1320,6 +1329,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, case RecurKind::FMin: case RecurKind::FMinimum: case RecurKind::FMaximum: + case RecurKind::FMinimumNum: + case RecurKind::FMaximumNum: return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src); case RecurKind::FMulAdd: case RecurKind::FAdd: diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 53da78ee599b7..56a3bf74814b5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21774,7 +21774,9 @@ class HorizontalReduction { case RecurKind::FMax: case RecurKind::FMin: case RecurKind::FMaximum: - case RecurKind::FMinimum: { + case RecurKind::FMinimum: + case RecurKind::FMaximumNum: + case RecurKind::FMinimumNum: { Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind); return Builder.CreateBinaryIntrinsic(Id, LHS, RHS); } @@ -23086,6 +23088,8 @@ class HorizontalReduction { case RecurKind::FAnyOf: case RecurKind::IFindLastIV: case RecurKind::FFindLastIV: + case RecurKind::FMaximumNum: + case RecurKind::FMinimumNum: case RecurKind::None: llvm_unreachable("Unexpected reduction kind for repeated scalar."); } @@ -23220,6 +23224,8 @@ class HorizontalReduction { case RecurKind::FAnyOf: case RecurKind::IFindLastIV: case RecurKind::FFindLastIV: + case RecurKind::FMaximumNum: + case RecurKind::FMinimumNum: case RecurKind::None: llvm_unreachable("Unexpected reduction kind for repeated scalar."); } @@ -23319,6 +23325,8 @@ class HorizontalReduction { case RecurKind::FAnyOf: case RecurKind::IFindLastIV: case RecurKind::FFindLastIV: + case RecurKind::FMaximumNum: + case RecurKind::FMinimumNum: case RecurKind::None: llvm_unreachable("Unexpected reduction kind for reused scalars."); } diff --git a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll index eb6dcc72df57e..6dde2b9adc7c8 100644 --- a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll @@ -6,18 +6,42 @@ define float @maximumnum_intrinsic(ptr readonly %x) { ; CHECK-LABEL: define float @maximumnum_intrinsic( ; CHECK-SAME: ptr readonly [[X:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]]) +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call float @llvm.maximumnum.f32(float [[RED]], float [[L]]) -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -41,18 +65,42 @@ define float @maximumnum_intrinsic_fast(ptr readonly %x) { ; CHECK-LABEL: define float @maximumnum_intrinsic_fast( ; CHECK-SAME: ptr readonly [[X:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]]) +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call fast float @llvm.maximumnum.f32(float [[RED]], float [[L]]) -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -76,18 +124,42 @@ define float @minimumnum_intrinsic(ptr readonly %x) { ; CHECK-LABEL: define float @minimumnum_intrinsic( ; CHECK-SAME: ptr readonly [[X:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]]) +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call float @llvm.minimumnum.f32(float [[RED]], float [[L]]) -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -111,18 +183,42 @@ define float @minimumnum_intrinsic_fast(ptr readonly %x) { ; CHECK-LABEL: define float @minimumnum_intrinsic_fast( ; CHECK-SAME: ptr readonly [[X:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]]) +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call fast float @llvm.minimumnum.f32(float [[RED]], float [[L]]) -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -144,3 +240,15 @@ exit: declare float @llvm.minimumnum.f32(float, float) declare float @llvm.maximumnum.f32(float, float) +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-maximumnum-minimumnum.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-maximumnum-minimumnum.ll new file mode 100644 index 0000000000000..bb7695794f0b2 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-maximumnum-minimumnum.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-macosx -S %s | FileCheck %s + +declare float @llvm.maximumnum.f32(float, float) +declare float @llvm.minimumnum.f32(float, float) + +; TODO: Need reduction version of maximumnum/minimumnum. +define float @reduction_v4f32_maximumnum(ptr %p) { +; CHECK-LABEL: define float @reduction_v4f32_maximumnum +; CHECK-SAME: (ptr [[P:%.*]]) { +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1 +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2 +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3 +; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4 +; CHECK-NEXT: [[M1:%.*]] = tail call float @llvm.maximumnum.f32(float [[T1]], float [[T0]]) +; CHECK-NEXT: [[M2:%.*]] = tail call float @llvm.maximumnum.f32(float [[T2]], float [[M1]]) +; CHECK-NEXT: [[M3:%.*]] = tail call float @llvm.maximumnum.f32(float [[T3]], float [[M2]]) +; CHECK-NEXT: ret float [[M3]] +; + %g1 = getelementptr inbounds float, ptr %p, i64 1 + %g2 = getelementptr inbounds float, ptr %p, i64 2 + %g3 = getelementptr inbounds float, ptr %p, i64 3 + %t0 = load float, ptr %p, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %m1 = tail call float @llvm.maximumnum.f32(float %t1, float %t0) + %m2 = tail call float @llvm.maximumnum.f32(float %t2, float %m1) + %m3 = tail call float @llvm.maximumnum.f32(float %t3, float %m2) + ret float %m3 +} + +define float @reduction_v4f64_minimumnum(ptr %p) { +; CHECK-LABEL: define float @reduction_v4f64_minimumnum +; CHECK-SAME: (ptr [[P:%.*]]) { +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1 +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2 +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3 +; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4 +; CHECK-NEXT: [[M1:%.*]] = tail call float @llvm.minimumnum.f32(float [[T1]], float [[T0]]) +; CHECK-NEXT: [[M2:%.*]] = tail call float @llvm.minimumnum.f32(float [[T2]], float [[M1]]) +; CHECK-NEXT: [[M3:%.*]] = tail call float @llvm.minimumnum.f32(float [[T3]], float [[M2]]) +; CHECK-NEXT: ret float [[M3]] +; + %g1 = getelementptr inbounds float, ptr %p, i64 1 + %g2 = getelementptr inbounds float, ptr %p, i64 2 + %g3 = getelementptr inbounds float, ptr %p, i64 3 + %t0 = load float, ptr %p, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %m1 = tail call float @llvm.minimumnum.f32(float %t1, float %t0) + %m2 = tail call float @llvm.minimumnum.f32(float %t2, float %m1) + %m3 = tail call float @llvm.minimumnum.f32(float %t3, float %m2) + ret float %m3 +}