Skip to content

Commit ac65fb8

Browse files
committed
[LoopVectorize] Fix incorrect order of invariant stores when there are multiple reductions.
When a loop has multiple reductions, each with an intermediate invariant store, the order in which those reductions are processed is not considered. This can result in the invariant stores outside the loop not preserving the original order. This patch sorts VPReductionPHIRecipes by the order in which they have stores in the original loop before running `InnerLoopVectorizer::fixReduction` function, and it helps to maintain the correct order of stores. Fixes llvm#64047 Differential Revision: https://reviews.llvm.org/D157631
1 parent 8149989 commit ac65fb8

File tree

2 files changed

+42
-10
lines changed

2 files changed

+42
-10
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+36-2
Original file line numberDiff line numberDiff line change
@@ -3617,10 +3617,44 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
36173617
// the incoming edges.
36183618
VPBasicBlock *Header =
36193619
State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3620+
3621+
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
3622+
// sank outside of the loop would keep the same order as they had in the
3623+
// original loop.
3624+
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
36203625
for (VPRecipeBase &R : Header->phis()) {
36213626
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3622-
fixReduction(ReductionPhi, State);
3623-
else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3627+
ReductionPHIList.emplace_back(ReductionPhi);
3628+
}
3629+
stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
3630+
const VPReductionPHIRecipe *R2) {
3631+
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
3632+
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
3633+
3634+
// If neither of the recipes has an intermediate store, keep the order the
3635+
// same.
3636+
if (!IS1 && !IS2)
3637+
return false;
3638+
3639+
// If only one of the recipes has an intermediate store, then move it
3640+
// towards the beginning of the list.
3641+
if (IS1 && !IS2)
3642+
return true;
3643+
3644+
if (!IS1 && IS2)
3645+
return false;
3646+
3647+
// If both recipes have an intermediate store, then the recipe with the
3648+
// later store should be processed earlier. So it should go to the beginning
3649+
// of the list.
3650+
return DT->dominates(IS2, IS1);
3651+
});
3652+
3653+
for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
3654+
fixReduction(ReductionPhi, State);
3655+
3656+
for (VPRecipeBase &R : Header->phis()) {
3657+
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
36243658
fixFixedOrderRecurrence(FOR, State);
36253659
}
36263660
}

llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

+6-8
Original file line numberDiff line numberDiff line change
@@ -558,14 +558,13 @@ exit: ; preds = %for.body
558558
}
559559

560560
; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved
561-
; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
562561
; See https://github.com/llvm/llvm-project/issues/64047
563562
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
564563
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
565564
; CHECK: middle.block:
566-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
565+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
567566
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
568-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
567+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
569568
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
570569
;
571570
entry:
@@ -619,14 +618,13 @@ exit:
619618
}
620619

621620
; Same as above but storing is done to two different pointers and they can be aliased
622-
; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
623621
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
624622
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
625623
; CHECK: middle.block:
626-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
627-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst2, align 4
628-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
629-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst1, align 4
624+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
625+
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
626+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
627+
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
630628
;
631629
entry:
632630
br label %for.body

0 commit comments

Comments
 (0)