Skip to content

Commit e970be9

Browse files
fhahnyuxuanchen1997
authored andcommitted
[LV] Ignore live-out users in cost model if scalar epilogue is required.
Summary: Follow-up to ba8126b. If a scalar epilogue is required, users outside the loop won't use live-outs from the vector loop but from the scalar epilogue. Ignore them if that is the case. This fixes another case where the VPlan-based cost-model more accurately computes cost. Fixes #100464. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250614
1 parent 60055ad commit e970be9

File tree

3 files changed

+119
-4
lines changed

3 files changed

+119
-4
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6678,6 +6678,15 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
66786678

66796679
SmallVector<Value *, 4> DeadInterleavePointerOps;
66806680
SmallVector<Value *, 4> DeadOps;
6681+
6682+
// If a scalar epilogue is required, users outside the loop won't use
6683+
// live-outs from the vector loop but from the scalar epilogue. Ignore them if
6684+
// that is the case.
6685+
bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6686+
auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6687+
return RequiresScalarEpilogue &&
6688+
!TheLoop->contains(cast<Instruction>(U)->getParent());
6689+
};
66816690
for (BasicBlock *BB : TheLoop->blocks())
66826691
for (Instruction &I : *BB) {
66836692
// Find all stores to invariant variables. Since they are going to sink
@@ -6693,8 +6702,9 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
66936702
// Add instructions that would be trivially dead and are only used by
66946703
// values already ignored to DeadOps to seed worklist.
66956704
if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6696-
all_of(I.users(), [this](User *U) {
6697-
return VecValuesToIgnore.contains(U) || ValuesToIgnore.contains(U);
6705+
all_of(I.users(), [this, IsLiveOutDead](User *U) {
6706+
return VecValuesToIgnore.contains(U) ||
6707+
ValuesToIgnore.contains(U) || IsLiveOutDead(U);
66986708
}))
66996709
DeadOps.push_back(&I);
67006710

@@ -6727,16 +6737,22 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
67276737

67286738
// Mark ops that would be trivially dead and are only used by ignored
67296739
// instructions as free.
6740+
BasicBlock *Header = TheLoop->getHeader();
67306741
for (unsigned I = 0; I != DeadOps.size(); ++I) {
67316742
auto *Op = dyn_cast<Instruction>(DeadOps[I]);
67326743
// Skip any op that shouldn't be considered dead.
67336744
if (!Op || !TheLoop->contains(Op) ||
6745+
(isa<PHINode>(Op) && Op->getParent() == Header) ||
67346746
!wouldInstructionBeTriviallyDead(Op, TLI) ||
6735-
any_of(Op->users(), [this](User *U) {
6736-
return !VecValuesToIgnore.contains(U) && !ValuesToIgnore.contains(U);
6747+
any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6748+
return !VecValuesToIgnore.contains(U) && ValuesToIgnore.contains(U) &&
6749+
!IsLiveOutDead(U);
67376750
}))
67386751
continue;
67396752

6753+
if (!TheLoop->contains(Op->getParent()))
6754+
continue;
6755+
67406756
// If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
67416757
// which applies for both scalar and vector versions. Otherwise it is only
67426758
// dead in vector versions, so only add it to VecValuesToIgnore.

llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,106 @@ loop:
8686
exit:
8787
ret void
8888
}
89+
90+
; Test case for https://github.com/llvm/llvm-project/issues/100464.
91+
; Loop with a live-out %l and scalar epilogue required due to an interleave
92+
; group. As the scalar epilogue is required the live-out is fed from the scalar
93+
; epilogue and dead in the vector loop.
94+
define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
95+
; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required(
96+
; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
97+
; CHECK-NEXT: [[ENTRY:.*]]:
98+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
99+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
100+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 8, i32 [[TMP1]])
101+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
102+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
103+
; CHECK: [[VECTOR_MEMCHECK]]:
104+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005
105+
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005
106+
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
107+
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
108+
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
109+
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
110+
; CHECK: [[VECTOR_PH]]:
111+
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
112+
; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4
113+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
114+
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
115+
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
116+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
117+
; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
118+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
119+
; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 4
120+
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
121+
; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP9]], zeroinitializer
122+
; CHECK-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
123+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
124+
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
125+
; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4
126+
; CHECK-NEXT: [[TMP14:%.*]] = mul i32 4, [[TMP13]]
127+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
128+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
129+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
130+
; CHECK: [[VECTOR_BODY]]:
131+
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
132+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
133+
; CHECK-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
134+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
135+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)), !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
136+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP8]]
137+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
138+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
139+
; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
140+
; CHECK: [[MIDDLE_BLOCK]]:
141+
; CHECK-NEXT: br label %[[SCALAR_PH]]
142+
; CHECK: [[SCALAR_PH]]:
143+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
144+
; CHECK-NEXT: br label %[[LOOP:.*]]
145+
; CHECK: [[LOOP]]:
146+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
147+
; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
148+
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IDXPROM]]
149+
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
150+
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IDXPROM]]
151+
; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1
152+
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
153+
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IV]], 1001
154+
; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
155+
; CHECK: [[EXIT]]:
156+
; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ]
157+
; CHECK-NEXT: ret i8 [[R]]
158+
;
159+
entry:
160+
br label %loop
161+
162+
loop:
163+
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
164+
%idxprom = sext i32 %iv to i64
165+
%gep.src = getelementptr i8, ptr %src, i64 %idxprom
166+
%l = load i8, ptr %gep.src, align 1
167+
%gep.dst = getelementptr i8, ptr %dst, i64 %idxprom
168+
store i8 0, ptr %gep.dst, align 1
169+
%iv.next = add i32 %iv, 4
170+
%cmp = icmp ult i32 %iv, 1001
171+
br i1 %cmp, label %loop, label %exit
172+
173+
exit:
174+
%r = phi i8 [ %l, %loop ]
175+
ret i8 %r
176+
}
177+
178+
89179
;.
90180
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
91181
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
92182
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
93183
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
184+
; CHECK: [[META4]] = !{[[META5:![0-9]+]]}
185+
; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
186+
; CHECK: [[META6]] = distinct !{[[META6]], !"LVerDomain"}
187+
; CHECK: [[META7]] = !{[[META8:![0-9]+]]}
188+
; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]}
189+
; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
190+
; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
94191
;.

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
1818
; CHECK-NEXT: LV: Found an induction variable.
1919
; CHECK-NEXT: LV: Did not find one integer induction var.
2020
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
21+
; CHECK-NEXT: LV: Loop does not require scalar epilogue
2122
; CHECK-NEXT: LV: Found trip count: 0
2223
; CHECK-NEXT: LV: Scalable vectorization is available
2324
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
@@ -222,6 +223,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
222223
; CHECK-NEXT: LV: Found FP op with unsafe algebra.
223224
; CHECK-NEXT: LV: Did not find one integer induction var.
224225
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
226+
; CHECK-NEXT: LV: Loop does not require scalar epilogue
225227
; CHECK-NEXT: LV: Found trip count: 0
226228
; CHECK-NEXT: LV: Scalable vectorization is available
227229
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.

0 commit comments

Comments
 (0)