Skip to content

Commit b8b6ebb

Browse files
committed
[LV] Relax high loop trip count threshold for deciding to interleave the loop
The current loop trip count threshold to allow loop interleaving is 128 which seems arbitrarily high & uncorrelated with factors like VW, IC, register pressure etc. A set of microbenchmarks in llvm-test-suite (llvm/llvm-test-suite#26), when tested on a AArch64 platform, shows that loop interleaving is beneficial even for loops with low trip counts. We have also found similar evidence in an application benchmark that when compiled with PGO shows a 40% regression when it's hot loop with profile-guided trip count of 24 doesn't get interleaved because of this threshold. Therefore, it seems reasonable to eliminate this threshold and use the trip count for computing interleaving count instead (#73766).
1 parent cd28da3 commit b8b6ebb

19 files changed

+2285
-1566
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -267,11 +267,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267267
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268268
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
269269

270-
static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271-
"tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272-
cl::desc("We don't interleave loops with a estimated constant trip count "
273-
"below this number"));
274-
275270
static cl::opt<unsigned> ForceTargetNumScalarRegs(
276271
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277272
cl::desc("A flag that overrides the target's number of scalar registers."));
@@ -5499,14 +5494,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
54995494

55005495
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
55015496
const bool HasReductions = !Legal->getReductionVars().empty();
5502-
// Do not interleave loops with a relatively small known or estimated trip
5503-
// count. But we will interleave when InterleaveSmallLoopScalarReduction is
5504-
// enabled, and the code has scalar reductions(HasReductions && VF = 1),
5505-
// because with the above conditions interleaving can expose ILP and break
5506-
// cross iteration dependences for reductions.
5507-
if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5508-
!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5509-
return 1;
55105497

55115498
// If we did not calculate the cost for VF (because the user selected the VF)
55125499
// then we calculate the cost of VF here.

llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ entry:
7979

8080

8181
; VECTORIZE: mul <4 x i32>
82+
; VECTORIZE: mul <4 x i32>
83+
; VECTORIZE-NOT: mul <4 x i32>
8284

8385
for.body: ; preds = %for.body, %entry
8486
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]

llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -326,34 +326,40 @@ define void @trunc_invariant_sdiv_result(i32 %a, i32 %b, ptr noalias %src, ptr %
326326
; CHECK: vector.body:
327327
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
328328
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
329+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 16
329330
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
330-
; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
331-
; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
332-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
333-
; CHECK-NEXT: store <16 x i16> [[TMP5]], ptr [[TMP6]], align 2
334-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
335-
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
336-
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
331+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
332+
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
333+
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i16>
334+
; CHECK-NEXT: [[TMP7:%.*]] = mul <16 x i16> [[TMP2]], [[TMP5]]
335+
; CHECK-NEXT: [[TMP8:%.*]] = mul <16 x i16> [[TMP2]], [[TMP6]]
336+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
337+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i64 16
338+
; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP9]], align 2
339+
; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP10]], align 2
340+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
341+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
342+
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
337343
; CHECK: middle.block:
338344
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
339345
; CHECK: vec.epilog.iter.check:
340346
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
341347
; CHECK: vec.epilog.ph:
342-
; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[INVAR_DIV]] to i16
343-
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i16> poison, i16 [[TMP8]], i64 0
344-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> poison, <4 x i32> zeroinitializer
348+
; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[INVAR_DIV]] to i16
349+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i64 0
350+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> poison, <4 x i32> zeroinitializer
345351
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
346352
; CHECK: vec.epilog.vector.body:
347-
; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ 96, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
348-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX3]]
349-
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
350-
; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i16>
351-
; CHECK-NEXT: [[TMP13:%.*]] = mul <4 x i16> [[TMP10]], [[TMP12]]
352-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX3]]
353-
; CHECK-NEXT: store <4 x i16> [[TMP13]], ptr [[TMP14]], align 2
354-
; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX3]], 4
355-
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 100
356-
; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
353+
; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ 96, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
354+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX4]]
355+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
356+
; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[WIDE_LOAD5]] to <4 x i16>
357+
; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i16> [[TMP14]], [[TMP16]]
358+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX4]]
359+
; CHECK-NEXT: store <4 x i16> [[TMP17]], ptr [[TMP18]], align 2
360+
; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
361+
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 100
362+
; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
357363
; CHECK: vec.epilog.middle.block:
358364
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
359365
; CHECK: vec.epilog.scalar.ph:

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2-
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
1+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
32

43
target triple = "aarch64-linux-gnu"
54

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2-
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
1+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
32

43
target triple = "aarch64-linux-gnu"
54

0 commit comments

Comments
 (0)