-
Notifications
You must be signed in to change notification settings - Fork 344
[MicroBenchmark,LoopInterleaving] Check performance impact of Loop Interleaving Count with varying loop iterations #26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
nilanjana87
merged 11 commits into
llvm:main
from
nilanjana87:loop_interleaving_microbenchmark
Nov 17, 2023
Merged
Changes from 1 commit
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
2b2eb4d
[MicroBenchmarks,LoopInterleaving] Check performance impact of Loop I…
nilanjana87 3a22381
As per reviewer comments, added comments explaining test cases, remov…
nilanjana87 418ec55
Merge branch 'llvm:main' into loop_interleaving_microbenchmark
nilanjana87 66ec798
Replacing functions with same functionality but different pragma-base…
nilanjana87 cb3a077
Ran clang-format
nilanjana87 379381b
Renaming functions to be more meaningful
nilanjana87 eb33e98
Added benchmarks for testing impact of loop interleaving for loops wi…
nilanjana87 9f5bda4
Fixed some of the code comments & ran clang-format
nilanjana87 107bc87
Made a separate executable for Loop Interleaving microbenchmark
nilanjana87 802ca40
Force disable loop unrolling for the auto-vectorization cases
nilanjana87 398bc8c
Ran clang-format
nilanjana87 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,313 @@ | ||
// This program tests performance impact of Interleaving Count with varying loop | ||
// iteration count for different types of loop, such as loops with or | ||
nilanjana87 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// without reduction inside it, loops with different vectorization widths. | ||
nilanjana87 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#include <iostream> | ||
#include <memory> | ||
#include <random> | ||
|
||
#include "benchmark/benchmark.h" | ||
|
||
#define ELEMENTS 2048 | ||
#define ALIGNED16 __attribute__((aligned(16))) | ||
|
||
static std::mt19937 rng; | ||
unsigned int g_sum = 0; | ||
|
||
int A[ELEMENTS] ALIGNED16; | ||
int B[ELEMENTS] ALIGNED16; | ||
int C[ELEMENTS] ALIGNED16; | ||
|
||
// Initialize arrays with random numbers. | ||
static void init_data(unsigned N) { | ||
std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(), | ||
std::numeric_limits<int>::max()); | ||
for (unsigned I = 0; I < N; I++) { | ||
A[I] = distrib(rng); | ||
B[I] = distrib(rng); | ||
C[I] = distrib(rng); | ||
} | ||
} | ||
|
||
static void __attribute__((noinline)) loopWithVW4IC1(int Iterations) { | ||
#pragma clang loop vectorize_width(4) interleave(disable) | ||
for (int J = 0; J < Iterations; J++) { | ||
A[J] = B[J] + C[J]; | ||
} | ||
} | ||
|
||
static void __attribute__((noinline)) loopWithVW4IC2(int Iterations) { | ||
#pragma clang loop vectorize_width(4) interleave_count(2) | ||
nilanjana87 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for (int J = 0; J < Iterations; J++) { | ||
nilanjana87 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
A[J] = B[J] + C[J]; | ||
} | ||
} | ||
|
||
static void __attribute__((noinline)) loopWithVW4IC4(int Iterations) { | ||
#pragma clang loop vectorize_width(4) interleave_count(4) | ||
for (int J = 0; J < Iterations; J++) { | ||
A[J] = B[J] + C[J]; | ||
} | ||
} | ||
|
||
static int __attribute__((noinline)) | ||
loopWithReductionWithVW4IC1(int Iterations) { | ||
unsigned sum = 0; | ||
#pragma clang loop vectorize_width(4) interleave(disable) | ||
for (int J = 0; J < Iterations; J++) { | ||
sum += A[J]; | ||
} | ||
return sum; | ||
} | ||
|
||
static int __attribute__((noinline)) | ||
loopWithReductionWithVW4IC2(int Iterations) { | ||
unsigned sum = 0; | ||
#pragma clang loop vectorize_width(4) interleave_count(2) | ||
for (int J = 0; J < Iterations; J++) { | ||
sum += A[J]; | ||
} | ||
return sum; | ||
} | ||
|
||
static int __attribute__((noinline)) | ||
loopWithReductionWithVW4IC4(int Iterations) { | ||
unsigned sum = 0; | ||
#pragma clang loop vectorize_width(4) interleave_count(4) | ||
for (int J = 0; J < Iterations; J++) { | ||
sum += A[J]; | ||
} | ||
return sum; | ||
} | ||
|
||
static int __attribute__((noinline)) | ||
loopWithReductionWithVW1IC1(int Iterations) { | ||
unsigned sum = 0; | ||
#pragma clang loop vectorize_width(1) interleave_count(1) | ||
for (int J = 0; J < Iterations; J++) { | ||
sum += A[J]; | ||
} | ||
return sum; | ||
} | ||
|
||
static int __attribute__((noinline)) | ||
loopWithReductionWithVW1IC2(int Iterations) { | ||
unsigned sum = 0; | ||
#pragma clang loop vectorize_width(1) interleave_count(2) | ||
for (int J = 0; J < Iterations; J++) { | ||
sum += A[J]; | ||
} | ||
return sum; | ||
} | ||
|
||
static int __attribute__((noinline)) | ||
nilanjana87 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
loopWithReductionWithVW1IC4(int Iterations) { | ||
unsigned sum = 0; | ||
#pragma clang loop vectorize_width(1) interleave_count(4) | ||
for (int J = 0; J < Iterations; J++) { | ||
sum += A[J]; | ||
} | ||
return sum; | ||
} | ||
|
||
static void __attribute__((always_inline)) | ||
benchForLoopInterleaveThreshold(benchmark::State &state, void (*Fn)(int), | ||
int Iterations) { | ||
std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(), | ||
nilanjana87 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
std::numeric_limits<int>::max()); | ||
init_data(ELEMENTS); | ||
for (auto _ : state) { | ||
benchmark::DoNotOptimize(A); | ||
benchmark::DoNotOptimize(B); | ||
benchmark::DoNotOptimize(C); | ||
benchmark::ClobberMemory(); | ||
Fn(Iterations); | ||
} | ||
} | ||
|
||
static void __attribute__((always_inline)) | ||
benchForWithReductionLoopInterleaveThreshold(benchmark::State &state, | ||
int (*Fn)(int), int Iterations) { | ||
std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(), | ||
std::numeric_limits<int>::max()); | ||
init_data(ELEMENTS); | ||
for (auto _ : state) { | ||
benchmark::DoNotOptimize(A); | ||
benchmark::DoNotOptimize(B); | ||
benchmark::DoNotOptimize(C); | ||
benchmark::ClobberMemory(); | ||
g_sum += Fn(Iterations); | ||
} | ||
} | ||
|
||
#define ADD_BENCHMARK(Itr) \ | ||
nilanjana87 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
void benchForIC1VW4LoopTC##Itr(benchmark::State &state) { \ | ||
benchForLoopInterleaveThreshold(state, &loopWithVW4IC1, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC1VW4LoopTC##Itr); \ | ||
void benchForIC2VW4LoopTC##Itr(benchmark::State &state) { \ | ||
benchForLoopInterleaveThreshold(state, &loopWithVW4IC2, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC2VW4LoopTC##Itr); \ | ||
void benchForIC4VW4LoopTC##Itr(benchmark::State &state) { \ | ||
benchForLoopInterleaveThreshold(state, &loopWithVW4IC4, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC4VW4LoopTC##Itr); \ | ||
void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ | ||
benchForWithReductionLoopInterleaveThreshold( \ | ||
state, &loopWithReductionWithVW4IC1, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr); \ | ||
void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ | ||
benchForWithReductionLoopInterleaveThreshold( \ | ||
state, &loopWithReductionWithVW4IC2, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr); \ | ||
void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ | ||
benchForWithReductionLoopInterleaveThreshold( \ | ||
state, &loopWithReductionWithVW4IC4, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr); \ | ||
void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ | ||
benchForWithReductionLoopInterleaveThreshold( \ | ||
state, &loopWithReductionWithVW1IC1, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr); \ | ||
void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ | ||
benchForWithReductionLoopInterleaveThreshold( \ | ||
state, &loopWithReductionWithVW1IC2, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr); \ | ||
void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ | ||
benchForWithReductionLoopInterleaveThreshold( \ | ||
state, &loopWithReductionWithVW1IC4, Itr); \ | ||
} \ | ||
BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr); | ||
|
||
ADD_BENCHMARK(1) | ||
ADD_BENCHMARK(2) | ||
ADD_BENCHMARK(3) | ||
ADD_BENCHMARK(4) | ||
ADD_BENCHMARK(5) | ||
ADD_BENCHMARK(6) | ||
ADD_BENCHMARK(7) | ||
ADD_BENCHMARK(8) | ||
ADD_BENCHMARK(9) | ||
ADD_BENCHMARK(10) | ||
ADD_BENCHMARK(11) | ||
ADD_BENCHMARK(12) | ||
ADD_BENCHMARK(13) | ||
ADD_BENCHMARK(14) | ||
ADD_BENCHMARK(15) | ||
ADD_BENCHMARK(16) | ||
ADD_BENCHMARK(17) | ||
ADD_BENCHMARK(18) | ||
ADD_BENCHMARK(19) | ||
ADD_BENCHMARK(20) | ||
ADD_BENCHMARK(21) | ||
ADD_BENCHMARK(22) | ||
ADD_BENCHMARK(23) | ||
ADD_BENCHMARK(24) | ||
ADD_BENCHMARK(25) | ||
ADD_BENCHMARK(26) | ||
ADD_BENCHMARK(27) | ||
ADD_BENCHMARK(28) | ||
ADD_BENCHMARK(29) | ||
ADD_BENCHMARK(30) | ||
ADD_BENCHMARK(31) | ||
ADD_BENCHMARK(32) | ||
ADD_BENCHMARK(33) | ||
ADD_BENCHMARK(34) | ||
ADD_BENCHMARK(35) | ||
ADD_BENCHMARK(36) | ||
ADD_BENCHMARK(37) | ||
ADD_BENCHMARK(38) | ||
ADD_BENCHMARK(39) | ||
ADD_BENCHMARK(40) | ||
ADD_BENCHMARK(41) | ||
ADD_BENCHMARK(42) | ||
ADD_BENCHMARK(43) | ||
ADD_BENCHMARK(44) | ||
ADD_BENCHMARK(45) | ||
ADD_BENCHMARK(46) | ||
ADD_BENCHMARK(47) | ||
ADD_BENCHMARK(48) | ||
ADD_BENCHMARK(49) | ||
ADD_BENCHMARK(50) | ||
ADD_BENCHMARK(51) | ||
ADD_BENCHMARK(52) | ||
ADD_BENCHMARK(53) | ||
ADD_BENCHMARK(54) | ||
ADD_BENCHMARK(55) | ||
ADD_BENCHMARK(56) | ||
ADD_BENCHMARK(57) | ||
ADD_BENCHMARK(58) | ||
ADD_BENCHMARK(59) | ||
ADD_BENCHMARK(60) | ||
ADD_BENCHMARK(61) | ||
ADD_BENCHMARK(62) | ||
ADD_BENCHMARK(63) | ||
ADD_BENCHMARK(64) | ||
ADD_BENCHMARK(65) | ||
ADD_BENCHMARK(66) | ||
ADD_BENCHMARK(67) | ||
ADD_BENCHMARK(68) | ||
ADD_BENCHMARK(69) | ||
ADD_BENCHMARK(70) | ||
ADD_BENCHMARK(71) | ||
ADD_BENCHMARK(72) | ||
ADD_BENCHMARK(73) | ||
ADD_BENCHMARK(74) | ||
ADD_BENCHMARK(75) | ||
ADD_BENCHMARK(76) | ||
ADD_BENCHMARK(77) | ||
ADD_BENCHMARK(78) | ||
ADD_BENCHMARK(79) | ||
ADD_BENCHMARK(80) | ||
ADD_BENCHMARK(81) | ||
ADD_BENCHMARK(82) | ||
ADD_BENCHMARK(83) | ||
ADD_BENCHMARK(84) | ||
ADD_BENCHMARK(85) | ||
ADD_BENCHMARK(86) | ||
ADD_BENCHMARK(87) | ||
ADD_BENCHMARK(88) | ||
ADD_BENCHMARK(89) | ||
ADD_BENCHMARK(90) | ||
ADD_BENCHMARK(91) | ||
ADD_BENCHMARK(92) | ||
ADD_BENCHMARK(93) | ||
ADD_BENCHMARK(94) | ||
ADD_BENCHMARK(95) | ||
ADD_BENCHMARK(96) | ||
ADD_BENCHMARK(97) | ||
ADD_BENCHMARK(98) | ||
ADD_BENCHMARK(99) | ||
ADD_BENCHMARK(100) | ||
ADD_BENCHMARK(101) | ||
ADD_BENCHMARK(102) | ||
ADD_BENCHMARK(103) | ||
ADD_BENCHMARK(104) | ||
ADD_BENCHMARK(105) | ||
ADD_BENCHMARK(106) | ||
ADD_BENCHMARK(107) | ||
ADD_BENCHMARK(108) | ||
ADD_BENCHMARK(109) | ||
ADD_BENCHMARK(110) | ||
ADD_BENCHMARK(111) | ||
ADD_BENCHMARK(112) | ||
ADD_BENCHMARK(113) | ||
ADD_BENCHMARK(114) | ||
ADD_BENCHMARK(115) | ||
ADD_BENCHMARK(116) | ||
ADD_BENCHMARK(117) | ||
ADD_BENCHMARK(118) | ||
ADD_BENCHMARK(119) | ||
ADD_BENCHMARK(120) | ||
ADD_BENCHMARK(121) | ||
ADD_BENCHMARK(122) | ||
ADD_BENCHMARK(123) | ||
ADD_BENCHMARK(124) | ||
ADD_BENCHMARK(125) | ||
ADD_BENCHMARK(126) | ||
ADD_BENCHMARK(127) | ||
ADD_BENCHMARK(128) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.