Skip to content

Commit 4fd1373

Browse files
authored
[Backport 8.x] benchmark script (#17283)
This commit cherry-picked the missing becnhmark script PRs The deprecated artifacts-api is removed [CI] benchmark uses the new artifacts-api (#17224) [CI] benchmark readme (#16783) Introduce a new flag to explicitly permit legacy monitoring (#16586) (Only take the benchmark script) [ci] fix wrong queue type in benchmark marathon (#16465) [CI] fix benchmark marathon (#16447) [CI] benchmark dashboard and pipeline for testing against multiple versions (#16421)
1 parent fde903c commit 4fd1373

File tree

13 files changed

+648
-272
lines changed

13 files changed

+648
-272
lines changed

Diff for: .buildkite/benchmark_marathon_pipeline.yml

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
agents:
2+
provider: gcp
3+
imageProject: elastic-images-prod
4+
image: family/platform-ingest-logstash-ubuntu-2204
5+
machineType: "n2-standard-16"
6+
diskSizeGb: 100
7+
diskType: pd-ssd
8+
9+
steps:
10+
- label: "Benchmark Marathon"
11+
command: .buildkite/scripts/benchmark/marathon.sh

Diff for: .buildkite/scripts/benchmark/README.md

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
## Steps to set up GCP instance to run benchmark script
2+
- Create an instance "n2-standard-16" with Ubuntu image
3+
- Install docker
4+
- `sudo snap install docker`
5+
- `sudo usermod -a -G docker $USER`
6+
- Install jq
7+
- Install vault
8+
- `sudo snap install vault`
9+
- `vault login --method github`
10+
- `vault kv get -format json secret/ci/elastic-logstash/benchmark`
11+
- Setup Elasticsearch index mapping and alias with `setup/*`
12+
- Import Kibana dashboard with `save-objects/*`
13+
- Run the benchmark script
14+
- Send data to your own Elasticsearch. Customise `VAULT_PATH="secret/ci/elastic-logstash/your/path"`
15+
- Run the script `main.sh`
16+
- or run in background `nohup bash -x main.sh > log.log 2>&1 &`
17+
18+
## Notes
19+
- Benchmarks should only be compared using the same hardware setup.
20+
- Please do not send the test metrics to the benchmark cluster. You can set `VAULT_PATH` to send data and metrics to your own server.
21+
- Run `all.sh` as calibration which gives you a baseline of performance in different versions.
22+
- [#16586](https://github.com/elastic/logstash/pull/16586) allows legacy monitoring using the configuration `xpack.monitoring.allow_legacy_collection: true`, which is not recognized in version 8. To run benchmarks in version 8, use the script of the corresponding branch (e.g. `8.16`) instead of `main` in buildkite.

Diff for: .buildkite/scripts/benchmark/config/logstash.yml

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ pipeline.workers: ${WORKER}
33
pipeline.batch.size: ${BATCH_SIZE}
44
queue.type: ${QTYPE}
55

6+
xpack.monitoring.allow_legacy_collection: true
67
xpack.monitoring.enabled: true
78
xpack.monitoring.elasticsearch.username: ${MONITOR_ES_USER}
89
xpack.monitoring.elasticsearch.password: ${MONITOR_ES_PW}

Diff for: .buildkite/scripts/benchmark/config/uuid

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
f74f1a28-25e9-494f-ba41-ca9f13d4446d

Diff for: .buildkite/scripts/benchmark/core.sh

+315
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
#!/usr/bin/env bash
2+
set -eo pipefail
3+
4+
SCRIPT_PATH="$(dirname "${BASH_SOURCE[0]}")"
5+
CONFIG_PATH="$SCRIPT_PATH/config"
6+
source "$SCRIPT_PATH/util.sh"
7+
8+
usage() {
9+
echo "Usage: $0 [FB_CNT] [QTYPE] [CPU] [MEM]"
10+
echo "Example: $0 4 {persisted|memory|all} 2 2"
11+
exit 1
12+
}
13+
14+
parse_args() {
15+
while [[ "$#" -gt 0 ]]; do
16+
if [ -z "$FB_CNT" ]; then
17+
FB_CNT=$1
18+
elif [ -z "$QTYPE" ]; then
19+
case $1 in
20+
all | persisted | memory)
21+
QTYPE=$1
22+
;;
23+
*)
24+
echo "Error: wrong queue type $1"
25+
usage
26+
;;
27+
esac
28+
elif [ -z "$CPU" ]; then
29+
CPU=$1
30+
elif [ -z "$MEM" ]; then
31+
MEM=$1
32+
else
33+
echo "Error: Too many arguments"
34+
usage
35+
fi
36+
shift
37+
done
38+
39+
# set default value
40+
# number of filebeat
41+
FB_CNT=${FB_CNT:-4}
42+
# all | persisted | memory
43+
QTYPE=${QTYPE:-all}
44+
CPU=${CPU:-4}
45+
MEM=${MEM:-4}
46+
XMX=$((MEM / 2))
47+
48+
IFS=','
49+
# worker multiplier: 1,2,4
50+
MULTIPLIERS="${MULTIPLIERS:-1,2,4}"
51+
read -ra MULTIPLIERS <<< "$MULTIPLIERS"
52+
BATCH_SIZES="${BATCH_SIZES:-500}"
53+
read -ra BATCH_SIZES <<< "$BATCH_SIZES"
54+
# tags to json array
55+
read -ra TAG_ARRAY <<< "$TAGS"
56+
JSON_TAGS=$(printf '"%s",' "${TAG_ARRAY[@]}" | sed 's/,$//')
57+
JSON_TAGS="[$JSON_TAGS]"
58+
59+
IFS=' '
60+
echo "filebeats: $FB_CNT, cpu: $CPU, mem: $MEM, Queue: $QTYPE, worker multiplier: ${MULTIPLIERS[@]}, batch size: ${BATCH_SIZES[@]}"
61+
}
62+
63+
get_secret() {
64+
VAULT_PATH=${VAULT_PATH:-secret/ci/elastic-logstash/benchmark}
65+
VAULT_DATA=$(vault kv get -format json $VAULT_PATH)
66+
BENCHMARK_ES_HOST=$(echo $VAULT_DATA | jq -r '.data.es_host')
67+
BENCHMARK_ES_USER=$(echo $VAULT_DATA | jq -r '.data.es_user')
68+
BENCHMARK_ES_PW=$(echo $VAULT_DATA | jq -r '.data.es_pw')
69+
70+
MONITOR_ES_HOST=$(echo $VAULT_DATA | jq -r '.data.monitor_es_host')
71+
MONITOR_ES_USER=$(echo $VAULT_DATA | jq -r '.data.monitor_es_user')
72+
MONITOR_ES_PW=$(echo $VAULT_DATA | jq -r '.data.monitor_es_pw')
73+
}
74+
75+
pull_images() {
76+
echo "--- Pull docker images"
77+
78+
if [[ -n "$LS_VERSION" ]]; then
79+
# pull image if it doesn't exist in local
80+
[[ -z $(docker images -q docker.elastic.co/logstash/logstash:$LS_VERSION) ]] && docker pull "docker.elastic.co/logstash/logstash:$LS_VERSION"
81+
else
82+
# pull the latest snapshot logstash image
83+
# select the SNAPSHOT artifact with the highest semantic version number
84+
LS_VERSION=$( curl --retry-all-errors --retry 5 --retry-delay 1 -s "https://storage.googleapis.com/artifacts-api/snapshots/main.json" | jq -r '.version' )
85+
BUILD_ID=$(curl --retry-all-errors --retry 5 --retry-delay 1 -s "https://storage.googleapis.com/artifacts-api/snapshots/main.json" | jq -r '.build_id')
86+
ARCH=$(arch)
87+
IMAGE_URL="https://snapshots.elastic.co/${BUILD_ID}/downloads/logstash/logstash-$LS_VERSION-docker-image-$ARCH.tar.gz"
88+
IMAGE_FILENAME="$LS_VERSION.tar.gz"
89+
90+
echo "Download $LS_VERSION from $IMAGE_URL"
91+
[[ ! -e $IMAGE_FILENAME ]] && curl -fsSL --retry-max-time 60 --retry 3 --retry-delay 5 -o "$IMAGE_FILENAME" "$IMAGE_URL"
92+
[[ -z $(docker images -q docker.elastic.co/logstash/logstash:$LS_VERSION) ]] && docker load -i "$IMAGE_FILENAME"
93+
fi
94+
95+
# pull filebeat image
96+
FB_DEFAULT_VERSION="8.13.4"
97+
FB_VERSION=${FB_VERSION:-$FB_DEFAULT_VERSION}
98+
docker pull "docker.elastic.co/beats/filebeat:$FB_VERSION"
99+
}
100+
101+
generate_logs() {
102+
FLOG_FILE_CNT=${FLOG_FILE_CNT:-4}
103+
SINGLE_SIZE=524288000
104+
TOTAL_SIZE="$((FLOG_FILE_CNT * SINGLE_SIZE))"
105+
FLOG_PATH="$SCRIPT_PATH/flog"
106+
mkdir -p $FLOG_PATH
107+
108+
if [[ ! -e "$FLOG_PATH/log${FLOG_FILE_CNT}.log" ]]; then
109+
echo "--- Generate logs in background. log: ${FLOG_FILE_CNT}, each size: 500mb"
110+
docker run -d --name=flog --rm -v $FLOG_PATH:/go/src/data mingrammer/flog -t log -w -o "/go/src/data/log.log" -b $TOTAL_SIZE -p $SINGLE_SIZE
111+
fi
112+
}
113+
114+
check_logs() {
115+
echo "--- Check log generation"
116+
117+
local cnt=0
118+
until [[ -e "$FLOG_PATH/log${FLOG_FILE_CNT}.log" || $cnt -gt 600 ]]; do
119+
echo "wait 30s" && sleep 30
120+
cnt=$((cnt + 30))
121+
done
122+
123+
ls -lah $FLOG_PATH
124+
}
125+
126+
start_logstash() {
127+
LS_CONFIG_PATH=$SCRIPT_PATH/ls/config
128+
mkdir -p $LS_CONFIG_PATH
129+
130+
cp $CONFIG_PATH/pipelines.yml $LS_CONFIG_PATH/pipelines.yml
131+
cp $CONFIG_PATH/logstash.yml $LS_CONFIG_PATH/logstash.yml
132+
cp $CONFIG_PATH/uuid $LS_CONFIG_PATH/uuid
133+
134+
LS_JAVA_OPTS=${LS_JAVA_OPTS:--Xmx${XMX}g}
135+
docker run -d --name=ls --net=host --cpus=$CPU --memory=${MEM}g -e LS_JAVA_OPTS="$LS_JAVA_OPTS" \
136+
-e QTYPE="$QTYPE" -e WORKER="$WORKER" -e BATCH_SIZE="$BATCH_SIZE" \
137+
-e BENCHMARK_ES_HOST="$BENCHMARK_ES_HOST" -e BENCHMARK_ES_USER="$BENCHMARK_ES_USER" -e BENCHMARK_ES_PW="$BENCHMARK_ES_PW" \
138+
-e MONITOR_ES_HOST="$MONITOR_ES_HOST" -e MONITOR_ES_USER="$MONITOR_ES_USER" -e MONITOR_ES_PW="$MONITOR_ES_PW" \
139+
-v $LS_CONFIG_PATH/logstash.yml:/usr/share/logstash/config/logstash.yml:ro \
140+
-v $LS_CONFIG_PATH/pipelines.yml:/usr/share/logstash/config/pipelines.yml:ro \
141+
-v $LS_CONFIG_PATH/uuid:/usr/share/logstash/data/uuid:ro \
142+
docker.elastic.co/logstash/logstash:$LS_VERSION
143+
}
144+
145+
start_filebeat() {
146+
for ((i = 0; i < FB_CNT; i++)); do
147+
FB_PATH="$SCRIPT_PATH/fb${i}"
148+
mkdir -p $FB_PATH
149+
150+
cp $CONFIG_PATH/filebeat.yml $FB_PATH/filebeat.yml
151+
152+
docker run -d --name=fb$i --net=host --user=root \
153+
-v $FB_PATH/filebeat.yml:/usr/share/filebeat/filebeat.yml \
154+
-v $SCRIPT_PATH/flog:/usr/share/filebeat/flog \
155+
docker.elastic.co/beats/filebeat:$FB_VERSION filebeat -e --strict.perms=false
156+
done
157+
}
158+
159+
capture_stats() {
160+
CURRENT=$(jq -r '.flow.output_throughput.current' $NS_JSON)
161+
local eps_1m=$(jq -r '.flow.output_throughput.last_1_minute' $NS_JSON)
162+
local eps_5m=$(jq -r '.flow.output_throughput.last_5_minutes' $NS_JSON)
163+
local worker_util=$(jq -r '.pipelines.main.flow.worker_utilization.last_1_minute' $NS_JSON)
164+
local worker_concurr=$(jq -r '.pipelines.main.flow.worker_concurrency.last_1_minute' $NS_JSON)
165+
local cpu_percent=$(jq -r '.process.cpu.percent' $NS_JSON)
166+
local heap=$(jq -r '.jvm.mem.heap_used_in_bytes' $NS_JSON)
167+
local non_heap=$(jq -r '.jvm.mem.non_heap_used_in_bytes' $NS_JSON)
168+
local q_event_cnt=$(jq -r '.pipelines.main.queue.events_count' $NS_JSON)
169+
local q_size=$(jq -r '.pipelines.main.queue.queue_size_in_bytes' $NS_JSON)
170+
TOTAL_EVENTS_OUT=$(jq -r '.pipelines.main.events.out' $NS_JSON)
171+
printf "current: %s, 1m: %s, 5m: %s, worker_utilization: %s, worker_concurrency: %s, cpu: %s, heap: %s, non-heap: %s, q_events: %s, q_size: %s, total_events_out: %s\n" \
172+
$CURRENT $eps_1m $eps_5m $worker_util $worker_concurr $cpu_percent $heap $non_heap $q_event_cnt $q_size $TOTAL_EVENTS_OUT
173+
}
174+
175+
aggregate_stats() {
176+
local file_glob="$SCRIPT_PATH/$NS_DIR/${QTYPE:0:1}_w${WORKER}b${BATCH_SIZE}_*.json"
177+
MAX_EPS_1M=$( jqmax '.flow.output_throughput.last_1_minute' "$file_glob" )
178+
MAX_EPS_5M=$( jqmax '.flow.output_throughput.last_5_minutes' "$file_glob" )
179+
MAX_WORKER_UTIL=$( jqmax '.pipelines.main.flow.worker_utilization.last_1_minute' "$file_glob" )
180+
MAX_WORKER_CONCURR=$( jqmax '.pipelines.main.flow.worker_concurrency.last_1_minute' "$file_glob" )
181+
MAX_Q_EVENT_CNT=$( jqmax '.pipelines.main.queue.events_count' "$file_glob" )
182+
MAX_Q_SIZE=$( jqmax '.pipelines.main.queue.queue_size_in_bytes' "$file_glob" )
183+
184+
AVG_CPU_PERCENT=$( jqavg '.process.cpu.percent' "$file_glob" )
185+
AVG_VIRTUAL_MEM=$( jqavg '.process.mem.total_virtual_in_bytes' "$file_glob" )
186+
AVG_HEAP=$( jqavg '.jvm.mem.heap_used_in_bytes' "$file_glob" )
187+
AVG_NON_HEAP=$( jqavg '.jvm.mem.non_heap_used_in_bytes' "$file_glob" )
188+
}
189+
190+
send_summary() {
191+
echo "--- Send summary to Elasticsearch"
192+
193+
# build json
194+
local timestamp
195+
timestamp=$(date -u +"%Y-%m-%dT%H:%M:%S")
196+
SUMMARY="{\"timestamp\": \"$timestamp\", \"version\": \"$LS_VERSION\", \"cpu\": \"$CPU\", \"mem\": \"$MEM\", \"workers\": \"$WORKER\", \"batch_size\": \"$BATCH_SIZE\", \"queue_type\": \"$QTYPE\""
197+
not_empty "$TOTAL_EVENTS_OUT" && SUMMARY="$SUMMARY, \"total_events_out\": \"$TOTAL_EVENTS_OUT\""
198+
not_empty "$MAX_EPS_1M" && SUMMARY="$SUMMARY, \"max_eps_1m\": \"$MAX_EPS_1M\""
199+
not_empty "$MAX_EPS_5M" && SUMMARY="$SUMMARY, \"max_eps_5m\": \"$MAX_EPS_5M\""
200+
not_empty "$MAX_WORKER_UTIL" && SUMMARY="$SUMMARY, \"max_worker_utilization\": \"$MAX_WORKER_UTIL\""
201+
not_empty "$MAX_WORKER_CONCURR" && SUMMARY="$SUMMARY, \"max_worker_concurrency\": \"$MAX_WORKER_CONCURR\""
202+
not_empty "$AVG_CPU_PERCENT" && SUMMARY="$SUMMARY, \"avg_cpu_percentage\": \"$AVG_CPU_PERCENT\""
203+
not_empty "$AVG_HEAP" && SUMMARY="$SUMMARY, \"avg_heap\": \"$AVG_HEAP\""
204+
not_empty "$AVG_NON_HEAP" && SUMMARY="$SUMMARY, \"avg_non_heap\": \"$AVG_NON_HEAP\""
205+
not_empty "$AVG_VIRTUAL_MEM" && SUMMARY="$SUMMARY, \"avg_virtual_memory\": \"$AVG_VIRTUAL_MEM\""
206+
not_empty "$MAX_Q_EVENT_CNT" && SUMMARY="$SUMMARY, \"max_queue_events\": \"$MAX_Q_EVENT_CNT\""
207+
not_empty "$MAX_Q_SIZE" && SUMMARY="$SUMMARY, \"max_queue_bytes_size\": \"$MAX_Q_SIZE\""
208+
not_empty "$TAGS" && SUMMARY="$SUMMARY, \"tags\": $JSON_TAGS"
209+
SUMMARY="$SUMMARY}"
210+
211+
tee summary.json << EOF
212+
{"index": {}}
213+
$SUMMARY
214+
EOF
215+
216+
# send to ES
217+
local resp
218+
local err_status
219+
resp=$(curl -s -X POST -u "$BENCHMARK_ES_USER:$BENCHMARK_ES_PW" "$BENCHMARK_ES_HOST/benchmark_summary/_bulk" -H 'Content-Type: application/json' --data-binary @"summary.json")
220+
echo "$resp"
221+
err_status=$(echo "$resp" | jq -r ".errors")
222+
if [[ "$err_status" == "true" ]]; then
223+
echo "Failed to send summary"
224+
exit 1
225+
fi
226+
}
227+
228+
# $1: snapshot index
229+
node_stats() {
230+
NS_JSON="$SCRIPT_PATH/$NS_DIR/${QTYPE:0:1}_w${WORKER}b${BATCH_SIZE}_$1.json" # m_w8b1000_0.json
231+
232+
# curl inside container because docker on mac cannot resolve localhost to host network interface
233+
docker exec -i ls curl localhost:9600/_node/stats > "$NS_JSON" 2> /dev/null
234+
}
235+
236+
# $1: index
237+
snapshot() {
238+
node_stats $1
239+
capture_stats
240+
}
241+
242+
create_directory() {
243+
NS_DIR="fb${FB_CNT}c${CPU}m${MEM}" # fb4c4m4
244+
mkdir -p "$SCRIPT_PATH/$NS_DIR"
245+
}
246+
247+
queue() {
248+
for QTYPE in "persisted" "memory"; do
249+
worker
250+
done
251+
}
252+
253+
worker() {
254+
for m in "${MULTIPLIERS[@]}"; do
255+
WORKER=$((CPU * m))
256+
batch
257+
done
258+
}
259+
260+
batch() {
261+
for BATCH_SIZE in "${BATCH_SIZES[@]}"; do
262+
run_pipeline
263+
stop_pipeline
264+
done
265+
}
266+
267+
run_pipeline() {
268+
echo "--- Run pipeline. queue type: $QTYPE, worker: $WORKER, batch size: $BATCH_SIZE"
269+
270+
start_logstash
271+
start_filebeat
272+
docker ps
273+
274+
echo "(0) sleep 3m" && sleep 180
275+
snapshot "0"
276+
277+
for i in {1..8}; do
278+
echo "($i) sleep 30s" && sleep 30
279+
snapshot "$i"
280+
281+
# print docker log when ingestion rate is zero
282+
# remove '.' in number and return max val
283+
[[ $(max -g "${CURRENT/./}" "0") -eq 0 ]] &&
284+
docker logs fb0 &&
285+
docker logs ls
286+
done
287+
288+
aggregate_stats
289+
send_summary
290+
}
291+
292+
stop_pipeline() {
293+
echo "--- Stop Pipeline"
294+
295+
for ((i = 0; i < FB_CNT; i++)); do
296+
docker stop fb$i
297+
docker rm fb$i
298+
done
299+
300+
docker stop ls
301+
docker rm ls
302+
303+
curl -u "$BENCHMARK_ES_USER:$BENCHMARK_ES_PW" -X DELETE $BENCHMARK_ES_HOST/_data_stream/logs-generic-default
304+
echo " data stream deleted "
305+
306+
# TODO: clean page caches, reduce memory fragmentation
307+
# https://github.com/elastic/logstash/pull/16191#discussion_r1647050216
308+
}
309+
310+
clean_up() {
311+
# stop log generation if it has not done yet
312+
[[ -n $(docker ps | grep flog) ]] && docker stop flog || true
313+
# remove image
314+
docker image rm docker.elastic.co/logstash/logstash:$LS_VERSION
315+
}

0 commit comments

Comments
 (0)