Skip to content

Commit 0b2a75b

Browse files
authored
[EP Perf] Add concurrency test (#19804)
### Description <!-- Describe your changes. --> * Add concurrency test to EP Perf CI panel (impl. by onnx_test_runner) * Model: FasterRCNN-10 model within CI image * `-c` param configurable via CI panel when kicking off CI tasks * Auto-replicate test input/outputs according to `-c` param * By default, the model test will be executed in 100 iterations (~2min added to T4 CI task load overall) ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> To monitor potential concurrency issues of ORT-TRT
1 parent 42399df commit 0b2a75b

File tree

5 files changed

+86
-25
lines changed

5 files changed

+86
-25
lines changed

onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh

+22-1
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44

55
set -x
66

7-
while getopts p:o:l:s: parameter
7+
while getopts p:o:l:s:c: parameter
88
do case "${parameter}"
99
in
1010
p) WORKSPACE=${OPTARG};;
1111
o) ORT_BINARY_PATH=${OPTARG};;
1212
l) BUILD_ORT_LATEST=${OPTARG};;
1313
s) ORT_SOURCE=${OPTARG};;
14+
c) CONCURRENCY=${OPTARG};;
1415
esac
1516
done
1617

@@ -104,6 +105,26 @@ fi
104105

105106
mv valgrind.log result
106107

108+
# Concurrency Test
109+
FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/"
110+
111+
mkdir FasterRCNN-10/
112+
cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/
113+
114+
# replicate test inputs
115+
for (( i=1; i<CONCURRENCY; i++ )); do
116+
cp -r "./FasterRCNN-10/test_data_set_0/" "./FasterRCNN-10/test_data_set_$i/"
117+
done
118+
119+
pip install onnx requests packaging
120+
python ${ORT_SOURCE}/onnxruntime/python/tools/symbolic_shape_infer.py \
121+
--input="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
122+
--output="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
123+
--auto_merge
124+
125+
${ORT_SOURCE}/build/Linux/Release/onnx_test_runner -e tensorrt -c ${CONCURRENCY} -r 100 ./FasterRCNN-10/ > concurrency_test.log 2>&1
126+
mv concurrency_test.log result
127+
107128
# Run AddressSanitizer
108129
ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
109130

onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh

+3-2
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
set -x
44

55
# Parse Arguments
6-
while getopts w:d:p:l: parameter
6+
while getopts w:d:p:l:c: parameter
77
do case "${parameter}"
88
in
99
w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime
1010
d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine
1111
p) MEM_TEST_DIR=${OPTARG};; # mem test dir
1212
l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT
13+
c) CONCURRENCY=${OPTARG};;
1314
esac
1415
done
1516

@@ -24,4 +25,4 @@ then
2425
BUILD_ORT_LATEST="true"
2526
fi
2627

27-
docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST
28+
docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY

onnxruntime/python/tools/tensorrt/perf/post.py

+50-11
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Licensed under the MIT License.
44
# --------------------------------------------------------------------------
55
import argparse
6+
import csv
67
import datetime
78
import os
89
import sys
@@ -419,10 +420,11 @@ def main():
419420
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
420421

421422
try:
423+
# Load EP Perf test results from /result
422424
result_file = args.report_folder
423-
424-
folders = os.listdir(result_file)
425-
os.chdir(result_file)
425+
result_perf_test_path = os.path.join(result_file, "result")
426+
folders = os.listdir(result_perf_test_path)
427+
os.chdir(result_perf_test_path)
426428

427429
tables = [
428430
fail_name,
@@ -445,26 +447,26 @@ def main():
445447
for model_group in folders:
446448
os.chdir(model_group)
447449
csv_filenames = os.listdir()
448-
for csv in csv_filenames:
449-
table = pd.read_csv(csv)
450-
if session_name in csv:
450+
for csv_file in csv_filenames:
451+
table = pd.read_csv(csv_file)
452+
if session_name in csv_file:
451453
table_results[session_name] = pd.concat(
452454
[table_results[session_name], get_session(table, model_group)], ignore_index=True
453455
)
454-
elif specs_name in csv:
456+
elif specs_name in csv_file:
455457
table_results[specs_name] = pd.concat(
456458
[
457459
table_results[specs_name],
458460
get_specs(table, args.branch, args.commit_hash, args.commit_datetime),
459461
],
460462
ignore_index=True,
461463
)
462-
elif fail_name in csv:
464+
elif fail_name in csv_file:
463465
table_results[fail_name] = pd.concat(
464466
[table_results[fail_name], get_failures(table, model_group)],
465467
ignore_index=True,
466468
)
467-
elif latency_name in csv:
469+
elif latency_name in csv_file:
468470
table_results[memory_name] = pd.concat(
469471
[table_results[memory_name], get_memory(table, model_group)],
470472
ignore_index=True,
@@ -474,11 +476,11 @@ def main():
474476
[table_results[latency_name], get_latency(table, model_group)],
475477
ignore_index=True,
476478
)
477-
elif status_name in csv:
479+
elif status_name in csv_file:
478480
table_results[status_name] = pd.concat(
479481
[table_results[status_name], get_status(table, model_group)], ignore_index=True
480482
)
481-
elif op_metrics_name in csv:
483+
elif op_metrics_name in csv_file:
482484
table = table.assign(Group=model_group)
483485
table_results[op_metrics_name] = pd.concat(
484486
[table_results[op_metrics_name], table], ignore_index=True
@@ -512,6 +514,43 @@ def main():
512514
args.commit_datetime,
513515
)
514516

517+
# Load concurrency test results
518+
result_mem_test_path = os.path.join(result_file, "result_mem_test")
519+
os.chdir(result_mem_test_path)
520+
log_path = "concurrency_test.log"
521+
if os.path.exists(log_path):
522+
print("Generating concurrency test report")
523+
with open(log_path) as log_file:
524+
log_content = log_file.read()
525+
526+
failed_cases_section = log_content.split("Failed Test Cases:")[1]
527+
528+
# passed = 1 if no failed test cases
529+
if failed_cases_section.strip() == "":
530+
passed = 1
531+
else:
532+
passed = 0
533+
534+
csv_path = "concurrency_test.csv"
535+
with open(csv_path, "w", newline="") as csv_file:
536+
csv_writer = csv.writer(csv_file)
537+
csv_writer.writerow(["Passed", "Log"])
538+
csv_writer.writerow([passed, log_content])
539+
540+
db_table_name = "ep_concurrencytest_record"
541+
table = pd.read_csv(csv_path)
542+
write_table(
543+
ingest_client,
544+
args.database,
545+
table,
546+
db_table_name,
547+
upload_time,
548+
identifier,
549+
args.branch,
550+
args.commit_hash,
551+
args.commit_datetime,
552+
)
553+
515554
except BaseException as e:
516555
print(str(e))
517556
sys.exit(1)

onnxruntime/test/onnx/main.cc

-5
Original file line numberDiff line numberDiff line change
@@ -341,11 +341,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
341341
logging_level = ORT_LOGGING_LEVEL_VERBOSE;
342342
}
343343

344-
if (concurrent_session_runs > 1 && repeat_count > 1) {
345-
fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n");
346-
usage();
347-
return -1;
348-
}
349344
argc -= optind;
350345
argv += optind;
351346
if (argc < 1) {

tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml

+11-6
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,15 @@ parameters:
2828
- "partner-models"
2929

3030
- name: MemTest
31-
displayName: Run Memory Test
31+
displayName: Run Memory Test and Concurrency Test
3232
type: boolean
3333
default: true
3434

35+
- name: ConcurrencyTest
36+
displayName: Specifies the number of concurrency model test to invoke simultaneously
37+
type: string
38+
default: 2
39+
3540
- name: TrtEPOptions
3641
displayName: TensorRT EP options
3742
type: object
@@ -107,8 +112,8 @@ jobs:
107112
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
108113

109114
- ${{ if eq(parameters.MemTest, true) }}:
110-
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
111-
displayName: 'Run Memory Test'
115+
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}'
116+
displayName: 'Run Memory Test and Concurrency Test'
112117
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'
113118

114119
- ${{ each option in parameters.ModelGroups }}:
@@ -152,16 +157,16 @@ jobs:
152157
displayName: 'Check and Install Azure CLI'
153158
154159
- task: AzureCLI@2
155-
displayName: 'Azure CLI Post to Dashboard'
160+
displayName: 'Post EP Perf Results to Dashboard'
156161
inputs:
157162
azureSubscription: AIInfraBuildOnnxRuntimeOSS
158163
scriptLocation: inlineScript
159164
scriptType: bash
160165
inlineScript: |
161166
short_hash=$(git rev-parse --short HEAD) &&
162167
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
163-
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
164-
168+
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
169+
165170
- template: templates/component-governance-component-detection-steps.yml
166171
parameters :
167172
condition : 'succeeded'

0 commit comments

Comments
 (0)