[EP Perf] Add concurrency test (#19804)

yf711 · web-flow · commit 0b2a75b274e4 · 2024-03-15T07:41:21.000-07:00
### Description
&lt;!-- Describe your changes. --&gt;
* Add concurrency test to EP Perf CI panel (impl. by onnx_test_runner)
  * Model: FasterRCNN-10 model within CI image
  * `-c` param configurable via CI panel when kicking off CI tasks
  * Auto-replicate test input/outputs according to `-c` param
* By default, the model test will be executed in 100 iterations (~2min
added to T4 CI task load overall)

### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
To monitor potential concurrency issues of ORT-TRT
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
@@ -4,13 +4,14 @@
 
 set -x
 
-while getopts p:o:l:s: parameter
+while getopts p:o:l:s:c: parameter
 do case "${parameter}"
 in
 p) WORKSPACE=${OPTARG};;
 o) ORT_BINARY_PATH=${OPTARG};;
 l) BUILD_ORT_LATEST=${OPTARG};;
 s) ORT_SOURCE=${OPTARG};;
+c) CONCURRENCY=${OPTARG};;
 esac
 done
 
@@ -104,6 +105,26 @@ fi
 
 mv valgrind.log result
 
+# Concurrency Test
+FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/"
+
+mkdir FasterRCNN-10/
+cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/
+
+# replicate test inputs
+for (( i=1; i<CONCURRENCY; i++ )); do
+    cp -r "./FasterRCNN-10/test_data_set_0/" "./FasterRCNN-10/test_data_set_$i/"
+done
+
+pip install onnx requests packaging
+python ${ORT_SOURCE}/onnxruntime/python/tools/symbolic_shape_infer.py \
+    --input="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --output="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --auto_merge
+
+${ORT_SOURCE}/build/Linux/Release/onnx_test_runner -e tensorrt -c ${CONCURRENCY} -r 100 ./FasterRCNN-10/ > concurrency_test.log 2>&1
+mv concurrency_test.log result
+
 # Run AddressSanitizer 
 ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
@@ -3,13 +3,14 @@
 set -x
 
 # Parse Arguments
-while getopts w:d:p:l: parameter
+while getopts w:d:p:l:c: parameter
 do case "${parameter}"
 in 
 w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime
 d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine
 p) MEM_TEST_DIR=${OPTARG};; # mem test dir
 l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT
+c) CONCURRENCY=${OPTARG};;
 esac
 done 
 
@@ -24,4 +25,4 @@ then
     BUILD_ORT_LATEST="true"
 fi
 
-docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST
+docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import argparse
+import csv
 import datetime
 import os
 import sys
@@ -419,10 +420,11 @@ def main():
     upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
 
     try:
+        # Load EP Perf test results from /result
         result_file = args.report_folder
-
-        folders = os.listdir(result_file)
-        os.chdir(result_file)
+        result_perf_test_path = os.path.join(result_file, "result")
+        folders = os.listdir(result_perf_test_path)
+        os.chdir(result_perf_test_path)
 
         tables = [
             fail_name,
@@ -445,26 +447,26 @@ def main():
         for model_group in folders:
             os.chdir(model_group)
             csv_filenames = os.listdir()
-            for csv in csv_filenames:
-                table = pd.read_csv(csv)
-                if session_name in csv:
+            for csv_file in csv_filenames:
+                table = pd.read_csv(csv_file)
+                if session_name in csv_file:
                     table_results[session_name] = pd.concat(
                         [table_results[session_name], get_session(table, model_group)], ignore_index=True
                     )
-                elif specs_name in csv:
+                elif specs_name in csv_file:
                     table_results[specs_name] = pd.concat(
                         [
                             table_results[specs_name],
                             get_specs(table, args.branch, args.commit_hash, args.commit_datetime),
                         ],
                         ignore_index=True,
                     )
-                elif fail_name in csv:
+                elif fail_name in csv_file:
                     table_results[fail_name] = pd.concat(
                         [table_results[fail_name], get_failures(table, model_group)],
                         ignore_index=True,
                     )
-                elif latency_name in csv:
+                elif latency_name in csv_file:
                     table_results[memory_name] = pd.concat(
                         [table_results[memory_name], get_memory(table, model_group)],
                         ignore_index=True,
@@ -474,11 +476,11 @@ def main():
                         [table_results[latency_name], get_latency(table, model_group)],
                         ignore_index=True,
                     )
-                elif status_name in csv:
+                elif status_name in csv_file:
                     table_results[status_name] = pd.concat(
                         [table_results[status_name], get_status(table, model_group)], ignore_index=True
                     )
-                elif op_metrics_name in csv:
+                elif op_metrics_name in csv_file:
                     table = table.assign(Group=model_group)
                     table_results[op_metrics_name] = pd.concat(
                         [table_results[op_metrics_name], table], ignore_index=True
@@ -512,6 +514,43 @@ def main():
                 args.commit_datetime,
             )
 
+        # Load concurrency test results
+        result_mem_test_path = os.path.join(result_file, "result_mem_test")
+        os.chdir(result_mem_test_path)
+        log_path = "concurrency_test.log"
+        if os.path.exists(log_path):
+            print("Generating concurrency test report")
+            with open(log_path) as log_file:
+                log_content = log_file.read()
+
+            failed_cases_section = log_content.split("Failed Test Cases:")[1]
+
+            # passed = 1 if no failed test cases
+            if failed_cases_section.strip() == "":
+                passed = 1
+            else:
+                passed = 0
+
+            csv_path = "concurrency_test.csv"
+            with open(csv_path, "w", newline="") as csv_file:
+                csv_writer = csv.writer(csv_file)
+                csv_writer.writerow(["Passed", "Log"])
+                csv_writer.writerow([passed, log_content])
+
+            db_table_name = "ep_concurrencytest_record"
+            table = pd.read_csv(csv_path)
+            write_table(
+                ingest_client,
+                args.database,
+                table,
+                db_table_name,
+                upload_time,
+                identifier,
+                args.branch,
+                args.commit_hash,
+                args.commit_datetime,
+            )
+
     except BaseException as e:
         print(str(e))
         sys.exit(1)
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
@@ -341,11 +341,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     logging_level = ORT_LOGGING_LEVEL_VERBOSE;
   }
 
-  if (concurrent_session_runs > 1 && repeat_count > 1) {
-    fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n");
-    usage();
-    return -1;
-  }
   argc -= optind;
   argv += optind;
   if (argc < 1) {
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -28,10 +28,15 @@ parameters:
     - "partner-models"
 
 - name: MemTest
-  displayName: Run Memory Test
+  displayName: Run Memory Test and Concurrency Test
   type: boolean
   default: true
 
+- name: ConcurrencyTest
+  displayName: Specifies the number of concurrency model test to invoke simultaneously
+  type: string
+  default: 2
+
 - name: TrtEPOptions
   displayName: TensorRT EP options
   type: object
@@ -107,8 +112,8 @@ jobs:
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
     
     - ${{ if eq(parameters.MemTest, true) }}:
-      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
-        displayName: 'Run Memory Test'
+      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}'
+        displayName: 'Run Memory Test and Concurrency Test'
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'
 
     - ${{ each option in parameters.ModelGroups }}:
@@ -152,16 +157,16 @@ jobs:
         displayName: 'Check and Install Azure CLI'
 
       - task: AzureCLI@2
-        displayName: 'Azure CLI Post to Dashboard'
+        displayName: 'Post EP Perf Results to Dashboard'
         inputs:
           azureSubscription: AIInfraBuildOnnxRuntimeOSS
           scriptLocation: inlineScript
           scriptType: bash
           inlineScript: |
             short_hash=$(git rev-parse --short HEAD) &&
             commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
-            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
-    
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
     - template: templates/component-governance-component-detection-steps.yml
       parameters :
         condition : 'succeeded'