From c27c71797a24e8d002cccbd56ea96694fde6fd27 Mon Sep 17 00:00:00 2001 From: gs-olive <113141689+gs-olive@users.noreply.github.com> Date: Tue, 15 Nov 2022 08:11:58 -0800 Subject: [PATCH] feat: Add functionality to performance tooling - Add functionality for timing compilation in addition to inference - Add bash scripting code for concatenating all model result outputs --- tools/perf/benchmark.sh | 8 ++++++++ tools/perf/perf_run.py | 14 +++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh index b84061025d..7a92be5624 100644 --- a/tools/perf/benchmark.sh +++ b/tools/perf/benchmark.sh @@ -62,3 +62,11 @@ do --truncate \ --report "bert_base_perf_bs${bs}.txt" done + +# Collect and concatenate all results +echo "Concatenating all results" +(echo "Output of All Model Runs"; echo) >> all_outputs.txt; + +for i in $(ls *_bs*.txt); + do (echo $i; cat $i; echo; echo) >> all_outputs.txt; +done diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index fbdf3b6c40..d646781edc 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -2,6 +2,7 @@ from __future__ import absolute_import from __future__ import division +import time import timeit import numpy as np import torch.backends.cudnn as cudnn @@ -103,7 +104,10 @@ def run_torch_tensorrt( if precision == "int8": compile_settings.update({"calib": params.get("calibration_cache")}) + start_compile = time.time_ns() model = torchtrt.compile(model, **compile_settings) + end_compile = time.time_ns() + compile_time_ms = (end_compile - start_compile) / 1e6 iters = params.get("iterations", 20) # Warm up @@ -123,7 +127,7 @@ def run_torch_tensorrt( meas_time = end_time - start_time timings.append(meas_time) - recordStats("Torch-TensorRT", timings, precision, batch_size) + recordStats("Torch-TensorRT", timings, precision, batch_size, compile_time_ms) # Runs inference using FX2TRT backend @@ -136,6 +140,7 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): model.half() input_tensors = [tensor.half() for tensor in input_tensors] # Run lowering eager mode benchmark + start_compile = time.time_ns() model = compile( model, input_tensors, @@ -143,6 +148,8 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): lower_precision=precision, verbose_log=False, ) + end_compile = time.time_ns() + compile_time_ms = (end_compile - start_compile) / 1e6 iters = params.get("iterations", 20) # Warm up @@ -162,7 +169,7 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): meas_time = end_time - start_time timings.append(meas_time) - recordStats("FX-TensorRT", timings, precision, batch_size) + recordStats("FX-TensorRT", timings, precision, batch_size, compile_time_ms) def torch_dtype_from_trt(dtype): @@ -331,7 +338,7 @@ def run( # Generate report -def recordStats(backend, timings, precision, batch_size=1): +def recordStats(backend, timings, precision, batch_size=1, compile_time_ms=None): times = np.array(timings) steps = len(times) speeds = batch_size / times @@ -350,6 +357,7 @@ def recordStats(backend, timings, precision, batch_size=1): "Mean(FPS)": speed_mean, "Median-Latency(ms)": time_med * 1000, "Mean-Latency(ms)": time_mean * 1000, + "Compile Time(ms)": compile_time_ms, } results.append(stats)