Skip to content

Commit 588e1d1

Browse files
author
Anurag Dixit
committed
fix: Fixed bugs and addressed review comments
Signed-off-by: Anurag Dixit <[email protected]>
1 parent a8016ff commit 588e1d1

File tree

5 files changed

+63
-23
lines changed

5 files changed

+63
-23
lines changed

Diff for: examples/benchmark/py/README.md

+18-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@ This is a comprehensive Python benchmark suite to run perf runs using different
88

99
Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package.
1010

11+
## Prerequisite
12+
13+
Benchmark scripts depends on following Python packages in addition to requirements.txt packages
14+
15+
1. Torch-TensorRT
16+
2. Torch
17+
3. TensorRT
18+
1119
## Structure
1220

1321
```
@@ -42,14 +50,20 @@ There are two sample configuration files added.
4250

4351
| Name | Supported Values | Description |
4452
| --- | --- | --- |
45-
| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference |
53+
| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference. |
4654
| input | - | Input binding names. Expected to list shapes of each input bindings |
4755
| model | - | Configure the model filename and name |
48-
| filename | - | Model file name to load from disk |
56+
| filename | - | Model file name to load from disk. |
4957
| name | - | Model name |
5058
| runtime | - | Runtime configurations |
5159
| device | 0 | Target device ID to run inference. Range depends on available GPUs |
52-
| precision | fp32, fp16 or half, int8 | Target precision to run inference |
60+
| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend |
61+
| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision |
62+
63+
Note:
64+
1. Please note that torch runtime perf is not supported for int8 yet.
65+
2. Torchscript module filename should end with .jit.pt otherwise it will be treated as a TensorRT engine.
66+
5367

5468

5569
Additional sample use case:
@@ -64,7 +78,7 @@ input:
6478
- 3
6579
- 224
6680
- 224
67-
num_of_input: 1
81+
num_inputs: 1
6882
model:
6983
filename: model.plan
7084
name: vgg16

Diff for: examples/benchmark/py/config/vgg16.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ input:
77
- 3
88
- 224
99
- 224
10-
num_of_input: 1
10+
num_inputs: 1
1111
model:
1212
filename: vgg16_traced.jit.pt
1313
name: vgg16

Diff for: examples/benchmark/py/config/vgg16_trt.yml

+4-1
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,15 @@ input:
66
- 3
77
- 224
88
- 224
9-
num_of_input: 1
9+
num_inputs: 1
1010
model:
1111
filename: model.plan
1212
name: vgg16
13+
calibration_cache:
14+
- vgg16.cache
1315
runtime:
1416
device: 0
1517
precision:
1618
- fp32
1719
- fp16
20+
- int8

Diff for: examples/benchmark/py/perf_run.py

+35-17
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ def run_torch_tensorrt(model, input_tensors, params, precision):
7878
"inputs": input_tensors,
7979
"enabled_precisions": {precision_to_dtype(precision)}
8080
}
81+
82+
if precision == 'int8':
83+
compile_settings.update({"calib": params.get('calibration_cache')})
84+
8185

8286
model = torchtrt.compile(model, **compile_settings)
8387

@@ -166,26 +170,35 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False):
166170
k += 1
167171

168172
timings = []
169-
with torch.no_grad():
170-
with engine.create_execution_context() as context:
171-
for i in range(WARMUP_ITER):
172-
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
173-
torch.cuda.synchronize()
174-
175-
for i in range(iters):
176-
start_time = timeit.default_timer()
177-
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
178-
torch.cuda.synchronize()
179-
end_time = timeit.default_timer()
180-
meas_time = end_time - start_time
181-
timings.append(meas_time)
182-
print("Iterations {}: {:.6f} s".format(i, end_time - start_time))
173+
with engine.create_execution_context() as context:
174+
for i in range(WARMUP_ITER):
175+
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
176+
torch.cuda.synchronize()
177+
178+
for i in range(iters):
179+
start_time = timeit.default_timer()
180+
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
181+
torch.cuda.synchronize()
182+
end_time = timeit.default_timer()
183+
meas_time = end_time - start_time
184+
timings.append(meas_time)
185+
print("Iterations {}: {:.6f} s".format(i, end_time - start_time))
183186

184187
printStats("TensorRT", timings, precision)
185188

186189
# Deploys inference run for different backend configurations
187190
def run(model, input_tensors, params, precision, is_trt_engine = False):
188191
for backend in params.get('backend'):
192+
193+
if precision == 'int8':
194+
if backend == 'all' or backend == 'torch':
195+
print("int8 precision is not supported for torch runtime in this script yet")
196+
return False
197+
198+
if backend == 'all' or backend == 'torch_tensorrt' or params.get('calibration_cache', None) == None:
199+
print("int8 precision expects calibration cache file for inference")
200+
return False
201+
189202
if backend == 'all':
190203
run_torch(model, input_tensors, params, precision)
191204
run_torch_tensorrt(model, input_tensors, params, precision)
@@ -280,20 +293,25 @@ def load_model(params):
280293
# Create random input tensor of certain size
281294
torch.manual_seed(12345)
282295

283-
num_input = params.get('input').get('num_of_input')
296+
num_input = params.get('input').get('num_inputs')
284297
for precision in params.get('runtime').get('precision', 'fp32'):
285298
input_tensors = []
286-
num_input = params.get('input').get('num_of_input', 1)
299+
num_input = params.get('input').get('num_inputs', 1)
287300
for i in range(num_input):
288301
inp_tensor = params.get('input').get('input' + str(i))
289302
input_tensors.append(torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda())
290303

304+
if is_trt_engine:
305+
print("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results")
306+
291307
if not is_trt_engine and precision == "fp16" or precision == "half":
292308
# If model is TensorRT serialized engine then model.half will report failure
293309
model = model.half()
294310

295311
# Run inference
296-
run(model, input_tensors, params, precision, is_trt_engine)
312+
status = run(model, input_tensors, params, precision, is_trt_engine)
313+
if status == False:
314+
continue
297315

298316
# Generate report
299317
print('Model Summary:')

Diff for: examples/benchmark/py/requirements.txt

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
timeit
2+
numpy
3+
argparse
4+
yaml
5+
pandas

0 commit comments

Comments
 (0)