Skip to content

Commit 05b4ea9

Browse files
authored
Automatically analyze in auto-regressive setting (#1212)
* Automatically analyze in auto-regressive setting * Update help * Review * Quality
1 parent a70fee1 commit 05b4ea9

File tree

2 files changed

+119
-54
lines changed

2 files changed

+119
-54
lines changed

src/deepsparse/benchmark/benchmark_model.py

+36-27
Original file line numberDiff line numberDiff line change
@@ -17,31 +17,40 @@
1717
1818
##########
1919
Command help:
20-
usage: deepsparse.benchmark [-h] [-b BATCH_SIZE] [-shapes INPUT_SHAPES]
21-
[-ncores NUM_CORES] [-s {async,sync,elastic}]
22-
[-t TIME] [-w WARMUP_TIME] [-nstreams NUM_STREAMS]
23-
[-pin {none,core,numa}]
24-
[-e {deepsparse,onnxruntime}] [-q]
20+
usage: deepsparse.benchmark [-h] [-b BATCH_SIZE] [-seq_len SEQUENCE_LENGTH]
21+
[-input_ids_len INPUT_IDS_LENGTH]
22+
[-i INPUT_SHAPES] [-ncores NUM_CORES]
23+
[-s {async,sync,elastic}] [-t TIME]
24+
[-w WARMUP_TIME] [-nstreams NUM_STREAMS]
25+
[-pin {none,core,numa}] [-e ENGINE] [-q]
2526
[-x EXPORT_PATH]
2627
model_path
2728
2829
Benchmark ONNX models in the DeepSparse Engine
2930
3031
positional arguments:
31-
model_path Path to an ONNX model file or SparseZoo model stub.
32+
model_path Path to an ONNX model file or SparseZoo model stub
3233
3334
optional arguments:
34-
-h, --help show this help message and exit.
35+
-h, --help show this help message and exit
3536
-b BATCH_SIZE, --batch_size BATCH_SIZE
3637
The batch size to run the analysis for. Must be
37-
greater than 0.
38-
-shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
38+
greater than 0
39+
-seq_len SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
40+
The sequence length to run the KV cache supported
41+
model benchmarks for. Must be greater than 0, default
42+
is 2048
43+
-input_ids_len INPUT_IDS_LENGTH, --input_ids_length INPUT_IDS_LENGTH
44+
The input ids length to run the KV cache supported
45+
model benchmarks for. Must be greater than 0, default
46+
is 1
47+
-i INPUT_SHAPES, -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
3948
Override the shapes of the inputs, i.e. -shapes
4049
"[1,2,3],[4,5,6],[7,8,9]" results in input0=[1,2,3]
41-
input1=[4,5,6] input2=[7,8,9].
50+
input1=[4,5,6] input2=[7,8,9]
4251
-ncores NUM_CORES, --num_cores NUM_CORES
4352
The number of physical cores to run the analysis on,
44-
defaults to all physical cores available on the system.
53+
defaults to all physical cores available on the system
4554
-s {async,sync,elastic}, --scenario {async,sync,elastic}
4655
Choose between using the async, sync and elastic
4756
scenarios. Sync and async are similar to the single-
@@ -62,13 +71,18 @@
6271
-pin {none,core,numa}, --thread_pinning {none,core,numa}
6372
Enable binding threads to cores ('core' the default),
6473
threads to cores on sockets ('numa'), or disable
65-
('none').
66-
-e {deepsparse,onnxruntime}, --engine {deepsparse,onnxruntime}
74+
('none')
75+
-e ENGINE, --engine ENGINE
6776
Inference engine backend to run eval on. Choices are
6877
'deepsparse', 'onnxruntime'. Default is 'deepsparse'.
69-
-q, --quiet Lower logging verbosity.
78+
Can also specify a user defined engine class by giving
79+
the script and class name in the following format
80+
<path to python script>:<Engine Class name>. This
81+
engine class will be dynamically imported during
82+
runtime
83+
-q, --quiet Lower logging verbosity
7084
-x EXPORT_PATH, --export_path EXPORT_PATH
71-
Store results into a JSON file.
85+
Store results into a JSON file
7286
7387
##########
7488
Example on a BERT from SparseZoo:
@@ -85,8 +99,7 @@
8599
Example on a CodeGen (model with KV cache support)
86100
from SparseZoo with input_ids_length 10 and sequence length 256:
87101
deepsparse.benchmark \
88-
zoo:nlg/text_generation/codegen_mono-350m/pytorch/
89-
huggingface/bigpython_bigquery_thepile/pruned50-none
102+
zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/pruned50-none \
90103
--input_ids_length 10 --sequence_length 256
91104
92105
##########
@@ -97,7 +110,7 @@
97110
Example on local ONNX model at batch size 32 with synchronous (singlestream) execution:
98111
deepsparse.benchmark /PATH/TO/model.onnx --batch_size 32 --scenario sync
99112
100-
"""
113+
""" # noqa E501
101114

102115
import argparse
103116
import importlib
@@ -153,25 +166,21 @@ def parse_args():
153166
default=1,
154167
help="The batch size to run the analysis for. Must be greater than 0",
155168
)
156-
157169
parser.add_argument(
158170
"-seq_len",
159171
"--sequence_length",
160172
type=int,
161-
default=2048,
162-
help="The sequence length to run the "
163-
"KV cache supported model benchmarks for. "
164-
"Must be greater than 0, default is 2048",
173+
default=512,
174+
help="The sequence length to run the KV cache supported model "
175+
"benchmarks for. Must be 1 <= seq_len, default is 512",
165176
)
166-
167177
parser.add_argument(
168178
"-input_ids_len",
169179
"--input_ids_length",
170180
type=int,
171181
default=1,
172-
help="The input ids length to run the "
173-
"KV cache supported model benchmarks for. "
174-
"Must be greater than 0, default is 1",
182+
help="The input ids length to run the KV cache supported model "
183+
"benchmarks for. Must be 1 <= input_ids_len <= seq_len, default is 1",
175184
)
176185
parser.add_argument(
177186
"-i",

src/deepsparse/debug_analysis.py

+83-27
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,17 @@
1818
##########
1919
Command help:
2020
usage: deepsparse.debug_analysis [-h] [-wi NUM_WARMUP_ITERATIONS]
21-
[-bi NUM_ITERATIONS] [-ncores NUM_CORES]
22-
[-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
23-
[-ksf KERNEL_SPARSITY_FILE]
24-
[--optimization OPTIMIZATION] [-i INPUT_SHAPES] [-q]
25-
[-x EXPORT_PATH]
26-
model_path
21+
[-bi NUM_ITERATIONS] [-ncores NUM_CORES]
22+
[-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
23+
[-ksf KERNEL_SPARSITY_FILE]
24+
[--optimization OPTIMIZATION]
25+
[-seq_len SEQUENCE_LENGTH]
26+
[-input_ids_len INPUT_IDS_LENGTH]
27+
[-i INPUT_SHAPES] [--use-internal-kvcache]
28+
[--kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS]
29+
[--kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS]
30+
[-q] [-x EXPORT_PATH]
31+
model_path
2732
2833
Analyze ONNX models in the DeepSparse Engine
2934
@@ -49,14 +54,31 @@
4954
Filepath to per-layer kernel sparsities JSON
5055
--optimization OPTIMIZATION
5156
To enable or disable optimizations (Tensor Columns)
52-
-i INPUT_SHAPES, --input_shapes INPUT_SHAPES
57+
-seq_len SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
58+
The sequence length to run the KV cache supported
59+
model benchmarks for. Must be seq_len >= 1, default is
60+
512
61+
-input_ids_len INPUT_IDS_LENGTH, --input_ids_length INPUT_IDS_LENGTH
62+
The input ids length to run the KV cache supported
63+
model benchmarks for. Must be 1 <= input_ids_len <=
64+
seq_len, default is 1
65+
-i INPUT_SHAPES, -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
5366
Override the shapes of the inputs, i.e. -shapes
5467
"[1,2,3],[4,5,6],[7,8,9]" results in input0=[1,2,3]
5568
input1=[4,5,6] input2=[7,8,9]
69+
--use-internal-kvcache
70+
Enable internal KVCache
71+
--kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS
72+
Internal KVCache: The amount of previous tokens that
73+
will be read from the external KV cache on the first
74+
inference
75+
--kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS
76+
Internal KVCache: The amount of first tokens that we
77+
want to keep permanently in the KV cache
5678
-q, --quiet Lower logging verbosity
5779
-x EXPORT_PATH, --export_path EXPORT_PATH
5880
Store results into a JSON file
59-
"""
81+
""" # noqa E501
6082

6183
import argparse
6284
import json
@@ -66,8 +88,10 @@
6688
from deepsparse.utils import (
6789
default_cached_outputs,
6890
generate_random_inputs,
91+
has_model_kv_cache,
6992
model_to_path,
7093
override_onnx_input_shapes,
94+
overwrite_cache_model_inputs,
7195
parse_input_shapes,
7296
)
7397

@@ -132,8 +156,25 @@ def parse_args():
132156
type=bool,
133157
default=True,
134158
)
159+
parser.add_argument(
160+
"-seq_len",
161+
"--sequence_length",
162+
type=int,
163+
default=512,
164+
help="The sequence length to run the KV cache supported model "
165+
"benchmarks for. Must be seq_len >= 1, default is 512",
166+
)
167+
parser.add_argument(
168+
"-input_ids_len",
169+
"--input_ids_length",
170+
type=int,
171+
default=1,
172+
help="The input ids length to run the KV cache supported model "
173+
"benchmarks for. Must be 1 <= input_ids_len <= seq_len, default is 1",
174+
)
135175
parser.add_argument(
136176
"-i",
177+
"-shapes",
137178
"--input_shapes",
138179
help="Override the shapes of the inputs, "
139180
'i.e. -shapes "[1,2,3],[4,5,6],[7,8,9]" results in '
@@ -142,21 +183,24 @@ def parse_args():
142183
default="",
143184
)
144185
parser.add_argument(
145-
"--use-kvcache", help="Enable KVCache", action="store_true", default=False
186+
"--use-internal-kvcache",
187+
help="Enable internal KVCache",
188+
action="store_true",
189+
default=False,
146190
)
147191
parser.add_argument(
148192
"--kv-cache-prev-num-tokens",
149-
help="KVCache: The amount of previous tokens that will be read"
193+
help="Internal KVCache: The amount of previous tokens that will be read"
150194
" from the external KV cache on the first inference",
151195
type=int,
152-
default=None,
196+
default=0,
153197
)
154198
parser.add_argument(
155199
"--kv-cache-num-frozen-tokens",
156-
help="KVCache: The amount of first tokens that we want to keep"
200+
help="Internal KVCache: The amount of first tokens that we want to keep"
157201
" permanently in the KV cache",
158202
type=int,
159-
default=None,
203+
default=0,
160204
)
161205
parser.add_argument(
162206
"-q",
@@ -307,10 +351,31 @@ def main():
307351
orig_model_path = args.model_path
308352
model_path = model_to_path(args.model_path)
309353

310-
print("Analyzing model: {}".format(orig_model_path))
354+
print(f"Analyzing model: {orig_model_path}")
311355

312356
batch_size = args.batch_size
313357

358+
if has_model_kv_cache(model_path):
359+
if batch_size != 1:
360+
raise ValueError(
361+
"Unable to run models with KV cache support "
362+
"for batch size different than one."
363+
"Please set batch size to 1 and try again"
364+
)
365+
366+
print(
367+
"Found model with KV cache support. "
368+
"Benchmarking the autoregressive model with "
369+
f"input_ids_length: {args.input_ids_length} and "
370+
f"sequence length: {args.sequence_length}."
371+
)
372+
373+
model_path, _, _ = overwrite_cache_model_inputs(
374+
model_path=model_path,
375+
input_ids_length=args.input_ids_length,
376+
sequence_length=args.sequence_length,
377+
)
378+
314379
if input_shapes:
315380
with override_onnx_input_shapes(model_path, input_shapes) as tmp_path:
316381
input_list = generate_random_inputs(tmp_path, batch_size)
@@ -319,24 +384,15 @@ def main():
319384

320385
kv_cache_params = None
321386
if args.use_kvcache:
322-
kv_cache_prev_num_tokens = 0
323-
if args.kv_cache_prev_num_tokens is not None:
324-
kv_cache_prev_num_tokens = args.kv_cache_prev_num_tokens
325-
326-
kv_cache_num_frozen_tokens = 0
327-
if args.kv_cache_num_frozen_tokens is not None:
328-
kv_cache_num_frozen_tokens = args.kv_cache_num_frozen_tokens
329-
330387
kv_cache_params = KVCacheParams(
331388
default_cached_outputs(model_path),
332-
kv_cache_prev_num_tokens,
333-
kv_cache_num_frozen_tokens,
389+
args.kv_cache_prev_num_tokens,
390+
args.kv_cache_num_frozen_tokens,
334391
)
335392

336393
print(
337-
"Enable KVCache: prev_num_tokens = {}, num_frozen_tokens = {}".format(
338-
kv_cache_params.prev_num_tokens, kv_cache_params.num_frozen_tokens
339-
)
394+
f"Enable KVCache: prev_num_tokens = {kv_cache_params.prev_num_tokens}, "
395+
f"num_frozen_tokens = {kv_cache_params.num_frozen_tokens}"
340396
)
341397

342398
result = model_debug_analysis(

0 commit comments

Comments
 (0)