18
18
##########
19
19
Command help:
20
20
usage: deepsparse.debug_analysis [-h] [-wi NUM_WARMUP_ITERATIONS]
21
- [-bi NUM_ITERATIONS] [-ncores NUM_CORES]
22
- [-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
23
- [-ksf KERNEL_SPARSITY_FILE]
24
- [--optimization OPTIMIZATION] [-i INPUT_SHAPES] [-q]
25
- [-x EXPORT_PATH]
26
- model_path
21
+ [-bi NUM_ITERATIONS] [-ncores NUM_CORES]
22
+ [-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
23
+ [-ksf KERNEL_SPARSITY_FILE]
24
+ [--optimization OPTIMIZATION]
25
+ [-seq_len SEQUENCE_LENGTH]
26
+ [-input_ids_len INPUT_IDS_LENGTH]
27
+ [-i INPUT_SHAPES] [--use-internal-kvcache]
28
+ [--kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS]
29
+ [--kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS]
30
+ [-q] [-x EXPORT_PATH]
31
+ model_path
27
32
28
33
Analyze ONNX models in the DeepSparse Engine
29
34
49
54
Filepath to per-layer kernel sparsities JSON
50
55
--optimization OPTIMIZATION
51
56
To enable or disable optimizations (Tensor Columns)
52
- -i INPUT_SHAPES, --input_shapes INPUT_SHAPES
57
+ -seq_len SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
58
+ The sequence length to run the KV cache supported
59
+ model benchmarks for. Must be seq_len >= 1, default is
60
+ 512
61
+ -input_ids_len INPUT_IDS_LENGTH, --input_ids_length INPUT_IDS_LENGTH
62
+ The input ids length to run the KV cache supported
63
+ model benchmarks for. Must be 1 <= input_ids_len <=
64
+ seq_len, default is 1
65
+ -i INPUT_SHAPES, -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
53
66
Override the shapes of the inputs, i.e. -shapes
54
67
"[1,2,3],[4,5,6],[7,8,9]" results in input0=[1,2,3]
55
68
input1=[4,5,6] input2=[7,8,9]
69
+ --use-internal-kvcache
70
+ Enable internal KVCache
71
+ --kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS
72
+ Internal KVCache: The amount of previous tokens that
73
+ will be read from the external KV cache on the first
74
+ inference
75
+ --kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS
76
+ Internal KVCache: The amount of first tokens that we
77
+ want to keep permanently in the KV cache
56
78
-q, --quiet Lower logging verbosity
57
79
-x EXPORT_PATH, --export_path EXPORT_PATH
58
80
Store results into a JSON file
59
- """
81
+ """ # noqa E501
60
82
61
83
import argparse
62
84
import json
66
88
from deepsparse .utils import (
67
89
default_cached_outputs ,
68
90
generate_random_inputs ,
91
+ has_model_kv_cache ,
69
92
model_to_path ,
70
93
override_onnx_input_shapes ,
94
+ overwrite_cache_model_inputs ,
71
95
parse_input_shapes ,
72
96
)
73
97
@@ -132,8 +156,25 @@ def parse_args():
132
156
type = bool ,
133
157
default = True ,
134
158
)
159
+ parser .add_argument (
160
+ "-seq_len" ,
161
+ "--sequence_length" ,
162
+ type = int ,
163
+ default = 512 ,
164
+ help = "The sequence length to run the KV cache supported model "
165
+ "benchmarks for. Must be seq_len >= 1, default is 512" ,
166
+ )
167
+ parser .add_argument (
168
+ "-input_ids_len" ,
169
+ "--input_ids_length" ,
170
+ type = int ,
171
+ default = 1 ,
172
+ help = "The input ids length to run the KV cache supported model "
173
+ "benchmarks for. Must be 1 <= input_ids_len <= seq_len, default is 1" ,
174
+ )
135
175
parser .add_argument (
136
176
"-i" ,
177
+ "-shapes" ,
137
178
"--input_shapes" ,
138
179
help = "Override the shapes of the inputs, "
139
180
'i.e. -shapes "[1,2,3],[4,5,6],[7,8,9]" results in '
@@ -142,21 +183,24 @@ def parse_args():
142
183
default = "" ,
143
184
)
144
185
parser .add_argument (
145
- "--use-kvcache" , help = "Enable KVCache" , action = "store_true" , default = False
186
+ "--use-internal-kvcache" ,
187
+ help = "Enable internal KVCache" ,
188
+ action = "store_true" ,
189
+ default = False ,
146
190
)
147
191
parser .add_argument (
148
192
"--kv-cache-prev-num-tokens" ,
149
- help = "KVCache: The amount of previous tokens that will be read"
193
+ help = "Internal KVCache: The amount of previous tokens that will be read"
150
194
" from the external KV cache on the first inference" ,
151
195
type = int ,
152
- default = None ,
196
+ default = 0 ,
153
197
)
154
198
parser .add_argument (
155
199
"--kv-cache-num-frozen-tokens" ,
156
- help = "KVCache: The amount of first tokens that we want to keep"
200
+ help = "Internal KVCache: The amount of first tokens that we want to keep"
157
201
" permanently in the KV cache" ,
158
202
type = int ,
159
- default = None ,
203
+ default = 0 ,
160
204
)
161
205
parser .add_argument (
162
206
"-q" ,
@@ -307,10 +351,31 @@ def main():
307
351
orig_model_path = args .model_path
308
352
model_path = model_to_path (args .model_path )
309
353
310
- print ("Analyzing model: {}" . format ( orig_model_path ) )
354
+ print (f "Analyzing model: { orig_model_path } " )
311
355
312
356
batch_size = args .batch_size
313
357
358
+ if has_model_kv_cache (model_path ):
359
+ if batch_size != 1 :
360
+ raise ValueError (
361
+ "Unable to run models with KV cache support "
362
+ "for batch size different than one."
363
+ "Please set batch size to 1 and try again"
364
+ )
365
+
366
+ print (
367
+ "Found model with KV cache support. "
368
+ "Benchmarking the autoregressive model with "
369
+ f"input_ids_length: { args .input_ids_length } and "
370
+ f"sequence length: { args .sequence_length } ."
371
+ )
372
+
373
+ model_path , _ , _ = overwrite_cache_model_inputs (
374
+ model_path = model_path ,
375
+ input_ids_length = args .input_ids_length ,
376
+ sequence_length = args .sequence_length ,
377
+ )
378
+
314
379
if input_shapes :
315
380
with override_onnx_input_shapes (model_path , input_shapes ) as tmp_path :
316
381
input_list = generate_random_inputs (tmp_path , batch_size )
@@ -319,24 +384,15 @@ def main():
319
384
320
385
kv_cache_params = None
321
386
if args .use_kvcache :
322
- kv_cache_prev_num_tokens = 0
323
- if args .kv_cache_prev_num_tokens is not None :
324
- kv_cache_prev_num_tokens = args .kv_cache_prev_num_tokens
325
-
326
- kv_cache_num_frozen_tokens = 0
327
- if args .kv_cache_num_frozen_tokens is not None :
328
- kv_cache_num_frozen_tokens = args .kv_cache_num_frozen_tokens
329
-
330
387
kv_cache_params = KVCacheParams (
331
388
default_cached_outputs (model_path ),
332
- kv_cache_prev_num_tokens ,
333
- kv_cache_num_frozen_tokens ,
389
+ args . kv_cache_prev_num_tokens ,
390
+ args . kv_cache_num_frozen_tokens ,
334
391
)
335
392
336
393
print (
337
- "Enable KVCache: prev_num_tokens = {}, num_frozen_tokens = {}" .format (
338
- kv_cache_params .prev_num_tokens , kv_cache_params .num_frozen_tokens
339
- )
394
+ f"Enable KVCache: prev_num_tokens = { kv_cache_params .prev_num_tokens } , "
395
+ f"num_frozen_tokens = { kv_cache_params .num_frozen_tokens } "
340
396
)
341
397
342
398
result = model_debug_analysis (
0 commit comments