28
28
from deepsparse .benchmark import BenchmarkResults
29
29
from deepsparse .utils import (
30
30
generate_random_inputs ,
31
+ get_output_names ,
31
32
model_to_path ,
32
33
override_onnx_input_shapes ,
33
34
)
53
54
"Scheduler" ,
54
55
"Context" ,
55
56
"MultiModelEngine" ,
57
+ "KVCacheEngine" ,
58
+ "BaseEngine" ,
56
59
]
57
60
58
61
_LOGGER = logging .getLogger (__name__ )
@@ -152,7 +155,95 @@ def _validate_scheduler(scheduler: Union[None, str, Scheduler]) -> Scheduler:
152
155
return scheduler
153
156
154
157
155
- class Engine (object ):
158
+ class Context (object ):
159
+ """
160
+ Contexts can be used to run multiple instances of the MultiModelEngine with the same
161
+ scheduler. This allows one scheduler to manage the resources of the system
162
+ effectively, keeping engines that are running different models from fighting over system
163
+ resources.
164
+
165
+ :param num_cores: The number of physical cores to run the model on. If more
166
+ cores are requested than are available on a single socket, the engine
167
+ will try to distribute them evenly across as few sockets as possible.
168
+ :param num_streams: The max number of requests the model can handle
169
+ concurrently.
170
+ """
171
+
172
+ def __init__ (
173
+ self ,
174
+ num_cores : int = None ,
175
+ num_streams : int = None ,
176
+ ):
177
+ self ._num_cores = _validate_num_cores (num_cores )
178
+ self ._scheduler = Scheduler .from_str ("elastic" )
179
+ self ._deepsparse_context = LIB .deepsparse_context (
180
+ self ._num_cores ,
181
+ _validate_num_streams (num_streams , self ._num_cores ),
182
+ self ._scheduler .value ,
183
+ )
184
+ # num_streams can be adjusted by how we map optimially to the hardware topology,
185
+ # so let's use the context as the source of truth to be transparent
186
+ self ._num_streams = self ._deepsparse_context .num_streams ()
187
+
188
+ @property
189
+ def value (self ):
190
+ return self ._deepsparse_context
191
+
192
+ @property
193
+ def num_cores (self ):
194
+ return self ._num_cores
195
+
196
+ @property
197
+ def num_streams (self ):
198
+ return self ._num_streams
199
+
200
+ @property
201
+ def scheduler (self ):
202
+ return self ._scheduler
203
+
204
+ def __repr__ (self ) -> str :
205
+ return f"Context(num_cores={ self .num_cores } , num_streams={ self .num_streams } , scheduler={ self .scheduler } )"
206
+
207
+
208
+ class BaseEngine (object ):
209
+ def construct (
210
+ self ,
211
+ model : Union [str , "Model" , "File" ],
212
+ batch_size : int = 1 ,
213
+ num_cores : int = None ,
214
+ num_streams : int = None ,
215
+ scheduler : Scheduler = None ,
216
+ input_shapes : List [List [int ]] = None ,
217
+ ):
218
+ _analytics .send_event ("python__engine__init" )
219
+ self ._model_path = model_to_path (model )
220
+ self ._batch_size = _validate_batch_size (batch_size )
221
+ self ._num_cores = _validate_num_cores (num_cores )
222
+ self ._num_streams = _validate_num_streams (num_streams , self ._num_cores )
223
+ self ._scheduler = _validate_scheduler (scheduler )
224
+ self ._input_shapes = input_shapes
225
+ self ._cpu_avx_type = AVX_TYPE
226
+ self ._cpu_vnni = VNNI
227
+
228
+ def construct_with_context (
229
+ self ,
230
+ model : Union [str , "Model" , "File" ],
231
+ batch_size : int ,
232
+ context : Context ,
233
+ input_shapes : List [List [int ]] = None ,
234
+ ):
235
+ _analytics .send_event ("python__engine__init" )
236
+ self ._model_path = model_to_path (model )
237
+ self ._batch_size = _validate_batch_size (batch_size )
238
+ self ._num_cores = context .num_cores
239
+ self ._num_streams = context .num_streams
240
+ self ._scheduler = _validate_scheduler (context .scheduler )
241
+ self ._input_shapes = input_shapes
242
+ self ._cpu_avx_type = AVX_TYPE
243
+ self ._cpu_vnni = VNNI
244
+
245
+
246
+ class Engine (BaseEngine ):
156
247
"""
157
248
Create a new DeepSparse Engine that compiles the given onnx file
158
249
for GPU class performance on commodity CPUs.
@@ -186,16 +277,10 @@ def __init__(
186
277
scheduler : Scheduler = None ,
187
278
input_shapes : List [List [int ]] = None ,
188
279
):
189
- _analytics .send_event ("python__engine__init" )
190
- self ._model_path = model_to_path (model )
191
- self ._batch_size = _validate_batch_size (batch_size )
192
- self ._num_cores = _validate_num_cores (num_cores )
193
- self ._scheduler = _validate_scheduler (scheduler )
194
- self ._input_shapes = input_shapes
195
- self ._cpu_avx_type = AVX_TYPE
196
- self ._cpu_vnni = VNNI
280
+ BaseEngine .construct (
281
+ self , model , batch_size , num_cores , num_streams , scheduler , input_shapes
282
+ )
197
283
198
- num_streams = _validate_num_streams (num_streams , self ._num_cores )
199
284
if self ._input_shapes :
200
285
with override_onnx_input_shapes (
201
286
self ._model_path , self ._input_shapes
@@ -204,7 +289,7 @@ def __init__(
204
289
model_path ,
205
290
self ._batch_size ,
206
291
self ._num_cores ,
207
- num_streams ,
292
+ self . _num_streams ,
208
293
self ._scheduler .value ,
209
294
None ,
210
295
)
@@ -213,7 +298,7 @@ def __init__(
213
298
self ._model_path ,
214
299
self ._batch_size ,
215
300
self ._num_cores ,
216
- num_streams ,
301
+ self . _num_streams ,
217
302
self ._scheduler .value ,
218
303
None ,
219
304
)
@@ -645,15 +730,10 @@ def __init__(
645
730
imposed_as : Optional [float ] = None ,
646
731
imposed_ks : Optional [float ] = None ,
647
732
):
648
- self ._model_path = model_to_path (model )
649
- self ._batch_size = _validate_batch_size (batch_size )
650
- self ._num_cores = _validate_num_cores (num_cores )
651
- self ._scheduler = _validate_scheduler (scheduler )
652
- self ._input_shapes = input_shapes
653
- self ._cpu_avx_type = AVX_TYPE
654
- self ._cpu_vnni = VNNI
733
+ BaseEngine .construct (
734
+ self , model , batch_size , num_cores , None , scheduler , input_shapes
735
+ )
655
736
656
- num_streams = _validate_num_streams (None , self ._num_cores )
657
737
if self ._input_shapes :
658
738
with override_onnx_input_shapes (
659
739
self ._model_path , self ._input_shapes
@@ -662,7 +742,7 @@ def __init__(
662
742
model_path ,
663
743
self ._batch_size ,
664
744
self ._num_cores ,
665
- num_streams ,
745
+ self . _num_streams ,
666
746
self ._scheduler .value ,
667
747
None ,
668
748
"external" ,
@@ -677,7 +757,7 @@ def __init__(
677
757
self ._model_path ,
678
758
self ._batch_size ,
679
759
self ._num_cores ,
680
- num_streams ,
760
+ self . _num_streams ,
681
761
self ._scheduler .value ,
682
762
None ,
683
763
"external" ,
@@ -712,53 +792,6 @@ def analyze(
712
792
return bench_info
713
793
714
794
715
- class Context (object ):
716
- """
717
- Contexts can be used to run multiple instances of the MultiModelEngine with the same
718
- scheduler. This allows one scheduler to manage the resources of the system
719
- effectively, keeping engines that are running different models from fighting over system
720
- resources.
721
-
722
- :param num_cores: The number of physical cores to run the model on. If more
723
- cores are requested than are available on a single socket, the engine
724
- will try to distribute them evenly across as few sockets as possible.
725
- :param num_streams: The max number of requests the model can handle
726
- concurrently.
727
- """
728
-
729
- def __init__ (
730
- self ,
731
- num_cores : int = None ,
732
- num_streams : int = None ,
733
- ):
734
- self ._num_cores = _validate_num_cores (num_cores )
735
- self ._scheduler = Scheduler .from_str ("elastic" )
736
- self ._deepsparse_context = LIB .deepsparse_context (
737
- self ._num_cores ,
738
- _validate_num_streams (num_streams , self ._num_cores ),
739
- self ._scheduler .value ,
740
- )
741
-
742
- @property
743
- def value (self ):
744
- return self ._deepsparse_context
745
-
746
- @property
747
- def num_cores (self ):
748
- return self ._num_cores
749
-
750
- @property
751
- def num_streams (self ):
752
- return self ._deepsparse_context .num_streams ()
753
-
754
- @property
755
- def scheduler (self ):
756
- return self ._scheduler
757
-
758
- def __repr__ (self ) -> str :
759
- return f"Context(num_cores={ self .num_cores } , num_streams={ self .num_streams } , scheduler={ self .scheduler } )"
760
-
761
-
762
795
class MultiModelEngine (Engine ):
763
796
"""
764
797
The MultiModelEngine, together with the Context class, can be used to run multiple models
@@ -785,14 +818,9 @@ def __init__(
785
818
context : Context ,
786
819
input_shapes : List [List [int ]] = None ,
787
820
):
788
- self ._model_path = model_to_path (model )
789
- self ._batch_size = _validate_batch_size (batch_size )
790
- self ._num_cores = context .num_cores
791
- self ._num_streams = context .num_streams
792
- self ._scheduler = _validate_scheduler (context .scheduler )
793
- self ._input_shapes = input_shapes
794
- self ._cpu_avx_type = AVX_TYPE
795
- self ._cpu_vnni = VNNI
821
+ BaseEngine .construct_with_context (
822
+ self , model , batch_size , context , input_shapes
823
+ )
796
824
797
825
if self ._input_shapes :
798
826
with override_onnx_input_shapes (
@@ -817,6 +845,52 @@ def __init__(
817
845
)
818
846
819
847
848
+ class KVCacheEngine (Engine ):
849
+ """
850
+ Engine that can do kv caching.
851
+ """
852
+
853
+ def __init__ (
854
+ self ,
855
+ model : Union [str , "Model" , "File" ],
856
+ batch_size : int = 1 ,
857
+ num_cores : int = None ,
858
+ num_streams : int = None ,
859
+ scheduler : Scheduler = None ,
860
+ input_shapes : List [List [int ]] = None ,
861
+ kv_cache_bools : List [bool ] = None ,
862
+ prev_cache_length : int = 0 ,
863
+ ):
864
+ BaseEngine .construct (
865
+ self , model , batch_size , num_cores , num_streams , scheduler , input_shapes
866
+ )
867
+
868
+ if kv_cache_bools is None :
869
+ # If no list was provided, then we assume all outputs except for the first are KV caches
870
+ # Note: In the future we can look at the names of outputs to be more sure
871
+ #
872
+ # Create a boolean list of every output of the model
873
+ output_names = get_output_names (self ._model_path )
874
+ kv_cache_bools = [True for i in range (len (output_names ))]
875
+ # Assume first input is logits and logits ought not to be cached
876
+ kv_cache_bools [0 ] = False
877
+
878
+ num_streams = _validate_num_streams (num_streams , self ._num_cores )
879
+ if self ._input_shapes :
880
+ raise NotImplementedError ("Don't do this yet :)" )
881
+ else :
882
+ self ._eng_net = LIB .deepsparse_engine (
883
+ self ._model_path ,
884
+ self ._batch_size ,
885
+ self ._num_cores ,
886
+ num_streams ,
887
+ self ._scheduler .value ,
888
+ None ,
889
+ kv_cache_bools ,
890
+ prev_cache_length ,
891
+ )
892
+
893
+
820
894
def compile_model (
821
895
model : Union [str , "Model" , "File" ],
822
896
batch_size : int = 1 ,
0 commit comments