Skip to content

Commit 3eaa563

Browse files
authored
Refactor engine.py to pull out some common functionality (#1035)
* Refactor engine.py to pull out some common functionality * Add line at the end * Review comments * Style
1 parent 0d43a20 commit 3eaa563

File tree

1 file changed

+151
-77
lines changed

1 file changed

+151
-77
lines changed

src/deepsparse/engine.py

+151-77
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from deepsparse.benchmark import BenchmarkResults
2929
from deepsparse.utils import (
3030
generate_random_inputs,
31+
get_output_names,
3132
model_to_path,
3233
override_onnx_input_shapes,
3334
)
@@ -53,6 +54,8 @@
5354
"Scheduler",
5455
"Context",
5556
"MultiModelEngine",
57+
"KVCacheEngine",
58+
"BaseEngine",
5659
]
5760

5861
_LOGGER = logging.getLogger(__name__)
@@ -152,7 +155,95 @@ def _validate_scheduler(scheduler: Union[None, str, Scheduler]) -> Scheduler:
152155
return scheduler
153156

154157

155-
class Engine(object):
158+
class Context(object):
159+
"""
160+
Contexts can be used to run multiple instances of the MultiModelEngine with the same
161+
scheduler. This allows one scheduler to manage the resources of the system
162+
effectively, keeping engines that are running different models from fighting over system
163+
resources.
164+
165+
:param num_cores: The number of physical cores to run the model on. If more
166+
cores are requested than are available on a single socket, the engine
167+
will try to distribute them evenly across as few sockets as possible.
168+
:param num_streams: The max number of requests the model can handle
169+
concurrently.
170+
"""
171+
172+
def __init__(
173+
self,
174+
num_cores: int = None,
175+
num_streams: int = None,
176+
):
177+
self._num_cores = _validate_num_cores(num_cores)
178+
self._scheduler = Scheduler.from_str("elastic")
179+
self._deepsparse_context = LIB.deepsparse_context(
180+
self._num_cores,
181+
_validate_num_streams(num_streams, self._num_cores),
182+
self._scheduler.value,
183+
)
184+
# num_streams can be adjusted by how we map optimially to the hardware topology,
185+
# so let's use the context as the source of truth to be transparent
186+
self._num_streams = self._deepsparse_context.num_streams()
187+
188+
@property
189+
def value(self):
190+
return self._deepsparse_context
191+
192+
@property
193+
def num_cores(self):
194+
return self._num_cores
195+
196+
@property
197+
def num_streams(self):
198+
return self._num_streams
199+
200+
@property
201+
def scheduler(self):
202+
return self._scheduler
203+
204+
def __repr__(self) -> str:
205+
return f"Context(num_cores={self.num_cores}, num_streams={self.num_streams}, scheduler={self.scheduler})"
206+
207+
208+
class BaseEngine(object):
209+
def construct(
210+
self,
211+
model: Union[str, "Model", "File"],
212+
batch_size: int = 1,
213+
num_cores: int = None,
214+
num_streams: int = None,
215+
scheduler: Scheduler = None,
216+
input_shapes: List[List[int]] = None,
217+
):
218+
_analytics.send_event("python__engine__init")
219+
self._model_path = model_to_path(model)
220+
self._batch_size = _validate_batch_size(batch_size)
221+
self._num_cores = _validate_num_cores(num_cores)
222+
self._num_streams = _validate_num_streams(num_streams, self._num_cores)
223+
self._scheduler = _validate_scheduler(scheduler)
224+
self._input_shapes = input_shapes
225+
self._cpu_avx_type = AVX_TYPE
226+
self._cpu_vnni = VNNI
227+
228+
def construct_with_context(
229+
self,
230+
model: Union[str, "Model", "File"],
231+
batch_size: int,
232+
context: Context,
233+
input_shapes: List[List[int]] = None,
234+
):
235+
_analytics.send_event("python__engine__init")
236+
self._model_path = model_to_path(model)
237+
self._batch_size = _validate_batch_size(batch_size)
238+
self._num_cores = context.num_cores
239+
self._num_streams = context.num_streams
240+
self._scheduler = _validate_scheduler(context.scheduler)
241+
self._input_shapes = input_shapes
242+
self._cpu_avx_type = AVX_TYPE
243+
self._cpu_vnni = VNNI
244+
245+
246+
class Engine(BaseEngine):
156247
"""
157248
Create a new DeepSparse Engine that compiles the given onnx file
158249
for GPU class performance on commodity CPUs.
@@ -186,16 +277,10 @@ def __init__(
186277
scheduler: Scheduler = None,
187278
input_shapes: List[List[int]] = None,
188279
):
189-
_analytics.send_event("python__engine__init")
190-
self._model_path = model_to_path(model)
191-
self._batch_size = _validate_batch_size(batch_size)
192-
self._num_cores = _validate_num_cores(num_cores)
193-
self._scheduler = _validate_scheduler(scheduler)
194-
self._input_shapes = input_shapes
195-
self._cpu_avx_type = AVX_TYPE
196-
self._cpu_vnni = VNNI
280+
BaseEngine.construct(
281+
self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
282+
)
197283

198-
num_streams = _validate_num_streams(num_streams, self._num_cores)
199284
if self._input_shapes:
200285
with override_onnx_input_shapes(
201286
self._model_path, self._input_shapes
@@ -204,7 +289,7 @@ def __init__(
204289
model_path,
205290
self._batch_size,
206291
self._num_cores,
207-
num_streams,
292+
self._num_streams,
208293
self._scheduler.value,
209294
None,
210295
)
@@ -213,7 +298,7 @@ def __init__(
213298
self._model_path,
214299
self._batch_size,
215300
self._num_cores,
216-
num_streams,
301+
self._num_streams,
217302
self._scheduler.value,
218303
None,
219304
)
@@ -645,15 +730,10 @@ def __init__(
645730
imposed_as: Optional[float] = None,
646731
imposed_ks: Optional[float] = None,
647732
):
648-
self._model_path = model_to_path(model)
649-
self._batch_size = _validate_batch_size(batch_size)
650-
self._num_cores = _validate_num_cores(num_cores)
651-
self._scheduler = _validate_scheduler(scheduler)
652-
self._input_shapes = input_shapes
653-
self._cpu_avx_type = AVX_TYPE
654-
self._cpu_vnni = VNNI
733+
BaseEngine.construct(
734+
self, model, batch_size, num_cores, None, scheduler, input_shapes
735+
)
655736

656-
num_streams = _validate_num_streams(None, self._num_cores)
657737
if self._input_shapes:
658738
with override_onnx_input_shapes(
659739
self._model_path, self._input_shapes
@@ -662,7 +742,7 @@ def __init__(
662742
model_path,
663743
self._batch_size,
664744
self._num_cores,
665-
num_streams,
745+
self._num_streams,
666746
self._scheduler.value,
667747
None,
668748
"external",
@@ -677,7 +757,7 @@ def __init__(
677757
self._model_path,
678758
self._batch_size,
679759
self._num_cores,
680-
num_streams,
760+
self._num_streams,
681761
self._scheduler.value,
682762
None,
683763
"external",
@@ -712,53 +792,6 @@ def analyze(
712792
return bench_info
713793

714794

715-
class Context(object):
716-
"""
717-
Contexts can be used to run multiple instances of the MultiModelEngine with the same
718-
scheduler. This allows one scheduler to manage the resources of the system
719-
effectively, keeping engines that are running different models from fighting over system
720-
resources.
721-
722-
:param num_cores: The number of physical cores to run the model on. If more
723-
cores are requested than are available on a single socket, the engine
724-
will try to distribute them evenly across as few sockets as possible.
725-
:param num_streams: The max number of requests the model can handle
726-
concurrently.
727-
"""
728-
729-
def __init__(
730-
self,
731-
num_cores: int = None,
732-
num_streams: int = None,
733-
):
734-
self._num_cores = _validate_num_cores(num_cores)
735-
self._scheduler = Scheduler.from_str("elastic")
736-
self._deepsparse_context = LIB.deepsparse_context(
737-
self._num_cores,
738-
_validate_num_streams(num_streams, self._num_cores),
739-
self._scheduler.value,
740-
)
741-
742-
@property
743-
def value(self):
744-
return self._deepsparse_context
745-
746-
@property
747-
def num_cores(self):
748-
return self._num_cores
749-
750-
@property
751-
def num_streams(self):
752-
return self._deepsparse_context.num_streams()
753-
754-
@property
755-
def scheduler(self):
756-
return self._scheduler
757-
758-
def __repr__(self) -> str:
759-
return f"Context(num_cores={self.num_cores}, num_streams={self.num_streams}, scheduler={self.scheduler})"
760-
761-
762795
class MultiModelEngine(Engine):
763796
"""
764797
The MultiModelEngine, together with the Context class, can be used to run multiple models
@@ -785,14 +818,9 @@ def __init__(
785818
context: Context,
786819
input_shapes: List[List[int]] = None,
787820
):
788-
self._model_path = model_to_path(model)
789-
self._batch_size = _validate_batch_size(batch_size)
790-
self._num_cores = context.num_cores
791-
self._num_streams = context.num_streams
792-
self._scheduler = _validate_scheduler(context.scheduler)
793-
self._input_shapes = input_shapes
794-
self._cpu_avx_type = AVX_TYPE
795-
self._cpu_vnni = VNNI
821+
BaseEngine.construct_with_context(
822+
self, model, batch_size, context, input_shapes
823+
)
796824

797825
if self._input_shapes:
798826
with override_onnx_input_shapes(
@@ -817,6 +845,52 @@ def __init__(
817845
)
818846

819847

848+
class KVCacheEngine(Engine):
849+
"""
850+
Engine that can do kv caching.
851+
"""
852+
853+
def __init__(
854+
self,
855+
model: Union[str, "Model", "File"],
856+
batch_size: int = 1,
857+
num_cores: int = None,
858+
num_streams: int = None,
859+
scheduler: Scheduler = None,
860+
input_shapes: List[List[int]] = None,
861+
kv_cache_bools: List[bool] = None,
862+
prev_cache_length: int = 0,
863+
):
864+
BaseEngine.construct(
865+
self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
866+
)
867+
868+
if kv_cache_bools is None:
869+
# If no list was provided, then we assume all outputs except for the first are KV caches
870+
# Note: In the future we can look at the names of outputs to be more sure
871+
#
872+
# Create a boolean list of every output of the model
873+
output_names = get_output_names(self._model_path)
874+
kv_cache_bools = [True for i in range(len(output_names))]
875+
# Assume first input is logits and logits ought not to be cached
876+
kv_cache_bools[0] = False
877+
878+
num_streams = _validate_num_streams(num_streams, self._num_cores)
879+
if self._input_shapes:
880+
raise NotImplementedError("Don't do this yet :)")
881+
else:
882+
self._eng_net = LIB.deepsparse_engine(
883+
self._model_path,
884+
self._batch_size,
885+
self._num_cores,
886+
num_streams,
887+
self._scheduler.value,
888+
None,
889+
kv_cache_bools,
890+
prev_cache_length,
891+
)
892+
893+
820894
def compile_model(
821895
model: Union[str, "Model", "File"],
822896
batch_size: int = 1,

0 commit comments

Comments
 (0)