neuralmagic · bfineran · Aug 10, 2023 · Aug 8, 2023 · Aug 8, 2023 · Aug 8, 2023
diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
@@ -300,6 +300,7 @@ def __init__(
         num_streams: int = None,
         scheduler: Scheduler = None,
         input_shapes: List[List[int]] = None,
+        cached_outputs: List[bool] = None,
     ):
         BaseEngine.construct(
             self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
@@ -316,6 +317,7 @@ def __init__(
                     self._num_streams,
                     self._scheduler.value,
                     None,
+                    cached_outputs,
                 )
         else:
             self._eng_net = LIB.deepsparse_engine(
@@ -325,6 +327,7 @@ def __init__(
                 self._num_streams,
                 self._scheduler.value,
                 None,
+                cached_outputs,
             )
 
     def __call__(

diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import logging
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy
-import onnx
 from transformers import AutoTokenizer
 
 from deepsparse.engine import Context
 from deepsparse.pipeline import DEEPSPARSE_ENGINE, create_engine
 from deepsparse.transformers.utils.decoder_kv_cache import DecoderKVCache
-from deepsparse.transformers.utils.helpers import generate_session_id
+from deepsparse.transformers.utils.helpers import (
+    generate_session_id,
+    overwrite_onnx_model_inputs,
+)
 from deepsparse.utils.data import numpy_softmax
-from deepsparse.utils.onnx import translate_onnx_type_to_numpy
-from sparsezoo.utils.onnx import save_onnx
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -71,7 +70,11 @@ def __init__(
         # flag to indicate if the model is quantized or not
         self.kv_cache_data_type = None
 
-        onnx_file_path, output_indices_to_be_cached = self.overwrite_onnx_model_inputs(
+        (
+            onnx_file_path,
+            output_indices_to_be_cached,
+            kv_cache_data_type,
+        ) = overwrite_onnx_model_inputs(
             onnx_file_path=onnx_file_path,
             batch_size=engine_args.get("batch_size", 1),
             sequence_length=sequence_length,
@@ -80,9 +83,10 @@ def __init__(
         kv_cache_enabled = False
         if sum(output_indices_to_be_cached):
             kv_cache_enabled = True
+            self.kv_cache_data_type = kv_cache_data_type
             if use_deepsparse_cache and engine_type == DEEPSPARSE_ENGINE:
                 # inform the engine, that are using the kv cache
-                engine_args["cache_output_bools"] = output_indices_to_be_cached
+                engine_args["cached_outputs"] = output_indices_to_be_cached
 
         self.engine = create_engine(
             onnx_file_path=onnx_file_path,
@@ -100,6 +104,7 @@ def __init__(
         )
         self._freeze_first_position = self._should_freeze_first_position(tokenizer)
         self._session_id = generate_session_id()
+        self._engine_type = engine_type
 
     @property
     def session_id(self) -> str:
@@ -135,6 +140,32 @@ def num_non_blank_cache_entries(self) -> int:
         """
         return self.kv_cache.num_non_blank_entries
 
+    def run(self, inputs: List[numpy.ndarray], val_inp: bool) -> List[numpy.ndarray]:
+        """
+        Run the engine with the given inputs.
+
+        If the internal deepsparse kv cache management is enable,
+        the LIB.kv_cache class object will be passed to the engine
+        call as well.
+
+        :param inputs: The inputs to run the engine with
+        :param val_inp: Whether the input is for validation or not
+
+        :return: The output of the engine
+        """
+
+        if self.kv_cache is not None:
+            if self.kv_cache._kv_cache is not None:
+                if val_inp:
+                    self.engine._validate_inputs(inputs)
+                # model has kv cache support, as well as deepsparse
+                # internal management of the kv cache
+                return self.engine._eng_net.execute_list_out(
+                    inputs, self.kv_cache._kv_cache
+                )
+
+        return self.engine.run(inputs, val_inp)
+
     def __call__(
         self,
         inp: List[numpy.ndarray],
@@ -154,7 +185,7 @@ def __call__(
             # to the input
             inp = self.add_kv_cache_to_input(inp)
 
-        out = self.engine.run(inp, val_inp)
+        out = self.run(inp, val_inp)
 
         if self.kv_cache:
             logits, *kv_cache_state = out
@@ -187,78 +218,9 @@ def transfer_cache_state(self, cache: DecoderKVCache):
         :param cache: The `DecoderKVCache` object to transfer to the engine
             from
         """
-        cache_to_copy = copy.deepcopy(cache)
         target_cache_capacity = self.sequence_length - self.input_ids_length
-        cache_to_copy.set_capacity(target_cache_capacity)
-        self.kv_cache = cache_to_copy
-
-    def overwrite_onnx_model_inputs(
-        self,
-        onnx_file_path: str,
-        sequence_length: int,
-        input_ids_length: int,
-        batch_size: int = 1,
-    ) -> Tuple[str, List[int]]:
-        """
-        Enforces the appropriate input shapes for the onnx model, as well as
-        checks whether kv cache is enabled or not.
-
-        :param onnx_file_path: The path to the onnx model file that will be
-            overwritten with the new input shapes
-        :param batch_size: The batch size to use for the input
-        :param sequence_length: The sequence length to use for the input
-        :param input_ids_length: The length of input_ids
-        :return: The path to the onnx model file that has been overwritten
-            with the new input shapes, as well as the indices of the inputs
-            that should be cached
-        """
-        model = onnx.load(onnx_file_path, load_external_data=False)
-        initializer_input_names = set(node.name for node in model.graph.initializer)
-        external_inputs = [
-            inp for inp in model.graph.input if inp.name not in initializer_input_names
-        ]
-        for external_input in external_inputs:
-            # overwrite the batch size for all the inputs
-            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-
-            if external_input.name in ["input_ids", "positions"]:
-                external_input.type.tensor_type.shape.dim[
-                    1
-                ].dim_value = input_ids_length
-            elif external_input.name == "attention_mask":
-                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
-            elif external_input.name.startswith(_CACHE_INPUT_NAME):
-                external_input.type.tensor_type.shape.dim[2].dim_value = (
-                    sequence_length - input_ids_length
-                )
-            elif external_input.name.startswith("causal_mask"):
-                external_input.type.tensor_type.shape.dim[
-                    2
-                ].dim_value = input_ids_length
-                external_input.type.tensor_type.shape.dim[3].dim_value = sequence_length
-            else:
-                raise ValueError(
-                    f"Unexpected external input name: {external_input.name}"
-                )
-
-        _LOGGER.info(
-            "Overwriting in-place the input shapes "
-            f"of the transformer model at {onnx_file_path}"
-        )
-        save_onnx(model, onnx_file_path)
-
-        output_indices_to_be_cached = [
-            1 if inp.name.startswith("present") else 0 for inp in model.graph.output
-        ]
-        if any(output_indices_to_be_cached):
-            kv_cache_elem_type = next(
-                inp
-                for inp in model.graph.input
-                if inp.name.startswith(_CACHE_INPUT_NAME)
-            ).type.tensor_type.elem_type
-            self.kv_cache_data_type = translate_onnx_type_to_numpy(kv_cache_elem_type)
-
-        return onnx_file_path, output_indices_to_be_cached
+        cache.set_capacity(target_cache_capacity)
+        self.kv_cache = cache
 
     def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
         """
@@ -283,7 +245,7 @@ def reset_kv_cache(self):
         kv_cache_state = self._initialize_kv_cache_state(
             self.sequence_length - self.input_ids_length
         )
-        self.kv_cache.setup_session(
+        self.kv_cache.setup(
             session_id=self._session_id,
             state=kv_cache_state,
             num_processed_tokens=0,
@@ -328,7 +290,7 @@ def update_kv_cache(
             name: array for name, array in zip(cache_onnx_names, kv_cache_state)
         }
 
-        self.kv_cache.update_session(
+        self.kv_cache.update(
             state=kv_cache_state,
             input_ids_len=input_ids_len,
         )
@@ -364,6 +326,6 @@ def _should_freeze_first_position(tokenizer) -> bool:
         # (True if tokenizer has a prefix for a BOS token)
         if tokenizer is None:
             return False
-        if hasattr(tokenizer, "bos_token"):
+        if hasattr(tokenizer, "add_bos_token"):
             return True
         return False