From 9b22a25d0379b58733fe415079b61d673744c073 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Wed, 8 Mar 2023 16:06:20 +0000
Subject: [PATCH 1/8] initial commit

---
 src/deepsparse/tasks.py                       |  2 +
 src/deepsparse/transformers/helpers.py        | 10 ++-
 .../transformers/pipelines/__init__.py        |  1 +
 .../transformers/pipelines/helper/__init__.py |  1 +
 .../pipelines/helper/autoregressive.py        | 63 +++++++++++++++++++
 .../transformers/pipelines/text_generation.py | 41 ++++++++++++
 6 files changed, 115 insertions(+), 3 deletions(-)
 create mode 100644 src/deepsparse/transformers/pipelines/helper/__init__.py
 create mode 100644 src/deepsparse/transformers/pipelines/helper/autoregressive.py

diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index aa6c349eb6..90bece573f 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -82,6 +82,7 @@ class SupportedTasks:
             "token_classification",
             "zero_shot_text_classification",
             "transformers_embedding_extraction",
+            "text_generation"
         ],
     )(
         question_answering=AliasedTask("question_answering", ["qa"]),
@@ -93,6 +94,7 @@ class SupportedTasks:
         transformers_embedding_extraction=AliasedTask(
             "transformers_embedding_extraction", []
         ),
+        text_generation=AliasedTask("text_generation", ["codegen"]),
     )
 
     image_classification = namedtuple("image_classification", ["image_classification"])(
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index e204839e73..bebf9a9195 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -159,13 +159,17 @@ def overwrite_transformer_onnx_model_inputs(
     ]
     input_names = []
     for external_input in external_inputs:
-        external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-        external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        # this is a hack for now, we will need to think a smart way to implement
+        # this in the future
+        if external_input.name.startswith("input_ids"):
+            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+            external_input.type.tensor_type.shape.dim[1].dim_value = max_length
         input_names.append(external_input.name)
 
     # Save modified model
     if output_path is None:
-        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
+        # hack for now, otherwise temporary files dissappear for some reason
+        tmp_file = NamedTemporaryFile(delete=False)  # file will be deleted after program exit
         onnx.save(model, tmp_file.name)
 
         return tmp_file.name, input_names, tmp_file
diff --git a/src/deepsparse/transformers/pipelines/__init__.py b/src/deepsparse/transformers/pipelines/__init__.py
index 3e2e88381d..e30215182a 100644
--- a/src/deepsparse/transformers/pipelines/__init__.py
+++ b/src/deepsparse/transformers/pipelines/__init__.py
@@ -21,3 +21,4 @@
 from .token_classification import *
 from .zero_shot_text_classification import *
 from .embedding_extraction import *
+from .text_generation import *
diff --git a/src/deepsparse/transformers/pipelines/helper/__init__.py b/src/deepsparse/transformers/pipelines/helper/__init__.py
new file mode 100644
index 0000000000..dddccb5767
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines/helper/__init__.py
@@ -0,0 +1 @@
+from .autoregressive import *
\ No newline at end of file
diff --git a/src/deepsparse/transformers/pipelines/helper/autoregressive.py b/src/deepsparse/transformers/pipelines/helper/autoregressive.py
new file mode 100644
index 0000000000..915ce37e00
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines/helper/autoregressive.py
@@ -0,0 +1,63 @@
+import os
+from deepsparse import Context, MultiModelEngine
+from deepsparse.pipeline import DEEPSPARSE_ENGINE, ORT_ENGINE, SUPPORTED_PIPELINE_ENGINES, ORTEngine, Engine
+from deepsparse.transformers.pipelines import TransformersPipeline
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
+
+_MODEL_DIR_ONNX_DECODER_NAME = "decoder_model.onnx"
+
+
+class AutoregressivePipeline(TransformersPipeline):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def setup_onnx_file_path(self) -> str:
+        """
+        Performs any setup to unwrap and process the given `model_path` and other
+        class properties into an inference ready onnx file to be compiled by the
+        engine of the pipeline
+
+        :return: file path to the ONNX file for the engine to compile
+        """
+        onnx_path = super().setup_onnx_file_path()
+        onnx_decoder_path = self.setup_decoder_onnx_file_path()
+        return onnx_path, onnx_decoder_path
+
+    def setup_decoder_onnx_file_path(self):
+        decoder_onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_DECODER_NAME)
+
+        decoder_onnx_path, self.decoder_onnx_input_names, self._temp_model_directory = overwrite_transformer_onnx_model_inputs(
+            decoder_onnx_path, max_length=self.sequence_length
+        )
+
+        return decoder_onnx_path
+
+    def _initialize_engine(self):
+        assert len(self.onnx_file_path) == 2, "Expected two onnx files for encoder and decoder"
+        #assert os.path.exists(self.onnx_file_path[0]), f"Encoder onnx file does not exist at {self.onnx_file_path[0]}"
+        assert os.path.exists(self.onnx_file_path[1]), f"Decoder onnx file does not exist at {self.onnx_file_path[1]}"
+        self.engine = self._initialize_single_engine(onnx_file_path = self.onnx_file_path[0])
+        self.decoder_engine = self._initialize_single_engine(onnx_file_path = self.onnx_file_path[1])
+
+
+    def _initialize_single_engine(self, onnx_file_path):
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=onnx_file_path,
+                    **self._engine_args,
+                )
+            return Engine(onnx_file_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(onnx_file_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index e69de29bb2..905bf9a996 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -0,0 +1,41 @@
+from typing import Type, List, Union, Tuple, Dict, Any
+
+import numpy
+from pydantic import BaseModel
+
+from deepsparse import Pipeline
+from deepsparse.transformers.pipelines.helper import AutoregressivePipeline
+
+
+
+__all__ = ["TextGenerationPipeline"]
+
+@Pipeline.register(
+    task="text_generation",
+    task_aliases=["codegen"],
+)
+class TextGenerationPipeline(AutoregressivePipeline):
+
+    @staticmethod
+    def route_input_to_bucket(*args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs) -> Pipeline:
+        pass
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def process_inputs(self, inputs: BaseModel) -> Union[
+        List[numpy.ndarray], Tuple[List[numpy.ndarray], Dict[str, Any]]]:
+        pass
+
+    def process_engine_outputs(self, engine_outputs: List[numpy.ndarray], **kwargs) -> BaseModel:
+        pass
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        pass
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        pass
+
+

From 5cd038a067fac5311a1919db60eb476900219c33 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 10 Mar 2023 11:10:23 +0000
Subject: [PATCH 2/8] coreys simplifications

---
 src/deepsparse/pipeline.py                    |  2 +-
 src/deepsparse/transformers/helpers.py        |  2 +-
 .../pipelines/helper/autoregressive.py        | 75 ++++++++++++++++++-
 .../transformers/pipelines/pipeline.py        |  9 ++-
 .../transformers/pipelines/text_generation.py | 67 ++++++++++++++---
 5 files changed, 138 insertions(+), 17 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 88d5414992..48b423337c 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(b, **postprocess_kwargs) for b in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index bebf9a9195..7a0719faf9 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -168,7 +168,7 @@ def overwrite_transformer_onnx_model_inputs(
 
     # Save modified model
     if output_path is None:
-        # hack for now, otherwise temporary files dissappear for some reason
+        # hack for now, otherwise temporary files disappear for some reason
         tmp_file = NamedTemporaryFile(delete=False)  # file will be deleted after program exit
         onnx.save(model, tmp_file.name)
 
diff --git a/src/deepsparse/transformers/pipelines/helper/autoregressive.py b/src/deepsparse/transformers/pipelines/helper/autoregressive.py
index 915ce37e00..d2dd90bcb3 100644
--- a/src/deepsparse/transformers/pipelines/helper/autoregressive.py
+++ b/src/deepsparse/transformers/pipelines/helper/autoregressive.py
@@ -1,8 +1,12 @@
 import os
+import numpy
 from deepsparse import Context, MultiModelEngine
 from deepsparse.pipeline import DEEPSPARSE_ENGINE, ORT_ENGINE, SUPPORTED_PIPELINE_ENGINES, ORTEngine, Engine
 from deepsparse.transformers.pipelines import TransformersPipeline
 from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
+from typing import Type, List, Mapping, Any
+from pydantic import BaseModel
+from abc import abstractmethod
 
 _MODEL_DIR_ONNX_DECODER_NAME = "decoder_model.onnx"
 
@@ -11,6 +15,73 @@ class AutoregressivePipeline(TransformersPipeline):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self._initialize_engine()
+
+
+
+    def engine_forward(self, engine_inputs: List[numpy.ndarray], **kwargs) -> List[numpy.ndarray]:
+        """
+        :param engine_inputs: list of numpy inputs to Pipeline engine forward
+            pass
+        :return: result of forward pass to Pipeline engine
+        """
+        batch_num = engine_inputs[0].shape[0]
+        eos_token_found = [False] * batch_num
+        new_tokens = []
+        valid_tokens_mask = kwargs.get("valid_tokens_mask", None)
+        if valid_tokens_mask is None:
+            raise ValueError
+
+        logits, *kv_cache = self.decoder_engine(engine_inputs)
+
+        # Using the mask to keep the valid tokens only
+        valid_tokens = numpy.ma.masked_array(engine_inputs[0], valid_tokens_mask)
+        for batch_idx, valid_tokens_sequence in enumerate(valid_tokens):
+            # by counting the number of valid tokens,
+            # we can get the index of the last valid token
+            # Is this assumption always valid?
+            last_valid_token_idx = numpy.ma.count(valid_tokens_sequence)
+            # get the logits that emerge after processing the last valid token
+            last_logits = logits[batch_idx, last_valid_token_idx - 1, :]
+            next_token = numpy.argmax(last_logits)
+            eos_token_found[batch_idx] = next_token == self.tokenizer.eos_token_id
+            if last_valid_token_idx >= self.sequence_length:
+                raise ValueError("Sequence length exceeded")
+            new_tokens.append(next_token)
+            engine_inputs[1][batch_idx, last_valid_token_idx] = 1
+
+
+        input_dict = {}
+        input_dict['input_ids'] = numpy.array([[next_token]])
+        input_dict['attention_mask'] = engine_inputs[1]
+
+        kv_cache_names = [name.replace('present', 'past_key_values') for name in self.decoder_engine._output_names if name.startswith('present')]
+        for name, array in zip(kv_cache_names, kv_cache):
+            input_dict[name] = array
+
+        engine_inputs = [input_dict[name] for name in self.onnx_input_names]
+
+        while last_valid_token_idx < self.sequence_length:
+            if all(eos_token_found):
+                return engine_inputs[0]
+            logits, *kv_cache = self.engine(engine_inputs)
+            # Using the mask to keep the valid tokens only
+            valid_tokens = numpy.ma.masked_array(engine_inputs[0], valid_tokens_mask)
+            for batch_idx, valid_tokens_sequence in enumerate(valid_tokens):
+                # by counting the number of valid tokens,
+                # we can get the index of the last valid token
+                # Is this assumption always valid?
+                last_valid_token_idx = numpy.ma.count(valid_tokens_sequence)
+                # get the logits that emerge after processing the last valid token
+                last_logits = logits[batch_idx, last_valid_token_idx - 1, :]
+                next_token = numpy.argmax(last_logits)
+                eos_token_found = next_token == self.tokenizer.eos_token_id
+                engine_inputs[0][batch_idx, last_valid_token_idx] = next_token
+                engine_inputs[1][batch_idx, last_valid_token_idx] = 1
+
+        return engine_inputs[0]
+
+
 
     def setup_onnx_file_path(self) -> str:
         """
@@ -24,6 +95,7 @@ class properties into an inference ready onnx file to be compiled by the
         onnx_decoder_path = self.setup_decoder_onnx_file_path()
         return onnx_path, onnx_decoder_path
 
+
     def setup_decoder_onnx_file_path(self):
         decoder_onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_DECODER_NAME)
 
@@ -35,7 +107,7 @@ def setup_decoder_onnx_file_path(self):
 
     def _initialize_engine(self):
         assert len(self.onnx_file_path) == 2, "Expected two onnx files for encoder and decoder"
-        #assert os.path.exists(self.onnx_file_path[0]), f"Encoder onnx file does not exist at {self.onnx_file_path[0]}"
+        assert os.path.exists(self.onnx_file_path[0]), f"Encoder onnx file does not exist at {self.onnx_file_path[0]}"
         assert os.path.exists(self.onnx_file_path[1]), f"Decoder onnx file does not exist at {self.onnx_file_path[1]}"
         self.engine = self._initialize_single_engine(onnx_file_path = self.onnx_file_path[0])
         self.decoder_engine = self._initialize_single_engine(onnx_file_path = self.onnx_file_path[1])
@@ -61,3 +133,4 @@ def _initialize_single_engine(self, onnx_file_path):
                 f"Unknown engine_type {self.engine_type}. Supported values include: "
                 f"{SUPPORTED_PIPELINE_ENGINES}"
             )
+
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index 38073e260f..a8ef694b77 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -126,19 +126,20 @@ def setup_onnx_file_path(self) -> str:
         return onnx_path
 
     def tokens_to_engine_input(
-        self, tokens: Mapping[Any, numpy.ndarray]
+        self, tokens: Mapping[Any, numpy.ndarray], onnx_input_names: Optional[List[str]] = None
     ) -> List[numpy.ndarray]:
         """
         :param tokens: outputs of the pipeline tokenizer
         :return: list of numpy arrays in expected order for model input
         """
-        if not all(name in tokens for name in self.onnx_input_names):
+        onnx_input_names = onnx_input_names or self.onnx_input_names
+        if not all(name in tokens for name in onnx_input_names):
             raise ValueError(
-                f"pipeline expected arrays with names {self.onnx_input_names}, "
+                f"pipeline expected arrays with names {onnx_input_names}, "
                 f"received inputs: {list(tokens.keys())}"
             )
 
-        return [tokens[name] for name in self.onnx_input_names]
+        return [tokens[name] for name in onnx_input_names]
 
     @staticmethod
     def should_bucket(*args, **kwargs) -> bool:
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 905bf9a996..ff2ba3af1f 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -1,8 +1,8 @@
-from typing import Type, List, Union, Tuple, Dict, Any
+from typing import Type, List, Union, Tuple, Dict, Any, Optional
 
 import numpy
-from pydantic import BaseModel
-
+from pydantic import BaseModel, Field
+from transformers import BatchEncoding
 from deepsparse import Pipeline
 from deepsparse.transformers.pipelines.helper import AutoregressivePipeline
 
@@ -10,6 +10,14 @@
 
 __all__ = ["TextGenerationPipeline"]
 
+class InputSchema(BaseModel):
+    sequences: Union[str, List[str]]
+
+class OutputSchema(BaseModel):
+    sequences: Union[str, List[str]]
+
+
+
 @Pipeline.register(
     task="text_generation",
     task_aliases=["codegen"],
@@ -23,19 +31,58 @@ def route_input_to_bucket(*args, input_schema: BaseModel, pipelines: List[Pipeli
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def process_inputs(self, inputs: BaseModel) -> Union[
-        List[numpy.ndarray], Tuple[List[numpy.ndarray], Dict[str, Any]]]:
-        pass
-
     def process_engine_outputs(self, engine_outputs: List[numpy.ndarray], **kwargs) -> BaseModel:
-        pass
+
+        return None
 
     @property
     def input_schema(self) -> Type[BaseModel]:
-        pass
+        """
+        :return: pydantic model class that inputs to this pipeline must comply to
+        """
+        return InputSchema
+
 
     @property
     def output_schema(self) -> Type[BaseModel]:
-        pass
+        return OutputSchema
+
+
+    def process_inputs(
+            self,
+            inputs: BaseModel):
+
+        sequences = inputs.sequences
+        if isinstance(sequences, List) and all(
+                isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
+        ):
+            # if batch items contain only one sequence but are wrapped in lists, unwrap
+            # for use as tokenizer input
+            sequences = [sequence[0] for sequence in sequences]
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        input_tokens = self.tokenizer(
+            sequences,
+            add_special_tokens=True,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding="max_length")
+
+
+        onnx_input_names = [input_name for input_name in self.onnx_input_names if not input_name.startswith("past_key_values")]
+        engine_input = self.tokens_to_engine_input(input_tokens, onnx_input_names = onnx_input_names)
+
+        # a boolean mask that indicates which tokens are valid (are non-padding tokens)
+        valid_tokens_mask = numpy.where(
+            engine_input[0] == self.tokenizer.pad_token_id, 1, 0
+        )
+
+        preprocessing_kwargs = dict(
+            input_sequence=engine_input[0], valid_tokens_mask=valid_tokens_mask
+        )
+
+        return engine_input, preprocessing_kwargs
+
+
 
 

From 2a28e4f659af2700af3d57bd45b86bf740150937 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 10 Mar 2023 13:44:05 +0000
Subject: [PATCH 3/8] finishing the second model static

---
 src/deepsparse/pipeline.py                    |   3 +-
 src/deepsparse/tasks.py                       |   2 +-
 src/deepsparse/transformers/helpers.py        |  16 +-
 .../transformers/pipelines/helper/__init__.py |   1 -
 .../pipelines/helper/autoregressive.py        | 136 --------------
 .../transformers/pipelines/pipeline.py        |   4 +-
 .../transformers/pipelines/text_generation.py | 169 +++++++++++++++---
 7 files changed, 159 insertions(+), 172 deletions(-)
 delete mode 100644 src/deepsparse/transformers/pipelines/helper/__init__.py
 delete mode 100644 src/deepsparse/transformers/pipelines/helper/autoregressive.py

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 48b423337c..ea3d87b879 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,8 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = [self.engine_forward(b, **postprocess_kwargs) for b in batches]
+        # batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(b) for b in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index 90bece573f..394406a19b 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -82,7 +82,7 @@ class SupportedTasks:
             "token_classification",
             "zero_shot_text_classification",
             "transformers_embedding_extraction",
-            "text_generation"
+            "text_generation",
         ],
     )(
         question_answering=AliasedTask("question_answering", ["qa"]),
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 7a0719faf9..efece7f259 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -135,6 +135,7 @@ def overwrite_transformer_onnx_model_inputs(
     batch_size: int = 1,
     max_length: int = 128,
     output_path: Optional[str] = None,
+    input_names_to_overwrite: Optional[List[str]] = ["input_ids"],
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
     Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
@@ -159,17 +160,18 @@ def overwrite_transformer_onnx_model_inputs(
     ]
     input_names = []
     for external_input in external_inputs:
-        # this is a hack for now, we will need to think a smart way to implement
-        # this in the future
-        if external_input.name.startswith("input_ids"):
-            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-            external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        if input_names_to_overwrite is not None:
+            if external_input.name not in input_names_to_overwrite:
+                input_names.append(external_input.name)
+                continue
+
+        external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        external_input.type.tensor_type.shape.dim[1].dim_value = max_length
         input_names.append(external_input.name)
 
     # Save modified model
     if output_path is None:
-        # hack for now, otherwise temporary files disappear for some reason
-        tmp_file = NamedTemporaryFile(delete=False)  # file will be deleted after program exit
+        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
         onnx.save(model, tmp_file.name)
 
         return tmp_file.name, input_names, tmp_file
diff --git a/src/deepsparse/transformers/pipelines/helper/__init__.py b/src/deepsparse/transformers/pipelines/helper/__init__.py
deleted file mode 100644
index dddccb5767..0000000000
--- a/src/deepsparse/transformers/pipelines/helper/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .autoregressive import *
\ No newline at end of file
diff --git a/src/deepsparse/transformers/pipelines/helper/autoregressive.py b/src/deepsparse/transformers/pipelines/helper/autoregressive.py
deleted file mode 100644
index d2dd90bcb3..0000000000
--- a/src/deepsparse/transformers/pipelines/helper/autoregressive.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-import numpy
-from deepsparse import Context, MultiModelEngine
-from deepsparse.pipeline import DEEPSPARSE_ENGINE, ORT_ENGINE, SUPPORTED_PIPELINE_ENGINES, ORTEngine, Engine
-from deepsparse.transformers.pipelines import TransformersPipeline
-from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
-from typing import Type, List, Mapping, Any
-from pydantic import BaseModel
-from abc import abstractmethod
-
-_MODEL_DIR_ONNX_DECODER_NAME = "decoder_model.onnx"
-
-
-class AutoregressivePipeline(TransformersPipeline):
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self._initialize_engine()
-
-
-
-    def engine_forward(self, engine_inputs: List[numpy.ndarray], **kwargs) -> List[numpy.ndarray]:
-        """
-        :param engine_inputs: list of numpy inputs to Pipeline engine forward
-            pass
-        :return: result of forward pass to Pipeline engine
-        """
-        batch_num = engine_inputs[0].shape[0]
-        eos_token_found = [False] * batch_num
-        new_tokens = []
-        valid_tokens_mask = kwargs.get("valid_tokens_mask", None)
-        if valid_tokens_mask is None:
-            raise ValueError
-
-        logits, *kv_cache = self.decoder_engine(engine_inputs)
-
-        # Using the mask to keep the valid tokens only
-        valid_tokens = numpy.ma.masked_array(engine_inputs[0], valid_tokens_mask)
-        for batch_idx, valid_tokens_sequence in enumerate(valid_tokens):
-            # by counting the number of valid tokens,
-            # we can get the index of the last valid token
-            # Is this assumption always valid?
-            last_valid_token_idx = numpy.ma.count(valid_tokens_sequence)
-            # get the logits that emerge after processing the last valid token
-            last_logits = logits[batch_idx, last_valid_token_idx - 1, :]
-            next_token = numpy.argmax(last_logits)
-            eos_token_found[batch_idx] = next_token == self.tokenizer.eos_token_id
-            if last_valid_token_idx >= self.sequence_length:
-                raise ValueError("Sequence length exceeded")
-            new_tokens.append(next_token)
-            engine_inputs[1][batch_idx, last_valid_token_idx] = 1
-
-
-        input_dict = {}
-        input_dict['input_ids'] = numpy.array([[next_token]])
-        input_dict['attention_mask'] = engine_inputs[1]
-
-        kv_cache_names = [name.replace('present', 'past_key_values') for name in self.decoder_engine._output_names if name.startswith('present')]
-        for name, array in zip(kv_cache_names, kv_cache):
-            input_dict[name] = array
-
-        engine_inputs = [input_dict[name] for name in self.onnx_input_names]
-
-        while last_valid_token_idx < self.sequence_length:
-            if all(eos_token_found):
-                return engine_inputs[0]
-            logits, *kv_cache = self.engine(engine_inputs)
-            # Using the mask to keep the valid tokens only
-            valid_tokens = numpy.ma.masked_array(engine_inputs[0], valid_tokens_mask)
-            for batch_idx, valid_tokens_sequence in enumerate(valid_tokens):
-                # by counting the number of valid tokens,
-                # we can get the index of the last valid token
-                # Is this assumption always valid?
-                last_valid_token_idx = numpy.ma.count(valid_tokens_sequence)
-                # get the logits that emerge after processing the last valid token
-                last_logits = logits[batch_idx, last_valid_token_idx - 1, :]
-                next_token = numpy.argmax(last_logits)
-                eos_token_found = next_token == self.tokenizer.eos_token_id
-                engine_inputs[0][batch_idx, last_valid_token_idx] = next_token
-                engine_inputs[1][batch_idx, last_valid_token_idx] = 1
-
-        return engine_inputs[0]
-
-
-
-    def setup_onnx_file_path(self) -> str:
-        """
-        Performs any setup to unwrap and process the given `model_path` and other
-        class properties into an inference ready onnx file to be compiled by the
-        engine of the pipeline
-
-        :return: file path to the ONNX file for the engine to compile
-        """
-        onnx_path = super().setup_onnx_file_path()
-        onnx_decoder_path = self.setup_decoder_onnx_file_path()
-        return onnx_path, onnx_decoder_path
-
-
-    def setup_decoder_onnx_file_path(self):
-        decoder_onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_DECODER_NAME)
-
-        decoder_onnx_path, self.decoder_onnx_input_names, self._temp_model_directory = overwrite_transformer_onnx_model_inputs(
-            decoder_onnx_path, max_length=self.sequence_length
-        )
-
-        return decoder_onnx_path
-
-    def _initialize_engine(self):
-        assert len(self.onnx_file_path) == 2, "Expected two onnx files for encoder and decoder"
-        assert os.path.exists(self.onnx_file_path[0]), f"Encoder onnx file does not exist at {self.onnx_file_path[0]}"
-        assert os.path.exists(self.onnx_file_path[1]), f"Decoder onnx file does not exist at {self.onnx_file_path[1]}"
-        self.engine = self._initialize_single_engine(onnx_file_path = self.onnx_file_path[0])
-        self.decoder_engine = self._initialize_single_engine(onnx_file_path = self.onnx_file_path[1])
-
-
-    def _initialize_single_engine(self, onnx_file_path):
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=onnx_file_path,
-                    **self._engine_args,
-                )
-            return Engine(onnx_file_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(onnx_file_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
-
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index a8ef694b77..0e58b57c9b 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -126,7 +126,9 @@ def setup_onnx_file_path(self) -> str:
         return onnx_path
 
     def tokens_to_engine_input(
-        self, tokens: Mapping[Any, numpy.ndarray], onnx_input_names: Optional[List[str]] = None
+        self,
+        tokens: Mapping[Any, numpy.ndarray],
+        onnx_input_names: Optional[List[str]] = None,
     ) -> List[numpy.ndarray]:
         """
         :param tokens: outputs of the pipeline tokenizer
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index ff2ba3af1f..6862d26795 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -1,60 +1,87 @@
-from typing import Type, List, Union, Tuple, Dict, Any, Optional
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Mapping, Optional, Tuple, Type, Union
 
 import numpy
 from pydantic import BaseModel, Field
+
+from deepsparse import Context, MultiModelEngine
+from deepsparse.pipeline import (
+    DEEPSPARSE_ENGINE,
+    ORT_ENGINE,
+    SUPPORTED_PIPELINE_ENGINES,
+    Engine,
+    ORTEngine,
+)
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
+from deepsparse.transformers.pipelines import TransformersPipeline
+
+
+_MODEL_DIR_ONNX_DECODER_NAME = "decoder_model.onnx"
 from transformers import BatchEncoding
-from deepsparse import Pipeline
-from deepsparse.transformers.pipelines.helper import AutoregressivePipeline
 
+from deepsparse import Pipeline
 
 
 __all__ = ["TextGenerationPipeline"]
 
+
 class InputSchema(BaseModel):
     sequences: Union[str, List[str]]
 
+
 class OutputSchema(BaseModel):
     sequences: Union[str, List[str]]
 
 
-
 @Pipeline.register(
     task="text_generation",
     task_aliases=["codegen"],
 )
-class TextGenerationPipeline(AutoregressivePipeline):
-
+class TextGenerationPipeline(TransformersPipeline):
     @staticmethod
-    def route_input_to_bucket(*args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs) -> Pipeline:
+    def route_input_to_bucket(
+        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
+    ) -> Pipeline:
         pass
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-
-    def process_engine_outputs(self, engine_outputs: List[numpy.ndarray], **kwargs) -> BaseModel:
-
-        return None
+        self.onnx_decoder_path = self.setup_decoder_onnx_file_path()
+        self.decoder_engine = self._initialize_decoder_engine()
 
     @property
     def input_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that inputs to this pipeline must comply to
-        """
         return InputSchema
 
-
     @property
     def output_schema(self) -> Type[BaseModel]:
         return OutputSchema
 
+    def process_engine_outputs(
+        self, engine_outputs: List[numpy.ndarray], **kwargs
+    ) -> BaseModel:
+        return None
 
-    def process_inputs(
-            self,
-            inputs: BaseModel):
+    def process_inputs(self, inputs: BaseModel):
 
         sequences = inputs.sequences
         if isinstance(sequences, List) and all(
-                isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
+            isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
         ):
             # if batch items contain only one sequence but are wrapped in lists, unwrap
             # for use as tokenizer input
@@ -66,11 +93,17 @@ def process_inputs(
             add_special_tokens=True,
             return_tensors="np",
             max_length=self.sequence_length,
-            padding="max_length")
-
+            padding="max_length",
+        )
 
-        onnx_input_names = [input_name for input_name in self.onnx_input_names if not input_name.startswith("past_key_values")]
-        engine_input = self.tokens_to_engine_input(input_tokens, onnx_input_names = onnx_input_names)
+        onnx_input_names = [
+            input_name
+            for input_name in self.onnx_input_names
+            if not input_name.startswith("past_key_values")
+        ]
+        engine_input = self.tokens_to_engine_input(
+            input_tokens, onnx_input_names=onnx_input_names
+        )
 
         # a boolean mask that indicates which tokens are valid (are non-padding tokens)
         valid_tokens_mask = numpy.where(
@@ -83,6 +116,92 @@ def process_inputs(
 
         return engine_input, preprocessing_kwargs
 
+    def engine_forward(
+        self, engine_inputs: List[numpy.ndarray], **kwargs
+    ) -> List[numpy.ndarray]:
+        """
+        :param engine_inputs: list of numpy inputs to Pipeline engine forward
+            pass
+        :return: result of forward pass to Pipeline engine
+        """
+        assert self._batch_size == 1
+
+        eos_token_found = False
+        generated_tokens = []
+        valid_tokens = [
+            t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id
+        ]
+
+        past_logits, *new_kvs = self.decoder_engine(engine_inputs)
+
+        new_token = numpy.argmax(past_logits[0, -1, :])
+        generated_tokens.append(new_token)
+
+        kv_output_names = [
+            name
+            for name in self.decoder_engine._output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(kv_output_names, new_kvs))
+        kv_cache = {
+            k.replace("present", "past_key_values"): v[:, :, :-1]
+            for k, v in kv_cache.items()
+        }
+        for k, v in kv_cache.items():
+            v[:, :, len(valid_tokens) :] = 0.0
+
+        for iter in range(self.sequence_length - len(valid_tokens)):
+            if eos_token_found:
+                return valid_tokens
+
+            attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+            attention_mask[:, : len(valid_tokens)] = 1
+            attention_mask[:, -1] = 1
+            assert attention_mask.sum() == len(valid_tokens) + 1
+
+            engine_inputs_dict = {
+                "input_ids": numpy.array([[new_token]]),
+                "attention_mask": attention_mask,
+            }
+            engine_inputs_dict.update(kv_cache)
+            engine_inputs = [
+                numpy.ascontiguousarray(engine_inputs_dict[name])
+                for name in self.onnx_input_names
+            ]
+
+            new_logits, *new_kvs = self.engine(engine_inputs)
+
+        return engine_inputs[0]
+
+    def setup_decoder_onnx_file_path(self):
+        decoder_onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_DECODER_NAME)
+        (
+            decoder_onnx_path,
+            self.decoder_onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            decoder_onnx_path, max_length=self.sequence_length
+        )
 
-
-
+        return decoder_onnx_path
+
+    def _initialize_decoder_engine(self) -> Union[Engine, ORTEngine]:
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=self.onnx_decoder_path,
+                    **self._engine_args,
+                )
+            return Engine(self.onnx_decoder_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_decoder_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )

From 56f4f28bdc321142b9a4772714249ab9baa28b49 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 10 Mar 2023 17:08:48 +0000
Subject: [PATCH 4/8] ready, time for beautification

---
 src/deepsparse/transformers/helpers.py        |  12 +-
 .../transformers/pipelines/pipeline.py        |   2 +-
 .../transformers/pipelines/text_generation.py | 289 +++++++++++++-----
 3 files changed, 213 insertions(+), 90 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index efece7f259..18869060de 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -135,7 +135,6 @@ def overwrite_transformer_onnx_model_inputs(
     batch_size: int = 1,
     max_length: int = 128,
     output_path: Optional[str] = None,
-    input_names_to_overwrite: Optional[List[str]] = ["input_ids"],
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
     Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
@@ -160,13 +159,10 @@ def overwrite_transformer_onnx_model_inputs(
     ]
     input_names = []
     for external_input in external_inputs:
-        if input_names_to_overwrite is not None:
-            if external_input.name not in input_names_to_overwrite:
-                input_names.append(external_input.name)
-                continue
-
-        external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-        external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        # Commenting this out for now, as it is not needed for the ORT backend
+        # Will be crucial for DeepSparse backend
+        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
         input_names.append(external_input.name)
 
     # Save modified model
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index 0e58b57c9b..d28818da73 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -109,7 +109,7 @@ def setup_onnx_file_path(self) -> str:
             config_path, finetuning_task=self.task if hasattr(self, "task") else None
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path, model_max_length=self.sequence_length
+            tokenizer_path, model_max_length=self.sequence_length, padding_side="right"
         )
         self.config_path = os.path.join(config_path, "config.json")
         self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 6862d26795..b09100e0c3 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 import os
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Mapping, Optional, Tuple, Type, Union
+from typing import Dict, List, Tuple, Type, Union
 
 import numpy
 from pydantic import BaseModel, Field
 
-from deepsparse import Context, MultiModelEngine
+from deepsparse import Context, MultiModelEngine, Pipeline
 from deepsparse.pipeline import (
     DEEPSPARSE_ENGINE,
     ORT_ENGINE,
@@ -31,21 +30,31 @@
 from deepsparse.transformers.pipelines import TransformersPipeline
 
 
-_MODEL_DIR_ONNX_DECODER_NAME = "decoder_model.onnx"
-from transformers import BatchEncoding
-
-from deepsparse import Pipeline
-
+_MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
 
 __all__ = ["TextGenerationPipeline"]
 
 
-class InputSchema(BaseModel):
-    sequences: Union[str, List[str]]
+class TextGenerationInput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input sequence(s) to generate "
+        "text from. If a string is provided, "
+        "the model will generate text from the "
+        "provided sequence. If a list of strings "
+        "is provided, the model will "
+        "generate text from each sequence in the list.",
+    )
 
 
-class OutputSchema(BaseModel):
-    sequences: Union[str, List[str]]
+class TextGenerationOutput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input text sequence(s) appended with "
+        "the generated text sequence(s). "
+        "If a string was provided as input, "
+        "a string will be returned. "
+        "If a list of strings was provided as "
+        "input, a list of strings will be returned.",
+    )
 
 
 @Pipeline.register(
@@ -53,32 +62,71 @@ class OutputSchema(BaseModel):
     task_aliases=["codegen"],
 )
 class TextGenerationPipeline(TransformersPipeline):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
+        self.multitoken_engine = self._initialize_multitoken_engine()
+        if self._batch_size != 1:
+            raise ValueError(
+                "For the sake of simplicity, only dynamic"
+                "batch shape is supported for now. "
+                "Set `batch_size` to 1 or None."
+            )
+
     @staticmethod
     def route_input_to_bucket(
         *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
     ) -> Pipeline:
-        pass
+        """
+        This method is used to route the input to the correct pipeline.
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.onnx_decoder_path = self.setup_decoder_onnx_file_path()
-        self.decoder_engine = self._initialize_decoder_engine()
+        :param args: args to pass to the pipeline
+        :param input_schema: the input schema for the pipeline
+        :param pipelines: the list of pipelines to route the input to
+        :param kwargs: kwargs to pass to the pipeline
+        :return: the pipeline to route the input to
+        """
+        raise NotImplemented()
 
     @property
     def input_schema(self) -> Type[BaseModel]:
-        return InputSchema
+        """
+        Property to return the input schema for the pipeline.
+
+        :return: the input schema for the pipeline
+        """
+        return TextGenerationInput
 
     @property
     def output_schema(self) -> Type[BaseModel]:
-        return OutputSchema
+        """
+        Property to return the output schema for the pipeline.
+
+        :return: the output schema for the pipeline
+        """
+        return TextGenerationOutput
 
     def process_engine_outputs(
         self, engine_outputs: List[numpy.ndarray], **kwargs
     ) -> BaseModel:
-        return None
+        """
+        Convert the engine outputs to the output schema for the pipeline.
+
+        :param engine_outputs: the outputs from the engine
+        :return: the output schema for the pipeline
+        """
+        sequences = self.tokenizer.batch_decode(
+            engine_outputs[0], skip_special_tokens=True
+        )
+        return TextGenerationOutput(sequences=sequences)
 
-    def process_inputs(self, inputs: BaseModel):
+    def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
+        """
+        Convert the input schema for the pipeline to the inputs for the engine.
 
+        :param inputs: the input schema for the pipeline
+        :return: the inputs for the engine
+        """
         sequences = inputs.sequences
         if isinstance(sequences, List) and all(
             isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
@@ -90,31 +138,16 @@ def process_inputs(self, inputs: BaseModel):
         self.tokenizer.pad_token = self.tokenizer.eos_token
         input_tokens = self.tokenizer(
             sequences,
-            add_special_tokens=True,
             return_tensors="np",
             max_length=self.sequence_length,
             padding="max_length",
         )
 
-        onnx_input_names = [
-            input_name
-            for input_name in self.onnx_input_names
-            if not input_name.startswith("past_key_values")
-        ]
         engine_input = self.tokens_to_engine_input(
-            input_tokens, onnx_input_names=onnx_input_names
-        )
-
-        # a boolean mask that indicates which tokens are valid (are non-padding tokens)
-        valid_tokens_mask = numpy.where(
-            engine_input[0] == self.tokenizer.pad_token_id, 1, 0
+            input_tokens, onnx_input_names=self.multitoken_engine._input_names
         )
 
-        preprocessing_kwargs = dict(
-            input_sequence=engine_input[0], valid_tokens_mask=valid_tokens_mask
-        )
-
-        return engine_input, preprocessing_kwargs
+        return engine_input
 
     def engine_forward(
         self, engine_inputs: List[numpy.ndarray], **kwargs
@@ -124,68 +157,162 @@ def engine_forward(
             pass
         :return: result of forward pass to Pipeline engine
         """
-        assert self._batch_size == 1
 
+        # flag to indicate if the end of the sequence has been reached
         eos_token_found = False
-        generated_tokens = []
-        valid_tokens = [
-            t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id
-        ]
+        # list of the meaningful tokens in the sequence
+        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
 
-        past_logits, *new_kvs = self.decoder_engine(engine_inputs)
+        tokens, kv_cache = self.initial_autoregressive_pass(
+            engine=self.multitoken_engine, tokens=tokens, engine_inputs=engine_inputs
+        )
 
-        new_token = numpy.argmax(past_logits[0, -1, :])
-        generated_tokens.append(new_token)
+        # perform the remaining autoregressive passes
+        num_iterations = self.sequence_length - len(tokens)
+        for iter in range(num_iterations):
+            if eos_token_found:
+                return numpy.array([[tokens]])
+
+            tokens, kv_cache = self.autoregressive_pass(
+                self.engine, tokens, kv_cache, self.sequence_length
+            )
+
+        return numpy.array([[tokens]])
+
+    @staticmethod
+    def autoregressive_pass(
+        engine: Union[Engine, ORTEngine],
+        tokens: List[int],
+        kv_cache: Dict[str, numpy.ndarray],
+        sequence_length: int,
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        Performs an autoregressive pass to generate the next token in the sequence
+        and update the kv_cache with the new key/value pairs.
+
+        1) Set the attention mask to 1 for the tokens that are already in the sequence
+           and 1 for the `new_token`. This is because the `new_token`'s key/value will be
+           added to the set of keys/values at the last position (before being fed to an attention
+           block)
+        2) Setup the engine inputs
+        3) Run the engine forward pas
+        4)  Preprocesses the kv cache so that it can be used as input to the next
+            autoregressive pass.
+        5)  Returns the new token sequence and the updated kv cache.
+
+        :param engine: the engine to use for the autoregressive pass
+        :param tokens: the current token sequence
+        :param kv_cache: the current kv_cache
+        :param sequence_length: the maximum sequence length
+        :return: the new token sequence and the updated kv cache
+        """
 
+        new_token = tokens[-1]
+
+        attention_mask = numpy.zeros((1, sequence_length), dtype=numpy.int64)
+        attention_mask[:, : len(tokens)] = 1
+        # setting the last token's attention weight
+        # to 1 because the `new_token’s`
+        # key/value gets concatenated to the kv_cache.
+        # The k,v values for the `new_token`
+        # will be located at the last position
+        attention_mask[:, -1] = 1
+
+        engine_inputs_dict = {
+            "input_ids": numpy.array([[new_token]]),
+            "attention_mask": attention_mask,
+        }
+        engine_inputs_dict.update(kv_cache)
+
+        engine_inputs = [engine_inputs_dict[name] for name in engine._input_names]
+
+        new_logits, *new_kvs = engine(engine_inputs)
+
+        # rename the output names to match the names expected
+        # in the next autoregressive pass
         kv_output_names = [
-            name
-            for name in self.decoder_engine._output_names
+            name.replace("present", "past_key_values")
+            for name in engine._output_names
             if name.startswith("present")
         ]
         kv_cache = dict(zip(kv_output_names, new_kvs))
-        kv_cache = {
-            k.replace("present", "past_key_values"): v[:, :, :-1]
-            for k, v in kv_cache.items()
-        }
         for k, v in kv_cache.items():
-            v[:, :, len(valid_tokens) :] = 0.0
+            v[:, :, len(tokens) - 1] = v[:, :, -1]
+            kv_cache[k] = numpy.ascontiguousarray(v[:, :, :-1])
 
-        for iter in range(self.sequence_length - len(valid_tokens)):
-            if eos_token_found:
-                return valid_tokens
+        # Obtain the next token from the logits
+        new_token = numpy.argmax(new_logits[0, -1, :])
+        tokens.append(new_token)
+
+        return tokens, kv_cache
+
+    @staticmethod
+    def initial_autoregressive_pass(
+        engine: Union[Engine, ORTEngine],
+        tokens: List[int],
+        engine_inputs: List[numpy.ndarray],
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        Performs a single autoregressive pass to initialize the key, value cache.
+
+        1)  Obtains logits and kv cache for the input sequence.
+            From logits, obtains the next token.
+        2)  Preprocesses the kv cache so that it can be used as input to the next
+            autoregressive pass.
+        3)  Returns the new token sequence and the updated kv cache.
+
+        :param engine_inputs: list of numpy inputs to Pipeline
+            engine forward pass
+        :param engine: the engine to use for the forward pass
+        :param tokens: input tokens provided by the user
+        :return: the extended token sequence and the kv cache
+        """
+
+        past_logits, *new_kvs = engine(engine_inputs)
 
-            attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-            attention_mask[:, : len(valid_tokens)] = 1
-            attention_mask[:, -1] = 1
-            assert attention_mask.sum() == len(valid_tokens) + 1
+        # rename the output names to match the names expected
+        # in the next autoregressive pass
+        kv_output_names = [
+            name.replace("present", "past_key_values")
+            for name in engine._output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(kv_output_names, new_kvs))
+        for k, v in kv_cache.items():
+            # remove the information about the `new_token` from the cache
+            v = v[:, :, :-1]
+            # zero out all the info that does not pertain to the
+            # "seen" `token` sequence
+            v[:, :, len(tokens) :] = 0.0
+            kv_cache[k] = numpy.ascontiguousarray(v)
 
-            engine_inputs_dict = {
-                "input_ids": numpy.array([[new_token]]),
-                "attention_mask": attention_mask,
-            }
-            engine_inputs_dict.update(kv_cache)
-            engine_inputs = [
-                numpy.ascontiguousarray(engine_inputs_dict[name])
-                for name in self.onnx_input_names
-            ]
+        # Obtain the next token from the logits
+        new_token = numpy.argmax(past_logits[0, len(tokens) - 1])
+        tokens.append(new_token)
 
-            new_logits, *new_kvs = self.engine(engine_inputs)
+        return tokens, kv_cache
 
-        return engine_inputs[0]
+    def _setup_multitoken_onnx_file_path(self) -> str:
+        # `setup_onnx_file_path` function rewritten
+        # to setup the multitoken_onnx_file_path
 
-    def setup_decoder_onnx_file_path(self):
-        decoder_onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_DECODER_NAME)
+        multitoken_onnx_path = os.path.join(
+            self.model_path, _MODEL_DIR_ONNX_MULTI_TOKEN_NAME
+        )
         (
-            decoder_onnx_path,
-            self.decoder_onnx_input_names,
+            multitoken_onnx_path,
+            self.multitoken_onnx_input_names,
             self._temp_model_directory,
         ) = overwrite_transformer_onnx_model_inputs(
-            decoder_onnx_path, max_length=self.sequence_length
+            multitoken_onnx_path, max_length=self.sequence_length
         )
 
-        return decoder_onnx_path
+        return multitoken_onnx_path
+
+    def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
+        # `_initialize_engine` function rewritten
+        # to initialize the multitoken_engine
 
-    def _initialize_decoder_engine(self) -> Union[Engine, ORTEngine]:
         engine_type = self.engine_type.lower()
 
         if engine_type == DEEPSPARSE_ENGINE:
@@ -194,12 +321,12 @@ def _initialize_decoder_engine(self) -> Union[Engine, ORTEngine]:
                 self._engine_args.pop("scheduler", None)
                 self._engine_args["context"] = self.context
                 return MultiModelEngine(
-                    model=self.onnx_decoder_path,
+                    model=self.onnx_multitoken_path,
                     **self._engine_args,
                 )
-            return Engine(self.onnx_decoder_path, **self._engine_args)
+            return Engine(self.onnx_multitoken_path, **self._engine_args)
         elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_decoder_path, **self._engine_args)
+            return ORTEngine(self.onnx_multitoken_path, **self._engine_args)
         else:
             raise ValueError(
                 f"Unknown engine_type {self.engine_type}. Supported values include: "

From f43f05a783d8747c6716357955fc2d1af3d11092 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 10 Mar 2023 17:24:34 +0000
Subject: [PATCH 5/8] ready for review

---
 src/deepsparse/pipeline.py                    |  3 +-
 .../transformers/pipelines/pipeline.py        |  3 +-
 .../transformers/pipelines/text_generation.py | 38 ++++++++++---------
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index ea3d87b879..88d5414992 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,8 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        # batch_outputs = list(self.executor.map(self.engine_forward, batches))
-        batch_outputs = [self.engine_forward(b) for b in batches]
+        batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index d28818da73..16603b6950 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -109,7 +109,8 @@ def setup_onnx_file_path(self) -> str:
             config_path, finetuning_task=self.task if hasattr(self, "task") else None
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path, model_max_length=self.sequence_length, padding_side="right"
+            tokenizer_path,
+            model_max_length=self.sequence_length,
         )
         self.config_path = os.path.join(config_path, "config.json")
         self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index b09100e0c3..8f0d26edeb 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -64,8 +64,11 @@ class TextGenerationOutput(BaseModel):
 class TextGenerationPipeline(TransformersPipeline):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        # setup the auxiliary multitoken model
         self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
+        # initialize the auxiliary multitoken engine
         self.multitoken_engine = self._initialize_multitoken_engine()
+
         if self._batch_size != 1:
             raise ValueError(
                 "For the sake of simplicity, only dynamic"
@@ -86,7 +89,7 @@ def route_input_to_bucket(
         :param kwargs: kwargs to pass to the pipeline
         :return: the pipeline to route the input to
         """
-        raise NotImplemented()
+        raise NotImplementedError
 
     @property
     def input_schema(self) -> Type[BaseModel]:
@@ -128,6 +131,7 @@ def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
         :return: the inputs for the engine
         """
         sequences = inputs.sequences
+
         if isinstance(sequences, List) and all(
             isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
         ):
@@ -136,6 +140,7 @@ def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
             sequences = [sequence[0] for sequence in sequences]
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
+
         input_tokens = self.tokenizer(
             sequences,
             return_tensors="np",
@@ -151,14 +156,13 @@ def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
 
     def engine_forward(
         self, engine_inputs: List[numpy.ndarray], **kwargs
-    ) -> List[numpy.ndarray]:
+    ) -> numpy.ndarray:
         """
-        :param engine_inputs: list of numpy inputs to Pipeline engine forward
-            pass
-        :return: result of forward pass to Pipeline engine
+        :param engine_inputs: list of numpy inputs to
+            Pipeline engine forward pass
+        :return: A numpy array that contains the tokens generated by the model
         """
 
-        # flag to indicate if the end of the sequence has been reached
         eos_token_found = False
         # list of the meaningful tokens in the sequence
         tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
@@ -174,7 +178,10 @@ def engine_forward(
                 return numpy.array([[tokens]])
 
             tokens, kv_cache = self.autoregressive_pass(
-                self.engine, tokens, kv_cache, self.sequence_length
+                engine=self.engine,
+                tokens=tokens,
+                kv_cache=kv_cache,
+                sequence_length=self.sequence_length,
             )
 
         return numpy.array([[tokens]])
@@ -190,12 +197,12 @@ def autoregressive_pass(
         Performs an autoregressive pass to generate the next token in the sequence
         and update the kv_cache with the new key/value pairs.
 
-        1) Set the attention mask to 1 for the tokens that are already in the sequence
-           and 1 for the `new_token`. This is because the `new_token`'s key/value will be
-           added to the set of keys/values at the last position (before being fed to an attention
-           block)
-        2) Setup the engine inputs
-        3) Run the engine forward pas
+        1)  Set the attention mask to 1 for the tokens that are already in the sequence
+            and 1 for the `new_token` - at the last position. This is because the
+            `new_token`'s key/value will be added to the set of keys/values
+            at the last position (before being fed to an attention block)
+        2)  Set up the engine inputs
+        3)  Run the engine forward pas
         4)  Preprocesses the kv cache so that it can be used as input to the next
             autoregressive pass.
         5)  Returns the new token sequence and the updated kv cache.
@@ -211,11 +218,6 @@ def autoregressive_pass(
 
         attention_mask = numpy.zeros((1, sequence_length), dtype=numpy.int64)
         attention_mask[:, : len(tokens)] = 1
-        # setting the last token's attention weight
-        # to 1 because the `new_token’s`
-        # key/value gets concatenated to the kv_cache.
-        # The k,v values for the `new_token`
-        # will be located at the last position
         attention_mask[:, -1] = 1
 
         engine_inputs_dict = {

From 28b5131849c1547f16ba4c3d7c600b4146920a50 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 14 Mar 2023 12:54:08 +0000
Subject: [PATCH 6/8] moved the code to examples

---
 examples/codegen/README.md                    |  30 +++
 .../codegen}/text_generation.py               | 180 +++++++++++++++---
 src/deepsparse/tasks.py                       |   2 -
 src/deepsparse/transformers/helpers.py        |   6 +-
 .../transformers/pipelines/__init__.py        |   1 -
 5 files changed, 188 insertions(+), 31 deletions(-)
 create mode 100644 examples/codegen/README.md
 rename {src/deepsparse/transformers/pipelines => examples/codegen}/text_generation.py (64%)

diff --git a/examples/codegen/README.md b/examples/codegen/README.md
new file mode 100644
index 0000000000..d855a5e075
--- /dev/null
+++ b/examples/codegen/README.md
@@ -0,0 +1,30 @@
+<!--
+Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+Example of how to run the pipeline:
+
+```python
+from examples.codegen.text_generation import TextGenerationPipeline
+
+codegen = TextGenerationPipeline(
+    model_path="/network/damian/static-codegen-350M-multi",
+    engine_type="onnxruntime",
+    sequence_length=128, )
+
+out = codegen(sequences=["def hello_world():", "def fibonacci(x):"])
+for seq in out.sequences:
+    print(seq)
+```
\ No newline at end of file
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/examples/codegen/text_generation.py
similarity index 64%
rename from src/deepsparse/transformers/pipelines/text_generation.py
rename to examples/codegen/text_generation.py
index 8f0d26edeb..dab643c171 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/examples/codegen/text_generation.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 import os
-from typing import Dict, List, Tuple, Type, Union
+from tempfile import NamedTemporaryFile
+from typing import Dict, List, Optional, Tuple, Type, Union
 
 import numpy
+import onnx
 from pydantic import BaseModel, Field
 
 from deepsparse import Context, MultiModelEngine, Pipeline
@@ -26,15 +28,62 @@
     Engine,
     ORTEngine,
 )
-from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
 from deepsparse.transformers.pipelines import TransformersPipeline
+from scipy.special import softmax
 
 
 _MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
+_MODEL_DIR_ONNX_NAME = "model.onnx"
 
 __all__ = ["TextGenerationPipeline"]
 
 
+def overwrite_transformer_onnx_model_inputs(
+    path: str,
+    batch_size: int = 1,
+    max_length: int = 128,
+    output_path: Optional[str] = None,
+) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
+    """
+    Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
+    Assumes that these are the first and second shape indices of the given model inputs
+    respectively
+
+    :param path: path to the ONNX model to override
+    :param batch_size: batch size to set
+    :param max_length: max sequence length to set
+    :param output_path: if provided, the model will be saved to the given path,
+        otherwise, the model will be saved to a named temporary file that will
+        be deleted after the program exits
+    :return: if no output path, a tuple of the saved path to the model, list of
+        model input names, and reference to the tempfile object will be returned
+        otherwise, only the model input names will be returned
+    """
+    # overwrite input shapes
+    model = onnx.load(path)
+    initializer_input_names = set([node.name for node in model.graph.initializer])
+    external_inputs = [
+        inp for inp in model.graph.input if inp.name not in initializer_input_names
+    ]
+    input_names = []
+    for external_input in external_inputs:
+        # this is removed for now (will need to be accounted for when we start
+        # supporting deepsparse engine
+        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        input_names.append(external_input.name)
+
+    # Save modified model
+    if output_path is None:
+        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
+        onnx.save(model, tmp_file.name)
+
+        return tmp_file.name, input_names, tmp_file
+    else:
+        onnx.save(model, output_path)
+        return input_names
+
+
 class TextGenerationInput(BaseModel):
     sequences: Union[str, List[str]] = Field(
         description="The input sequence(s) to generate "
@@ -62,13 +111,36 @@ class TextGenerationOutput(BaseModel):
     task_aliases=["codegen"],
 )
 class TextGenerationPipeline(TransformersPipeline):
-    def __init__(self, **kwargs):
+    """
+    Pipeline for text generation tasks.
+
+    :param deterministic: if True, the pipeline will sample from
+        the probability distribution computed from the logits.
+        If False, the pipeline will get the next token by applying
+        an argmax function to the logits.
+    :param sampling_temperature: the temperature to use when sampling
+        from the probability distribution computed from the logits.
+        Higher values will result in more random samples.
+    :param kwargs: kwargs to pass to the TransformersPipeline
+    """
+
+    def __init__(
+        self, deterministic: bool = True, sampling_temperature: float = 1.0, **kwargs
+    ):
         super().__init__(**kwargs)
-        # setup the auxiliary multitoken model
+        self.deterministic = deterministic
+        self.sampling_temperature = sampling_temperature
+
+        # set-up the auxiliary multitoken model
         self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
         # initialize the auxiliary multitoken engine
         self.multitoken_engine = self._initialize_multitoken_engine()
 
+        # re-initialize the target model
+        # this will be removed once codegen is productionized
+        self.onnx_path = self._setup_onnx_file_path()
+        self.engine = self._reinitialize_engine()
+
         if self._batch_size != 1:
             raise ValueError(
                 "For the sake of simplicity, only dynamic"
@@ -168,7 +240,7 @@ def engine_forward(
         tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
 
         tokens, kv_cache = self.initial_autoregressive_pass(
-            engine=self.multitoken_engine, tokens=tokens, engine_inputs=engine_inputs
+            tokens=tokens, engine_inputs=engine_inputs
         )
 
         # perform the remaining autoregressive passes
@@ -178,20 +250,16 @@ def engine_forward(
                 return numpy.array([[tokens]])
 
             tokens, kv_cache = self.autoregressive_pass(
-                engine=self.engine,
                 tokens=tokens,
                 kv_cache=kv_cache,
-                sequence_length=self.sequence_length,
             )
 
         return numpy.array([[tokens]])
 
-    @staticmethod
     def autoregressive_pass(
-        engine: Union[Engine, ORTEngine],
+        self,
         tokens: List[int],
         kv_cache: Dict[str, numpy.ndarray],
-        sequence_length: int,
     ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
         """
         Performs an autoregressive pass to generate the next token in the sequence
@@ -207,16 +275,14 @@ def autoregressive_pass(
             autoregressive pass.
         5)  Returns the new token sequence and the updated kv cache.
 
-        :param engine: the engine to use for the autoregressive pass
         :param tokens: the current token sequence
         :param kv_cache: the current kv_cache
-        :param sequence_length: the maximum sequence length
         :return: the new token sequence and the updated kv cache
         """
 
         new_token = tokens[-1]
 
-        attention_mask = numpy.zeros((1, sequence_length), dtype=numpy.int64)
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
         attention_mask[:, : len(tokens)] = 1
         attention_mask[:, -1] = 1
 
@@ -226,15 +292,15 @@ def autoregressive_pass(
         }
         engine_inputs_dict.update(kv_cache)
 
-        engine_inputs = [engine_inputs_dict[name] for name in engine._input_names]
+        engine_inputs = [engine_inputs_dict[name] for name in self.engine._input_names]
 
-        new_logits, *new_kvs = engine(engine_inputs)
+        new_logits, *new_kvs = self.engine(engine_inputs)
 
         # rename the output names to match the names expected
         # in the next autoregressive pass
         kv_output_names = [
             name.replace("present", "past_key_values")
-            for name in engine._output_names
+            for name in self.engine._output_names
             if name.startswith("present")
         ]
         kv_cache = dict(zip(kv_output_names, new_kvs))
@@ -243,14 +309,17 @@ def autoregressive_pass(
             kv_cache[k] = numpy.ascontiguousarray(v[:, :, :-1])
 
         # Obtain the next token from the logits
-        new_token = numpy.argmax(new_logits[0, -1, :])
+        new_token = TextGenerationPipeline.sample_new_token(
+            logits=new_logits[0, -1, :],
+            deterministic=self.deterministic,
+            temperature=self.sampling_temperature,
+        )
         tokens.append(new_token)
 
         return tokens, kv_cache
 
-    @staticmethod
     def initial_autoregressive_pass(
-        engine: Union[Engine, ORTEngine],
+        self,
         tokens: List[int],
         engine_inputs: List[numpy.ndarray],
     ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
@@ -263,20 +332,19 @@ def initial_autoregressive_pass(
             autoregressive pass.
         3)  Returns the new token sequence and the updated kv cache.
 
+        :param tokens: input tokens provided by the user
         :param engine_inputs: list of numpy inputs to Pipeline
             engine forward pass
-        :param engine: the engine to use for the forward pass
-        :param tokens: input tokens provided by the user
         :return: the extended token sequence and the kv cache
         """
 
-        past_logits, *new_kvs = engine(engine_inputs)
+        past_logits, *new_kvs = self.multitoken_engine(engine_inputs)
 
         # rename the output names to match the names expected
         # in the next autoregressive pass
         kv_output_names = [
             name.replace("present", "past_key_values")
-            for name in engine._output_names
+            for name in self.multitoken_engine._output_names
             if name.startswith("present")
         ]
         kv_cache = dict(zip(kv_output_names, new_kvs))
@@ -289,11 +357,35 @@ def initial_autoregressive_pass(
             kv_cache[k] = numpy.ascontiguousarray(v)
 
         # Obtain the next token from the logits
-        new_token = numpy.argmax(past_logits[0, len(tokens) - 1])
+        new_token = TextGenerationPipeline.sample_new_token(
+            logits=past_logits[0, len(tokens) - 1],
+            deterministic=self.deterministic,
+            temperature=self.sampling_temperature,
+        )
         tokens.append(new_token)
 
         return tokens, kv_cache
 
+    @staticmethod
+    def sample_new_token(
+        logits: numpy.ndarray, deterministic: bool, temperature: float
+    ) -> int:
+        """
+        Samples a token from the logits using the sampling temperature.
+
+        :param logits: the logits from the model
+        :param deterministic: whether to sample from the softmax or take the argmax
+        :param temperature: the sampling temperature
+
+        :return: the sampled token
+        """
+        if deterministic:
+            return numpy.argmax(logits)
+        else:
+            logits /= temperature
+            probs = softmax(logits)
+            return numpy.random.choice(len(probs), p=probs)
+
     def _setup_multitoken_onnx_file_path(self) -> str:
         # `setup_onnx_file_path` function rewritten
         # to setup the multitoken_onnx_file_path
@@ -334,3 +426,43 @@ def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
                 f"Unknown engine_type {self.engine_type}. Supported values include: "
                 f"{SUPPORTED_PIPELINE_ENGINES}"
             )
+
+    def _setup_onnx_file_path(self) -> str:
+        # `setup_onnx_file_path` function rewritten
+
+        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path, max_length=self.sequence_length
+        )
+
+        return onnx_path
+
+    def _initialize_engine(self):
+        return None
+
+    def _reinitialize_engine(self) -> Union[Engine, ORTEngine]:
+        # `_initialize_engine` function rewritten
+
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=self.onnx_path,
+                    **self._engine_args,
+                )
+            return Engine(self.onnx_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index 394406a19b..aa6c349eb6 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -82,7 +82,6 @@ class SupportedTasks:
             "token_classification",
             "zero_shot_text_classification",
             "transformers_embedding_extraction",
-            "text_generation",
         ],
     )(
         question_answering=AliasedTask("question_answering", ["qa"]),
@@ -94,7 +93,6 @@ class SupportedTasks:
         transformers_embedding_extraction=AliasedTask(
             "transformers_embedding_extraction", []
         ),
-        text_generation=AliasedTask("text_generation", ["codegen"]),
     )
 
     image_classification = namedtuple("image_classification", ["image_classification"])(
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 18869060de..e204839e73 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -159,10 +159,8 @@ def overwrite_transformer_onnx_model_inputs(
     ]
     input_names = []
     for external_input in external_inputs:
-        # Commenting this out for now, as it is not needed for the ORT backend
-        # Will be crucial for DeepSparse backend
-        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        external_input.type.tensor_type.shape.dim[1].dim_value = max_length
         input_names.append(external_input.name)
 
     # Save modified model
diff --git a/src/deepsparse/transformers/pipelines/__init__.py b/src/deepsparse/transformers/pipelines/__init__.py
index e30215182a..3e2e88381d 100644
--- a/src/deepsparse/transformers/pipelines/__init__.py
+++ b/src/deepsparse/transformers/pipelines/__init__.py
@@ -21,4 +21,3 @@
 from .token_classification import *
 from .zero_shot_text_classification import *
 from .embedding_extraction import *
-from .text_generation import *

From 4f9e804a9015c168c3f76d84afada95c262436ea Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 14 Mar 2023 13:24:56 +0000
Subject: [PATCH 7/8] fix eos logic

---
 examples/codegen/text_generation.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
index dab643c171..f7a38be266 100644
--- a/examples/codegen/text_generation.py
+++ b/examples/codegen/text_generation.py
@@ -246,7 +246,12 @@ def engine_forward(
         # perform the remaining autoregressive passes
         num_iterations = self.sequence_length - len(tokens)
         for iter in range(num_iterations):
+            eos_token_found = self.tokenizer.eos_token_id == tokens[-1]
             if eos_token_found:
+                # fill the token list so that it has the correct sequence length
+                tokens = tokens + [self.tokenizer.pad_token_id] * (
+                    self.sequence_length - len(tokens)
+                )
                 return numpy.array([[tokens]])
 
             tokens, kv_cache = self.autoregressive_pass(

From a780f7ca83513c94ba7f9e94b3484b4fca8c5d7e Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Thu, 16 Mar 2023 11:16:31 +0000
Subject: [PATCH 8/8] add argument num_tokens_to_generate

---
 examples/codegen/text_generation.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
index f7a38be266..1812c9ef93 100644
--- a/examples/codegen/text_generation.py
+++ b/examples/codegen/text_generation.py
@@ -121,15 +121,23 @@ class TextGenerationPipeline(TransformersPipeline):
     :param sampling_temperature: the temperature to use when sampling
         from the probability distribution computed from the logits.
         Higher values will result in more random samples.
+    :param num_tokens_to_generate: the number of tokens to generate
+        given the input sequence. If None, the model will generate
+        tokens until the end of the sequence is reached.
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
 
     def __init__(
-        self, deterministic: bool = True, sampling_temperature: float = 1.0, **kwargs
+        self,
+        deterministic: bool = True,
+        sampling_temperature: float = 1.0,
+        num_tokens_to_generate: Optional[int] = None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.deterministic = deterministic
         self.sampling_temperature = sampling_temperature
+        self.num_tokens_to_generate = num_tokens_to_generate
 
         # set-up the auxiliary multitoken model
         self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
@@ -235,7 +243,6 @@ def engine_forward(
         :return: A numpy array that contains the tokens generated by the model
         """
 
-        eos_token_found = False
         # list of the meaningful tokens in the sequence
         tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
 
@@ -243,8 +250,18 @@ def engine_forward(
             tokens=tokens, engine_inputs=engine_inputs
         )
 
-        # perform the remaining autoregressive passes
+        # establish the number of autoregressive passes to perform
         num_iterations = self.sequence_length - len(tokens)
+        if self.num_tokens_to_generate:
+            if self.num_tokens_to_generate > num_iterations:
+                raise ValueError(
+                    f"Num_tokens_to_generate ({self.num_tokens_to_generate}) "
+                    f"cannot be greater than sequence_length ({self.sequence_length}) "
+                    f"minus the number of tokens in the input sequence ({len(tokens)})."
+                )
+            num_iterations = self.num_tokens_to_generate
+
+        # perform the remaining autoregressive passes
         for iter in range(num_iterations):
             eos_token_found = self.tokenizer.eos_token_id == tokens[-1]
             if eos_token_found:
@@ -259,6 +276,10 @@ def engine_forward(
                 kv_cache=kv_cache,
             )
 
+        # fill the token list so that it has the correct sequence length
+        tokens = tokens + [self.tokenizer.pad_token_id] * (
+            self.sequence_length - len(tokens)
+        )
         return numpy.array([[tokens]])
 
     def autoregressive_pass(