Merge branch 'main' into feature_readme

dsikka · web-flow · commit 86ff638e3fcc · 2023-09-26T13:27:01.000-04:00
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -138,14 +138,15 @@ class Config:
         description="GenerationConfig file consisting of parameters used to control "
         "sequences generated for each prompt. The current supported parameters are: "
         "max_length, max_new_tokens, num_return_sequences, output_scores, top_p, "
-        "top_k, repetition_penalty, do_sample, temperature",
+        "top_k, repetition_penalty, do_sample, temperature. If None is provided, "
+        "deepsparse defaults will be used. For all other input types, HuggingFace "
+        "defaults for GenerationConfig will be used. ",
     )
 
-    kwargs: Optional[Dict] = Field(
+    generation_kwargs: Optional[Dict] = Field(
         default=None,
         description="Any arguments to override generation_config arguments. Refer to "
-        "the generation_config argument for a full list of supported variables. Only "
-        "valid when generation_config is not None.",
+        "the generation_config argument for a full list of supported variables.",
     )
 
 
@@ -201,6 +202,12 @@ class TextGenerationPipeline(TransformersPipeline):
         of tokens supplied even if the stop token is reached.
     :param internal_kv_cache: if True, the pipeline will use the deepsparse kv cache
         for caching the model outputs.
+    :param generation_config: config file consisting of parameters used to control
+        sequences generated for each prompt. The current supported parameters are:
+        max_length, max_new_tokens, num_return_sequences, output_scores, top_p,
+        top_k, repetition_penalty, do_sample, temperature. If None is provided,
+        deepsparse defaults will be used. For all other input types, HuggingFace
+        defaults for GenerationConfig will be used.
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
 
@@ -409,6 +416,7 @@ def parse_inputs(self, *args, **kwargs) -> TextGenerationInput:
         if "sequences" in kwargs and "prompt" not in kwargs:
             # support prompt and sequences interchangeably
             kwargs["prompt"] = kwargs["sequences"]
+
         if (
             args
             and not isinstance(args[0], TextGenerationInput)
@@ -419,6 +427,14 @@ def parse_inputs(self, *args, **kwargs) -> TextGenerationInput:
             kwargs["prompt"] = args[0]
             args = args[1:]
 
+        if kwargs:
+            generation_kwargs = kwargs.get("generation_kwargs", {})
+            for k, v in kwargs.items():
+                if not generation_kwargs.get(k) and hasattr(GenerationDefaults, k):
+                    generation_kwargs[k] = v
+
+            kwargs["generation_kwargs"] = generation_kwargs
+
         return super().parse_inputs(*args, **kwargs)
 
     def process_inputs(
@@ -434,7 +450,7 @@ def process_inputs(
             self.generation_config, inputs.generation_config, GenerationDefaults()
         )
 
-        generation_config = override_config(inputs.kwargs, generation_config)
+        generation_config = override_config(inputs.generation_kwargs, generation_config)
 
         self.streaming = inputs.streaming
         if not self.cache_support_enabled and generation_config.max_length > 1:
@@ -527,10 +543,10 @@ def _create_generated_text_output(
             finished=False,
         )
 
-    def _stream_engine_outputs(self, engine_outputs, prompts, kwargs):
+    def _stream_engine_outputs(self, engine_outputs, prompts, generation_config):
         for output in engine_outputs:
             generated_tokens, generated_logits, finished_reason = output
-            logits = generated_logits if kwargs.get("return_logits") else None
+            logits = generated_logits if generation_config.output_scores else None
             generation = self._create_generated_text_output(
                 self.tokenizer.batch_decode(generated_tokens)[0],
                 finished_reason[0],
@@ -557,7 +573,9 @@ def process_engine_outputs(
         streaming = kwargs.get("streaming")
 
         if streaming:
-            return self._stream_engine_outputs(engine_outputs, prompts, kwargs)
+            return self._stream_engine_outputs(
+                engine_outputs, prompts, generation_config
+            )
 
         if self._debug:
             (
diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
@@ -246,15 +246,14 @@ def override_config(
         return generation_config
 
     for k, v in overrides.items():
-        try:
-            if getattr(generation_config, k):
-                setattr(generation_config, k, v)
-                _LOGGER.debug(f"Overriding attribute {k} in the generation config")
-        except AttributeError as exception:
+        if hasattr(generation_config, k):
+            setattr(generation_config, k, v)
+            _LOGGER.debug(f"Overriding attribute {k} in the generation config")
+        else:
             raise AttributeError(
-                "Argument provided for GenerationConfig is not "
+                f"Argument {k} provided for GenerationConfig is not "
                 "valid. Refer to the TextGenerationInput for supported attributes. "
-            ) from exception
+            )
 
     return generation_config