From 1d3c54f1c59286298b6fb1ae11ce24b0261ec58c Mon Sep 17 00:00:00 2001 From: rahul-tuli Date: Tue, 26 Sep 2023 11:37:04 -0400 Subject: [PATCH 1/8] Add changes needed on parent class side --- src/deepsparse/transformers/pipelines/text_generation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index d0844afc30..06c1d9750c 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -329,7 +329,6 @@ def initialize_engines( if ( self.cache_support_enabled and self.enable_multitoken_prefill ) or not self.cache_support_enabled: - # input_ids_length for the multitoken engine is either: # - the prompt_sequence_length if the cache support is enabled # (the prompt is processed sequentially at predefined processing length) From 3b9a00aa215e6f375997ca1e63034f751e234f5d Mon Sep 17 00:00:00 2001 From: rahul-tuli Date: Tue, 26 Sep 2023 12:15:34 -0400 Subject: [PATCH 2/8] Add stream mode to chatbot Clean functions --- .../transformers/inference/infer.py | 90 ++++++++++--------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index 460b8499c4..823d47228e 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -43,7 +43,10 @@ --task TEXT The task to use for the pipeline. Choose any of `chat`, `codegen`, `text-generation` [default: chat] - --help Show this message and exit. + --stream / --no_stream Whether to stream output as generated or not + [default: no_stream] + --help Show this message and exit. [default: + False] Installation: pip install deepsparse[transformers] Examples: @@ -62,6 +65,10 @@ 4) Disable history deepsparse.infer models/llama/deployment \ --task text-generation + +5) Stream output +deepsparse.infer models/llama/deployment \ + --stream """ from typing import Optional @@ -122,6 +129,12 @@ help="The task to use for the pipeline. Choose any of " "`chat`, `codegen`, `text-generation`", ) +@click.option( + "--stream/--no_stream", + is_flag=True, + default=False, + help="Whether to stream output as generated or not", +) def main( model_path: str, data: Optional[str], @@ -130,6 +143,7 @@ def main( prompt_sequence_length: int, show_tokens_per_sec: bool, task: str, + stream: bool, ): """ Command Line utility to interact with a text genration LLM in a chatbot style @@ -166,52 +180,46 @@ def main( return # continue prompts until a keyboard interrupt - while data is None: # always True in interactive Mode - prompt = input(">>> ") - _run_inference( - pipeline, - sampling_temperature, - task, - session_ids, - show_tokens_per_sec, - prompt_sequence_length, - prompt, + while True: + input_text = input("User: ") + pipeline_inputs = dict( + prompt=[input_text], + sampling_temperature=sampling_temperature, ) + if SupportedTasks.is_chat(task): + pipeline_inputs["session_ids"] = session_ids -def _run_inference( - pipeline, - sampling_temperature, - task, - session_ids, - show_tokens_per_sec, - prompt_sequence_length, - prompt, - **kwargs, -): - pipeline_inputs = dict( - prompt=[prompt], - temperature=sampling_temperature, - **kwargs, + response = pipeline(**pipeline_inputs, streaming=stream) + _display_bot_response(stream, response) + + if show_tokens_per_sec: + _display_generation_speed(prompt_sequence_length, pipeline) + + +def _display_generation_speed(prompt_sequence_length, pipeline): + # display prefill and generation speed(s) in tokens/sec + times = pipeline.timer_manager.times + prefill_speed = 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] + generation_speed = 1.0 / times["engine_token_generation_single"] + print( + f"[prefill: {prefill_speed:.2f} tokens/sec]", + f"[decode: {generation_speed:.2f} tokens/sec]", + sep="\n", ) - if SupportedTasks.is_chat(task): - pipeline_inputs["session_ids"] = session_ids - response = pipeline(**pipeline_inputs) - print("\n", response.generations[0].text) - if show_tokens_per_sec: - times = pipeline.timer_manager.times - prefill_speed = ( - 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] - ) - generation_speed = 1.0 / times["engine_token_generation_single"] - print( - f"[prefill: {prefill_speed:.2f} tokens/sec]", - f"[decode: {generation_speed:.2f} tokens/sec]", - sep="\n", - ) +def _display_bot_response(stream: bool, response): + # print response from pipeline, streaming or not + + print("Bot:", end=" ") + if stream: + for generation in response: + print(generation.generations[0].text, end=" ") + print() + else: + print(response.generations[0].text) -if __name__ == "__main__": +if "__main__" == __name__: main() From e4fb27c3da5d91ca0468f357e7a952b20e5d12a1 Mon Sep 17 00:00:00 2001 From: rahul-tuli Date: Wed, 27 Sep 2023 09:25:44 -0400 Subject: [PATCH 3/8] Update sampling temperature to temperature --- src/deepsparse/transformers/inference/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index 823d47228e..c9bcbb5c19 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -184,7 +184,7 @@ def main( input_text = input("User: ") pipeline_inputs = dict( prompt=[input_text], - sampling_temperature=sampling_temperature, + temperature=sampling_temperature, ) if SupportedTasks.is_chat(task): From 98865746fc9b5bd155a2a3cc7b75389ee547c75b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 6 Oct 2023 18:52:51 -0600 Subject: [PATCH 4/8] Update src/deepsparse/transformers/infer.py --- src/deepsparse/transformers/inference/infer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index c9bcbb5c19..348d899268 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -212,10 +212,10 @@ def _display_generation_speed(prompt_sequence_length, pipeline): def _display_bot_response(stream: bool, response): # print response from pipeline, streaming or not - print("Bot:", end=" ") + print("Bot:", end="") if stream: for generation in response: - print(generation.generations[0].text, end=" ") + print(generation.generations[0].text, end="") print() else: print(response.generations[0].text) From 8c2e74fd7a5c6549c3adeb5e24fe204001bf05bb Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 11 Oct 2023 19:01:32 +0000 Subject: [PATCH 5/8] Rebase and fix streaming --- src/deepsparse/transformers/inference/infer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index 348d899268..81d3ad5441 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -212,10 +212,10 @@ def _display_generation_speed(prompt_sequence_length, pipeline): def _display_bot_response(stream: bool, response): # print response from pipeline, streaming or not - print("Bot:", end="") + print("Bot:", end="", flush=True) if stream: for generation in response: - print(generation.generations[0].text, end="") + print(generation.generations[0].text, end="", flush=True) print() else: print(response.generations[0].text) From a29478f4a57343401ff4a6e140cb5dfec8f6c036 Mon Sep 17 00:00:00 2001 From: rahul-tuli Date: Wed, 11 Oct 2023 16:17:16 -0400 Subject: [PATCH 6/8] Fix broken data pathway --- .../transformers/inference/infer.py | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index 81d3ad5441..ee9d652953 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -182,19 +182,41 @@ def main( # continue prompts until a keyboard interrupt while True: input_text = input("User: ") - pipeline_inputs = dict( - prompt=[input_text], - temperature=sampling_temperature, + _run_inference( + pipeline=pipeline, + sampling_temperature=sampling_temperature, + task=task, + session_ids=session_ids, + show_tokens_per_sec=show_tokens_per_sec, + prompt_sequence_length=prompt_sequence_length, + stream=stream, + input_text=input_text, ) - if SupportedTasks.is_chat(task): - pipeline_inputs["session_ids"] = session_ids - response = pipeline(**pipeline_inputs, streaming=stream) - _display_bot_response(stream, response) +def _run_inference( + pipeline: Pipeline, + sampling_temperature: float, + task: str, + session_ids: str, + show_tokens_per_sec: bool, + prompt_sequence_length: int, + input_text: str, + stream: bool = False, +): + pipeline_inputs = dict( + prompt=[input_text], + temperature=sampling_temperature, + ) + + if SupportedTasks.is_chat(task): + pipeline_inputs["session_ids"] = session_ids + + response = pipeline(**pipeline_inputs, streaming=stream) + _display_bot_response(stream, response) - if show_tokens_per_sec: - _display_generation_speed(prompt_sequence_length, pipeline) + if show_tokens_per_sec: + _display_generation_speed(prompt_sequence_length, pipeline) def _display_generation_speed(prompt_sequence_length, pipeline): From 86379d110b5438b1697f43d18883e024016c8442 Mon Sep 17 00:00:00 2001 From: rahul-tuli Date: Wed, 11 Oct 2023 16:23:40 -0400 Subject: [PATCH 7/8] Add stream to data mode update arg name to prompt --- src/deepsparse/transformers/inference/infer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index ee9d652953..f15a724dc6 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -175,6 +175,7 @@ def main( task=task, pipeline=pipeline, session_ids=session_ids, + stream=stream, **prompt_kwargs, ) return @@ -190,7 +191,7 @@ def main( show_tokens_per_sec=show_tokens_per_sec, prompt_sequence_length=prompt_sequence_length, stream=stream, - input_text=input_text, + prompt=input_text, ) @@ -201,11 +202,11 @@ def _run_inference( session_ids: str, show_tokens_per_sec: bool, prompt_sequence_length: int, - input_text: str, + prompt: str, stream: bool = False, ): pipeline_inputs = dict( - prompt=[input_text], + prompt=[prompt], temperature=sampling_temperature, ) From 68d7c5965920760d0bcbf3e1652aa3197b6e28cf Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 12 Oct 2023 17:27:09 +0000 Subject: [PATCH 8/8] Update prompt prefill tok/s --- src/deepsparse/transformers/inference/infer.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index f15a724dc6..0dbfa92a62 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -113,7 +113,7 @@ @click.option( "--prompt_sequence_length", type=int, - default=64, + default=16, help="Processed prompt in chunks of this length. " "This is to maximize the inference speed", ) @@ -124,7 +124,7 @@ ) @click.option( "--task", - default="chat", + default="text-generation", type=str, help="The task to use for the pipeline. Choose any of " "`chat`, `codegen`, `text-generation`", @@ -166,7 +166,6 @@ def main( default_prompt_kwargs = { "sequence_length": sequence_length, "sampling_temperature": sampling_temperature, - "prompt_sequence_length": prompt_sequence_length, "show_tokens_per_sec": show_tokens_per_sec, } @@ -189,7 +188,6 @@ def main( task=task, session_ids=session_ids, show_tokens_per_sec=show_tokens_per_sec, - prompt_sequence_length=prompt_sequence_length, stream=stream, prompt=input_text, ) @@ -201,7 +199,6 @@ def _run_inference( task: str, session_ids: str, show_tokens_per_sec: bool, - prompt_sequence_length: int, prompt: str, stream: bool = False, ): @@ -217,13 +214,15 @@ def _run_inference( _display_bot_response(stream, response) if show_tokens_per_sec: - _display_generation_speed(prompt_sequence_length, pipeline) + _display_generation_speed(prompt, pipeline) -def _display_generation_speed(prompt_sequence_length, pipeline): +def _display_generation_speed(prompt, pipeline): # display prefill and generation speed(s) in tokens/sec times = pipeline.timer_manager.times - prefill_speed = 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] + prefill_speed = ( + len(pipeline.tokenizer(prompt)["input_ids"]) / times["engine_prompt_prefill"] + ) generation_speed = 1.0 / times["engine_token_generation_single"] print( f"[prefill: {prefill_speed:.2f} tokens/sec]",