From a81e32d4712c9d40bf347d47907a2308a4aeda9c Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 31 Aug 2023 02:55:05 +0000
Subject: [PATCH 1/2] Fixes for openai server example

---
 examples/openai-server/client.py | 5 ++++-
 examples/openai-server/server.py | 8 ++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/openai-server/client.py b/examples/openai-server/client.py
index bf9f4d6b81..9e387cd99f 100644
--- a/examples/openai-server/client.py
+++ b/examples/openai-server/client.py
@@ -28,12 +28,15 @@
 # Completion API
 stream = True
 completion = openai.Completion.create(
-    model=model, prompt="def fib():", stream=stream, max_tokens=16
+    model=model, prompt="def fib():", stream=stream, max_tokens=32
 )
 
 print("Completion results:")
 if stream:
+    text = ""
     for c in completion:
         print(c)
+        text += c["choices"][0]["text"]
+    print(text)
 else:
     print(completion)
diff --git a/examples/openai-server/server.py b/examples/openai-server/server.py
index 6bd14d971f..d3a8005f4b 100644
--- a/examples/openai-server/server.py
+++ b/examples/openai-server/server.py
@@ -117,14 +117,18 @@ async def generate(
         thread.start()
 
         # stream out the text
+        concat_text = ""
+        concat_token_ids = []
         for new_text in streamer:
+            concat_text += new_text
+            concat_token_ids.append(self.tokenize(new_text))
             yield RequestOutput(
                 request_id=request_id,
                 prompt=prompt,
                 prompt_token_ids=prompt_token_ids,
                 outputs=[
                     CompletionOutput(
-                        index=0, text=new_text, token_ids=self.tokenize(new_text)
+                        index=0, text=concat_text, token_ids=concat_token_ids
                     )
                 ],
                 finished=False,
@@ -135,7 +139,7 @@ async def generate(
             request_id=request_id,
             prompt=prompt,
             prompt_token_ids=prompt_token_ids,
-            outputs=[CompletionOutput(index=0, text="", token_ids=[0])],
+            outputs=[CompletionOutput(index=0, text="", token_ids=[0], finish_reason="stop")],
             finished=True,
         )
 

From 752c5f044de866350f011f6a19f8e5370465080a Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 31 Aug 2023 03:13:57 +0000
Subject: [PATCH 2/2] Refresh README

---
 examples/openai-server/README.md | 363 ++++++++-----------------------
 examples/openai-server/client.py |   4 +-
 examples/openai-server/server.py |  74 +++++--
 examples/openai-server/test.py   |  12 +-
 4 files changed, 151 insertions(+), 302 deletions(-)

diff --git a/examples/openai-server/README.md b/examples/openai-server/README.md
index 3e03a629c6..fd4cba13eb 100644
--- a/examples/openai-server/README.md
+++ b/examples/openai-server/README.md
@@ -14,15 +14,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+# OpenAI-compatible Completions Server
+
 Goal: Make a text-generation server that is compatible with the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/introduction) so it can plug-in readily with applications that use the interface.
 
-## Install requirements
+### Install requirements
 `pip install -r requirements.txt`
 
 ## Simple CLI usage
 Set up the server:
 ```
-python examples/openai-server/server.py --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none --prompt-processing-sequence-length 1
+python examples/openai-server/server.py --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none
 None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
 2023-08-07 17:18:32 __main__     INFO     args: Namespace(model='zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none', max_model_len=512, prompt_processing_sequence_length=1, use_deepsparse_cache=False, host='localhost', port=8000, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], served_model_name=None)
 2023-08-07 17:18:32 deepsparse.transformers WARNING  The neuralmagic fork of transformers may not be installed. It can be installed via `pip install nm_transformers`
@@ -42,7 +44,22 @@ curl http://localhost:8000/v1/models
 {"object":"list","data":[{"id":"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none","object":"model","created":1691444523,"owned_by":"neuralmagic","root":"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none","parent":null,"permission":[{"id":"modelperm-d0d9f0bb6a5c48458848e6b9a8cb8aca","object":"model_permission","created":1691444523,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]}
 ```
 
-Then you can hit the [Completions API](https://platform.openai.com/docs/api-reference/completions) with a `curl` command and see the streaming output:
+Then you can hit the [Completions API](https://platform.openai.com/docs/api-reference/completions) with a `curl` command and see the output:
+
+```
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
+        "prompt": "def fib():",
+        "max_tokens": 30
+    }'
+
+{"id":"cmpl-4d7c32ea65e14468bbe93c63d1687ba9","object":"text_completion","created":1693451394,"model":"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none","choices":[{"index":0,"text":"\n    a, b = 0, 1\n    while True:\n        yield a\n        a, b = b, a + b","logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":2,"total_tokens":4,"completion_tokens":2}}
+```
+
+There is also streaming output to enable with `"stream": true`:
+
 <details>
 
 ```
@@ -51,44 +68,75 @@ curl http://localhost:8000/v1/completions \
     -d '{
         "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
         "prompt": "def fib():",
-        "max_tokens": 16,
+        "max_tokens": 30,
         "stream": true
     }'
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "def fib():\n", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "def fib():\n", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "    ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a, ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "= ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "0, ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "1\n", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "    ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "while ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "True:\n", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "        ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a, ", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "yield ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a\n", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "0, ", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "        ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "  ", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a, ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "while ", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "= ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "True:\n", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "  ", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b, ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a ", "logprobs": null, "finish_reason": null}]}
 
-data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "+ ", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]}
+
+data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": "stop"}]}
 
 data: [DONE]
 ```
@@ -101,6 +149,7 @@ data: [DONE]
 ```python
 import openai
 
+
 # Modify OpenAI's API values to use the DeepSparse API server.
 openai.api_key = "EMPTY"
 openai.api_base = "http://localhost:8000/v1"
@@ -112,17 +161,18 @@ print("Models:", models)
 model = models["data"][0]["id"]
 
 # Completion API
-stream = True
+stream = False
 completion = openai.Completion.create(
-    model=model,
-    prompt="def fib():",
-    stream=stream,
-    max_tokens=16)
+    model=model, prompt="def fib():", stream=stream, max_tokens=30
+)
 
 print("Completion results:")
 if stream:
+    text = ""
     for c in completion:
         print(c)
+        text += c["choices"][0]["text"]
+    print(text)
 else:
     print(completion)
 ```
@@ -137,15 +187,15 @@ Models: {
     {
       "id": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
       "object": "model",
-      "created": 1692040552,
+      "created": 1693451467,
       "owned_by": "neuralmagic",
       "root": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
       "parent": null,
       "permission": [
         {
-          "id": "modelperm-23ab758b9a9a43b6ba9584146508f9eb",
+          "id": "modelperm-611e8298e6974b389e2da6e93b7b576b",
           "object": "model_permission",
-          "created": 1692040552,
+          "created": 1693451467,
           "allow_create_engine": false,
           "allow_sampling": true,
           "allow_logprobs": true,
@@ -162,256 +212,23 @@ Models: {
 }
 Completion results:
 {
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
+  "id": "cmpl-caca545954ad4c169b607e36f6a967e4",
   "object": "text_completion",
-  "created": 1692040552,
+  "created": 1693451467,
   "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
   "choices": [
     {
       "index": 0,
-      "text": "def fib():\n",
+      "text": "\n    a, b = 0, 1\n    while True:\n        yield a\n        a, b = b, a + b",
       "logprobs": null,
-      "finish_reason": null
+      "finish_reason": "stop"
     }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "a, ",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "0, ",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "  ",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "while ",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "True:\n",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "  ",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
-}
-{
-  "id": "cmpl-98386417df264683bc558aaf8a060dd1",
-  "object": "text_completion",
-  "created": 1692040552,
-  "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
-  "choices": [
-    {
-      "index": 0,
-      "text": "",
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ]
+  ],
+  "usage": {
+    "prompt_tokens": 2,
+    "total_tokens": 4,
+    "completion_tokens": 2
+  }
 }
 ```
 </details>
diff --git a/examples/openai-server/client.py b/examples/openai-server/client.py
index 9e387cd99f..b97f0fbb83 100644
--- a/examples/openai-server/client.py
+++ b/examples/openai-server/client.py
@@ -26,9 +26,9 @@
 model = models["data"][0]["id"]
 
 # Completion API
-stream = True
+stream = False
 completion = openai.Completion.create(
-    model=model, prompt="def fib():", stream=stream, max_tokens=32
+    model=model, prompt="def fib():", stream=stream, max_tokens=30
 )
 
 print("Completion results:")
diff --git a/examples/openai-server/server.py b/examples/openai-server/server.py
index d3a8005f4b..a572fbebdd 100644
--- a/examples/openai-server/server.py
+++ b/examples/openai-server/server.py
@@ -101,47 +101,76 @@ async def generate(
         temperature: float = 0.80,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
+        stream: bool = True,
         **kwargs,
     ) -> AsyncGenerator[RequestOutput, None]:
         request_id = random_uuid()
 
         prompt_token_ids = self.tokenize(prompt)
 
-        streamer = TextIteratorStreamer(self.engine.tokenizer)
         self.engine.max_generated_tokens = max_tokens
         self.engine.sampling_temperature = temperature
 
-        generation_kwargs = dict(sequences=prompt, streamer=streamer)
+        if not stream:
+            # Non-streaming response
+            output = self.engine(sequences=prompt)
 
-        thread = Thread(target=self.engine, kwargs=generation_kwargs)
-        thread.start()
+            new_text = output.sequences[0]
 
-        # stream out the text
-        concat_text = ""
-        concat_token_ids = []
-        for new_text in streamer:
-            concat_text += new_text
-            concat_token_ids.append(self.tokenize(new_text))
             yield RequestOutput(
                 request_id=request_id,
                 prompt=prompt,
                 prompt_token_ids=prompt_token_ids,
                 outputs=[
                     CompletionOutput(
-                        index=0, text=concat_text, token_ids=concat_token_ids
+                        index=0,
+                        text=new_text,
+                        token_ids=self.tokenize(new_text),
+                        finish_reason="stop",
                     )
                 ],
-                finished=False,
+                finished=True,
             )
 
-        # finished
-        yield RequestOutput(
-            request_id=request_id,
-            prompt=prompt,
-            prompt_token_ids=prompt_token_ids,
-            outputs=[CompletionOutput(index=0, text="", token_ids=[0], finish_reason="stop")],
-            finished=True,
-        )
+        else:
+            # Streaming response
+            streamer = TextIteratorStreamer(self.engine.tokenizer)
+
+            generation_kwargs = dict(sequences=prompt, streamer=streamer)
+
+            thread = Thread(target=self.engine, kwargs=generation_kwargs)
+            thread.start()
+
+            # stream out the text
+            concat_text = ""
+            concat_token_ids = []
+            for new_text in streamer:
+                concat_text += new_text
+                concat_token_ids.append(self.tokenize(new_text))
+                yield RequestOutput(
+                    request_id=request_id,
+                    prompt=prompt,
+                    prompt_token_ids=prompt_token_ids,
+                    outputs=[
+                        CompletionOutput(
+                            index=0, text=concat_text, token_ids=concat_token_ids
+                        )
+                    ],
+                    finished=False,
+                )
+
+            # finished
+            yield RequestOutput(
+                request_id=request_id,
+                prompt=prompt,
+                prompt_token_ids=prompt_token_ids,
+                outputs=[
+                    CompletionOutput(
+                        index=0, text="", token_ids=[0], finish_reason="stop"
+                    )
+                ],
+                finished=True,
+            )
 
     async def abort(self, session_id):
         pass
@@ -500,6 +529,7 @@ async def create_completion(raw_request: Request):
             max_tokens=request.max_tokens,
             logprobs=request.logprobs,
             use_beam_search=request.use_beam_search,
+            stream=request.stream,
         )
     except ValueError as e:
         return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
@@ -661,10 +691,10 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
     parser.add_argument(
         "--prompt-processing-sequence-length",
         type=int,
-        default=64,
+        default=16,
         help=(
             "For large prompts, the prompt is processed in chunks of this length. "
-            "This is to maximize the inference speed. By default, this is set to 64."
+            "This is to maximize the inference speed. By default, this is set to 16."
         ),
     )
     parser.add_argument(
diff --git a/examples/openai-server/test.py b/examples/openai-server/test.py
index 156a5767e1..556055f0f1 100644
--- a/examples/openai-server/test.py
+++ b/examples/openai-server/test.py
@@ -14,9 +14,7 @@
 
 # Make sure to start the server first:
 """
-python examples/openai-server/server.py \
-    --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none \
-    --prompt-processing-sequence-length 1
+python examples/openai-server/server.py --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none
 """  # noqa: E501
 
 import unittest
@@ -41,11 +39,15 @@ def test_model_completion(self):
 
         for model in models:
             response = openai.Completion.create(
-                model=model, prompt="def fib():", max_tokens=16
+                model=model, prompt="def fib():", max_tokens=30
             )
             self.assertIsNotNone(response)
             self.assertIn("choices", response)
             self.assertTrue(len(response["choices"]) > 0)
+            self.assertTrue(
+                response["choices"][0]["text"]
+                == "\n    a, b = 0, 1\n    while True:\n        yield a\n        a, b = b, a + b"  # noqa: E501
+            )
             print(response["choices"])
 
     def test_streaming_output(self):
@@ -53,7 +55,7 @@ def test_streaming_output(self):
 
         for model in models:
             responses = openai.Completion.create(
-                model=model, prompt="def fib():", max_tokens=16, stream=True
+                model=model, prompt="def fib():", max_tokens=30, stream=True
             )
             for response in responses:
                 self.assertIn("choices", response)