From a81e32d4712c9d40bf347d47907a2308a4aeda9c Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 31 Aug 2023 02:55:05 +0000 Subject: [PATCH 1/2] Fixes for openai server example --- examples/openai-server/client.py | 5 ++++- examples/openai-server/server.py | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/openai-server/client.py b/examples/openai-server/client.py index bf9f4d6b81..9e387cd99f 100644 --- a/examples/openai-server/client.py +++ b/examples/openai-server/client.py @@ -28,12 +28,15 @@ # Completion API stream = True completion = openai.Completion.create( - model=model, prompt="def fib():", stream=stream, max_tokens=16 + model=model, prompt="def fib():", stream=stream, max_tokens=32 ) print("Completion results:") if stream: + text = "" for c in completion: print(c) + text += c["choices"][0]["text"] + print(text) else: print(completion) diff --git a/examples/openai-server/server.py b/examples/openai-server/server.py index 6bd14d971f..d3a8005f4b 100644 --- a/examples/openai-server/server.py +++ b/examples/openai-server/server.py @@ -117,14 +117,18 @@ async def generate( thread.start() # stream out the text + concat_text = "" + concat_token_ids = [] for new_text in streamer: + concat_text += new_text + concat_token_ids.append(self.tokenize(new_text)) yield RequestOutput( request_id=request_id, prompt=prompt, prompt_token_ids=prompt_token_ids, outputs=[ CompletionOutput( - index=0, text=new_text, token_ids=self.tokenize(new_text) + index=0, text=concat_text, token_ids=concat_token_ids ) ], finished=False, @@ -135,7 +139,7 @@ async def generate( request_id=request_id, prompt=prompt, prompt_token_ids=prompt_token_ids, - outputs=[CompletionOutput(index=0, text="", token_ids=[0])], + outputs=[CompletionOutput(index=0, text="", token_ids=[0], finish_reason="stop")], finished=True, ) From 752c5f044de866350f011f6a19f8e5370465080a Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 31 Aug 2023 03:13:57 +0000 Subject: [PATCH 2/2] Refresh README --- examples/openai-server/README.md | 363 ++++++++----------------------- examples/openai-server/client.py | 4 +- examples/openai-server/server.py | 74 +++++-- examples/openai-server/test.py | 12 +- 4 files changed, 151 insertions(+), 302 deletions(-) diff --git a/examples/openai-server/README.md b/examples/openai-server/README.md index 3e03a629c6..fd4cba13eb 100644 --- a/examples/openai-server/README.md +++ b/examples/openai-server/README.md @@ -14,15 +14,17 @@ See the License for the specific language governing permissions and limitations under the License. --> +# OpenAI-compatible Completions Server + Goal: Make a text-generation server that is compatible with the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/introduction) so it can plug-in readily with applications that use the interface. -## Install requirements +### Install requirements `pip install -r requirements.txt` ## Simple CLI usage Set up the server: ``` -python examples/openai-server/server.py --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none --prompt-processing-sequence-length 1 +python examples/openai-server/server.py --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used. 2023-08-07 17:18:32 __main__ INFO args: Namespace(model='zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none', max_model_len=512, prompt_processing_sequence_length=1, use_deepsparse_cache=False, host='localhost', port=8000, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], served_model_name=None) 2023-08-07 17:18:32 deepsparse.transformers WARNING The neuralmagic fork of transformers may not be installed. It can be installed via `pip install nm_transformers` @@ -42,7 +44,22 @@ curl http://localhost:8000/v1/models {"object":"list","data":[{"id":"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none","object":"model","created":1691444523,"owned_by":"neuralmagic","root":"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none","parent":null,"permission":[{"id":"modelperm-d0d9f0bb6a5c48458848e6b9a8cb8aca","object":"model_permission","created":1691444523,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]} ``` -Then you can hit the [Completions API](https://platform.openai.com/docs/api-reference/completions) with a `curl` command and see the streaming output: +Then you can hit the [Completions API](https://platform.openai.com/docs/api-reference/completions) with a `curl` command and see the output: + +``` +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", + "prompt": "def fib():", + "max_tokens": 30 + }' + +{"id":"cmpl-4d7c32ea65e14468bbe93c63d1687ba9","object":"text_completion","created":1693451394,"model":"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none","choices":[{"index":0,"text":"\n a, b = 0, 1\n while True:\n yield a\n a, b = b, a + b","logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":2,"total_tokens":4,"completion_tokens":2}} +``` + +There is also streaming output to enable with `"stream": true`: +
``` @@ -51,44 +68,75 @@ curl http://localhost:8000/v1/completions \ -d '{ "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "prompt": "def fib():", - "max_tokens": 16, + "max_tokens": 30, "stream": true }' -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "def fib():\n", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "def fib():\n", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": " ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a, ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "= ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "0, ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "1\n", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": " ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "while ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "True:\n", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": " ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a, ", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "yield ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a\n", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "0, ", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": " ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": " ", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a, ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "while ", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "= ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "True:\n", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": " ", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b, ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "a ", "logprobs": null, "finish_reason": null}]} -data: {"id": "cmpl-473d4978ecc64a61a5eb6c442505aeba", "object": "text_completion", "created": 1691444444, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "+ ", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "b", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": null}]} + +data: {"id": "cmpl-14fcb54b0716430bb4f155ffd8882c8f", "object": "text_completion", "created": 1693451416, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [{"index": 0, "text": "", "logprobs": null, "finish_reason": "stop"}]} data: [DONE] ``` @@ -101,6 +149,7 @@ data: [DONE] ```python import openai + # Modify OpenAI's API values to use the DeepSparse API server. openai.api_key = "EMPTY" openai.api_base = "http://localhost:8000/v1" @@ -112,17 +161,18 @@ print("Models:", models) model = models["data"][0]["id"] # Completion API -stream = True +stream = False completion = openai.Completion.create( - model=model, - prompt="def fib():", - stream=stream, - max_tokens=16) + model=model, prompt="def fib():", stream=stream, max_tokens=30 +) print("Completion results:") if stream: + text = "" for c in completion: print(c) + text += c["choices"][0]["text"] + print(text) else: print(completion) ``` @@ -137,15 +187,15 @@ Models: { { "id": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "object": "model", - "created": 1692040552, + "created": 1693451467, "owned_by": "neuralmagic", "root": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "parent": null, "permission": [ { - "id": "modelperm-23ab758b9a9a43b6ba9584146508f9eb", + "id": "modelperm-611e8298e6974b389e2da6e93b7b576b", "object": "model_permission", - "created": 1692040552, + "created": 1693451467, "allow_create_engine": false, "allow_sampling": true, "allow_logprobs": true, @@ -162,256 +212,23 @@ Models: { } Completion results: { - "id": "cmpl-98386417df264683bc558aaf8a060dd1", + "id": "cmpl-caca545954ad4c169b607e36f6a967e4", "object": "text_completion", - "created": 1692040552, + "created": 1693451467, "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", "choices": [ { "index": 0, - "text": "def fib():\n", + "text": "\n a, b = 0, 1\n while True:\n yield a\n a, b = b, a + b", "logprobs": null, - "finish_reason": null + "finish_reason": "stop" } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "a, ", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "0, ", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": " ", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "while ", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "True:\n", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": " ", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] -} -{ - "id": "cmpl-98386417df264683bc558aaf8a060dd1", - "object": "text_completion", - "created": 1692040552, - "model": "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none", - "choices": [ - { - "index": 0, - "text": "", - "logprobs": null, - "finish_reason": null - } - ] + ], + "usage": { + "prompt_tokens": 2, + "total_tokens": 4, + "completion_tokens": 2 + } } ```
diff --git a/examples/openai-server/client.py b/examples/openai-server/client.py index 9e387cd99f..b97f0fbb83 100644 --- a/examples/openai-server/client.py +++ b/examples/openai-server/client.py @@ -26,9 +26,9 @@ model = models["data"][0]["id"] # Completion API -stream = True +stream = False completion = openai.Completion.create( - model=model, prompt="def fib():", stream=stream, max_tokens=32 + model=model, prompt="def fib():", stream=stream, max_tokens=30 ) print("Completion results:") diff --git a/examples/openai-server/server.py b/examples/openai-server/server.py index d3a8005f4b..a572fbebdd 100644 --- a/examples/openai-server/server.py +++ b/examples/openai-server/server.py @@ -101,47 +101,76 @@ async def generate( temperature: float = 0.80, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, + stream: bool = True, **kwargs, ) -> AsyncGenerator[RequestOutput, None]: request_id = random_uuid() prompt_token_ids = self.tokenize(prompt) - streamer = TextIteratorStreamer(self.engine.tokenizer) self.engine.max_generated_tokens = max_tokens self.engine.sampling_temperature = temperature - generation_kwargs = dict(sequences=prompt, streamer=streamer) + if not stream: + # Non-streaming response + output = self.engine(sequences=prompt) - thread = Thread(target=self.engine, kwargs=generation_kwargs) - thread.start() + new_text = output.sequences[0] - # stream out the text - concat_text = "" - concat_token_ids = [] - for new_text in streamer: - concat_text += new_text - concat_token_ids.append(self.tokenize(new_text)) yield RequestOutput( request_id=request_id, prompt=prompt, prompt_token_ids=prompt_token_ids, outputs=[ CompletionOutput( - index=0, text=concat_text, token_ids=concat_token_ids + index=0, + text=new_text, + token_ids=self.tokenize(new_text), + finish_reason="stop", ) ], - finished=False, + finished=True, ) - # finished - yield RequestOutput( - request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - outputs=[CompletionOutput(index=0, text="", token_ids=[0], finish_reason="stop")], - finished=True, - ) + else: + # Streaming response + streamer = TextIteratorStreamer(self.engine.tokenizer) + + generation_kwargs = dict(sequences=prompt, streamer=streamer) + + thread = Thread(target=self.engine, kwargs=generation_kwargs) + thread.start() + + # stream out the text + concat_text = "" + concat_token_ids = [] + for new_text in streamer: + concat_text += new_text + concat_token_ids.append(self.tokenize(new_text)) + yield RequestOutput( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + outputs=[ + CompletionOutput( + index=0, text=concat_text, token_ids=concat_token_ids + ) + ], + finished=False, + ) + + # finished + yield RequestOutput( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + outputs=[ + CompletionOutput( + index=0, text="", token_ids=[0], finish_reason="stop" + ) + ], + finished=True, + ) async def abort(self, session_id): pass @@ -500,6 +529,7 @@ async def create_completion(raw_request: Request): max_tokens=request.max_tokens, logprobs=request.logprobs, use_beam_search=request.use_beam_search, + stream=request.stream, ) except ValueError as e: return create_error_response(HTTPStatus.BAD_REQUEST, str(e)) @@ -661,10 +691,10 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: parser.add_argument( "--prompt-processing-sequence-length", type=int, - default=64, + default=16, help=( "For large prompts, the prompt is processed in chunks of this length. " - "This is to maximize the inference speed. By default, this is set to 64." + "This is to maximize the inference speed. By default, this is set to 16." ), ) parser.add_argument( diff --git a/examples/openai-server/test.py b/examples/openai-server/test.py index 156a5767e1..556055f0f1 100644 --- a/examples/openai-server/test.py +++ b/examples/openai-server/test.py @@ -14,9 +14,7 @@ # Make sure to start the server first: """ -python examples/openai-server/server.py \ - --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none \ - --prompt-processing-sequence-length 1 +python examples/openai-server/server.py --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none """ # noqa: E501 import unittest @@ -41,11 +39,15 @@ def test_model_completion(self): for model in models: response = openai.Completion.create( - model=model, prompt="def fib():", max_tokens=16 + model=model, prompt="def fib():", max_tokens=30 ) self.assertIsNotNone(response) self.assertIn("choices", response) self.assertTrue(len(response["choices"]) > 0) + self.assertTrue( + response["choices"][0]["text"] + == "\n a, b = 0, 1\n while True:\n yield a\n a, b = b, a + b" # noqa: E501 + ) print(response["choices"]) def test_streaming_output(self): @@ -53,7 +55,7 @@ def test_streaming_output(self): for model in models: responses = openai.Completion.create( - model=model, prompt="def fib():", max_tokens=16, stream=True + model=model, prompt="def fib():", max_tokens=30, stream=True ) for response in responses: self.assertIn("choices", response)