neuralmagic · mgoin · Sep 6, 2023 · Sep 6, 2023
diff --git a/examples/openai-server/README.md b/examples/openai-server/README.md
@@ -26,7 +26,7 @@ Set up the server:
 ```
 python examples/openai-server/server.py --model zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none
 None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
-2023-08-07 17:18:32 __main__     INFO     args: Namespace(model='zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none', max_model_len=512, prompt_sequence_length=1, internal_kv_cache=False, host='localhost', port=8000, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], served_model_name=None)
+2023-08-07 17:18:32 __main__     INFO     args: Namespace(model='zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none', max_model_len=512, prompt_sequence_length=16, host='localhost', port=8000, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], served_model_name=None)
 2023-08-07 17:18:32 deepsparse.transformers WARNING  The neuralmagic fork of transformers may not be installed. It can be installed via `pip install nm_transformers`
 Using pad_token, but it is not set yet.
 2023-08-07 17:18:34 deepsparse.transformers.engines.nl_decoder_engine INFO     Overwriting in-place the input shapes of the transformer model at /home/mgoin/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx

diff --git a/examples/openai-server/server.py b/examples/openai-server/server.py
@@ -79,14 +79,12 @@ def __init__(
         model: str,
         sequence_length: int = 512,
         prompt_sequence_length: int = 64,
-        internal_kv_cache: bool = False,
     ):
         self.engine = deepsparse.Pipeline.create(
             task="text-generation",
             model_path=model,
             sequence_length=sequence_length,
             prompt_sequence_length=prompt_sequence_length,
-            internal_kv_cache=internal_kv_cache,
         )
 
     def tokenize(self, text: str) -> List[int]:
@@ -689,22 +687,14 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
         help="maximum number of input+output tokens the model will use",
     )
     parser.add_argument(
-        "--prompt-processing-sequence-length",
+        "--prompt-sequence-length",
         type=int,
         default=16,
         help=(
             "For large prompts, the prompt is processed in chunks of this length. "
             "This is to maximize the inference speed. By default, this is set to 16."
         ),
     )
-    parser.add_argument(
-        "--use-deepsparse-cache",
-        action="store_true",
-        help=(
-            "If True, the pipeline will use the deepsparse kv cache for caching the "
-            "model outputs."
-        ),
-    )
 
     parser.add_argument("--host", type=str, default="localhost", help="host name")
     parser.add_argument("--port", type=int, default=8000, help="port number")
@@ -752,7 +742,6 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
         model=args.model,
         sequence_length=max_model_len,
         prompt_sequence_length=args.prompt_sequence_length,
-        internal_kv_cache=args.internal_kv_cache,
     )
     tokenizer = engine.engine.tokenizer