neuralmagic · dbogunowicz · Oct 23, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -636,6 +636,9 @@ def process_engine_outputs(
             created=datetime.datetime.now(), prompts=prompts, generations=generations
         )
 
+        if "session_ids" in kwargs:
+            outputs["session_ids"] = kwargs["session_ids"]
+
         if self._debug:
             debug_params = dict(
                 kv_cache_state=kv_cache_state,

diff --git a/tests/deepsparse/transformers/pipelines/configs/__init__.py b/tests/deepsparse/transformers/pipelines/configs/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/transformers/pipelines/configs/gpt_neo.yaml b/tests/deepsparse/transformers/pipelines/configs/gpt_neo.yaml
@@ -0,0 +1,13 @@
+model_path: "hf:mgoin/TinyStories-1M-deepsparse"
+model_name: "roneneldan/TinyStories-1M"
+pipeline_type: ["text-generation", "chat"]
+num_tokens_generate: 128
+prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio\n Some cat was layin' down some rock 'n' roll\n \"Lotta soul,\" he said\n Then the loud sound did seem to fade\n Came back like a slow voice on a wave of phase\n That weren't no DJ, that was hazy cosmic jive"
+has_bos_token: False
+logits_threshold: 24.7
+precision: 0.001
+cache_management_type:
+    - "internal"
+    - "external"
+run_helper_tests: True
+cadence: "commit"
diff --git a/tests/deepsparse/transformers/pipelines/helpers.py b/tests/deepsparse/transformers/pipelines/helpers.py
@@ -12,11 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple
+import functools
+import os
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy
+import yaml
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+import pytest
+
 
 class TorchGroundTruthSource:
     """
@@ -82,3 +87,53 @@ def _create_tokenizer(model_name):
             tokenizer.pad_token = tokenizer.eos_token
 
         return tokenizer
+
+
+def parse_params(config_path: str) -> Tuple[Optional[Dict], Optional[str]]:
+    # parses the config file provided
+    assert os.path.isfile(config_path), f"config_path {config_path} is not a file"
+    # reads the yaml file
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+
+    cadence = os.environ.get("CADENCE", "commit")
+    expected_cadence = config["cadence"]
+
+    if not isinstance(expected_cadence, list):
+        expected_cadence = [expected_cadence]
+    if cadence in expected_cadence:
+        return config, None
+    return None, "Skipping test for cadence: {}".format(config["cadence"])
+
+
+def validate_cache_management_type(
+    internal_kv_cache, cache_management_type: Union[str, List[str]]
+) -> bool:
+    if internal_kv_cache and "internal" not in cache_management_type:
+        pytest.skip(
+            "The tests for running the pipeline with "
+            "internal kv cache management are disabled."
+        )
+    if not internal_kv_cache and "external" not in cache_management_type:
+        pytest.skip(
+            "The tests for running the pipeline with "
+            "external kv cache management are disabled."
+        )
+    return internal_kv_cache
+
+
+def helper_test(test_method):
+    @functools.wraps(test_method)
+    def wrapper(self, setup):
+        if not self.run_helper_tests:
+            raise pytest.skip(
+                "Skipping the helper test. Set run_helper_tests to True to run it."
+            )
+
+        return test_method(self, setup)
+
+    return wrapper
+
+
+def find_closest_number_divisible_by_four(number):
+    return number - (number % 4)
diff --git a/tests/deepsparse/transformers/pipelines/test_chat.py b/tests/deepsparse/transformers/pipelines/test_chat.py
@@ -12,34 +12,157 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+from transformers import GenerationConfig
+
 import pytest
-from deepsparse import Pipeline
-
-
-@pytest.mark.parametrize(
-    "pipeline_kwargs",
-    [
-        dict(
-            model_path="zoo:nlg/text_generation/codegen_mono-350m/pytorch/"
-            "huggingface/bigpython_bigquery_thepile/base-none",
-            engine_type="onnxruntime",
-        ),
-    ],
+from tests.deepsparse.transformers.pipelines.helpers import helper_test
+from tests.deepsparse.transformers.pipelines.test_text_generation import (
+    TestTextGenerationPipeline,
 )
-@pytest.mark.skip(reason="too heavy for now to run in gha")
-def test_chat_pipeline_session_manager(pipeline_kwargs):
-    chat_pipeline = Pipeline.create(task="chat", **pipeline_kwargs)
 
-    with chat_pipeline.session():
-        output_1 = chat_pipeline(
-            prompt="first", generation_config=dict(max_new_tokens=1)
+
+@pytest.fixture
+def config(request):
+    return request.param
+
+
+class TestChatPipeline(TestTextGenerationPipeline):
+    @pytest.fixture
+    def pipeline_type(self):
+        return "chat"
+
+    @helper_test
+    def test_chat_pipeline_session_manager(self, setup):
+        pipeline = self.get_pipeline()
+
+        with pipeline.session():
+            output_1 = pipeline(
+                prompt="first", generation_config=dict(max_new_tokens=1)
+            )
+            output_2 = pipeline(
+                prompt="second", generation_config=dict(max_new_tokens=1)
+            )
+        # assert inferences in the same context share a session id
+        assert output_1.session_ids == output_2.session_ids
+
+        # test that follow-up inference has a different session id
+        output_3 = pipeline(prompt="third", generation_config=dict(max_new_tokens=1))
+        assert output_3.session_ids != output_1.session_ids
+
+    @helper_test
+    def test_run_with_same_session_ids(self, setup):
+        # Test the scenario where the same session ids are used for multiple
+        # inference runs. There are two conditions that must be fulfilled:
+        # 1. The information regarding the prompt does not leak between sessions
+        # 2. Running two prompts one after another is identical to running
+        #    a composition of those prompts i.e.
+        #     generated_text = pipeline(prompt_1)
+        #     generated_text_2 = pipeline(prompt_2)
+        #     generated_text_2 == pipeline(prompt_1 + generated_text + prompt_2)
+
+        prompt_1 = "This prompt is used for testing purposes. To this to make sure that"
+        prompt_2 = "still this prompt should not"
+        num_generated_tokens = 32
+
+        self._test_run_with_same_session_ids(
+            prompt_1,
+            prompt_2,
+            num_generated_tokens,
+            multi_token_prefill=False,
+        )
+        self._test_run_with_same_session_ids(
+            prompt_1,
+            prompt_2,
+            num_generated_tokens,
+            multi_token_prefill=True,
+        )
+
+    def _test_run_with_same_session_ids(
+        self,
+        prompt_1,
+        prompt_2,
+        num_generated_tokens,
+        multi_token_prefill,
+    ):
+        pipeline = self.get_pipeline(
+            prompt_sequence_length=self.prompt_sequence_length
+            if multi_token_prefill
+            else 1,
+        )
+
+        # make sure information does not leak between sessions
+        self._test_composition_same_session_ids(
+            prompt_1,
+            prompt_2,
+            num_generated_tokens,
+            pipeline,
+            session_id_1="test_1",
+            session_id_2="test_2",
+        )
+
+        self._test_composition_same_session_ids(
+            prompt_1,
+            prompt_2,
+            num_generated_tokens,
+            pipeline,
+            session_id_1="test_3",
+            session_id_2="test_4",
+        )
+
+    def _test_composition_same_session_ids(
+        self,
+        prompt_1,
+        prompt_2,
+        num_generated_tokens,
+        pipeline,
+        session_id_1,
+        session_id_2,
+    ):
+
+        tokenizer = pipeline.tokenizer
+        config = GenerationConfig(
+            output_scores=True, max_length=num_generated_tokens, top_k=0, top_p=0.0
+        )
+
+        # make sure that running two prompts one after another
+        # is identical to running a composition of those prompts
+        out_1_ = pipeline(
+            sequences=prompt_1,
+            force_max_tokens=True,
+            session_ids=session_id_1,
+            generation_config=config,
+            include_prompt_logits=True,
         )
-        output_2 = chat_pipeline(
-            prompt="second", generation_config=dict(max_new_tokens=1)
+        prompt_1_ = out_1_.generations[0].text
+        out_1 = pipeline(
+            sequences=prompt_2,
+            force_max_tokens=True,
+            session_ids=session_id_1,
+            generation_config=config,
+            include_prompt_logits=True,
         )
-    # assert inferences in the same context share a session id
-    assert output_1.session_ids == output_2.session_ids
+        cache_state_1 = pipeline.storage_kv_cache.get(session_id_1).cached_inputs[
+            "past_key_values.0.key"
+        ]
 
-    # test that follow-up inference has a different session id
-    output_3 = chat_pipeline(prompt="third", generation_config=dict(max_new_tokens=1))
-    assert output_3.session_ids != output_1.session_ids
+        prompt_composition = tokenizer.decode(
+            tokenizer(prompt_1).input_ids
+            + tokenizer(prompt_1_).input_ids
+            + tokenizer(prompt_2).input_ids,
+            skip_special_tokens=True,
+        )
+        out_2 = pipeline(
+            sequences=prompt_composition,
+            session_ids=session_id_2,
+            generation_config=config,
+            include_prompt_logits=True,
+        )
+        cache_state_2 = pipeline.storage_kv_cache.get(session_id_2).cached_inputs[
+            "past_key_values.0.key"
+        ]
+        if cache_state_1.shape[0]:
+            # if cache state is not empty, i.e. we are managing kv cache
+            # externally, make sure that the cache state is the same
+            np.allclose(cache_state_1, cache_state_2, atol=self.precision)
+        assert out_1.generations[0].text == out_2.generations[0].text