Skip to content

[LLM Tests] Config support for LLM test suite / enable test suite in GHA #1324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/deepsparse/transformers/pipelines/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,9 @@ def process_engine_outputs(
created=datetime.datetime.now(), prompts=prompts, generations=generations
)

if "session_ids" in kwargs:
outputs["session_ids"] = kwargs["session_ids"]

if self._debug:
debug_params = dict(
kv_cache_state=kv_cache_state,
Expand Down
13 changes: 13 additions & 0 deletions tests/deepsparse/transformers/pipelines/configs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
13 changes: 13 additions & 0 deletions tests/deepsparse/transformers/pipelines/configs/gpt_neo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
model_path: "hf:mgoin/TinyStories-1M-deepsparse"
model_name: "roneneldan/TinyStories-1M"
pipeline_type: ["text-generation", "chat"]
num_tokens_generate: 128
prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio\n Some cat was layin' down some rock 'n' roll\n \"Lotta soul,\" he said\n Then the loud sound did seem to fade\n Came back like a slow voice on a wave of phase\n That weren't no DJ, that was hazy cosmic jive"
has_bos_token: False
logits_threshold: 24.7
precision: 0.001
cache_management_type:
- "internal"
- "external"
run_helper_tests: True
cadence: "commit"
57 changes: 56 additions & 1 deletion tests/deepsparse/transformers/pipelines/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Tuple
import functools
import os
from typing import Dict, List, Optional, Tuple, Union

import numpy
import yaml
from transformers import AutoModelForCausalLM, AutoTokenizer

import pytest


class TorchGroundTruthSource:
"""
Expand Down Expand Up @@ -82,3 +87,53 @@ def _create_tokenizer(model_name):
tokenizer.pad_token = tokenizer.eos_token

return tokenizer


def parse_params(config_path: str) -> Tuple[Optional[Dict], Optional[str]]:
# parses the config file provided
assert os.path.isfile(config_path), f"config_path {config_path} is not a file"
# reads the yaml file
with open(config_path, "r") as f:
config = yaml.safe_load(f)

cadence = os.environ.get("CADENCE", "commit")
expected_cadence = config["cadence"]

if not isinstance(expected_cadence, list):
expected_cadence = [expected_cadence]
if cadence in expected_cadence:
return config, None
return None, "Skipping test for cadence: {}".format(config["cadence"])


def validate_cache_management_type(
internal_kv_cache, cache_management_type: Union[str, List[str]]
) -> bool:
if internal_kv_cache and "internal" not in cache_management_type:
pytest.skip(
"The tests for running the pipeline with "
"internal kv cache management are disabled."
)
if not internal_kv_cache and "external" not in cache_management_type:
pytest.skip(
"The tests for running the pipeline with "
"external kv cache management are disabled."
)
return internal_kv_cache


def helper_test(test_method):
@functools.wraps(test_method)
def wrapper(self, setup):
if not self.run_helper_tests:
raise pytest.skip(
"Skipping the helper test. Set run_helper_tests to True to run it."
)

return test_method(self, setup)

return wrapper


def find_closest_number_divisible_by_four(number):
return number - (number % 4)
173 changes: 148 additions & 25 deletions tests/deepsparse/transformers/pipelines/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,34 +12,157 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from transformers import GenerationConfig

import pytest
from deepsparse import Pipeline


@pytest.mark.parametrize(
"pipeline_kwargs",
[
dict(
model_path="zoo:nlg/text_generation/codegen_mono-350m/pytorch/"
"huggingface/bigpython_bigquery_thepile/base-none",
engine_type="onnxruntime",
),
],
from tests.deepsparse.transformers.pipelines.helpers import helper_test
from tests.deepsparse.transformers.pipelines.test_text_generation import (
TestTextGenerationPipeline,
)
@pytest.mark.skip(reason="too heavy for now to run in gha")
def test_chat_pipeline_session_manager(pipeline_kwargs):
chat_pipeline = Pipeline.create(task="chat", **pipeline_kwargs)

with chat_pipeline.session():
output_1 = chat_pipeline(
prompt="first", generation_config=dict(max_new_tokens=1)

@pytest.fixture
def config(request):
return request.param


class TestChatPipeline(TestTextGenerationPipeline):
@pytest.fixture
def pipeline_type(self):
return "chat"

@helper_test
def test_chat_pipeline_session_manager(self, setup):
pipeline = self.get_pipeline()

with pipeline.session():
output_1 = pipeline(
prompt="first", generation_config=dict(max_new_tokens=1)
)
output_2 = pipeline(
prompt="second", generation_config=dict(max_new_tokens=1)
)
# assert inferences in the same context share a session id
assert output_1.session_ids == output_2.session_ids

# test that follow-up inference has a different session id
output_3 = pipeline(prompt="third", generation_config=dict(max_new_tokens=1))
assert output_3.session_ids != output_1.session_ids

@helper_test
def test_run_with_same_session_ids(self, setup):
# Test the scenario where the same session ids are used for multiple
# inference runs. There are two conditions that must be fulfilled:
# 1. The information regarding the prompt does not leak between sessions
# 2. Running two prompts one after another is identical to running
# a composition of those prompts i.e.
# generated_text = pipeline(prompt_1)
# generated_text_2 = pipeline(prompt_2)
# generated_text_2 == pipeline(prompt_1 + generated_text + prompt_2)

prompt_1 = "This prompt is used for testing purposes. To this to make sure that"
prompt_2 = "still this prompt should not"
num_generated_tokens = 32

self._test_run_with_same_session_ids(
prompt_1,
prompt_2,
num_generated_tokens,
multi_token_prefill=False,
)
self._test_run_with_same_session_ids(
prompt_1,
prompt_2,
num_generated_tokens,
multi_token_prefill=True,
)

def _test_run_with_same_session_ids(
self,
prompt_1,
prompt_2,
num_generated_tokens,
multi_token_prefill,
):
pipeline = self.get_pipeline(
prompt_sequence_length=self.prompt_sequence_length
if multi_token_prefill
else 1,
)

# make sure information does not leak between sessions
self._test_composition_same_session_ids(
prompt_1,
prompt_2,
num_generated_tokens,
pipeline,
session_id_1="test_1",
session_id_2="test_2",
)

self._test_composition_same_session_ids(
prompt_1,
prompt_2,
num_generated_tokens,
pipeline,
session_id_1="test_3",
session_id_2="test_4",
)

def _test_composition_same_session_ids(
self,
prompt_1,
prompt_2,
num_generated_tokens,
pipeline,
session_id_1,
session_id_2,
):

tokenizer = pipeline.tokenizer
config = GenerationConfig(
output_scores=True, max_length=num_generated_tokens, top_k=0, top_p=0.0
)

# make sure that running two prompts one after another
# is identical to running a composition of those prompts
out_1_ = pipeline(
sequences=prompt_1,
force_max_tokens=True,
session_ids=session_id_1,
generation_config=config,
include_prompt_logits=True,
)
output_2 = chat_pipeline(
prompt="second", generation_config=dict(max_new_tokens=1)
prompt_1_ = out_1_.generations[0].text
out_1 = pipeline(
sequences=prompt_2,
force_max_tokens=True,
session_ids=session_id_1,
generation_config=config,
include_prompt_logits=True,
)
# assert inferences in the same context share a session id
assert output_1.session_ids == output_2.session_ids
cache_state_1 = pipeline.storage_kv_cache.get(session_id_1).cached_inputs[
"past_key_values.0.key"
]

# test that follow-up inference has a different session id
output_3 = chat_pipeline(prompt="third", generation_config=dict(max_new_tokens=1))
assert output_3.session_ids != output_1.session_ids
prompt_composition = tokenizer.decode(
tokenizer(prompt_1).input_ids
+ tokenizer(prompt_1_).input_ids
+ tokenizer(prompt_2).input_ids,
skip_special_tokens=True,
)
out_2 = pipeline(
sequences=prompt_composition,
session_ids=session_id_2,
generation_config=config,
include_prompt_logits=True,
)
cache_state_2 = pipeline.storage_kv_cache.get(session_id_2).cached_inputs[
"past_key_values.0.key"
]
if cache_state_1.shape[0]:
# if cache state is not empty, i.e. we are managing kv cache
# externally, make sure that the cache state is the same
np.allclose(cache_state_1, cache_state_2, atol=self.precision)
assert out_1.generations[0].text == out_2.generations[0].text
Loading