Skip to content

Commit 10c804a

Browse files
dbogunowiczcorey-nmmarkurtzBenjaminrahul-tuli
authored
Perplexity Eval for Text Generation Models (#1073)
* initial commit * Update src/deepsparse/license.py * limit to 150mb * ready to review * initial commit * [Codegen][ORT][Static Seq Length] TextGenerationPipeline (#946) * initial commit * coreys simplifications * finishing the second model static * ready, time for beautification * ready for review * moved the code to examples * fix eos logic * add argument num_tokens_to_generate * [CodeGen][Documentation] (#956) * initial commit * coreys simplifications * finishing the second model static * ready, time for beautification * ready for review * moved the code to examples * fix eos logic * add argument num_tokens_to_generate * initial commit * change order * Update examples/codegen/README.md Co-authored-by: corey-nm <[email protected]> --------- Co-authored-by: corey-nm <[email protected]> * reimplementation for generative pipelines * restore text generation from examples * [CodeGen] ONNX model loading to support >2Gb models / two engines (#991) * refactor sucessfull * Pipeline fully refactored, time to test engine support. Note: Sliding window not yet implemented! * First iteration with Sage * Apply suggestions from code review * ORT agrees with the Engine. But they both give not entirely correct result. Hey, this is good news still * dynamic ORT vs static DS * pipeline handles OPT multitoken pass * fixes to get static pipeline a little further along * adjust shapes and slicing to enable static autoregressive pass - ISSUE: tokens past the base seq len are repeated * migrate from cache_length to positions input * got if working for multitoken + single token scenario * cleanup the pipeline * further cleanup post merge * Pipeline working for single-token inference only * do not load the onnx model with external files twice * pipeline never redundantly saves the external data + more robust tokenizer * Stop saving tmp files, otherwise the engine looks for external files in the wrong place * Left pad support * cleanup * cleanup2 * Add in pipeline timing * add in force tokens logic * remove input validation for text generation pipelines * remove multitoken support for now * remove kv cache engine and other fixes * nest input shape override * comment out input shape override * add non batch override for ORT * clean up generation pipeline * initial commit * Update src/deepsparse/license.py * limit to 150mb * ready to review * fix the erronous Makefile * perhaps fixed GHA * take into consideration that GHA creates four files * initial commit * tested with actual model * remove val_inp argument * Update README.md * Apply suggestions from code review * Update README.md * [BugFix] Update deepsparse dockerfile (#1069) * Remove autoinstall triggering commands * Fix typo * initial implementation * working implementation for pipeline input * [Fix] Fix CLI benchmark errors (#1071) * initial commit * ready for review * Update src/deepsparse/utils/onnx.py * Clean a typo in the pipeline code * cleanup the old files * Update src/deepsparse/transformers/engines/nl_decoder_engine.py * ready for review * ready for testing * assert proper padding on pipeline init * now also supporting kv cache perplexity. time for cleanup * ready for review * correctly print engine info * work with left padding of the tokenizer * quality * fix the multitoken inference --------- Co-authored-by: corey-nm <[email protected]> Co-authored-by: Mark Kurtz <[email protected]> Co-authored-by: Benjamin <[email protected]> Co-authored-by: Rahul Tuli <[email protected]>
1 parent 0809aea commit 10c804a

File tree

4 files changed

+193
-30
lines changed

4 files changed

+193
-30
lines changed

src/deepsparse/transformers/engines/nl_decoder_engine.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,7 @@ def __call__(
154154
else:
155155
logits = out[0]
156156

157-
B, S, V = logits.shape # batch, sequence, vocab
158-
logits = logits[:, -1, :].reshape(B, 1, V) # only take the last token
159-
160-
token = self.generate_token(logits=logits)
157+
token = self.generate_token(logits=logits[:, -1, :])
161158

162159
return token, logits
163160

@@ -253,6 +250,9 @@ def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
253250

254251
return numpy.random.choice(len(probs), p=probs)
255252

253+
def __str__(self):
254+
return f"{self.__class__.__name__}: {self.engine}"
255+
256256
def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]:
257257
# initialize empty kv cache of size
258258
# (batch_size, num_attention_heads, length, hidden_dims)

src/deepsparse/transformers/eval_downstream.py

+37-5
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,43 @@
6868
import numpy
6969
from tqdm.auto import tqdm
7070

71-
from deepsparse import Pipeline
72-
from deepsparse.transformers.metrics import PrecisionRecallF1
71+
from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
72+
from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
7373

7474

7575
from datasets import load_dataset, load_metric # isort: skip
7676

77-
DEEPSPARSE_ENGINE = "deepsparse"
78-
ORT_ENGINE = "onnxruntime"
77+
78+
def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
79+
dataset = load_dataset(dataset_name)["test"]
80+
81+
text_generation = Pipeline.create(
82+
task="text-generation",
83+
model_path=args.model_path,
84+
engine_type=args.engine,
85+
num_cores=args.num_cores,
86+
sequence_length=args.max_sequence_length,
87+
prompt_processing_sequence_length=args.max_sequence_length,
88+
max_generated_tokens=1,
89+
remove_special_tokens_from_prompt=False,
90+
)
91+
perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size)
92+
active_engines = [
93+
engine
94+
for engine in [text_generation.engine, text_generation.multitoken_engine]
95+
if engine
96+
]
97+
print("Engine info: ")
98+
[print(f"{engine}\n") for engine in active_engines]
99+
predictions = []
100+
for idx, sample in _enumerate_progress(dataset, args.max_samples):
101+
predictions.append(sample["prompt"] + sample["canonical_solution"])
102+
if len(predictions) == batch_size:
103+
perplexity_metrics.add_batch(predictions)
104+
predictions = []
105+
if args.max_samples and idx >= args.max_samples:
106+
break
107+
return perplexity_metrics
79108

80109

81110
def qa_eval(args, dataset_name="squad"):
@@ -443,11 +472,14 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
443472
"imdb": imdb_eval,
444473
"conll2003": conll2003_eval,
445474
"go_emotions": go_emotions_eval,
475+
"openai_humaneval": perplexity_eval,
446476
}
447477

448478

449479
def parse_args():
450480
parser = argparse.ArgumentParser(
481+
# TODO: It is not BERT anymore, should we
482+
# have another script or modify the existing one?
451483
description="Evaluate a BERT ONNX model on a downstream dataset"
452484
)
453485
parser.add_argument(
@@ -461,9 +493,9 @@ def parse_args():
461493
parser.add_argument(
462494
"-d",
463495
"--dataset",
464-
type=str,
465496
choices=list(SUPPORTED_DATASETS.keys()),
466497
required=True,
498+
type=str,
467499
)
468500
parser.add_argument(
469501
"-v",

src/deepsparse/transformers/metrics.py

+102-1
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,119 @@
1717
"""
1818

1919

20-
from typing import Dict, Optional
20+
from typing import Any, Dict, List, Optional
2121

2222
import numpy
23+
from tqdm import tqdm
2324

25+
import torch
26+
from deepsparse import Pipeline
27+
from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
2428
from sklearn.metrics import precision_recall_fscore_support
2529

2630

2731
__all__ = [
2832
"PrecisionRecallF1",
33+
"Perplexity",
2934
]
3035

3136

37+
class Perplexity:
38+
def __init__(self, pipeline: Pipeline, batch_size: int = 16):
39+
"""
40+
Given the pipeline, compute the perplexity of the model
41+
on the given text input.
42+
43+
Code adapted from:
44+
https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501
45+
46+
:param pipeline: The pipeline to use for text generation
47+
:param batch_size: The batch size to split the input text into
48+
non-overlapping batches
49+
"""
50+
if not isinstance(pipeline, TextGenerationPipeline):
51+
raise ValueError(
52+
"Perplexity can only be computed for text generation pipelines"
53+
)
54+
self._pipeline = pipeline
55+
self._batch_size = batch_size
56+
self._sequence_length = pipeline.sequence_length
57+
self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
58+
59+
self.perplexities = []
60+
61+
def add_batch(self, predictions: List[str]):
62+
"""
63+
Run the model on the given input sequences and compute the perplexity.
64+
The resulting perplexity is appended to the list of perplexities.
65+
66+
:param predictions: The predictions to compute perplexity on
67+
"""
68+
# tokenize the input text
69+
encodings = self._pipeline.tokenizer(
70+
predictions,
71+
return_attention_mask=True,
72+
max_length=self._sequence_length,
73+
truncation=True,
74+
padding="max_length",
75+
)
76+
77+
encoded_texts = encodings["input_ids"]
78+
attention_masks = encodings["attention_mask"]
79+
80+
for start_index in tqdm(range(0, len(encoded_texts), self._batch_size)):
81+
end_index = min(start_index + self._batch_size, len(encoded_texts))
82+
encoded_batch = encoded_texts[start_index:end_index]
83+
attention_mask = attention_masks[start_index:end_index]
84+
85+
out = self._pipeline(
86+
sequences=predictions, return_logits=True, truncate=True
87+
)
88+
logits = out.logits
89+
90+
labels = encoded_batch
91+
labels = numpy.stack(labels)
92+
attention_mask = numpy.stack(attention_mask)
93+
94+
# because the tokenizer is left padded, we need to move the meaningful
95+
# part of the logits and labels to the right
96+
num_padded_entries = attention_mask.sum(axis=1)
97+
98+
# shift the values at num_paddings to the top of the array using roll
99+
for i, num_padded in enumerate(num_padded_entries):
100+
logits[i] = numpy.roll(logits[i], num_padded, axis=0)
101+
labels[i] = numpy.roll(labels[i], num_padded, axis=0)
102+
attention_mask[i] = numpy.roll(attention_mask[i], num_padded, axis=0)
103+
104+
# shift logits and labels create the input and target for the loss function
105+
shift_logits = logits[:, :-1, :]
106+
shift_labels = labels[:, 1:]
107+
shift_attention_mask_batch = attention_mask[:, 1:]
108+
109+
# compute perplexity for this batch
110+
perplexity_batch = torch.exp(
111+
(
112+
self._loss_fct(
113+
torch.tensor(shift_logits.transpose(0, 2, 1)),
114+
torch.tensor(shift_labels),
115+
)
116+
* torch.tensor(shift_attention_mask_batch)
117+
).sum(1)
118+
/ torch.tensor(shift_attention_mask_batch).sum(1)
119+
)
120+
self.perplexities.extend(perplexity_batch.numpy().tolist())
121+
122+
def compute(self) -> Dict[str, Any]:
123+
"""
124+
:return: A dictionary containing the mean perplexity
125+
and the list of perplexities
126+
"""
127+
return {
128+
"mean_perplexity": numpy.mean(self.perplexities),
129+
"perplexities": self.perplexities,
130+
}
131+
132+
32133
class PrecisionRecallF1:
33134
def __init__(self, id_to_label: Optional[Dict[int, str]] = None):
34135
self._id_to_label = id_to_label

0 commit comments

Comments
 (0)