[Cherry-Pick][Text Generation] Terminate the inference when kv cache is full (#1447)

dbogunowicz · web-flow · commit e94dcac62347 · 2023-12-01T13:18:52.000-05:00
* [Fix] Remove erronous LIB.kv_cache input when using external kv cache management (#1337) * initial commit * initial commit * cleanup * cleanup2 * initial commit * initial commit * Needs to be >=
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -829,6 +829,11 @@ def engine_forward(
                     generated_tokens.append(token)
                     generated_logits.append(logits)
 
+                    if session.total_num_processed_tokens >= session.capacity:
+                        # if the kv cache is full, stop generation
+                        finished_reason.append(FinishReason.CAPACITY)
+                        break
+
                     if (
                         token == self.tokenizer.eos_token_id
                         and not self.force_max_tokens