triton-inference-server
diff --git a/Diff for: ‎.pre-commit-config.yaml
+2-2 b/Diff for: ‎.pre-commit-config.yaml
+2-2
diff --git a/Diff for: ‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
+2-2 b/Diff for: ‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
+2-2
diff --git a/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
+159-8 b/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
+159-8
diff --git a/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+45-3 b/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+45-3
diff --git a/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
+6-5 b/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
+6-5
diff --git a/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
+2-2 b/Diff for: ‎all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
+2-2
@@ -7,8 +7,8 @@ repos:
     rev: v1.1.13
     hooks:
     -   id: remove-crlf
--   repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.32.0
+-   repo: https://github.com/google/yapf
+    rev: v0.43.0
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
 
@@ -260,12 +260,12 @@ output [
   },
   {
     name: "context_logits"
-    data_type: TYPE_FP32
+    data_type: ${logits_datatype}
     dims: [ -1, -1 ]
   },
   {
     name: "generation_logits"
-    data_type: TYPE_FP32
+    data_type: ${logits_datatype}
     dims: [ -1, -1, -1 ]
   },
   {
 
@@ -20,6 +20,35 @@
 METRIC_TOTAL_INPUT_TOKENS = "total_input_tokens"
 import tensorrt_llm.logger as logger
 
+# From https://github.com/pytorch/pytorch/blob/39425feac799905402abe4d15667fa47c344f2d7/torch/testing/_internal/common_utils.py#L1761
+# Dict of NumPy dtype -> torch dtype (when the correspondence exists)
+numpy_to_torch_dtype_dict = {
+    np.bool_: torch.bool,
+    np.uint8: torch.uint8,
+    np.uint16: torch.uint16,
+    np.uint32: torch.uint32,
+    np.uint64: torch.uint64,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128
+}
+
+# Dict of torch dtype -> NumPy dtype
+torch_to_numpy_dtype_dict = {
+    value: key
+    for (key, value) in numpy_to_torch_dtype_dict.items()
+}
+torch_to_numpy_dtype_dict.update({
+    torch.bfloat16: np.float32,
+    torch.complex32: np.complex64
+})
+
 
 @dataclass
 class RequestData:
@@ -224,7 +253,7 @@ def get_external_draft_tokens_config_from_request(request,
     draft_logits = get_input_tensor_by_name(request, 'draft_logits',
                                             batch_size, batch_index)
     if draft_logits is not None:
-        kwargs['logits'] = from_numpy(draft_logits).squeeze()
+        kwargs['logits'] = from_numpy(draft_logits).squeeze(dim=0)
     kwargs['acceptance_threshold'] = get_input_scalar_by_name(
         request, 'draft_acceptance_threshold', batch_size, batch_index)
     kwargs = {k: v for k, v in kwargs.items() if v is not None}
@@ -279,6 +308,93 @@ def get_lora_config_from_request(request, batch_size=1, batch_index=0):
     return None
 
 
+def get_kv_cache_retention_config_from_request(request,
+                                               batch_size=1,
+                                               batch_index=0):
+
+    def get_tensor_and_check_length(name: str, expected_length: int):
+        tensor = get_input_tensor_by_name(request, name, batch_size,
+                                          batch_index)
+
+        if tensor is None:
+            raise RuntimeError(f"{name} must be provided.")
+
+        tensor = np.squeeze(tensor, axis=0)
+
+        if len(tensor) != expected_length:
+            raise RuntimeError(
+                f"Invalid {name} length. Expected length {expected_length}, got length {len(tensor)}"
+            )
+
+        return tensor
+
+    token_range_starts = get_input_tensor_by_name(
+        request, "retention_token_range_starts", batch_size, batch_index)
+
+    if token_range_starts is not None:
+        token_range_starts = np.squeeze(token_range_starts, axis=0)
+
+        token_range_ends = get_tensor_and_check_length(
+            "retention_token_range_ends", len(token_range_starts))
+        token_range_ends = [
+            None if end == -1 else end for end in token_range_ends
+        ]
+
+        token_range_priorities = get_tensor_and_check_length(
+            "retention_token_range_priorities", len(token_range_starts))
+
+        token_range_durations_ms = get_input_tensor_by_name(
+            request, "retention_token_range_durations_ms", batch_size,
+            batch_index)
+
+        if token_range_durations_ms is None:
+            token_range_durations_ms = [None] * len(token_range_starts)
+        else:
+            token_range_durations_ms = np.squeeze(token_range_durations_ms,
+                                                  axis=0)
+            token_range_durations_ms = [
+                None if duration == -1 else duration
+                for duration in token_range_durations_ms
+            ]
+
+            if len(token_range_durations_ms) != len(token_range_starts):
+                raise RuntimeError(
+                    f"Invalid retention_token_range_durations length. Expected length {len(token_range_starts)}, got length {len(token_range_durations_ms)}"
+                )
+
+        ranges = []
+
+        for start, end, priority, duration_ms in zip(token_range_starts,
+                                                     token_range_ends,
+                                                     token_range_priorities,
+                                                     token_range_durations_ms):
+            ranges.append(
+                trtllm.KvCacheRetentionConfig.TokenRangeRetentionConfig(
+                    token_start=start,
+                    token_end=end,
+                    priority=priority.item(),
+                    duration_ms=None if duration_ms is None else
+                    datetime.timedelta(milliseconds=duration_ms.item())))
+
+        decode_args = {}
+
+        decode_priority = get_input_scalar_by_name(
+            request, "retention_decode_priority", batch_size, batch_index)
+        if decode_priority is not None:
+            decode_args['decode_retention_priority'] = decode_priority
+
+        decode_duration_ms = get_input_scalar_by_name(
+            request, "retention_decode_duration_ms", batch_size, batch_index)
+        if decode_duration_ms is not None:
+            decode_args[
+                'decode_duration_ms'] = decode_duration_ms if decode_duration_ms != -1 else None
+
+        return trtllm.KvCacheRetentionConfig(
+            token_range_retention_configs=ranges, **decode_args)
+
+    return None
+
+
 def build_1_2_5_buckets(max_value: int) -> List[int]:
     """
     Builds a list of buckets with increasing powers of 10 multiplied by
@@ -375,6 +491,8 @@ def convert_request(request, exclude_input_from_output, decoupled):
             request, batch_size, batch_index, input_length)
         lora_config = get_lora_config_from_request(request, batch_size,
                                                    batch_index)
+        kv_cache_retention_config = get_kv_cache_retention_config_from_request(
+            request, batch_size, batch_index)
 
         # Inputs for mllama support
         encoder_input_features = get_input_tensor_by_name(
@@ -412,11 +530,15 @@ def convert_request(request, exclude_input_from_output, decoupled):
                 external_draft_tokens_config=external_draft_tokens_config,
                 prompt_tuning_config=prompt_tuning_config,
                 lora_config=lora_config,
-            ))
+                kv_cache_retention_config=kv_cache_retention_config))
     return requests
 
 
-def convert_response(response, batch_index, batch_size, num_return_sequences):
+def convert_response(response,
+                     batch_index,
+                     batch_size,
+                     num_return_sequences,
+                     expected_logits_dtype=torch.float32):
 
     if response.has_error():
         return pb_utils.InferenceResponse(output_tensors=[],
@@ -450,18 +572,24 @@ def convert_response(response, batch_index, batch_size, num_return_sequences):
                 np.expand_dims(np.array(result.log_probs, np.float32), 0)))
 
     if result.context_logits is not None:
+        assert (result.context_logits.dtype is expected_logits_dtype)
         output_tensors.append(
             pb_utils.Tensor(
                 "context_logits",
-                np.expand_dims(np.array(result.context_logits, np.float32),
-                               0)))
+                np.expand_dims(
+                    np.array(
+                        result.context_logits, torch_to_numpy_dtype_dict[
+                            result.context_logits.dtype]), 0)))
 
     if result.generation_logits is not None:
+        assert (result.generation_logits.dtype is expected_logits_dtype)
         output_tensors.append(
             pb_utils.Tensor(
                 "generation_logits",
-                np.expand_dims(np.array(result.generation_logits, np.float32),
-                               0)))
+                np.expand_dims(
+                    np.array(
+                        result.generation_logits, torch_to_numpy_dtype_dict[
+                            result.generation_logits.dtype]), 0)))
 
     if batch_size > 1:
         output_tensors.append(
@@ -555,6 +683,22 @@ def convert_timestamp_to_seconds(timestamp: str):
                                    "%m-%d-%Y %H:%M:%S.%f").timestamp())
 
 
+def triton_string_to_torch(dtype):
+    type_map = {
+        "TYPE_BOOL": torch.bool,
+        "TYPE_UINT8": torch.uint8,
+        "TYPE_INT8": torch.int8,
+        "TYPE_INT16": torch.int16,
+        "TYPE_INT32": torch.int32,
+        "TYPE_INT64": torch.int64,
+        "TYPE_FP16": torch.float16,
+        "TYPE_FP32": torch.float32,
+        "TYPE_FP64": torch.float64,
+        "TYPE_BF16": torch.bfloat16
+    }
+    return type_map[dtype]
+
+
 class TritonPythonModel:
     """Your Python model must use the same class name. Every Python model
     that is created must have "TritonPythonModel" as the class name.
@@ -916,6 +1060,12 @@ def initialize(self, args):
         self.stats_check_period_ms = get_parameter(
             model_config, "stats_check_period_ms", int) or 100
 
+        self.logits_dtype = None
+        for output in model_config['output']:
+            if output['name'] == 'context_logits' or output[
+                    'name'] == 'generation_logits':
+                self.logits_dtype = triton_string_to_torch(output['data_type'])
+
         self.create_metrics(args["model_name"],
                             args["model_version"],
                             is_v1_model=executor_config.batching_type ==
@@ -1059,7 +1209,8 @@ def awaiter_loop(self):
 
                 triton_response, is_final, output_length = convert_response(
                     response, request_data.batch_index,
-                    request_data.batch_size, request_data.num_return_sequences)
+                    request_data.batch_size, request_data.num_return_sequences,
+                    self.logits_dtype)
                 with self.lock:
                     self.req_id_to_request_data[
                         req_id].num_output_tokens += output_length
 
@@ -102,7 +102,7 @@ input [
   },
   {
     name: "draft_logits"
-    data_type: TYPE_FP32
+    data_type: ${logits_datatype}
     dims: [ -1, -1 ]
     optional: true
     allow_ragged_batch: true
@@ -388,6 +388,48 @@ input [
     dims: [ 1 ]
     optional: true
     allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_starts"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_ends"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_priorities"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_durations_ms"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_decode_priority"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_decode_duration_ms"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
   }
 ]
 output [
@@ -413,12 +455,12 @@ output [
   },
   {
     name: "context_logits"
-    data_type: TYPE_FP32
+    data_type: ${logits_datatype}
     dims: [ -1, -1 ]
   },
   {
     name: "generation_logits"
-    data_type: TYPE_FP32
+    data_type: ${logits_datatype}
     dims: [ -1, -1, -1 ]
   },
   {
 
@@ -324,8 +324,8 @@ def _spec_generate(
 
             # Evaluate criteria to stop generation loop.
             # If we've hit or exceeded the max output length, should stop
-            length_stop = (len(input_ids) >=
-                           len(prompt_input_ids) + output_len)
+            length_stop = (len(input_ids)
+                           >= len(prompt_input_ids) + output_len)
             if length_stop:
                 break
             # If draft and target have same outputs, should stop. Normally target should return 1 more token.
@@ -401,9 +401,10 @@ def postprocess(self, gen_response: GenerationResponse,
             batch_index = batch_index[0][0] if batch_index is not None else 0
 
             self._accumulated_tokens[batch_index] = new_tokens if (
-                self._accumulated_tokens[batch_index] is None
-            ) else np.concatenate(
-                (self._accumulated_tokens[batch_index], new_tokens), axis=2)
+                self._accumulated_tokens[batch_index]
+                is None) else np.concatenate(
+                    (self._accumulated_tokens[batch_index], new_tokens),
+                    axis=2)
             sequence_lengths = np.array(
                 [[self._accumulated_tokens[batch_index].shape[2]]],
                 dtype=np.int32)
 
@@ -308,12 +308,12 @@ output [
   },
   {
     name: "context_logits"
-    data_type: TYPE_FP32
+    data_type: ${logits_datatype}
     dims: [ -1, -1 ]
   },
   {
     name: "generation_logits"
-    data_type: TYPE_FP32
+    data_type: ${logits_datatype}
     dims: [ -1, -1, -1 ]
   },
   {
Original file line number	Diff line number	Diff line change
`@@ -260,12 +260,12 @@ output [`
`260`	`260`	`},`
`261`	`261`	`{`
`262`	`262`	`name: "context_logits"`
`263`		`- data_type: TYPE_FP32`
	`263`	`+ data_type: ${logits_datatype}`
`264`	`264`	`dims: [ -1, -1 ]`
`265`	`265`	`},`
`266`	`266`	`{`
`267`	`267`	`name: "generation_logits"`
`268`		`- data_type: TYPE_FP32`
	`268`	`+ data_type: ${logits_datatype}`
`269`	`269`	`dims: [ -1, -1, -1 ]`
`270`	`270`	`},`
`271`	`271`	`{`
Original file line number	Diff line number	Diff line change
`@@ -308,12 +308,12 @@ output [`
`308`	`308`	`},`
`309`	`309`	`{`
`310`	`310`	`name: "context_logits"`
`311`		`- data_type: TYPE_FP32`
	`311`	`+ data_type: ${logits_datatype}`
`312`	`312`	`dims: [ -1, -1 ]`
`313`	`313`	`},`
`314`	`314`	`{`
`315`	`315`	`name: "generation_logits"`
`316`		`- data_type: TYPE_FP32`
	`316`	`+ data_type: ${logits_datatype}`
`317`	`317`	`dims: [ -1, -1, -1 ]`
`318`	`318`	`},`
`319`	`319`	`{`