[feat]: Allow for a settable end-of-sequence/padding token in max throughput benchmark. (NVIDIA#3776)

FrankD412 · nvpohanh · commit e61cac9c2096 · 2025-05-05T01:40:42.000-07:00
* Move world options to a different group for clarity.

Signed-off-by: Frank Di Natale &lt;3429989+FrankD412@users.noreply.github.com&gt;

* Add eos_id option.

Signed-off-by: Frank Di Natale &lt;3429989+FrankD412@users.noreply.github.com&gt;

---------

Signed-off-by: Frank Di Natale &lt;3429989+FrankD412@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -92,6 +92,14 @@
     required=False,
     help="Pass in a dataset file for parsing instead of stdin.",
 )
+@optgroup.option(
+    "--eos_id",
+    type=int,
+    default=-1,
+    required=False,
+    help=
+    "Set the end-of-sequence token for the benchmark. Set to -1 to disable EOS.",
+)
 @optgroup.option(
     "--num_requests",
     type=int,
@@ -106,6 +114,22 @@
     default=2,
     help="Number of requests warm up benchmark.",
 )
+@optgroup.option(
+    "--target_input_len",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Target (average) input length for tuning heuristics.",
+)
+@optgroup.option(
+    "--target_output_len",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Target (average) sequence length for tuning heuristics.",
+)
+@optgroup.group(
+    "World Configuration",
+    help="Options for configuring the backend multi-GPU world.",
+)
 @optgroup.option(
     "--tp",
     type=int,
@@ -124,18 +148,6 @@
     default=None,
     help="expert parallelism size",
 )
-@optgroup.option(
-    "--target_input_len",
-    default=None,
-    type=click.IntRange(min=1),
-    help="Target (average) input length for tuning heuristics.",
-)
-@optgroup.option(
-    "--target_output_len",
-    default=None,
-    type=click.IntRange(min=1),
-    help="Target (average) sequence length for tuning heuristics.",
-)
 @optgroup.group("Request Load Control Options",
                 cls=MutuallyExclusiveOptionGroup,
                 help="Limits how requests are loaded.")
@@ -198,6 +210,7 @@ def throughput_command(
     # Parameters from CLI
     # Model, experiment, and engine params
     dataset_path: Path = params.pop("dataset")
+    eos_id: int = params.pop("eos_id")
     warmup: int = params.get("warmup")
     num_requests: int = params.pop("num_requests")
     max_seq_len: int = params.pop("max_seq_len")
@@ -299,8 +312,8 @@ def throughput_command(
         else:
             llm = LLM(**kwargs)
 
-        sampling_params = SamplingParams(end_id=-1,
-                                         pad_id=-1,
+        sampling_params = SamplingParams(end_id=eos_id,
+                                         pad_id=eos_id,
                                          beam_width=beam_width)
 
         # Perform warmup if requested.