NVIDIA · QiJune · Apr 20, 2025 · Apr 14, 2025 · Apr 15, 2025 · Apr 16, 2025
diff --git a/cpp/tests/resources/scripts/build_eagle_engines.py b/cpp/tests/resources/scripts/build_eagle_engines.py
@@ -31,7 +31,7 @@ def build_engine(base_model_dir: _pl.Path, eagle_model_dir: _pl.Path,
                  engine_dir: _pl.Path, build_base_model: bool, *args):
 
     if build_base_model:
-        checkpoint_path = "examples/llama/convert_checkpoint.py"
+        checkpoint_path = "examples/models/core/llama/convert_checkpoint.py"
     else:
         checkpoint_path = "examples/eagle/convert_checkpoint.py"
 

diff --git a/cpp/tests/resources/scripts/build_enc_dec_engines.py b/cpp/tests/resources/scripts/build_enc_dec_engines.py
@@ -119,7 +119,7 @@ class Convert(RunCMDMixin):
     def command(self):
         args = self.args
         return [
-            f'python examples/enc_dec/convert_checkpoint.py',
+            f'python examples/models/core/enc_dec/convert_checkpoint.py',
             f'--model_type {args.model_type}',
             f'--model_dir {args.hf_models_dir}',
             f'--output_dir {args.trt_models_dir}',

diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py
@@ -37,7 +37,7 @@ def convert_ckpt(model_dir: str,
                  world_size: int = 1,
                  dtype: str = 'float16'):
     convert_cmd = [
-        sys.executable, "examples/gpt/convert_checkpoint.py",
+        sys.executable, "examples/models/core/gpt/convert_checkpoint.py",
         f"--model_dir={model_dir}", f"--output_dir={output_dir}",
         f"--dtype={dtype}", f"--tp_size={world_size}"
     ] + list(args)

diff --git a/cpp/tests/resources/scripts/build_llama_engines.py b/cpp/tests/resources/scripts/build_llama_engines.py
@@ -32,11 +32,12 @@ def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, convert_extra_args,
 
     ckpt_dir = engine_dir / 'ckpt'
 
-    convert_cmd = [_sys.executable, "examples/llama/convert_checkpoint.py"
-                   ] + ([f'--model_dir={weight_dir}'] if weight_dir else []) + [
-                       f'--output_dir={ckpt_dir}',
-                       '--dtype=float16',
-                   ] + convert_extra_args
+    convert_cmd = [
+        _sys.executable, "examples/models/core/llama/convert_checkpoint.py"
+    ] + ([f'--model_dir={weight_dir}'] if weight_dir else []) + [
+        f'--output_dir={ckpt_dir}',
+        '--dtype=float16',
+    ] + convert_extra_args
 
     run_command(convert_cmd)
 

diff --git a/cpp/tests/resources/scripts/build_mamba_engines.py b/cpp/tests/resources/scripts/build_mamba_engines.py
@@ -31,12 +31,13 @@
 
 def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path,
                  *args):
-    convert_args = [_sys.executable, "examples/mamba/convert_checkpoint.py"] + (
-        ['--model_dir', str(weight_dir)] if weight_dir else []) + [
-            '--output_dir',
-            str(ckpt_dir),
-            '--dtype=float16',
-        ]
+    convert_args = [
+        _sys.executable, "examples/models/core/mamba/convert_checkpoint.py"
+    ] + (['--model_dir', str(weight_dir)] if weight_dir else []) + [
+        '--output_dir',
+        str(ckpt_dir),
+        '--dtype=float16',
+    ]
     run_command(convert_args)
     build_args = ["trtllm-build"] + ['--checkpoint_dir',
                                      str(ckpt_dir)] + [

diff --git a/cpp/tests/resources/scripts/build_recurrentgemma_engines.py b/cpp/tests/resources/scripts/build_recurrentgemma_engines.py
@@ -32,7 +32,8 @@
 def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path,
                  *args):
     convert_args = [
-        _sys.executable, "examples/recurrentgemma/convert_checkpoint.py"
+        _sys.executable,
+        "examples/models/core/recurrentgemma/convert_checkpoint.py"
     ] + (['--model_dir', str(weight_dir)] if weight_dir else []) + [
         '--output_dir',
         str(ckpt_dir),

diff --git a/cpp/tests/resources/scripts/generate_expected_enc_dec_output.py b/cpp/tests/resources/scripts/generate_expected_enc_dec_output.py
@@ -11,7 +11,7 @@ def command(self):
         for beam in args.beams_tuple:
             ret.append((
                 mpi_run,
-                f'python3 examples/enc_dec/run.py --engine_dir {args.engines_dir}',
+                f'python3 examples/models/core/enc_dec/run.py --engine_dir {args.engines_dir}',
                 f'--engine_name {args.ckpt}',
                 f'--model_name "{args.hf_models_dir}"',
                 f'--max_new_tokens={args.max_new_tokens}',

diff --git a/examples/bert/base_benchmark/config.json b/examples/bert/base_benchmark/config.json
diff --git a/examples/bert/base_with_attention_plugin_benchmark/config.json b/examples/bert/base_with_attention_plugin_benchmark/config.json
diff --git a/examples/bert/large_benchmark/config.json b/examples/bert/large_benchmark/config.json
diff --git a/examples/bert/large_with_attention_plugin_benchmark/config.json b/examples/bert/large_with_attention_plugin_benchmark/config.json
diff --git a/examples/draft_target_model/README.md b/examples/draft_target_model/README.md
@@ -25,7 +25,7 @@ We provide two styles of running DTM now: using TensorRT-LLM-BLS in Triton Infer
 + `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example.
 
 ```bash
-cd examples/llama
+cd examples/models/core/llama
 export DRAFT_CKPT_PATH=/workspace/ckpt-draft
 export TARGET_CKPT_PATH=/workspace/ckpt-target
 export DRAFT_ENGINE_PATH=/workspace/engine-draft

diff --git a/examples/bert/.gitignore → examples/models/core/bert/.gitignore b/examples/bert/.gitignore → examples/models/core/bert/.gitignore
diff --git a/examples/bert/README.md → examples/models/core/bert/README.md b/examples/bert/README.md → examples/models/core/bert/README.md
@@ -4,7 +4,7 @@ This document explains how to build the BERT family, specifically [BERT](https:/
 
 ## Overview
 
-The TensorRT-LLM BERT family implementation can be found in [`tensorrt_llm/models/bert/model.py`](../../tensorrt_llm/models/bert/model.py).
+The TensorRT-LLM BERT family implementation can be found in [`tensorrt_llm/models/bert/model.py`](../../../../tensorrt_llm/models/bert/model.py).
 The TensorRT-LLM BERT family example code is located in [`examples/bert`](./). There are two main files in that folder:
 
  * [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the BERT model into tensorrt-llm checkpoint format.

diff --git a/examples/bert/__init__.py → examples/models/core/bert/__init__.py b/examples/bert/__init__.py → examples/models/core/bert/__init__.py
diff --git a/examples/bert/convert_checkpoint.py → ...es/models/core/bert/convert_checkpoint.py b/examples/bert/convert_checkpoint.py → ...es/models/core/bert/convert_checkpoint.py
diff --git a/examples/bert/run.py → examples/models/core/bert/run.py b/examples/bert/run.py → examples/models/core/bert/run.py
diff --git a/examples/bert/utils.py → examples/models/core/bert/utils.py b/examples/bert/utils.py → examples/models/core/bert/utils.py
diff --git a/examples/commandr/README.md → examples/models/core/commandr/README.md b/examples/commandr/README.md → examples/models/core/commandr/README.md
@@ -18,15 +18,15 @@ This document explains how to build the [C4AI Command-R](https://huggingface.co/
 
 ## Overview
 
-The TensorRT-LLM Command-R implementation can be found in [`tensorrt_llm/models/commandr/model.py`](../../tensorrt_llm/models/commandr/model.py).
+The TensorRT-LLM Command-R implementation can be found in [`tensorrt_llm/models/commandr/model.py`](../../../../tensorrt_llm/models/commandr/model.py).
 The TensorRT-LLM Command-R example code is located in [`examples/commandr`](./). There is one main file:
 
 * [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
 
-In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
+In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
 
-* [`../run.py`](../run.py) to run the inference on an input text;
-* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
+* [`run.py`](../../../run.py) to run the inference on an input text;
+* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
 
 ## Support Matrix
 
@@ -122,23 +122,23 @@ If the engines are built successfully, you will see output like (Command-R as th
 
 ```bash
 # Run the default engine of Command-R on single GPU.
-python3 ../run.py --max_output_len 50 \
+python3 ../../../run.py --max_output_len 50 \
         --tokenizer_dir command_r_v01 \
         --engine_dir trt_engines/command_r_v01/fp16/1-gpu
 
 # Run the default engine of Command-R on single GPU, using streaming output.
-python3 ../run.py --max_output_len 50 \
+python3 ../../../run.py --max_output_len 50 \
         --tokenizer_dir command_r_v01 \
         --engine_dir trt_engines/command_r_v01/fp16/1-gpu \
         --streaming
 
 # Run the default engine of Aya-23-8B on single GPU.
-python3 ../run.py --max_output_len 50 \
+python3 ../../../run.py --max_output_len 50 \
         --tokenizer_dir aya_23_8B \
         --engine_dir trt_engines/aya_23_8B/fp16/1-gpu
 
 # Run the default engine of Aya-23-35B on single GPU.
-python3 ../run.py --max_output_len 50 \
+python3 ../../../run.py --max_output_len 50 \
         --tokenizer_dir aya_23_35B \
         --engine_dir trt_engines/aya_23_35B/fp16/1-gpu
 ```
@@ -148,7 +148,7 @@ python3 ../run.py --max_output_len 50 \
 ```bash
 # Run the Tensor Parallel 4 engine of Command-R+ on 4 GPUs.
 mpirun -n 4 \
-    python ../run.py  --max_output_len 50 \
+    python ../../../run.py  --max_output_len 50 \
         --tokenizer_dir command_r_plus \
         --engine_dir trt_engines/command_r_plus/fp16/4-gpu
 ```
@@ -165,7 +165,7 @@ Output [Text 0 Beam 0]: " chef in Paris and worked in the kitchens of the French
 
 ```bash
 # Run the summarization of Command-R task.
-python3 ../summarize.py --test_trt_llm \
+python3 ../../../summarize.py --test_trt_llm \
         --hf_model_dir command_r_v01 \
         --engine_dir trt_engines/command_r_v01/fp16/1-gpu
 ```
@@ -201,7 +201,7 @@ trtllm-build --checkpoint_dir trt_ckpt/command_r_v01/int8_wo/1-gpu \
         --output_dir trt_engines/command_r_v01/int8_wo/1-gpu
 
 # Run inference.
-python3 ../run.py --max_output_len 50 \
+python3 ../../../run.py --max_output_len 50 \
         --tokenizer_dir command_r_v01 \
         --engine_dir trt_engines/command_r_v01/int8_wo/1-gpu
 ```
diff --git a/examples/commandr/convert_checkpoint.py → ...odels/core/commandr/convert_checkpoint.py b/examples/commandr/convert_checkpoint.py → ...odels/core/commandr/convert_checkpoint.py
diff --git a/examples/commandr/requirements.txt → ...les/models/core/commandr/requirements.txt b/examples/commandr/requirements.txt → ...les/models/core/commandr/requirements.txt
@@ -1,4 +1,4 @@
--c ../constraints.txt
+-c ../../../constraints.txt
 tensorrt_llm>=0.0.0.dev0
 datasets==3.1.0
 evaluate

diff --git a/examples/deepseek_v3/README.md → examples/models/core/deepseek_v3/README.md b/examples/deepseek_v3/README.md → examples/models/core/deepseek_v3/README.md
diff --git a/examples/enc_dec/README.md → examples/models/core/enc_dec/README.md b/examples/enc_dec/README.md → examples/models/core/enc_dec/README.md
@@ -27,7 +27,7 @@ This document shows how to build and run an Encoder-Decoder (Enc-Dec) model in T
 
 ## Overview
 
-The TensorRT-LLM Enc-Dec implementation can be found in [tensorrt_llm/models/enc_dec/model.py](../../tensorrt_llm/models/enc_dec/model.py). The TensorRT-LLM Enc-Dec example code is located in [`examples/enc_dec`](./):
+The TensorRT-LLM Enc-Dec implementation can be found in [tensorrt_llm/models/enc_dec/model.py](../../../../tensorrt_llm/models/enc_dec/model.py). The TensorRT-LLM Enc-Dec example code is located in [`examples/enc_dec`](./):
 
  * `trtllm-build` to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the Enc-Dec model,
  * [`run.py`](./run.py) to run the inference on an example input text.
@@ -202,7 +202,7 @@ Different types of runtime are provided for encoder-decoder models. Following an
 - Python runtime w/ Static Batching
 - (NEW) C++ runtime w/ Paged KV Cache and Inflight Batching
 
-Please refer to the documentation for the details of [paged kv cache](../../docs/source/advanced/gpt-attention.md#paged-kv-cache) and [inflight batching](../../docs/source/advanced/gpt-attention.md#inflight-batching).
+Please refer to the documentation for the details of [paged kv cache](../../../../docs/source/advanced/gpt-attention.md#paged-kv-cache) and [inflight batching](../../../../docs/source/advanced/gpt-attention.md#inflight-batching).
 
 #### Run C++ runtime
 **Note: to use inflight batching and paged kv cache features in C++ runtime, please make sure you have set `--paged_kv_cache enable` (which is by default enabled) in the `trtllm-build` command of the decoder. Meanwhile, if using Python runtime, it is recommended to disable this flag by `--paged_kv_cache disable` to avoid any unnecessary overhead.**
@@ -213,12 +213,12 @@ For good usability, Python binding of the C++ runtime is provided. You can use t
 
 ```python
 # Inferencing via python binding of C++ runtime with inflight batching (IFB)
-python3 ../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful."
+python3 ../../../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful."
 ```
 
 You can specify `--kv_cache_free_gpu_memory_fraction` to control the percentage of free GPU memory to be used by KV cache (by default 0.9), and `--cross_kv_cache_fraction` to control the percentage of KV cache to be used by cross attention (by default 0.5, and rest of the KV cache will be used by self attention).
 
-For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend).
+For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend).
 
 #### Run with Triton Backend
 [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/encoder_decoder.md) contains the tutorial on how to run encoder-decoder engines with Tritonserver.

diff --git a/examples/enc_dec/__init__.py → examples/models/core/enc_dec/__init__.py b/examples/enc_dec/__init__.py → examples/models/core/enc_dec/__init__.py
diff --git a/examples/enc_dec/convert_checkpoint.py → ...models/core/enc_dec/convert_checkpoint.py b/examples/enc_dec/convert_checkpoint.py → ...models/core/enc_dec/convert_checkpoint.py
diff --git a/examples/enc_dec/helper.py → examples/models/core/enc_dec/helper.py b/examples/enc_dec/helper.py → examples/models/core/enc_dec/helper.py
diff --git a/examples/enc_dec/run.py → examples/models/core/enc_dec/run.py b/examples/enc_dec/run.py → examples/models/core/enc_dec/run.py
diff --git a/examples/exaone/README.md → examples/models/core/exaone/README.md b/examples/exaone/README.md → examples/models/core/exaone/README.md
@@ -2,7 +2,7 @@
 
 This document shows how to build and run a [EXAONE](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) model in TensorRT-LLM.
 
-The TensorRT-LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../tensorrt_llm/models/llama/model.py).
+The TensorRT-LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../../../tensorrt_llm/models/llama/model.py).
 See the LLaMA example [`examples/llama`](../llama) for details.
 
 - [EXAONE](#exaone)
@@ -113,7 +113,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
 
 ```bash
 # Build the EXAONE model using a single GPU and and apply FP8 quantization.
-python ../quantization/quantize.py \
+python ../../../quantization/quantize.py \
     --model_dir $HF_MODEL_DIR \
     --dtype float16 \
     --qformat fp8 \
@@ -134,7 +134,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
 
 ```bash
 # Build the EXAONE model using a single GPU and and apply INT8 SmoothQuant.
-python ../quantization/quantize.py \
+python ../../../quantization/quantize.py \
     --model_dir $HF_MODEL_DIR \
     --dtype float16 \
     --qformat int8_sq \
@@ -154,7 +154,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
 
 ```bash
 # Build the EXAONE model using a single GPU and and apply INT4 AWQ.
-python ../quantization/quantize.py \
+python ../../../quantization/quantize.py \
     --model_dir $HF_MODEL_DIR \
     --dtype float16 \
     --qformat int4_awq \
@@ -173,7 +173,7 @@ Please make sure your system contains a Hopper GPU before trying the commands be
 
 ```bash
 # Build the EXAONE model using a single GPU and and apply W4A8 AWQ.
-python ../quantization/quantize.py \
+python ../../../quantization/quantize.py \
     --model_dir $HF_MODEL_DIR \
     --dtype float16 \
     --qformat w4a8_awq \
@@ -190,21 +190,21 @@ trtllm-build \
 Test your engine with the [run.py](../run.py) script:
 
 ```bash
-python3 ../run.py \
+python3 ../../../run.py \
     --input_text "When did the first world war end?" \
     --max_output_len=100 \
     --tokenizer_dir $HF_MODEL_DIR \
     --engine_dir trt_engines/exaone/fp16/1-gpu
 
 # Run with 2 GPUs
 mpirun -n 2 --allow-run-as-root \
-    python3 ../run.py \
+    python3 ../../../run.py \
     --input_text "When did the first world war end?" \
     --max_output_len=100 \
     --tokenizer_dir $HF_MODEL_DIR \
     --engine_dir trt_engines/exaone/fp16/2-gpu
 
-python ../summarize.py \
+python ../../../summarize.py \
     --test_trt_llm \
     --data_type fp16 \
     --hf_model_dir $HF_MODEL_DIR \