Skip to content

Commit 329937a

Browse files
authored
Update TensorRT-LLM backend (#60)
* Update src * Update .gitmodules * Update .pre-commit-config.yaml * Update submodule
1 parent 06f63fe commit 329937a

12 files changed

+40
-20
lines changed

Diff for: .gitmodules

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[submodule "tensorrt_llm"]
22
path = tensorrt_llm
3-
url = git@github.com:NVIDIA/TensorRT-LLM.git
3+
url = https://github.com/NVIDIA/TensorRT-LLM.git

Diff for: .pre-commit-config.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,9 @@ repos:
4040
rev: v0.6.10
4141
hooks:
4242
- id: cmake-format
43+
- repo: https://github.com/codespell-project/codespell
44+
rev: v2.2.4
45+
hooks:
46+
- id: codespell
47+
args:
48+
- --skip=".git,tensorrt_llm"

Diff for: README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ You might have to contact your cluster's administrator to help you customize the
363363
### Kill the Triton server
364364

365365
```bash
366-
pgrep tritonserver | xargs kill -9
366+
pkill tritonserver
367367
```
368368

369369
## Testing the TensorRT-LLM Backend

Diff for: all_models/gpt/tensorrt_llm/1/model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def execute(self, requests):
242242
# response:
243243
#
244244
# pb_utils.InferenceResponse(
245-
# output_tensors=..., TritonError("An error occured"))
245+
# output_tensors=..., TritonError("An error occurred"))
246246

247247
inference_response = pb_utils.InferenceResponse(output_tensors)
248248
else:

Diff for: dockerfile/Dockerfile.trt_llm_backend

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev p
88
COPY requirements.txt /tmp/
99
RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com
1010

11-
# Remove prevous TRT installation
11+
# Remove previous TRT installation
1212
# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
1313
RUN apt-get remove --purge -y tensorrt*
1414
RUN pip uninstall -y tensorrt

Diff for: inflight_batcher_llm/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ set(TRITON_BUILD
3030

3131
if(TRITON_BUILD)
3232
set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm)
33-
# Install build time dependencies. This section is excuted during cmake
33+
# Install build time dependencies. This section is executed during cmake
3434
# configure time.
3535
execute_process(
3636
COMMAND bash -x ./tools/environment_setup.sh

Diff for: inflight_batcher_llm/src/libtensorrtllm.cc

+6-4
Original file line numberDiff line numberDiff line change
@@ -858,7 +858,6 @@ class ModelInstanceState
858858
packed.insert(
859859
packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end()));
860860
}
861-
int64_t nWords1 = static_cast<int64_t>(packed.size());
862861
bcast(packed, 0, COMM_WORLD);
863862
}
864863
}
@@ -1128,7 +1127,7 @@ class ModelInstanceState
11281127
TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size");
11291128
}
11301129

1131-
std::optional<bool> enableTrtOverlap = std::nullopt;
1130+
bool enableTrtOverlap = true;
11321131
try
11331132
{
11341133
enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");
@@ -1139,8 +1138,11 @@ class ModelInstanceState
11391138
TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");
11401139
}
11411140

1142-
TrtGptModelOptionalParams optionalParams(
1143-
maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap);
1141+
TrtGptModelOptionalParams optionalParams;
1142+
optionalParams.maxNumSequences = maxNumSequences;
1143+
optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache;
1144+
optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
1145+
optionalParams.enableTrtOverlap = enableTrtOverlap;
11441146

11451147
mBatchManager = std::make_shared<GptManager>(
11461148
mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,

Diff for: scripts/launch_triton_server.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import argparse
22
import subprocess
3+
import sys
34
from pathlib import Path
45

56

@@ -9,9 +10,18 @@ def parse_arguments():
910
type=int,
1011
default=1,
1112
help='world size, only support tensor parallelism now')
12-
parser.add_argument('--tritonserver',
13-
type=str,
14-
default='/opt/tritonserver/bin/tritonserver')
13+
parser.add_argument(
14+
'--tritonserver',
15+
type=str,
16+
help='path to the tritonserver exe',
17+
default='/opt/tritonserver/bin/tritonserver',
18+
)
19+
parser.add_argument(
20+
'--force',
21+
'-f',
22+
action='store_true',
23+
help='launch tritonserver regardless of other instances running')
24+
1525
path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt'
1626
parser.add_argument('--model_repo', type=str, default=path)
1727
return parser.parse_args()
@@ -30,13 +40,15 @@ def get_cmd(world_size, tritonserver, model_repo):
3040

3141
if __name__ == '__main__':
3242
args = parse_arguments()
33-
res = subprocess.run(['pgrep', 'tritonserver'],
43+
res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'],
3444
capture_output=True,
3545
encoding='utf-8')
3646
if res.stdout:
3747
pids = res.stdout.replace('\n', ' ').rstrip()
38-
raise RuntimeError(
39-
f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
40-
)
48+
msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
49+
if args.force:
50+
print(msg, file=sys.stderr)
51+
else:
52+
raise RuntimeError(msg + ' Or use --force.')
4153
cmd = get_cmd(int(args.world_size), args.tritonserver, args.model_repo)
4254
subprocess.Popen(cmd)

Diff for: tensorrt_llm

Submodule tensorrt_llm updated 43 files

Diff for: tools/environment_setup.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ git lfs install
3333

3434
pip3 install -r requirements.txt --extra-index-url https://pypi.ngc.nvidia.com
3535

36-
# Remove prevous TRT installation
36+
# Remove previous TRT installation
3737
apt-get remove --purge -y tensorrt* libnvinfer*
3838
pip uninstall -y tensorrt
3939

Diff for: tools/fill_template.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def main(file_path, substitutions, in_place):
2727
parser.add_argument(
2828
"substitutions",
2929
help=
30-
"substitions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
30+
"substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
3131
)
3232
parser.add_argument("--in_place",
3333
"-i",

Diff for: tools/gen_trtllm_dockerfile.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
def install_new_version_of_TRT(clone_repo=False, trtllm_be_repo_tag="main"):
3535
df = """
36-
# Remove prevous TRT installation
36+
# Remove previous TRT installation
3737
RUN apt-get remove --purge -y tensorrt* libnvinfer*
3838
RUN pip uninstall -y tensorrt
3939

0 commit comments

Comments
 (0)