Skip to content

Commit 8ffb174

Browse files
authored
Update TensorRT-LLM backend (triton-inference-server#602)
1 parent f738432 commit 8ffb174

File tree

12 files changed

+77
-17
lines changed

12 files changed

+77
-17
lines changed

all_models/gpt/postprocessing/config.pbtxt

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: "postprocessing"
22
backend: "python"
33
max_batch_size: 1024
4+
dynamic_batching {}
45
input [
56
{
67
name: "TOKENS_BATCH"

all_models/inflight_batcher_llm/postprocessing/config.pbtxt

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
name: "postprocessing"
2828
backend: "python"
2929
max_batch_size: ${triton_max_batch_size}
30+
dynamic_batching {}
3031
input [
3132
{
3233
name: "TOKENS_BATCH"

all_models/inflight_batcher_llm/tensorrt_llm/1/model.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -175,18 +175,14 @@ def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
175175
return trtllm.SamplingConfig(**kwargs)
176176

177177

178-
def get_output_config_from_request(request,
179-
exclude_input_from_output,
180-
batch_size=1,
181-
batch_index=0):
178+
def get_output_config_from_request(request, batch_size=1, batch_index=0):
182179
kwargs = {}
183180
kwargs["return_log_probs"] = get_input_scalar_by_name(
184181
request, 'return_log_probs', batch_size, batch_index)
185182
kwargs["return_context_logits"] = get_input_scalar_by_name(
186183
request, 'return_context_logits', batch_size, batch_index)
187184
kwargs["return_generation_logits"] = get_input_scalar_by_name(
188185
request, 'return_generation_logits', batch_size, batch_index)
189-
kwargs["exclude_input_from_output"] = exclude_input_from_output
190186
kwargs = {k: v for k, v in kwargs.items() if v is not None}
191187
return trtllm.OutputConfig(**kwargs)
192188

@@ -312,8 +308,18 @@ def convert_request(request, exclude_input_from_output, decoupled):
312308

313309
sampling_config = get_sampling_config_from_request(
314310
request, batch_size, batch_index)
315-
output_config = get_output_config_from_request(
316-
request, exclude_input_from_output, batch_size, batch_index)
311+
output_config = get_output_config_from_request(request, batch_size,
312+
batch_index)
313+
req_exclude_input_from_output = get_input_scalar_by_name(
314+
request, 'exclude_input_in_output', batch_size, batch_index)
315+
if req_exclude_input_from_output is None:
316+
# if request doesn't specify exclude_input_from_output, try to use the parameter
317+
output_config.exclude_input_from_output = (
318+
exclude_input_from_output
319+
if exclude_input_from_output is not None else false)
320+
else:
321+
output_config.exclude_input_from_output = req_exclude_input_from_output
322+
317323
external_draft_tokens_config = get_external_draft_tokens_config_from_request(
318324
request, batch_size, batch_index)
319325
prompt_tuning_config = get_prompt_tuning_config_from_request(

all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt

+7
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,13 @@ input [
253253
reshape: { shape: [ ] }
254254
optional: true
255255
},
256+
{
257+
name: "exclude_input_in_output"
258+
data_type: TYPE_BOOL
259+
dims: [ 1 ]
260+
reshape: { shape: [ ] }
261+
optional: true
262+
},
256263
{
257264
name: "stop"
258265
data_type: TYPE_BOOL

build.sh

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ PYTHON_BACKEND_REPO_TAG=${PYTHON_BACKEND_REPO_TAG:-r24.08}
4242
--filesystem=gcs --filesystem=s3 --filesystem=azure_storage \
4343
--endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \
4444
--backend=ensemble --enable-gpu --no-container-pull \
45+
--repoagent=checksum --cache=local --cache=redis \
4546
--image=base,${TRTLLM_BASE_IMAGE} \
4647
--backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \
4748
--backend=python:${PYTHON_BACKEND_REPO_TAG}

dockerfile/Dockerfile.triton.trt_llm_backend

+20-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@ ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-l
88
FROM ${PYTORCH_IMAGE} as pytorch_image
99
FROM ${BASE_IMAGE} as install_dependencies
1010

11+
ARG CCACHE_REMOTE_STORAGE
12+
ARG CCACHE_URL
13+
ENV CCACHE_DEBUG=1
14+
15+
RUN if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
16+
curl -k -L ${CCACHE_URL} -o ccache.tar.gz ; \
17+
tar -xzf ccache.tar.gz -C /usr/local --strip-components=1 ; \
18+
rm ccache.tar.gz ; \
19+
ccache --set-config=remote_only=true ; \
20+
ccache --set-config=remote_storage=${CCACHE_REMOTE_STORAGE} ; \
21+
ccache --set-config=log_file=/tmp/ccache.log ; \
22+
ccache -p ; \
23+
fi
24+
1125
# Copy PyTorch package from PyTorch image
1226
COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/
1327
COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torch /usr/local/lib/python3.10/dist-packages/torch
@@ -20,7 +34,6 @@ RUN apt-get update -q=2 && \
2034
apt-get install -y --no-install-recommends \
2135
python3-dev \
2236
python3-pip \
23-
ccache \
2437
git-lfs && \
2538
# Remove previous TRT installation
2639
apt-get remove -y tensorrt* libnvinfer* && \
@@ -76,7 +89,12 @@ RUN pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5 cmake==3.30.2
7689

7790
COPY scripts scripts
7891
COPY tensorrt_llm tensorrt_llm
79-
RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean
92+
RUN cd tensorrt_llm && \
93+
if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
94+
python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --use_ccache ; \
95+
else \
96+
python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean ; \
97+
fi
8098

8199
# Final stage to build the TRT-LLM container
82100
FROM ${BASE_IMAGE} as final_stage

inflight_batcher_llm/client/inflight_batcher_llm_client.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
123123
lora_weights_data, lora_config_data, return_log_probs_data,
124124
top_k_data, top_p_data, draft_ids_data,
125125
return_context_logits_data, return_generation_logits_data,
126-
decoder_input_ids_data, prompt_table_extra_id_data):
126+
decoder_input_ids_data, prompt_table_extra_id_data,
127+
exclude_input_in_output):
127128
inputs = [
128129
prepare_tensor("input_ids", input_ids_data),
129130
prepare_tensor("input_lengths", input_lengths_data),
@@ -185,6 +186,10 @@ def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
185186
prepare_tensor("prompt_table_extra_ids",
186187
prompt_table_extra_id_data),
187188
]
189+
if exclude_input_in_output is not None:
190+
inputs += [
191+
prepare_tensor("exclude_input_in_output", exclude_input_in_output),
192+
]
188193
return inputs
189194

190195

@@ -665,6 +670,11 @@ def callback(user_data, result, error):
665670
if decoder_input_ids is not None:
666671
decoder_input_ids_data = np.array(decoder_input_ids, dtype=np.int32)
667672

673+
exclude_input_in_output = None
674+
if FLAGS.exclude_input_in_output:
675+
exclude_input_in_output = np.array([[FLAGS.exclude_input_in_output]],
676+
dtype=bool)
677+
668678
if not FLAGS.vocab_size and tokenizer:
669679
FLAGS.vocab_size = tokenizer.vocab_size
670680
prompt_table_extra_id_data = None
@@ -690,7 +700,7 @@ def callback(user_data, result, error):
690700
lora_config_data, return_log_probs_data, top_k_data, top_p_data,
691701
draft_ids_data, return_context_logits_data,
692702
return_generation_logits_data, decoder_input_ids_data,
693-
prompt_table_extra_id_data)
703+
prompt_table_extra_id_data, exclude_input_in_output)
694704

695705
if FLAGS.requested_outputs:
696706
# Must have at least output_ids in requested outputs

inflight_batcher_llm/src/model_instance_state.cc

+4-1
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,9 @@ executor::ParallelConfig ModelInstanceState::getParallelConfigFromParams()
211211
if (useOrchestratorMode && std::atoi(useOrchestratorMode) != 0)
212212
{
213213
parallelConfig.setCommunicationMode(executor::CommunicationMode::kORCHESTRATOR);
214+
215+
tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE);
216+
214217
auto const workerExecutablePath = model_state_->GetExecutorWorkerPath();
215218
auto const spawnProcessesEnvVar = std::getenv("TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES");
216219
auto const spawnProcesses = !spawnProcessesEnvVar || std::atoi(spawnProcessesEnvVar);
@@ -978,7 +981,7 @@ std::tuple<TRITONBACKEND_Response*, bool, TRITONSERVER_Error*> ModelInstanceStat
978981
{
979982
size_t contextPhaseParamsSize
980983
= executor::Serialization::serializedSize(response.getResult().contextPhaseParams.value());
981-
std::vector<int64_t> contextPhaseParamsShape{1, contextPhaseParamsSize};
984+
std::vector<int64_t> contextPhaseParamsShape{1, static_cast<int64_t>(contextPhaseParamsSize)};
982985
TRITONSERVER_DataType contextPhaseParamsType = TRITONSERVER_TYPE_UINT8;
983986
auto contextPhaseParamsBuffer = utils::getResponseBuffer<uint8_t>(tritonResponse,
984987
contextPhaseParamsShape, contextPhaseParamsType, OutputFieldsNames::contextPhaseParams);

inflight_batcher_llm/src/utils.cc

+15-3
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,6 @@ executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTens
535535
bool returnContextLogits{false};
536536
extractSingleton<bool>(inputsTensors, InputFieldsNames::returnContextLogits, returnContextLogits);
537537

538-
// Note that currently excludeInputFromOutput is set from the backend parameters.
539538
return executor::OutputConfig(returnLogProbs, returnContextLogits, returnGenerationLogits);
540539
}
541540

@@ -628,7 +627,7 @@ std::optional<executor::LoraConfig> getLoraConfigFromTensors(InputTensors const&
628627
}
629628

630629
std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputTensors> const& inputsTensors,
631-
bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
630+
bool paramExcludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
632631
executor::RequestType requestType)
633632
{
634633
if (!isDecoupled && inputsTensors.size() > 1)
@@ -644,7 +643,20 @@ std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputT
644643
for (auto const& inputTensors : inputsTensors)
645644
{
646645
executor::OutputConfig outConfig = utils::getOutputConfigFromTensors(inputTensors);
647-
outConfig.excludeInputFromOutput = excludeInputFromOutput;
646+
647+
std::optional<bool> reqExcludeInputFromOutput{std::nullopt};
648+
extractOptionalSingleton<bool>(
649+
inputTensors, InputFieldsNames::excludeInputFromOutput, reqExcludeInputFromOutput);
650+
651+
// If specified in request, set from request
652+
if (reqExcludeInputFromOutput != std::nullopt)
653+
{
654+
outConfig.excludeInputFromOutput = reqExcludeInputFromOutput.value();
655+
}
656+
else // Set from parameter
657+
{
658+
outConfig.excludeInputFromOutput = paramExcludeInputFromOutput;
659+
}
648660

649661
executor::VecTokens inputTokens;
650662
if (!utils::extractVector<int32_t>(inputTensors, InputFieldsNames::inputTokens, inputTokens))

inflight_batcher_llm/src/utils.h

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct InputFieldsNames
6262
static constexpr char const* returnLogProbs = "return_log_probs";
6363
static constexpr char const* returnGenerationLogits = "return_generation_logits";
6464
static constexpr char const* returnContextLogits = "return_context_logits";
65+
static constexpr char const* excludeInputFromOutput = "exclude_input_in_output";
6566

6667
// SamplingConfig
6768
static constexpr char const* beamWidth = "beam_width";

tensorrt_llm

Submodule tensorrt_llm updated 98 files

tools/version.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
9f42c546baf991a2e69cb605595b6484d5388709
1+
7a776af38eccd9c94ccc23ff069959f2f629745e

0 commit comments

Comments
 (0)