Skip to content

Update TensorRT-LLM backend #602

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions all_models/gpt/postprocessing/config.pbtxt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: "postprocessing"
backend: "python"
max_batch_size: 1024
dynamic_batching {}
input [
{
name: "TOKENS_BATCH"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
name: "postprocessing"
backend: "python"
max_batch_size: ${triton_max_batch_size}
dynamic_batching {}
input [
{
name: "TOKENS_BATCH"
Expand Down
20 changes: 13 additions & 7 deletions all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,18 +175,14 @@ def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
return trtllm.SamplingConfig(**kwargs)


def get_output_config_from_request(request,
exclude_input_from_output,
batch_size=1,
batch_index=0):
def get_output_config_from_request(request, batch_size=1, batch_index=0):
kwargs = {}
kwargs["return_log_probs"] = get_input_scalar_by_name(
request, 'return_log_probs', batch_size, batch_index)
kwargs["return_context_logits"] = get_input_scalar_by_name(
request, 'return_context_logits', batch_size, batch_index)
kwargs["return_generation_logits"] = get_input_scalar_by_name(
request, 'return_generation_logits', batch_size, batch_index)
kwargs["exclude_input_from_output"] = exclude_input_from_output
kwargs = {k: v for k, v in kwargs.items() if v is not None}
return trtllm.OutputConfig(**kwargs)

Expand Down Expand Up @@ -312,8 +308,18 @@ def convert_request(request, exclude_input_from_output, decoupled):

sampling_config = get_sampling_config_from_request(
request, batch_size, batch_index)
output_config = get_output_config_from_request(
request, exclude_input_from_output, batch_size, batch_index)
output_config = get_output_config_from_request(request, batch_size,
batch_index)
req_exclude_input_from_output = get_input_scalar_by_name(
request, 'exclude_input_in_output', batch_size, batch_index)
if req_exclude_input_from_output is None:
# if request doesn't specify exclude_input_from_output, try to use the parameter
output_config.exclude_input_from_output = (
exclude_input_from_output
if exclude_input_from_output is not None else false)
else:
output_config.exclude_input_from_output = req_exclude_input_from_output

external_draft_tokens_config = get_external_draft_tokens_config_from_request(
request, batch_size, batch_index)
prompt_tuning_config = get_prompt_tuning_config_from_request(
Expand Down
7 changes: 7 additions & 0 deletions all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,13 @@ input [
reshape: { shape: [ ] }
optional: true
},
{
name: "exclude_input_in_output"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "stop"
data_type: TYPE_BOOL
Expand Down
1 change: 1 addition & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ PYTHON_BACKEND_REPO_TAG=${PYTHON_BACKEND_REPO_TAG:-r24.08}
--filesystem=gcs --filesystem=s3 --filesystem=azure_storage \
--endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \
--backend=ensemble --enable-gpu --no-container-pull \
--repoagent=checksum --cache=local --cache=redis \
--image=base,${TRTLLM_BASE_IMAGE} \
--backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \
--backend=python:${PYTHON_BACKEND_REPO_TAG}
22 changes: 20 additions & 2 deletions dockerfile/Dockerfile.triton.trt_llm_backend
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,20 @@ ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-l
FROM ${PYTORCH_IMAGE} as pytorch_image
FROM ${BASE_IMAGE} as install_dependencies

ARG CCACHE_REMOTE_STORAGE
ARG CCACHE_URL
ENV CCACHE_DEBUG=1

RUN if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
curl -k -L ${CCACHE_URL} -o ccache.tar.gz ; \
tar -xzf ccache.tar.gz -C /usr/local --strip-components=1 ; \
rm ccache.tar.gz ; \
ccache --set-config=remote_only=true ; \
ccache --set-config=remote_storage=${CCACHE_REMOTE_STORAGE} ; \
ccache --set-config=log_file=/tmp/ccache.log ; \
ccache -p ; \
fi

# Copy PyTorch package from PyTorch image
COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/
COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torch /usr/local/lib/python3.10/dist-packages/torch
Expand All @@ -20,7 +34,6 @@ RUN apt-get update -q=2 && \
apt-get install -y --no-install-recommends \
python3-dev \
python3-pip \
ccache \
git-lfs && \
# Remove previous TRT installation
apt-get remove -y tensorrt* libnvinfer* && \
Expand Down Expand Up @@ -76,7 +89,12 @@ RUN pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5 cmake==3.30.2

COPY scripts scripts
COPY tensorrt_llm tensorrt_llm
RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean
RUN cd tensorrt_llm && \
if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --use_ccache ; \
else \
python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean ; \
fi

# Final stage to build the TRT-LLM container
FROM ${BASE_IMAGE} as final_stage
Expand Down
14 changes: 12 additions & 2 deletions inflight_batcher_llm/client/inflight_batcher_llm_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
lora_weights_data, lora_config_data, return_log_probs_data,
top_k_data, top_p_data, draft_ids_data,
return_context_logits_data, return_generation_logits_data,
decoder_input_ids_data, prompt_table_extra_id_data):
decoder_input_ids_data, prompt_table_extra_id_data,
exclude_input_in_output):
inputs = [
prepare_tensor("input_ids", input_ids_data),
prepare_tensor("input_lengths", input_lengths_data),
Expand Down Expand Up @@ -185,6 +186,10 @@ def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
prepare_tensor("prompt_table_extra_ids",
prompt_table_extra_id_data),
]
if exclude_input_in_output is not None:
inputs += [
prepare_tensor("exclude_input_in_output", exclude_input_in_output),
]
return inputs


Expand Down Expand Up @@ -665,6 +670,11 @@ def callback(user_data, result, error):
if decoder_input_ids is not None:
decoder_input_ids_data = np.array(decoder_input_ids, dtype=np.int32)

exclude_input_in_output = None
if FLAGS.exclude_input_in_output:
exclude_input_in_output = np.array([[FLAGS.exclude_input_in_output]],
dtype=bool)

if not FLAGS.vocab_size and tokenizer:
FLAGS.vocab_size = tokenizer.vocab_size
prompt_table_extra_id_data = None
Expand All @@ -690,7 +700,7 @@ def callback(user_data, result, error):
lora_config_data, return_log_probs_data, top_k_data, top_p_data,
draft_ids_data, return_context_logits_data,
return_generation_logits_data, decoder_input_ids_data,
prompt_table_extra_id_data)
prompt_table_extra_id_data, exclude_input_in_output)

if FLAGS.requested_outputs:
# Must have at least output_ids in requested outputs
Expand Down
5 changes: 4 additions & 1 deletion inflight_batcher_llm/src/model_instance_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ executor::ParallelConfig ModelInstanceState::getParallelConfigFromParams()
if (useOrchestratorMode && std::atoi(useOrchestratorMode) != 0)
{
parallelConfig.setCommunicationMode(executor::CommunicationMode::kORCHESTRATOR);

tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE);

auto const workerExecutablePath = model_state_->GetExecutorWorkerPath();
auto const spawnProcessesEnvVar = std::getenv("TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES");
auto const spawnProcesses = !spawnProcessesEnvVar || std::atoi(spawnProcessesEnvVar);
Expand Down Expand Up @@ -978,7 +981,7 @@ std::tuple<TRITONBACKEND_Response*, bool, TRITONSERVER_Error*> ModelInstanceStat
{
size_t contextPhaseParamsSize
= executor::Serialization::serializedSize(response.getResult().contextPhaseParams.value());
std::vector<int64_t> contextPhaseParamsShape{1, contextPhaseParamsSize};
std::vector<int64_t> contextPhaseParamsShape{1, static_cast<int64_t>(contextPhaseParamsSize)};
TRITONSERVER_DataType contextPhaseParamsType = TRITONSERVER_TYPE_UINT8;
auto contextPhaseParamsBuffer = utils::getResponseBuffer<uint8_t>(tritonResponse,
contextPhaseParamsShape, contextPhaseParamsType, OutputFieldsNames::contextPhaseParams);
Expand Down
18 changes: 15 additions & 3 deletions inflight_batcher_llm/src/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,6 @@ executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTens
bool returnContextLogits{false};
extractSingleton<bool>(inputsTensors, InputFieldsNames::returnContextLogits, returnContextLogits);

// Note that currently excludeInputFromOutput is set from the backend parameters.
return executor::OutputConfig(returnLogProbs, returnContextLogits, returnGenerationLogits);
}

Expand Down Expand Up @@ -628,7 +627,7 @@ std::optional<executor::LoraConfig> getLoraConfigFromTensors(InputTensors const&
}

std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputTensors> const& inputsTensors,
bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
bool paramExcludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
executor::RequestType requestType)
{
if (!isDecoupled && inputsTensors.size() > 1)
Expand All @@ -644,7 +643,20 @@ std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputT
for (auto const& inputTensors : inputsTensors)
{
executor::OutputConfig outConfig = utils::getOutputConfigFromTensors(inputTensors);
outConfig.excludeInputFromOutput = excludeInputFromOutput;

std::optional<bool> reqExcludeInputFromOutput{std::nullopt};
extractOptionalSingleton<bool>(
inputTensors, InputFieldsNames::excludeInputFromOutput, reqExcludeInputFromOutput);

// If specified in request, set from request
if (reqExcludeInputFromOutput != std::nullopt)
{
outConfig.excludeInputFromOutput = reqExcludeInputFromOutput.value();
}
else // Set from parameter
{
outConfig.excludeInputFromOutput = paramExcludeInputFromOutput;
}

executor::VecTokens inputTokens;
if (!utils::extractVector<int32_t>(inputTensors, InputFieldsNames::inputTokens, inputTokens))
Expand Down
1 change: 1 addition & 0 deletions inflight_batcher_llm/src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct InputFieldsNames
static constexpr char const* returnLogProbs = "return_log_probs";
static constexpr char const* returnGenerationLogits = "return_generation_logits";
static constexpr char const* returnContextLogits = "return_context_logits";
static constexpr char const* excludeInputFromOutput = "exclude_input_in_output";

// SamplingConfig
static constexpr char const* beamWidth = "beam_width";
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm
Submodule tensorrt_llm updated 98 files
+12 −6 README.md
+30 −33 cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+2 −2 cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2 −2 cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+3 −3 cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+2 −2 cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2 −2 cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+3 −3 cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+2 −2 cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+2 −2 cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+21 −22 cpp/tensorrt_llm/common/mpiUtils.cpp
+2 −2 cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+2 −2 cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+3 −3 cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+2 −2 cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+2 −2 cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+3 −3 cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+2 −2 cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+2 −2 cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+1 −1 ...rt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+1 −1 ...rrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+1 −1 ...rMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+1 −1 ...rMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
+3 −3 ..._llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+1 −1 ...rt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+1 −1 ...nels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3 −3 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+1 −1 ...rrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+1 −1 ...rnels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3 −3 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+2 −2 ...t_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+2 −2 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+82 −6 cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
+7 −7 cpp/tensorrt_llm/pybind/bindings.cpp
+1 −1 cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp
+1 −1 cpp/tensorrt_llm/runtime/medusaModule.cpp
+8 −0 cpp/tests/CMakeLists.txt
+114 −297 cpp/tests/kernels/mixtureOfExpertsTest.cu
+42 −0 cpp/tests/resources/scripts/case_report_wrapper.py
+51 −1 cpp/tests/resources/scripts/test_cpp.py
+0 −0 docs/source/advanced/executor.md
+2 −0 docs/source/advanced/kv-cache-reuse.md
+9 −7 docs/source/advanced/speculative-decoding.md
+2 −2 docs/source/architecture/core-concepts.md
+3 −0 docs/source/index.rst
+2 −2 docs/source/release-notes.md
+1 −1 examples/baichuan/requirements.txt
+1 −1 examples/bloom/requirements.txt
+1 −1 examples/chatglm/requirements.txt
+1 −1 examples/dbrx/requirements.txt
+77 −0 examples/deepseek_v1/README.md
+14 −0 examples/deepseek_v1/__init__.py
+215 −0 examples/deepseek_v1/convert_checkpoint.py
+5 −0 examples/deepseek_v1/requirements.txt
+1 −1 examples/falcon/requirements.txt
+1 −1 examples/gemma/requirements.txt
+1 −1 examples/gpt/requirements.txt
+1 −1 examples/gptj/requirements.txt
+1 −1 examples/gptneox/requirements.txt
+1 −1 examples/grok/requirements.txt
+1 −1 examples/internlm/requirements.txt
+1 −1 examples/jais/requirements.txt
+1 −1 examples/llama/requirements.txt
+1 −1 examples/llm-api/requirements.txt
+1 −1 examples/mamba/requirements.txt
+1 −1 examples/medusa/requirements.txt
+1 −1 examples/mixtral/requirements.txt
+1 −1 examples/mpt/requirements.txt
+1 −1 examples/nemotron/requirements.txt
+1 −1 examples/opt/requirements.txt
+1 −1 examples/phi/requirements.txt
+1 −1 examples/quantization/requirements.txt
+1 −1 examples/qwen/requirements.txt
+1 −1 examples/qwenvl/requirements.txt
+1 −1 examples/recurrentgemma/requirements.txt
+1 −1 examples/redrafter/requirements.txt
+1 −1 examples/skywork/requirements.txt
+1 −1 examples/smaug/requirements.txt
+1 −1 examples/whisper/requirements.txt
+1 −0 requirements-dev.txt
+1 −1 tensorrt_llm/commands/build.py
+18 −18 tensorrt_llm/functional.py
+2 −1 tensorrt_llm/layers/__init__.py
+1 −6 tensorrt_llm/layers/embedding.py
+51 −0 tensorrt_llm/layers/moe.py
+3 −0 tensorrt_llm/models/__init__.py
+14 −0 tensorrt_llm/models/deepseek_v1/__init__.py
+361 −0 tensorrt_llm/models/deepseek_v1/convert.py
+257 −0 tensorrt_llm/models/deepseek_v1/model.py
+1 −2 tensorrt_llm/models/llama/model.py
+41 −8 tensorrt_llm/models/model_weights_loader.py
+3 −0 tensorrt_llm/models/modeling_utils.py
+8 −1 tensorrt_llm/models/qwen/model.py
+61 −0 tensorrt_llm/module.py
+1 −1 tensorrt_llm/version.py
+1 −1 tests/bindings/test_bindings_ut.py
+106 −0 tests/conftest.py
+2 −0 tests/test_module.py
2 changes: 1 addition & 1 deletion tools/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
9f42c546baf991a2e69cb605595b6484d5388709
7a776af38eccd9c94ccc23ff069959f2f629745e