triton-inference-server · kaiyux · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/all_models/gpt/postprocessing/config.pbtxt b/all_models/gpt/postprocessing/config.pbtxt
@@ -1,6 +1,7 @@
 name: "postprocessing"
 backend: "python"
 max_batch_size: 1024
+dynamic_batching {}
 input [
   {
     name: "TOKENS_BATCH"

diff --git a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
@@ -27,6 +27,7 @@
 name: "postprocessing"
 backend: "python"
 max_batch_size: ${triton_max_batch_size}
+dynamic_batching {}
 input [
   {
     name: "TOKENS_BATCH"

diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py b/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
@@ -175,18 +175,14 @@ def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
     return trtllm.SamplingConfig(**kwargs)
 
 
-def get_output_config_from_request(request,
-                                   exclude_input_from_output,
-                                   batch_size=1,
-                                   batch_index=0):
+def get_output_config_from_request(request, batch_size=1, batch_index=0):
     kwargs = {}
     kwargs["return_log_probs"] = get_input_scalar_by_name(
         request, 'return_log_probs', batch_size, batch_index)
     kwargs["return_context_logits"] = get_input_scalar_by_name(
         request, 'return_context_logits', batch_size, batch_index)
     kwargs["return_generation_logits"] = get_input_scalar_by_name(
         request, 'return_generation_logits', batch_size, batch_index)
-    kwargs["exclude_input_from_output"] = exclude_input_from_output
     kwargs = {k: v for k, v in kwargs.items() if v is not None}
     return trtllm.OutputConfig(**kwargs)
 
@@ -312,8 +308,18 @@ def convert_request(request, exclude_input_from_output, decoupled):
 
         sampling_config = get_sampling_config_from_request(
             request, batch_size, batch_index)
-        output_config = get_output_config_from_request(
-            request, exclude_input_from_output, batch_size, batch_index)
+        output_config = get_output_config_from_request(request, batch_size,
+                                                       batch_index)
+        req_exclude_input_from_output = get_input_scalar_by_name(
+            request, 'exclude_input_in_output', batch_size, batch_index)
+        if req_exclude_input_from_output is None:
+            # if request doesn't specify exclude_input_from_output, try to use the parameter
+            output_config.exclude_input_from_output = (
+                exclude_input_from_output
+                if exclude_input_from_output is not None else false)
+        else:
+            output_config.exclude_input_from_output = req_exclude_input_from_output
+
         external_draft_tokens_config = get_external_draft_tokens_config_from_request(
             request, batch_size, batch_index)
         prompt_tuning_config = get_prompt_tuning_config_from_request(

diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
@@ -253,6 +253,13 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "exclude_input_in_output"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "stop"
     data_type: TYPE_BOOL

diff --git a/build.sh b/build.sh
@@ -42,6 +42,7 @@ PYTHON_BACKEND_REPO_TAG=${PYTHON_BACKEND_REPO_TAG:-r24.08}
               --filesystem=gcs --filesystem=s3 --filesystem=azure_storage \
               --endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \
               --backend=ensemble --enable-gpu --no-container-pull \
+              --repoagent=checksum --cache=local --cache=redis \
               --image=base,${TRTLLM_BASE_IMAGE} \
               --backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \
               --backend=python:${PYTHON_BACKEND_REPO_TAG}
diff --git a/dockerfile/Dockerfile.triton.trt_llm_backend b/dockerfile/Dockerfile.triton.trt_llm_backend
@@ -8,6 +8,20 @@ ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-l
 FROM ${PYTORCH_IMAGE} as pytorch_image
 FROM ${BASE_IMAGE} as install_dependencies
 
+ARG CCACHE_REMOTE_STORAGE
+ARG CCACHE_URL
+ENV CCACHE_DEBUG=1
+
+RUN if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
+      curl -k -L ${CCACHE_URL} -o ccache.tar.gz ; \
+      tar -xzf ccache.tar.gz -C /usr/local --strip-components=1 ; \
+      rm ccache.tar.gz ; \
+      ccache --set-config=remote_only=true ; \
+      ccache --set-config=remote_storage=${CCACHE_REMOTE_STORAGE} ; \
+      ccache --set-config=log_file=/tmp/ccache.log ; \
+      ccache -p ; \
+    fi
+
 # Copy PyTorch package from PyTorch image
 COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/
 COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torch /usr/local/lib/python3.10/dist-packages/torch
@@ -20,7 +34,6 @@ RUN apt-get update -q=2 && \
     apt-get install -y --no-install-recommends \
         python3-dev \
         python3-pip \
-        ccache \
         git-lfs && \
     # Remove previous TRT installation
     apt-get remove -y tensorrt* libnvinfer* && \
@@ -76,7 +89,12 @@ RUN pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5 cmake==3.30.2
 
 COPY scripts scripts
 COPY tensorrt_llm tensorrt_llm
-RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean
+RUN cd tensorrt_llm && \
+    if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
+      python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --use_ccache ; \
+    else \
+      python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean ; \
+    fi
 
 # Final stage to build the TRT-LLM container
 FROM ${BASE_IMAGE} as final_stage

diff --git a/inflight_batcher_llm/client/inflight_batcher_llm_client.py b/inflight_batcher_llm/client/inflight_batcher_llm_client.py
@@ -123,7 +123,8 @@ def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
                    lora_weights_data, lora_config_data, return_log_probs_data,
                    top_k_data, top_p_data, draft_ids_data,
                    return_context_logits_data, return_generation_logits_data,
-                   decoder_input_ids_data, prompt_table_extra_id_data):
+                   decoder_input_ids_data, prompt_table_extra_id_data,
+                   exclude_input_in_output):
     inputs = [
         prepare_tensor("input_ids", input_ids_data),
         prepare_tensor("input_lengths", input_lengths_data),
@@ -185,6 +186,10 @@ def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
             prepare_tensor("prompt_table_extra_ids",
                            prompt_table_extra_id_data),
         ]
+    if exclude_input_in_output is not None:
+        inputs += [
+            prepare_tensor("exclude_input_in_output", exclude_input_in_output),
+        ]
     return inputs
 
 
@@ -665,6 +670,11 @@ def callback(user_data, result, error):
     if decoder_input_ids is not None:
         decoder_input_ids_data = np.array(decoder_input_ids, dtype=np.int32)
 
+    exclude_input_in_output = None
+    if FLAGS.exclude_input_in_output:
+        exclude_input_in_output = np.array([[FLAGS.exclude_input_in_output]],
+                                           dtype=bool)
+
     if not FLAGS.vocab_size and tokenizer:
         FLAGS.vocab_size = tokenizer.vocab_size
     prompt_table_extra_id_data = None
@@ -690,7 +700,7 @@ def callback(user_data, result, error):
         lora_config_data, return_log_probs_data, top_k_data, top_p_data,
         draft_ids_data, return_context_logits_data,
         return_generation_logits_data, decoder_input_ids_data,
-        prompt_table_extra_id_data)
+        prompt_table_extra_id_data, exclude_input_in_output)
 
     if FLAGS.requested_outputs:
         # Must have at least output_ids in requested outputs

diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
@@ -211,6 +211,9 @@ executor::ParallelConfig ModelInstanceState::getParallelConfigFromParams()
     if (useOrchestratorMode && std::atoi(useOrchestratorMode) != 0)
     {
         parallelConfig.setCommunicationMode(executor::CommunicationMode::kORCHESTRATOR);
+
+        tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE);
+
         auto const workerExecutablePath = model_state_->GetExecutorWorkerPath();
         auto const spawnProcessesEnvVar = std::getenv("TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES");
         auto const spawnProcesses = !spawnProcessesEnvVar || std::atoi(spawnProcessesEnvVar);
@@ -978,7 +981,7 @@ std::tuple<TRITONBACKEND_Response*, bool, TRITONSERVER_Error*> ModelInstanceStat
                 {
                     size_t contextPhaseParamsSize
                         = executor::Serialization::serializedSize(response.getResult().contextPhaseParams.value());
-                    std::vector<int64_t> contextPhaseParamsShape{1, contextPhaseParamsSize};
+                    std::vector<int64_t> contextPhaseParamsShape{1, static_cast<int64_t>(contextPhaseParamsSize)};
                     TRITONSERVER_DataType contextPhaseParamsType = TRITONSERVER_TYPE_UINT8;
                     auto contextPhaseParamsBuffer = utils::getResponseBuffer<uint8_t>(tritonResponse,
                         contextPhaseParamsShape, contextPhaseParamsType, OutputFieldsNames::contextPhaseParams);

diff --git a/inflight_batcher_llm/src/utils.cc b/inflight_batcher_llm/src/utils.cc
@@ -535,7 +535,6 @@ executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTens
     bool returnContextLogits{false};
     extractSingleton<bool>(inputsTensors, InputFieldsNames::returnContextLogits, returnContextLogits);
 
-    // Note that currently excludeInputFromOutput is set from the backend parameters.
     return executor::OutputConfig(returnLogProbs, returnContextLogits, returnGenerationLogits);
 }
 
@@ -628,7 +627,7 @@ std::optional<executor::LoraConfig> getLoraConfigFromTensors(InputTensors const&
 }
 
 std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputTensors> const& inputsTensors,
-    bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
+    bool paramExcludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
     executor::RequestType requestType)
 {
     if (!isDecoupled && inputsTensors.size() > 1)
@@ -644,7 +643,20 @@ std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputT
     for (auto const& inputTensors : inputsTensors)
     {
         executor::OutputConfig outConfig = utils::getOutputConfigFromTensors(inputTensors);
-        outConfig.excludeInputFromOutput = excludeInputFromOutput;
+
+        std::optional<bool> reqExcludeInputFromOutput{std::nullopt};
+        extractOptionalSingleton<bool>(
+            inputTensors, InputFieldsNames::excludeInputFromOutput, reqExcludeInputFromOutput);
+
+        // If specified in request, set from request
+        if (reqExcludeInputFromOutput != std::nullopt)
+        {
+            outConfig.excludeInputFromOutput = reqExcludeInputFromOutput.value();
+        }
+        else // Set from parameter
+        {
+            outConfig.excludeInputFromOutput = paramExcludeInputFromOutput;
+        }
 
         executor::VecTokens inputTokens;
         if (!utils::extractVector<int32_t>(inputTensors, InputFieldsNames::inputTokens, inputTokens))

diff --git a/inflight_batcher_llm/src/utils.h b/inflight_batcher_llm/src/utils.h
@@ -62,6 +62,7 @@ struct InputFieldsNames
     static constexpr char const* returnLogProbs = "return_log_probs";
     static constexpr char const* returnGenerationLogits = "return_generation_logits";
     static constexpr char const* returnContextLogits = "return_context_logits";
+    static constexpr char const* excludeInputFromOutput = "exclude_input_in_output";
 
     // SamplingConfig
     static constexpr char const* beamWidth = "beam_width";

diff --git a/tensorrt_llm b/tensorrt_llm
diff --git a/tools/version.txt b/tools/version.txt
@@ -1 +1 @@
-9f42c546baf991a2e69cb605595b6484d5388709
+7a776af38eccd9c94ccc23ff069959f2f629745e
+12 −6		README.md
+30 −33		cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+2 −2		cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2 −2		cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+3 −3		cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+2 −2		cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2 −2		cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+3 −3		cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+2 −2		cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+2 −2		cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+21 −22		cpp/tensorrt_llm/common/mpiUtils.cpp
+2 −2		cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+2 −2		cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+3 −3		cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+2 −2		cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+2 −2		cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+3 −3		cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+2 −2		cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+2 −2		cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+1 −1		...rt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+1 −1		...rrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+1 −1		...rMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+1 −1		...rMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
+3 −3		..._llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+1 −1		...rt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+1 −1		...nels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3 −3		cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+1 −1		...rrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+1 −1		...rnels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3 −3		cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+2 −2		...t_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+2 −2		cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+82 −6		cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
+7 −7		cpp/tensorrt_llm/pybind/bindings.cpp
+1 −1		cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp
+1 −1		cpp/tensorrt_llm/runtime/medusaModule.cpp
+8 −0		cpp/tests/CMakeLists.txt
+114 −297		cpp/tests/kernels/mixtureOfExpertsTest.cu
+42 −0		cpp/tests/resources/scripts/case_report_wrapper.py
+51 −1		cpp/tests/resources/scripts/test_cpp.py
+0 −0		docs/source/advanced/executor.md
+2 −0		docs/source/advanced/kv-cache-reuse.md
+9 −7		docs/source/advanced/speculative-decoding.md
+2 −2		docs/source/architecture/core-concepts.md
+3 −0		docs/source/index.rst
+2 −2		docs/source/release-notes.md
+1 −1		examples/baichuan/requirements.txt
+1 −1		examples/bloom/requirements.txt
+1 −1		examples/chatglm/requirements.txt
+1 −1		examples/dbrx/requirements.txt
+77 −0		examples/deepseek_v1/README.md
+14 −0		examples/deepseek_v1/__init__.py
+215 −0		examples/deepseek_v1/convert_checkpoint.py
+5 −0		examples/deepseek_v1/requirements.txt
+1 −1		examples/falcon/requirements.txt
+1 −1		examples/gemma/requirements.txt
+1 −1		examples/gpt/requirements.txt
+1 −1		examples/gptj/requirements.txt
+1 −1		examples/gptneox/requirements.txt
+1 −1		examples/grok/requirements.txt
+1 −1		examples/internlm/requirements.txt
+1 −1		examples/jais/requirements.txt
+1 −1		examples/llama/requirements.txt
+1 −1		examples/llm-api/requirements.txt
+1 −1		examples/mamba/requirements.txt
+1 −1		examples/medusa/requirements.txt
+1 −1		examples/mixtral/requirements.txt
+1 −1		examples/mpt/requirements.txt
+1 −1		examples/nemotron/requirements.txt
+1 −1		examples/opt/requirements.txt
+1 −1		examples/phi/requirements.txt
+1 −1		examples/quantization/requirements.txt
+1 −1		examples/qwen/requirements.txt
+1 −1		examples/qwenvl/requirements.txt
+1 −1		examples/recurrentgemma/requirements.txt
+1 −1		examples/redrafter/requirements.txt
+1 −1		examples/skywork/requirements.txt
+1 −1		examples/smaug/requirements.txt
+1 −1		examples/whisper/requirements.txt
+1 −0		requirements-dev.txt
+1 −1		tensorrt_llm/commands/build.py
+18 −18		tensorrt_llm/functional.py
+2 −1		tensorrt_llm/layers/__init__.py
+1 −6		tensorrt_llm/layers/embedding.py
+51 −0		tensorrt_llm/layers/moe.py
+3 −0		tensorrt_llm/models/__init__.py
+14 −0		tensorrt_llm/models/deepseek_v1/__init__.py
+361 −0		tensorrt_llm/models/deepseek_v1/convert.py
+257 −0		tensorrt_llm/models/deepseek_v1/model.py
+1 −2		tensorrt_llm/models/llama/model.py
+41 −8		tensorrt_llm/models/model_weights_loader.py
+3 −0		tensorrt_llm/models/modeling_utils.py
+8 −1		tensorrt_llm/models/qwen/model.py
+61 −0		tensorrt_llm/module.py
+1 −1		tensorrt_llm/version.py
+1 −1		tests/bindings/test_bindings_ut.py
+106 −0		tests/conftest.py
+2 −0		tests/test_module.py
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		9f42c546baf991a2e69cb605595b6484d5388709
		7a776af38eccd9c94ccc23ff069959f2f629745e