Merge branch 'main' of https://github.com/triton-inference-server/server into spolisetty_dlis_7657

pskiran1 · pskiran1 · commit 1948b34f7235 · 2025-01-17T14:47:40.000+05:30
diff --git a/build.py b/build.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -72,7 +72,7 @@
 
 DEFAULT_TRITON_VERSION_MAP = {
     "release_version": "2.54.0dev",
-    "triton_container_version": "24.01dev",
+    "triton_container_version": "25.01dev",
     "upstream_container_version": "24.12",
     "ort_version": "1.20.1",
     "ort_openvino_version": "2024.4.0",
@@ -1048,6 +1048,8 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
     # Install the windows- or linux-specific buildbase dependencies
     if target_platform() == "windows":
         df += """
+RUN python3 -m pip install build
+
 SHELL ["cmd", "/S", "/C"]
 """
     else:
diff --git a/qa/L0_implicit_state/models/growable_memory/config.pbtxt b/qa/L0_implicit_state/models/growable_memory/config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,6 +28,8 @@ name: "growable_memory"
 backend: "implicit_state"
 max_batch_size: 0
 sequence_batching {
+  # Set large idle timeout to avoid inter-request timeouts for test consistency
+  max_sequence_idle_microseconds: 10000000
   control_input [
     {
       name: "START"
diff --git a/qa/L0_io/gen_libtorch_model.py b/qa/L0_io/gen_libtorch_model.py
@@ -0,0 +1,90 @@
+#!/usr/bin/python
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import torch
+import torch.nn as nn
+
+
+class SumModule(nn.Module):
+    def __init__(self, device):
+        super(SumModule, self).__init__()
+        self.device = device
+
+    def forward(self, INPUT0, INPUT1):
+        INPUT0 = INPUT0.to(self.device)
+        INPUT1 = INPUT1.to(self.device)
+        print(
+            "SumModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
+        return INPUT0 + INPUT1
+
+
+class DiffModule(nn.Module):
+    def __init__(self, device):
+        super(DiffModule, self).__init__()
+        self.device = device
+
+    def forward(self, INPUT0, INPUT1):
+        INPUT0 = INPUT0.to(self.device)
+        INPUT1 = INPUT1.to(self.device)
+        print(
+            "DiffModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
+        return INPUT0 - INPUT1
+
+
+class TestModel(nn.Module):
+    def __init__(self, device0, device1):
+        super(TestModel, self).__init__()
+        self.device0 = device0
+        self.device1 = device1
+
+        self.layer1 = SumModule(self.device0)
+        self.layer2 = DiffModule(self.device1)
+
+    def forward(self, INPUT0, INPUT1):
+        op0 = self.layer1(INPUT0, INPUT1)
+        op1 = self.layer2(INPUT0, INPUT1)
+        return op0, op1
+
+
+if torch.cuda.device_count() < 2:
+    print("Need at least 2 GPUs to run this test")
+    exit(1)
+
+devices = [("cuda:1", "cuda:0"), ("cpu", "cuda:1")]
+model_names = ["libtorch_multi_gpu", "libtorch_multi_device"]
+
+for device_pair, model_name in zip(devices, model_names):
+    model = TestModel(device_pair[0], device_pair[1])
+    model_path = "models/" + model_name + "/1/model.pt"
+    scripted_model = torch.jit.script(model)
+    scripted_model.save(model_path)
diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -38,7 +38,8 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then
     REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
 fi
 
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+# This test requires at least 2 GPUs to test h2d and d2d transfer combinations
+export CUDA_VISIBLE_DEVICES=0,1
 
 IO_TEST_UTIL=./memory_alloc
 CLIENT_LOG="./client.log"
@@ -147,7 +148,6 @@ cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \
 
 # prepare libtorch multi-device and multi-gpu models
 cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/.
-cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py
 mkdir -p $MODELSDIR/libtorch_multi_device/1
 mkdir -p $MODELSDIR/libtorch_multi_gpu/1
 cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/.
diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -1576,7 +1576,7 @@ if [ `grep -c "Model 'custom_zero_1_float32' (version 1) has 1 in-flight inferen
     RET=1
 fi
 
-kill $SERVER_PID
+kill $SERVER_PID || true
 wait $SERVER_PID
 
 rm -f $CLIENT_LOG
@@ -1614,7 +1614,7 @@ if [ `grep -c "Model 'custom_sequence_int32' (version 1) has 1 in-flight inferen
     RET=1
 fi
 
-kill $SERVER_PID
+kill $SERVER_PID || true
 wait $SERVER_PID
 
 rm -f $CLIENT_LOG
@@ -1655,7 +1655,7 @@ if [ `grep -c "Model 'ensemble_zero_1_float32' (version 1) has 1 in-flight infer
     RET=1
 fi
 
-kill $SERVER_PID
+kill $SERVER_PID || true
 wait $SERVER_PID
 
 LOG_IDX=$((LOG_IDX+1))
@@ -2128,7 +2128,7 @@ if [ $? -ne 0 ]; then
 fi
 set -e
 
-kill $SERVER_PID
+kill $SERVER_PID || true
 wait $SERVER_PID
 
 LOG_IDX=$((LOG_IDX+1))
diff --git a/qa/L0_perf_analyzer_capi/test.sh b/qa/L0_perf_analyzer_capi/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -218,6 +218,7 @@ if [ $(cat $CLIENT_LOG | grep ": 0 infer/sec\|: 0 usec" | wc -l) -ne 0 ]; then
 fi
 
 $PERF_ANALYZER -v -m  simple_savedmodel_sequence_object -p 2000 -t5 --sync \
+-s ${STABILITY_THRESHOLD} \
 --input-data=$SEQ_JSONDATAFILE \
 --service-kind=triton_c_api --model-repository=$DATADIR \
 --triton-server-directory=$SERVER_LIBRARY_PATH >$CLIENT_LOG 2>&1
@@ -234,6 +235,7 @@ fi
 
 set +e
 $PERF_ANALYZER -v -m graphdef_sequence_float32 --shape INPUT:2 \
+-s ${STABILITY_THRESHOLD} \
 --input-data=$FLOAT_DIFFSHAPE_JSONDATAFILE \
 --input-data=$FLOAT_DIFFSHAPE_JSONDATAFILE -p2000 \
 --service-kind=triton_c_api --model-repository=$DATADIR \
@@ -250,21 +252,9 @@ if [ $(cat $CLIENT_LOG |  grep -P "The supplied shape .+ is incompatible with th
 fi
 set -e
 
-# Negative test for the async mode.
-set +e
-$PERF_ANALYZER -v -m graphdef_int32_int32_int32 -t 1 -p2000 -b 1 -a \
---service-kind=triton_c_api --model-repository=$DATADIR \
---triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \
->$CLIENT_LOG 2>&1
-if [ $(cat $CLIENT_LOG | grep "not supported by triton_c_api service" | wc -l) -ne 1 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
-set -e
-
 for SHARED_MEMORY_TYPE in system cuda; do
     $PERF_ANALYZER -v -m graphdef_int32_int32_int32 -t 1 -p2000 -b 1 \
+    -s ${STABILITY_THRESHOLD} \
     --shared-memory=$SHARED_MEMORY_TYPE \
     --service-kind=triton_c_api --model-repository=$DATADIR \
     --triton-server-directory=$SERVER_LIBRARY_PATH >$CLIENT_LOG 2>&1
diff --git a/qa/L0_pinned_memory/test.sh b/qa/L0_pinned_memory/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -38,6 +38,9 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then
     REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
 fi
 
+# Use "--request-count" throughout the test to PA stability criteria and
+# reduce flaky failures from PA unstable measurements.
+REQUEST_COUNT=10
 CLIENT=../clients/perf_client
 # Only use libtorch as it accepts GPU I/O and it can handle variable shape
 BACKENDS=${BACKENDS:="libtorch"}
@@ -91,7 +94,7 @@ for BACKEND in $BACKENDS; do
 
     # Sanity check that the server allocates pinned memory for large size
     set +e
-    $CLIENT -m${ENSEMBLE_NAME} --shape INPUT0:16777216
+    $CLIENT -m${ENSEMBLE_NAME} --shape INPUT0:16777216 --request-count ${REQUEST_COUNT}
     if (( $? != 0 )); then
         RET=1
     fi
@@ -128,6 +131,7 @@ for BACKEND in $BACKENDS; do
     for TENSOR_SIZE in 16384 1048576 2097152 4194304 8388608 16777216; do
         $CLIENT -i grpc -u localhost:8001 -m${ENSEMBLE_NAME} \
                 --shape INPUT0:${TENSOR_SIZE} \
+                --request-count ${REQUEST_COUNT} \
                 >> ${BACKEND}.${TENSOR_SIZE}.pinned.log 2>&1
         if (( $? != 0 )); then
             RET=1
@@ -150,7 +154,7 @@ for BACKEND in $BACKENDS; do
 
     # Sanity check that the server allocates non-pinned memory
     set +e
-    $CLIENT  -m${ENSEMBLE_NAME} --shape INPUT0:1
+    $CLIENT  -m${ENSEMBLE_NAME} --shape INPUT0:1 --request-count ${REQUEST_COUNT}
     if (( $? != 0 )); then
         RET=1
     fi
@@ -180,6 +184,7 @@ for BACKEND in $BACKENDS; do
     for TENSOR_SIZE in 16384 1048576 2097152 4194304 8388608 16777216; do
         $CLIENT -i grpc -u localhost:8001 -m${ENSEMBLE_NAME} \
                 --shape INPUT0:${TENSOR_SIZE} \
+                --request-count ${REQUEST_COUNT} \
                 >> ${BACKEND}.${TENSOR_SIZE}.nonpinned.log 2>&1
         if (( $? != 0 )); then
             RET=1
diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`1`	`+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`2`	`2`	`#`
`3`	`3`	`# Redistribution and use in source and binary forms, with or without`
`4`	`4`	`# modification, are permitted provided that the following conditions`
`@@ -28,6 +28,8 @@ name: "growable_memory"`
`28`	`28`	`backend: "implicit_state"`
`29`	`29`	`max_batch_size: 0`
`30`	`30`	`sequence_batching {`
	`31`	`+ # Set large idle timeout to avoid inter-request timeouts for test consistency`
	`32`	`+ max_sequence_idle_microseconds: 10000000`
`31`	`33`	`control_input [`
`32`	`34`	`{`
`33`	`35`	`name: "START"`