Skip to content

Commit 1ea48a6

Browse files
authored
feat: Add parameters support to InferResponse (#394)
* Add parameters support to InferResponse * Infer response to track parameters * Add parameters to binding infer response * Rank parameters argument up among InferResponse constructor arguments * Add setting parameters to Triton response * Send response parameters only on non-error * Fix double declaration * Unify py dictionary parameters to json str * Add documentation * Mark response parameters accessor const and JSON serializable * [Docs] Note BLS response parameters are not populated currently * [comment] Clarify why PbTensor::LoadFromSharedMemory() requires holding GIL
1 parent b771f4f commit 1ea48a6

File tree

5 files changed

+180
-83
lines changed

5 files changed

+180
-83
lines changed

README.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -54,6 +54,7 @@ any C++ code.
5454
- [`finalize`](#finalize)
5555
- [Model Config File](#model-config-file)
5656
- [Inference Request Parameters](#inference-request-parameters)
57+
- [Inference Response Parameters](#inference-response-parameters)
5758
- [Managing Python Runtime and Libraries](#managing-python-runtime-and-libraries)
5859
- [Building Custom Python Backend Stub](#building-custom-python-backend-stub)
5960
- [Creating Custom Execution Environments](#creating-custom-execution-environments)
@@ -787,6 +788,24 @@ You can read more about the inference request parameters in the [parameters
787788
extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md)
788789
documentation.
789790

791+
## Inference Response Parameters
792+
793+
Inference response parameters may be optionally set during the construction of
794+
an inference response object. The parameters should be a dictionary of key value
795+
pairs, where keys are `str` and values are `bool`, `int` or `str`. For example,
796+
```python
797+
response = pb_utils.InferenceResponse(
798+
output_tensors, parameters={"key": "value"}
799+
)
800+
```
801+
802+
You can read more about the inference response parameters in the [parameters
803+
extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md)
804+
documentation.
805+
806+
Inference response parameters is currently not supported on BLS inference
807+
responses received by BLS models.
808+
790809
## Managing Python Runtime and Libraries
791810

792811
Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/)

src/infer_response.cc

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -39,8 +39,10 @@ namespace triton { namespace backend { namespace python {
3939

4040
InferResponse::InferResponse(
4141
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
42-
std::shared_ptr<PbError> error, const bool is_last_response, void* id)
43-
: error_(error), is_last_response_(is_last_response), id_(id)
42+
std::shared_ptr<PbError> error, std::string parameters,
43+
const bool is_last_response, void* id)
44+
: error_(error), is_last_response_(is_last_response), id_(id),
45+
parameters_(std::move(parameters))
4446
{
4547
for (auto& output : output_tensors) {
4648
if (!output) {
@@ -58,6 +60,12 @@ InferResponse::OutputTensors()
5860
return output_tensors_;
5961
}
6062

63+
const std::string&
64+
InferResponse::Parameters() const
65+
{
66+
return parameters_;
67+
}
68+
6169
bool
6270
InferResponse::HasError()
6371
{
@@ -106,6 +114,9 @@ InferResponse::SaveToSharedMemory(
106114
j++;
107115
}
108116
response_shm_ptr->id = id_;
117+
118+
parameters_shm_ = PbString::Create(shm_pool, parameters_);
119+
response_shm_ptr->parameters = parameters_shm_->ShmHandle();
109120
}
110121
}
111122

@@ -143,6 +154,8 @@ InferResponse::LoadFromSharedMemory(
143154

144155
std::shared_ptr<PbError> pb_error;
145156
std::vector<std::shared_ptr<PbTensor>> output_tensors;
157+
std::shared_ptr<PbString> parameters_shm;
158+
std::string parameters;
146159

147160
// If the error field is set, do not load output tensors from shared memory.
148161
if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) {
@@ -154,33 +167,44 @@ InferResponse::LoadFromSharedMemory(
154167
bi::managed_external_buffer::handle_t* tensor_handle_shm =
155168
reinterpret_cast<bi::managed_external_buffer::handle_t*>(
156169
response_shm.data_.get() + sizeof(ResponseShm));
170+
{
157171
#ifdef TRITON_PB_STUB
158-
// Need to acquire the GIL to avoid hangs.
159-
py::gil_scoped_acquire acquire;
172+
// PbTensor::LoadFromSharedMemory() will construct Python objects if
173+
// called from pb_stub, which requires holding the GIL.
174+
py::gil_scoped_acquire acquire;
160175
#endif
161-
for (size_t idx = 0; idx < requested_output_count; ++idx) {
162-
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
163-
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
164-
output_tensors.emplace_back(std::move(pb_tensor));
176+
for (size_t idx = 0; idx < requested_output_count; ++idx) {
177+
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
178+
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
179+
output_tensors.emplace_back(std::move(pb_tensor));
180+
}
165181
}
182+
183+
parameters_shm = std::move(
184+
PbString::LoadFromSharedMemory(shm_pool, response_shm_ptr->parameters));
185+
parameters = parameters_shm->String();
166186
}
167187

168188
return std::unique_ptr<InferResponse>(new InferResponse(
169189
response_shm, output_tensors, pb_error,
170-
response_shm_ptr->is_last_response, response_shm_ptr->id));
190+
response_shm_ptr->is_last_response, response_shm_ptr->id, parameters_shm,
191+
parameters));
171192
}
172193

173194
InferResponse::InferResponse(
174195
AllocatedSharedMemory<char>& response_shm,
175196
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
176-
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id)
197+
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
198+
std::shared_ptr<PbString>& parameters_shm, std::string& parameters)
177199
{
178200
response_shm_ = std::move(response_shm);
179201
output_tensors_ = std::move(output_tensors);
180202
error_ = std::move(pb_error);
181203
shm_handle_ = response_shm_.handle_;
182204
id_ = id;
183205
is_last_response_ = is_last_response;
206+
parameters_shm_ = std::move(parameters_shm);
207+
parameters_ = std::move(parameters);
184208
}
185209

186210
std::shared_ptr<PbError>&
@@ -387,6 +411,38 @@ InferResponse::Send(
387411
cuda_copy |= cuda_used;
388412
}
389413

414+
if (!parameters_.empty()) {
415+
triton::common::TritonJson::Value param;
416+
THROW_IF_TRITON_ERROR(
417+
param.Parse(parameters_.c_str(), parameters_.length()));
418+
std::vector<std::string> param_keys;
419+
THROW_IF_TRITON_ERROR(param.Members(&param_keys));
420+
for (const auto& key : param_keys) {
421+
triton::common::TritonJson::Value value;
422+
if (!param.Find(key.c_str(), &value)) {
423+
throw PythonBackendException("Unexpected missing key on parameters");
424+
}
425+
if (value.IsString()) {
426+
std::string string_value;
427+
THROW_IF_TRITON_ERROR(value.AsString(&string_value));
428+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetStringParameter(
429+
response, key.c_str(), string_value.c_str()));
430+
} else if (value.IsInt()) {
431+
int64_t int_value = 0;
432+
THROW_IF_TRITON_ERROR(value.AsInt(&int_value));
433+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetIntParameter(
434+
response, key.c_str(), int_value));
435+
} else if (value.IsBool()) {
436+
bool bool_value = false;
437+
THROW_IF_TRITON_ERROR(value.AsBool(&bool_value));
438+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetBoolParameter(
439+
response, key.c_str(), bool_value));
440+
} else {
441+
throw PythonBackendException("Unsupported value type on parameters");
442+
}
443+
}
444+
}
445+
390446
#ifdef TRITON_ENABLE_GPU
391447
if (cuda_copy) {
392448
cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));

src/infer_response.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -38,6 +38,7 @@ namespace triton { namespace backend { namespace python {
3838

3939
struct ResponseShm {
4040
uint32_t outputs_size;
41+
bi::managed_external_buffer::handle_t parameters;
4142
bi::managed_external_buffer::handle_t error;
4243
bool has_error;
4344
// Indicates whether this error has a message or not.
@@ -72,9 +73,10 @@ class InferResponse {
7273
public:
7374
InferResponse(
7475
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
75-
std::shared_ptr<PbError> error = nullptr,
76+
std::shared_ptr<PbError> error = nullptr, std::string parameters = "",
7677
const bool is_last_response = true, void* id = nullptr);
7778
std::vector<std::shared_ptr<PbTensor>>& OutputTensors();
79+
const std::string& Parameters() const; // JSON serializable unless empty
7880
void SaveToSharedMemory(
7981
std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu = true);
8082
static std::unique_ptr<InferResponse> LoadFromSharedMemory(
@@ -116,8 +118,8 @@ class InferResponse {
116118
InferResponse(
117119
AllocatedSharedMemory<char>& response_shm,
118120
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
119-
std::shared_ptr<PbError>& pb_error, const bool is_last_response,
120-
void* id);
121+
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
122+
std::shared_ptr<PbString>& parameters_shm, std::string& parameters);
121123
std::vector<std::shared_ptr<PbTensor>> output_tensors_;
122124

123125
std::shared_ptr<PbError> error_;
@@ -128,6 +130,9 @@ class InferResponse {
128130
bool is_last_response_;
129131
// Representing the request id that the response was created from.
130132
void* id_;
133+
134+
std::shared_ptr<PbString> parameters_shm_;
135+
std::string parameters_;
131136
};
132137

133138
}}} // namespace triton::backend::python

src/pb_stub.cc

Lines changed: 72 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -104,6 +104,28 @@ PyDefaultArgumentToMutableType(const py::object& argument)
104104
std::string(py::str(argument.get_type())));
105105
}
106106

107+
std::string
108+
PyParametersToJSON(const py::dict& parameters)
109+
{
110+
for (const auto& pair : parameters) {
111+
if (!py::isinstance<py::str>(pair.first)) {
112+
throw PythonBackendException(
113+
"Expect parameters keys to have type str, found type " +
114+
std::string(py::str(pair.first.get_type())));
115+
}
116+
if (!py::isinstance<py::bool_>(pair.second) &&
117+
!py::isinstance<py::int_>(pair.second) &&
118+
!py::isinstance<py::str>(pair.second)) {
119+
throw PythonBackendException(
120+
"Expect parameters values to have type bool/int/str, found type " +
121+
std::string(py::str(pair.second.get_type())));
122+
}
123+
}
124+
py::module_ py_json = py::module_::import("json");
125+
std::string parameters_str = py::str(py_json.attr("dumps")(parameters));
126+
return parameters_str;
127+
}
128+
107129
void
108130
AsyncEventFutureDoneCallback(const py::object& py_future)
109131
{
@@ -1714,59 +1736,41 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
17141736
py::class_<InferRequest, std::shared_ptr<InferRequest>>(
17151737
module, "InferenceRequest")
17161738
.def(
1717-
py::init([](const std::string& request_id,
1718-
const py::object& correlation_id,
1719-
const std::vector<std::shared_ptr<PbTensor>>& inputs,
1720-
const std::vector<std::string>& requested_output_names,
1721-
const std::string& model_name,
1722-
const int64_t model_version, const uint32_t flags,
1723-
const uint64_t timeout,
1724-
const PreferredMemory& preferred_memory,
1725-
const InferenceTrace& trace,
1726-
const py::object& parameters_) {
1727-
py::dict parameters =
1728-
PyDefaultArgumentToMutableType<py::dict>(parameters_);
1729-
std::set<std::string> requested_outputs;
1730-
for (auto& requested_output_name : requested_output_names) {
1731-
requested_outputs.emplace(requested_output_name);
1732-
}
1733-
for (const auto& pair : parameters) {
1734-
if (!py::isinstance<py::str>(pair.first)) {
1735-
throw PythonBackendException(
1736-
"Expect parameters keys to have type str, found type " +
1737-
std::string(py::str(pair.first.get_type())));
1738-
}
1739-
if (!py::isinstance<py::bool_>(pair.second) &&
1740-
!py::isinstance<py::int_>(pair.second) &&
1741-
!py::isinstance<py::str>(pair.second)) {
1742-
throw PythonBackendException(
1743-
"Expect parameters values to have type bool/int/str, found "
1744-
"type " +
1745-
std::string(py::str(pair.second.get_type())));
1746-
}
1747-
}
1748-
py::module_ py_json = py::module_::import("json");
1749-
std::string parameters_str =
1750-
py::str(py_json.attr("dumps")(parameters));
1751-
1752-
CorrelationId correlation_id_obj;
1753-
if (py::isinstance<py::int_>(correlation_id)) {
1754-
correlation_id_obj =
1755-
CorrelationId(py::cast<uint64_t>(correlation_id));
1756-
} else if (py::isinstance<py::str>(correlation_id)) {
1757-
correlation_id_obj =
1758-
CorrelationId(py::cast<std::string>(correlation_id));
1759-
} else {
1760-
throw PythonBackendException(
1761-
"Correlation ID must be integer or string");
1762-
}
1763-
1764-
return std::make_shared<InferRequest>(
1765-
request_id, correlation_id_obj, inputs, requested_outputs,
1766-
model_name, model_version, parameters_str, flags, timeout,
1767-
0 /*response_factory_address*/, 0 /*request_address*/,
1768-
preferred_memory, trace);
1769-
}),
1739+
py::init(
1740+
[](const std::string& request_id,
1741+
const py::object& correlation_id,
1742+
const std::vector<std::shared_ptr<PbTensor>>& inputs,
1743+
const std::vector<std::string>& requested_output_names,
1744+
const std::string& model_name, const int64_t model_version,
1745+
const uint32_t flags, const uint64_t timeout,
1746+
const PreferredMemory& preferred_memory,
1747+
const InferenceTrace& trace, const py::object& parameters_) {
1748+
py::dict parameters =
1749+
PyDefaultArgumentToMutableType<py::dict>(parameters_);
1750+
std::set<std::string> requested_outputs;
1751+
for (auto& requested_output_name : requested_output_names) {
1752+
requested_outputs.emplace(requested_output_name);
1753+
}
1754+
std::string parameters_str = PyParametersToJSON(parameters);
1755+
1756+
CorrelationId correlation_id_obj;
1757+
if (py::isinstance<py::int_>(correlation_id)) {
1758+
correlation_id_obj =
1759+
CorrelationId(py::cast<uint64_t>(correlation_id));
1760+
} else if (py::isinstance<py::str>(correlation_id)) {
1761+
correlation_id_obj =
1762+
CorrelationId(py::cast<std::string>(correlation_id));
1763+
} else {
1764+
throw PythonBackendException(
1765+
"Correlation ID must be integer or string");
1766+
}
1767+
1768+
return std::make_shared<InferRequest>(
1769+
request_id, correlation_id_obj, inputs, requested_outputs,
1770+
model_name, model_version, parameters_str, flags, timeout,
1771+
0 /*response_factory_address*/, 0 /*request_address*/,
1772+
preferred_memory, trace);
1773+
}),
17701774
py::arg("request_id").none(false) = "",
17711775
py::arg("correlation_id").none(false) = 0,
17721776
py::arg("inputs").none(false),
@@ -1869,16 +1873,25 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
18691873
py::class_<InferResponse, std::shared_ptr<InferResponse>>(
18701874
module, "InferenceResponse")
18711875
.def(
1872-
py::init<
1873-
const std::vector<std::shared_ptr<PbTensor>>&,
1874-
std::shared_ptr<PbError>>(),
1876+
py::init(
1877+
[](const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
1878+
const std::shared_ptr<PbError>& error,
1879+
const py::object& parameters_) {
1880+
py::dict parameters =
1881+
PyDefaultArgumentToMutableType<py::dict>(parameters_);
1882+
std::string parameters_str = PyParametersToJSON(parameters);
1883+
return std::make_shared<InferResponse>(
1884+
output_tensors, error, parameters_str /* parameters */);
1885+
}),
18751886
py::arg("output_tensors") = py::list(),
1876-
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr))
1887+
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr),
1888+
py::arg("parameters") = py::none())
18771889
.def(
18781890
"output_tensors", &InferResponse::OutputTensors,
18791891
py::return_value_policy::reference)
18801892
.def("has_error", &InferResponse::HasError)
1881-
.def("error", &InferResponse::Error);
1893+
.def("error", &InferResponse::Error)
1894+
.def("parameters", &InferResponse::Parameters);
18821895

18831896
py::class_<ResponseSender, std::shared_ptr<ResponseSender>>(
18841897
module, "InferenceResponseSender")

0 commit comments

Comments
 (0)