Skip to content

Commit 27f04d1

Browse files
authored
Add support for response sender in the default mode (#364)
* Add response sender to non-decoupled models and unify data pipelines (#360) * Add response sender to non-decoupled model and unify data pipelines * Rename variable and class name * Fix decoupled batch statistics to account for implicit batch size (#361) * Fix decoupled gpu output error handling (#362) * Fix decoupled gpu output error handling * Return full error string upon exception from model * Response sender to check for improper non-decoupled model usage (#363) * Response sender to check for improper non-decoupled model usage * Force close response sender on exception * Rename functions
1 parent 9d2c513 commit 27f04d1

File tree

9 files changed

+241
-813
lines changed

9 files changed

+241
-813
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,12 @@ Upon return from the execute function all tensor data associated with the
479479
InferenceRequest objects passed to the function are deleted, and so
480480
InferenceRequest objects should not be retained by the Python model.
481481

482+
Starting from 24.06, models may choose to send the response using the
483+
`InferenceResponseSender` as illustrated on [Decoupled mode](#decoupled-mode).
484+
Since the model is in default mode, it must send exactly one response per
485+
request. The `pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag must be sent
486+
either with the response or as a flag only response afterward.
487+
482488
#### Error Handling
483489

484490
In case one of the requests has an error, you can use the `TritonError` object

src/infer_request.cc

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ InferRequest::InferRequest(
7474
pb_cancel_ =
7575
std::make_shared<PbCancel>(response_factory_address_, request_address_);
7676
response_sender_ = std::make_shared<ResponseSender>(
77-
request_address_, response_factory_address_,
77+
request_address_, response_factory_address_, nullptr /* is_decoupled */,
7878
Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
7979
#endif
8080
}
@@ -272,7 +272,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
272272
std::unique_ptr<InferRequest>
273273
InferRequest::LoadFromSharedMemory(
274274
std::unique_ptr<SharedMemoryManager>& shm_pool,
275-
bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle)
275+
bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle,
276+
bool const* is_model_decoupled)
276277
{
277278
AllocatedSharedMemory<char> infer_request_shm =
278279
shm_pool->Load<char>(request_handle);
@@ -328,7 +329,7 @@ InferRequest::LoadFromSharedMemory(
328329
return std::unique_ptr<InferRequest>(new InferRequest(
329330
infer_request_shm, request_id_shm, correlation_id_shm,
330331
requested_output_names_shm, model_name_shm, input_tensors, parameters_shm,
331-
infer_trace_shm));
332+
infer_trace_shm, is_model_decoupled));
332333
}
333334

334335
InferRequest::InferRequest(
@@ -339,7 +340,8 @@ InferRequest::InferRequest(
339340
std::unique_ptr<PbString>& model_name_shm,
340341
std::vector<std::shared_ptr<PbTensor>>& input_tensors,
341342
std::unique_ptr<PbString>& parameters_shm,
342-
std::unique_ptr<InferenceTrace>& infer_trace_shm)
343+
std::unique_ptr<InferenceTrace>& infer_trace_shm,
344+
bool const* is_model_decoupled)
343345
: infer_request_shm_(std::move(infer_request_shm)),
344346
request_id_shm_(std::move(request_id_shm)),
345347
requested_output_names_shm_(std::move(requested_output_names_shm)),
@@ -387,7 +389,7 @@ InferRequest::InferRequest(
387389
pb_cancel_ =
388390
std::make_shared<PbCancel>(response_factory_address_, request_address_);
389391
response_sender_ = std::make_shared<ResponseSender>(
390-
request_address_, response_factory_address_,
392+
request_address_, response_factory_address_, is_model_decoupled,
391393
Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
392394
#endif
393395
}
@@ -402,13 +404,6 @@ InferRequest::IsCancelled()
402404
std::shared_ptr<ResponseSender>
403405
InferRequest::GetResponseSender()
404406
{
405-
std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
406-
if (!stub->IsDecoupled()) {
407-
throw PythonBackendException(
408-
"'get_response_sender' function must be called only when the model is "
409-
"using the decoupled transaction policy.");
410-
}
411-
412407
return response_sender_;
413408
}
414409

src/infer_request.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ class InferRequest {
118118
static std::unique_ptr<InferRequest> LoadFromSharedMemory(
119119
std::unique_ptr<SharedMemoryManager>& shm_pool,
120120
bi::managed_external_buffer::handle_t request_handle,
121-
bool open_cuda_handle);
121+
bool open_cuda_handle, bool const* is_model_decoupled);
122122

123123
/// Disallow copying the inference request object.
124124
DISALLOW_COPY_AND_ASSIGN(InferRequest);
@@ -135,7 +135,8 @@ class InferRequest {
135135
std::unique_ptr<PbString>& model_name_shm,
136136
std::vector<std::shared_ptr<PbTensor>>& input_tensors,
137137
std::unique_ptr<PbString>& parameters_shm,
138-
std::unique_ptr<InferenceTrace>& infer_trace_shm);
138+
std::unique_ptr<InferenceTrace>& infer_trace_shm,
139+
bool const* is_model_decoupled);
139140

140141
std::string request_id_;
141142
CorrelationId correlation_id_;

0 commit comments

Comments
 (0)