Skip to content

Commit d0aee20

Browse files
yf711chilo-mswangyemsmoraxupengwa
authored
[ORT 1.18.1 Release] Cherry pick 3rd round (#21129)
### Description <!-- Describe your changes. --> Adding critical TensorRT EP support ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> --------- Co-authored-by: Chi Lo <[email protected]> Co-authored-by: Ye Wang <[email protected]> Co-authored-by: Michal Guzek <[email protected]> Co-authored-by: pengwa <[email protected]> Co-authored-by: wejoncy <[email protected]> Co-authored-by: Yi Zhang <[email protected]> Co-authored-by: Yi Zhang <[email protected]> Co-authored-by: Pranav Sharma <[email protected]> Co-authored-by: Adam Pocock <[email protected]> Co-authored-by: cao lei <[email protected]> Co-authored-by: Adrian Lizarraga <[email protected]> Co-authored-by: inisis <[email protected]> Co-authored-by: Jeff Bloomfield <[email protected]> Co-authored-by: mo-ja <[email protected]> Co-authored-by: kunal-vaishnavi <[email protected]> Co-authored-by: Sumit Agarwal <[email protected]> Co-authored-by: Atanas Dimitrov <[email protected]> Co-authored-by: Justin Chu <[email protected]> Co-authored-by: Yufeng Li <[email protected]> Co-authored-by: Dhruv Matani <[email protected]> Co-authored-by: Dhruv Matani <[email protected]> Co-authored-by: wangshuai09 <[email protected]> Co-authored-by: Xiaoyu <[email protected]> Co-authored-by: Xu Xing <[email protected]> Co-authored-by: Dmitri Smirnov <[email protected]> Co-authored-by: Rachel Guo <[email protected]> Co-authored-by: Sai Kishan Pampana <[email protected]> Co-authored-by: rachguo <[email protected]> Co-authored-by: Jian Chen <[email protected]> Co-authored-by: Shubham Bhokare <[email protected]> Co-authored-by: Yulong Wang <[email protected]> Co-authored-by: Andrew Fantino <[email protected]> Co-authored-by: Thomas Boby <[email protected]> Co-authored-by: Tianlei Wu <[email protected]> Co-authored-by: Scott McKay <[email protected]> Co-authored-by: Michal Guzek <[email protected]> Co-authored-by: George Wu <[email protected]> Co-authored-by: Baiju Meswani <[email protected]>
1 parent 8bfcf14 commit d0aee20

22 files changed

+613
-97
lines changed

cgmanifests/generated/cgmanifest.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@
216216
"component": {
217217
"type": "git",
218218
"git": {
219-
"commitHash": "bacfaaa951653cd4e72efe727a543567cb38f7de",
219+
"commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d",
220220
"repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
221221
},
222222
"comments": "onnx_tensorrt"

cmake/deps.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01
3838
neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
3939
onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
4040
#use the latest commit of 10.0-GA
41-
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/eb43908b02a296ea0594432f06e9d3fac288d672.zip;94d07871810a36a5bc70a1def5c50504101c9bd1
41+
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b
4242
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
4343
protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
4444
protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874

cmake/deps_update_and_upload.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
#
77
# Run without --do-upload once to verify downloading. Use --do-upload when you are ready to publish.
88
# E.g.:
9-
# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82
9+
# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.164
1010
# # check contents of C:/temp/onnxruntime_deps
11-
# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --no-download --do-upload
11+
# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.164 --no-download --do-upload
1212
#
1313
# Next, update the version number in tools/ci_build/github/azure-pipelines/templates/download-deps.yml.
1414

docs/ContribOperators.md

+2
Original file line numberDiff line numberDiff line change
@@ -1597,6 +1597,8 @@ This version of the operator has been available since version 1 of the 'com.micr
15971597
<dd>Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.</dd>
15981598
<dt><tt>notes</tt> : string</dt>
15991599
<dd>(Optional) Some notes for the model</dd>
1600+
<dt><tt>onnx_model_filename</tt> : string</dt>
1601+
<dd>(Optional) Filename of the original ONNX model.</dd>
16001602
<dt><tt>partition_name</tt> : string</dt>
16011603
<dd>(Optional) partitioned graph name.</dd>
16021604
<dt><tt>source</tt> : string</dt>

include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h

+14-3
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,21 @@ struct OrtTensorRTProviderOptionsV2 {
6464
* - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
6565
* - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
6666
*
67+
* 3. In the case of building weight-stripped engines, the same security reasons as listed in 1) apply to the
68+
* "onnx_model_filename" node attribute of EP context node, which contains a filename of the ONNX model with the
69+
* weights needed for the refit process. User can specify a folder path relative to the current working
70+
* directory by means of the "trt_onnx_model_folder_path" option.
71+
*
6772
*/
68-
int trt_dump_ep_context_model{0}; // Dump EP context node model
69-
const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
70-
int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
73+
int trt_dump_ep_context_model{0}; // Dump EP context node model
74+
const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
75+
int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
76+
int trt_weight_stripped_engine_enable{0}; // Enable weight-stripped engine build. Default 0 = false,
77+
// nonzero = true
78+
const char* trt_onnx_model_folder_path{nullptr}; // Folder path relative to the current working directory for
79+
// the ONNX model containing the weights (applicable only when
80+
// the "trt_weight_stripped_engine_enable" option is enabled)
7181

7282
const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix
83+
int trt_engine_hw_compatible{0}; // Enable hardware compatibility. Default 0 = false, nonzero = true
7384
};

onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
// Licensed under the MIT License.
1818

1919
#include <algorithm>
20+
#include <cfloat>
2021
#include <cuda.h>
2122
#include <cuda_fp16.h>
2223
#include <math.h>

onnxruntime/core/graph/contrib_ops/contrib_defs.cc

+5
Original file line numberDiff line numberDiff line change
@@ -3299,6 +3299,11 @@ void RegisterContribSchemas() {
32993299
"(Optional) SDK version used to convert the model.",
33003300
AttributeProto::STRING,
33013301
OPTIONAL_VALUE)
3302+
.Attr(
3303+
"onnx_model_filename",
3304+
"(Optional) Filename of the original ONNX model.",
3305+
AttributeProto::STRING,
3306+
OPTIONAL_VALUE)
33023307
.Attr(
33033308
"hardware_architecture",
33043309
"(Optional) Hardware architecture.",

onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc

+72-5
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
#include "onnx_ctx_model_helper.h"
99
#include "core/providers/cuda/shared_inc/cuda_call.h"
1010
#include "core/framework/execution_provider.h"
11+
#include "tensorrt_execution_provider.h"
1112

1213
namespace onnxruntime {
14+
extern TensorrtLogger& GetTensorrtLogger(bool verbose_log);
1315

1416
/*
1517
* Check whether the graph has the EP context contrib op.
@@ -67,7 +69,8 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
6769
char* engine_data,
6870
size_t size,
6971
const int64_t embed_mode,
70-
std::string compute_capability,
72+
const std::string compute_capability,
73+
const std::string onnx_model_path,
7174
const logging::Logger* logger) {
7275
auto model_build = graph_viewer.CreateModel(*logger);
7376
auto& graph_build = model_build->MainGraph();
@@ -88,6 +91,7 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
8891
auto attr_0 = ONNX_NAMESPACE::AttributeProto::Create(); // embed_mode
8992
auto attr_1 = ONNX_NAMESPACE::AttributeProto::Create(); // ep_cache_context
9093
auto attr_2 = ONNX_NAMESPACE::AttributeProto::Create(); // hardware_architecture
94+
auto attr_3 = ONNX_NAMESPACE::AttributeProto::Create(); // onnx_model_filename
9195
std::string engine_data_str = "";
9296
attr_0->set_name(EMBED_MODE);
9397
attr_0->set_type(onnx::AttributeProto_AttributeType_INT);
@@ -106,13 +110,17 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
106110
attr_2->set_name(COMPUTE_CAPABILITY);
107111
attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
108112
attr_2->set_s(compute_capability);
113+
attr_3->set_name(ONNX_MODEL_FILENAME);
114+
attr_3->set_type(onnx::AttributeProto_AttributeType_STRING);
115+
attr_3->set_s(std::filesystem::path(onnx_model_path).filename().string());
109116

110117
auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
111-
int num_attributes = 3;
118+
constexpr int num_attributes = 4;
112119
node_attributes->reserve(num_attributes);
113120
node_attributes->emplace(EMBED_MODE, *attr_0);
114121
node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
115122
node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
123+
node_attributes->emplace(ONNX_MODEL_FILENAME, *attr_3);
116124

117125
// Create EP context node
118126
graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
@@ -205,7 +213,7 @@ void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
205213
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path;
206214
}
207215

208-
bool IsAbsolutePath(std::string& path_string) {
216+
bool IsAbsolutePath(const std::string& path_string) {
209217
#ifdef _WIN32
210218
onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
211219
auto path = std::filesystem::path(ort_path_string.c_str());
@@ -219,7 +227,7 @@ bool IsAbsolutePath(std::string& path_string) {
219227
}
220228

221229
// Like "../file_path"
222-
bool IsRelativePathToParentPath(std::string& path_string) {
230+
bool IsRelativePathToParentPath(const std::string& path_string) {
223231
#ifdef _WIN32
224232
onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
225233
auto path = std::filesystem::path(ort_path_string.c_str());
@@ -236,6 +244,28 @@ bool IsRelativePathToParentPath(std::string& path_string) {
236244
#endif
237245
}
238246

247+
/*
248+
* Get the weight-refitted engine cache path from a weight-stripped engine cache path
249+
*
250+
* Weight-stipped engine:
251+
* An engine with weights stripped and its size is smaller than a regualr engine.
252+
* The cache name of weight-stripped engine is TensorrtExecutionProvider_TRTKernel_XXXXX.stripped.engine
253+
*
254+
* Weight-refitted engine:
255+
* An engine that its weights have been refitted and it's simply a regular engine.
256+
* The cache name of weight-refitted engine is TensorrtExecutionProvider_TRTKernel_XXXXX.engine
257+
*/
258+
std::string GetWeightRefittedEnginePath(std::string stripped_engine_cache) {
259+
std::filesystem::path stripped_engine_cache_path(stripped_engine_cache);
260+
std::string refitted_engine_cache_path = stripped_engine_cache_path.stem().stem().string() + ".engine";
261+
return refitted_engine_cache_path;
262+
}
263+
264+
bool IsWeightStrippedEngineCache(std::filesystem::path& engine_cache_path) {
265+
// The weight-stripped engine cache has the naming of xxx.stripped.engine
266+
return engine_cache_path.stem().extension().string() == ".stripped";
267+
}
268+
239269
Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
240270
if (!ValidateEPCtxNode(graph_viewer)) {
241271
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "It's not a valid EP Context node");
@@ -271,6 +301,22 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
271301
// The engine cache and context model (current model) should be in the same directory
272302
std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
273303
auto engine_cache_path = ctx_model_dir.append(cache_path);
304+
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] GetEpContextFromGraph engine_cache_path: " + engine_cache_path.string();
305+
306+
// If it's a weight-stripped engine cache, it needs to be refitted even though the refit flag is not enabled
307+
if (!weight_stripped_engine_refit_) {
308+
weight_stripped_engine_refit_ = IsWeightStrippedEngineCache(engine_cache_path);
309+
}
310+
311+
// If the serialized refitted engine is present, use it directly without refitting the engine again
312+
if (weight_stripped_engine_refit_) {
313+
const std::filesystem::path refitted_engine_cache_path = GetWeightRefittedEnginePath(engine_cache_path.string());
314+
if (std::filesystem::exists(refitted_engine_cache_path)) {
315+
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] " + refitted_engine_cache_path.string() + " exists.";
316+
engine_cache_path = refitted_engine_cache_path.string();
317+
weight_stripped_engine_refit_ = false;
318+
}
319+
}
274320

275321
if (!std::filesystem::exists(engine_cache_path)) {
276322
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
@@ -290,6 +336,21 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
290336
"TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string());
291337
}
292338
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string();
339+
340+
if (weight_stripped_engine_refit_) {
341+
const std::string onnx_model_filename = attrs.at(ONNX_MODEL_FILENAME).s();
342+
std::string weight_stripped_engine_cache = engine_cache_path.string();
343+
auto status = TensorrtExecutionProvider::RefitEngine(onnx_model_filename,
344+
onnx_model_folder_path_,
345+
weight_stripped_engine_cache,
346+
true /* path check for security */,
347+
(*trt_engine_).get(),
348+
true /* serialize refitted engine to disk */,
349+
detailed_build_log_);
350+
if (status != Status::OK()) {
351+
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
352+
}
353+
}
293354
}
294355
return Status::OK();
295356
}
@@ -306,7 +367,13 @@ bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewe
306367
// Show the warning if compute capability is not matched
307368
if (attrs.count(COMPUTE_CAPABILITY) > 0) {
308369
std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
309-
if (model_compute_capability != compute_capability_) {
370+
// Verify if engine was compiled with ampere+ hardware compatibility enabled
371+
if (model_compute_capability == "80+") {
372+
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine is compatible to all Ampere+ GPU (except Jetson)";
373+
if (std::stoi(compute_capability_) < 80) {
374+
LOGS_DEFAULT(WARNING) << "[TensorRT EP] However, this GPU doesn't match. The compute capability of the GPU: " << compute_capability_;
375+
}
376+
} else if (model_compute_capability != compute_capability_) {
310377
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";
311378
LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability;
312379
LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_;

onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h

+20-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include <string>
77
#include <filesystem>
8+
#include <memory>
89

910
#include "core/providers/tensorrt/nv_includes.h"
1011
#include "core/providers/shared_library/provider_api.h"
@@ -15,6 +16,7 @@ static const std::string EPCONTEXT_OP = "EPContext";
1516
static const std::string EMBED_MODE = "embed_mode";
1617
static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
1718
static const std::string COMPUTE_CAPABILITY = "hardware_architecture";
19+
static const std::string ONNX_MODEL_FILENAME = "onnx_model_filename";
1820
static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
1921
static const std::string EPCONTEXT_WARNING =
2022
"It's suggested to set the ORT graph optimization level to 0 and \
@@ -29,12 +31,13 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
2931
char* engine_data,
3032
size_t size,
3133
const int64_t embed_mode,
32-
std::string compute_capability,
34+
const std::string compute_capability,
35+
const std::string onnx_model_path,
3336
const logging::Logger* logger);
3437
std::string GetCtxModelPath(const std::string& ep_context_file_path,
3538
const std::string& original_model_path);
36-
bool IsAbsolutePath(std::string& path_string);
37-
bool IsRelativePathToParentPath(std::string& path_string);
39+
bool IsAbsolutePath(const std::string& path_string);
40+
bool IsRelativePathToParentPath(const std::string& path_string);
3841
void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
3942
const std::string& ctx_model_path);
4043
void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
@@ -46,7 +49,17 @@ class TensorRTCacheModelHandler {
4649
TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
4750
nvinfer1::IRuntime* trt_runtime,
4851
std::string ep_context_model_path,
49-
std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), ep_context_model_path_(ep_context_model_path), compute_capability_(compute_capability) {
52+
std::string compute_capability,
53+
bool weight_stripped_engine_refit,
54+
std::string onnx_model_folder_path,
55+
bool detailed_build_log)
56+
: trt_engine_(trt_engine),
57+
trt_runtime_(trt_runtime),
58+
ep_context_model_path_(ep_context_model_path),
59+
compute_capability_(compute_capability),
60+
weight_stripped_engine_refit_(weight_stripped_engine_refit),
61+
onnx_model_folder_path_(onnx_model_folder_path),
62+
detailed_build_log_(detailed_build_log) {
5063
}
5164
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
5265

@@ -59,5 +72,8 @@ class TensorRTCacheModelHandler {
5972
nvinfer1::IRuntime* trt_runtime_;
6073
std::string ep_context_model_path_; // If using context model, it implies context model and engine cache is in the same directory
6174
std::string compute_capability_;
75+
bool weight_stripped_engine_refit_;
76+
std::string onnx_model_folder_path_;
77+
bool detailed_build_log_;
6278
}; // TRTCacheModelHandler
6379
} // namespace onnxruntime

0 commit comments

Comments
 (0)