Skip to content

Commit e84c8aa

Browse files
committed
feat: Adding profiling support to the runtime
Signed-off-by: Naren Dasan <[email protected]>
1 parent b70c913 commit e84c8aa

24 files changed

+238
-142
lines changed

core/compiler.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ void AddEngineToGraph(
3131
torch::jit::script::Module mod,
3232
std::shared_ptr<torch::jit::Graph>& g,
3333
const std::string& serialized_engine,
34-
runtime::CudaDevice& device_info,
34+
runtime::CUDADevice& device_info,
3535
std::string engine_id = "",
3636
bool fallback = false) {
3737
auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(
@@ -166,7 +166,7 @@ partitioning::GraphAndMapping BuildHybridGraph(
166166
auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_info, static_params);
167167
auto temp_g = std::make_shared<torch::jit::Graph>();
168168
auto device_spec = convert_info.engine_settings.device;
169-
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
169+
auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
170170
AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
171171

172172
seg_block.update_graph(temp_g);
@@ -283,7 +283,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
283283
torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
284284

285285
auto device_spec = cfg.convert_info.engine_settings.device;
286-
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
286+
auto cuda_device = runtime::CUDADevice(device_spec.gpu_id, device_spec.device_type);
287287

288288
for (const torch::jit::Method& method : mod.get_methods()) {
289289
if (method.name().compare("forward") == 0) {
@@ -342,7 +342,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
342342
return new_mod;
343343
}
344344

345-
torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device) {
345+
torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device) {
346346
std::ostringstream engine_id;
347347
engine_id << reinterpret_cast<const int*>(&engine);
348348
torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());

core/compiler.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
2828

2929
torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);
3030

31-
torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device);
31+
torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CUDADevice cuda_device);
3232

3333
void set_device(const int gpu_id);
3434

core/conversion/converters/impl/expand.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -374,12 +374,13 @@ auto expand_registrations TORCHTRT_UNUSED =
374374

375375
// Collapse repeated dimension back into desired dimension
376376
std::vector<int64_t> collapse_shape_vec;
377-
for (int k = 0; k < repeat_shape_dims.nbDims; k++) {
377+
for (int64_t k = 0; k < repeat_shape_dims.nbDims; k++) {
378378
if (k == dim) {
379-
int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[++k];
379+
int64_t collapse_dim = repeat_shape_dims.d[k] * repeat_shape_dims.d[k+1];
380380
// Set dim size to -1 if repeat is being done on dynamic dim
381381
collapse_dim = std::max(collapse_dim, (int64_t)-1);
382382
collapse_shape_vec.push_back(collapse_dim);
383+
k++;
383384
} else {
384385
collapse_shape_vec.push_back(repeat_shape_dims.d[k]);
385386
}

core/conversion/converters/impl/select.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ auto select_registrations TORCHTRT_UNUSED =
280280

281281
std::vector<nvinfer1::ITensor*> tensors;
282282
std::vector<int32_t> adv_idx_indices;
283-
for (auto i = 0; i < ts.size(); i++) {
283+
for (size_t i = 0; i < ts.size(); i++) {
284284
auto t = ts[i];
285285
if (t.isTensor()) {
286286
auto torch_tensor = t.toTensor().to(torch::kInt32);

core/runtime/BUILD

+8-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ config_setting(
1313
cc_library(
1414
name = "runtime",
1515
srcs = [
16-
"CudaDevice.cpp",
16+
"CUDADevice.cpp",
1717
"DeviceList.cpp",
1818
"TRTEngine.cpp",
1919
"execute_engine.cpp",
@@ -22,6 +22,8 @@ cc_library(
2222
],
2323
hdrs = [
2424
"runtime.h",
25+
"CUDADevice.h",
26+
"TRTEngine.h"
2527
],
2628
deps = [
2729
"@tensorrt//:nvinfer",
@@ -36,6 +38,10 @@ cc_library(
3638

3739
pkg_tar(
3840
name = "include",
39-
srcs = ["runtime.h"],
41+
srcs = [
42+
"runtime.h",
43+
"CUDADevice.h",
44+
"TRTEngine.h"
45+
],
4046
package_dir = "core/runtime/",
4147
)

core/runtime/CMakeLists.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ set(lib_name "core_runtime")
22
add_library(${lib_name} OBJECT)
33

44
set(CXX_SRCS
5-
"${CMAKE_CURRENT_SOURCE_DIR}/CudaDevice.cpp"
5+
"${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.cpp"
66
"${CMAKE_CURRENT_SOURCE_DIR}/DeviceList.cpp"
77
"${CMAKE_CURRENT_SOURCE_DIR}/execute_engine.cpp"
88
"${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.cpp"
@@ -12,6 +12,8 @@ set(CXX_SRCS
1212

1313
set(HEADER_FILES
1414
"${CMAKE_CURRENT_SOURCE_DIR}/runtime.h"
15+
"${CMAKE_CURRENT_SOURCE_DIR}/CUDADevice.h"
16+
"${CMAKE_CURRENT_SOURCE_DIR}/TRTEngine.h"
1517
)
1618

1719
target_sources(${lib_name}

core/runtime/CudaDevice.cpp renamed to core/runtime/CUDADevice.cpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ const std::string DEVICE_INFO_DELIM = "%";
1111

1212
typedef enum { ID_IDX = 0, SM_MAJOR_IDX, SM_MINOR_IDX, DEVICE_TYPE_IDX, DEVICE_NAME_IDX } SerializedDeviceInfoIndex;
1313

14-
CudaDevice::CudaDevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
14+
CUDADevice::CUDADevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
1515

16-
CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
17-
CudaDevice cuda_device;
16+
CUDADevice::CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
17+
CUDADevice cuda_device;
1818
cudaDeviceProp device_prop;
1919

2020
// Device ID
@@ -41,7 +41,7 @@ CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
4141
// NOTE: Serialization Format for Device Info:
4242
// id%major%minor%(enum)device_type%device_name
4343

44-
CudaDevice::CudaDevice(std::string device_info) {
44+
CUDADevice::CUDADevice(std::string device_info) {
4545
LOG_DEBUG("Deserializing Device Info: " << device_info);
4646

4747
std::vector<std::string> tokens;
@@ -66,7 +66,7 @@ CudaDevice::CudaDevice(std::string device_info) {
6666
LOG_DEBUG("Deserialized Device Info: " << *this);
6767
}
6868

69-
CudaDevice& CudaDevice::operator=(const CudaDevice& other) {
69+
CUDADevice& CUDADevice::operator=(const CUDADevice& other) {
7070
id = other.id;
7171
major = other.major;
7272
minor = other.minor;
@@ -75,7 +75,7 @@ CudaDevice& CudaDevice::operator=(const CudaDevice& other) {
7575
return (*this);
7676
}
7777

78-
std::string CudaDevice::serialize() {
78+
std::string CUDADevice::serialize() {
7979
std::vector<std::string> content;
8080
content.resize(DEVICE_NAME_IDX + 1);
8181

@@ -98,13 +98,13 @@ std::string CudaDevice::serialize() {
9898
return serialized_device_info;
9999
}
100100

101-
std::string CudaDevice::getSMCapability() const {
101+
std::string CUDADevice::getSMCapability() const {
102102
std::stringstream ss;
103103
ss << major << "." << minor;
104104
return ss.str();
105105
}
106106

107-
std::ostream& operator<<(std::ostream& os, const CudaDevice& device) {
107+
std::ostream& operator<<(std::ostream& os, const CUDADevice& device) {
108108
os << "Device(ID: " << device.id << ", Name: " << device.device_name << ", SM Capability: " << device.major << '.'
109109
<< device.minor << ", Type: " << device.device_type << ')';
110110
return os;

core/runtime/CUDADevice.h

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#pragma once
2+
#include <string>
3+
#include "NvInfer.h"
4+
5+
namespace torch_tensorrt {
6+
namespace core {
7+
namespace runtime {
8+
9+
struct CUDADevice {
10+
int64_t id; // CUDA device id
11+
int64_t major; // CUDA compute major version
12+
int64_t minor; // CUDA compute minor version
13+
nvinfer1::DeviceType device_type;
14+
std::string device_name;
15+
16+
CUDADevice();
17+
CUDADevice(int64_t gpu_id, nvinfer1::DeviceType device_type);
18+
CUDADevice(std::string serialized_device_info);
19+
~CUDADevice() = default;
20+
CUDADevice(const CUDADevice& other) = default;
21+
CUDADevice& operator=(const CUDADevice& other);
22+
std::string serialize();
23+
std::string getSMCapability() const;
24+
friend std::ostream& operator<<(std::ostream& os, const CUDADevice& device);
25+
};
26+
27+
void set_cuda_device(CUDADevice& cuda_device);
28+
// Gets the current active GPU (DLA will not show up through this)
29+
CUDADevice get_current_device();
30+
31+
} // namespace torch_tensorrt
32+
} // namespace core
33+
} // namespace runtime

core/runtime/DeviceList.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,19 @@ DeviceList::DeviceList() {
1515
}
1616

1717
for (int i = 0; i < num_devices; i++) {
18-
device_list[i] = CudaDevice(i, nvinfer1::DeviceType::kGPU);
18+
device_list[i] = CUDADevice(i, nvinfer1::DeviceType::kGPU);
1919
}
2020

2121
// REVIEW: DO WE CARE ABOUT DLA?
2222

2323
LOG_DEBUG("Runtime:\n Available CUDA Devices: \n" << this->dump_list());
2424
}
2525

26-
void DeviceList::insert(int device_id, CudaDevice cuda_device) {
26+
void DeviceList::insert(int device_id, CUDADevice cuda_device) {
2727
device_list[device_id] = cuda_device;
2828
}
2929

30-
CudaDevice DeviceList::find(int device_id) {
30+
CUDADevice DeviceList::find(int device_id) {
3131
return device_list[device_id];
3232
}
3333

core/runtime/TRTEngine.cpp

+11-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ std::string slugify(std::string s) {
1616
return s;
1717
}
1818

19-
TRTEngine::TRTEngine(std::string serialized_engine, CudaDevice cuda_device) {
19+
TRTEngine::TRTEngine(std::string serialized_engine, CUDADevice cuda_device) {
2020
std::string _name = "deserialized_trt";
2121
new (this) TRTEngine(_name, serialized_engine, cuda_device);
2222
}
@@ -33,11 +33,11 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
3333
std::string _name = serialized_info[NAME_IDX];
3434
std::string engine_info = serialized_info[ENGINE_IDX];
3535

36-
CudaDevice cuda_device(serialized_info[DEVICE_IDX]);
36+
CUDADevice cuda_device(serialized_info[DEVICE_IDX]);
3737
new (this) TRTEngine(_name, engine_info, cuda_device);
3838
}
3939

40-
TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device) {
40+
TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device) {
4141
auto most_compatible_device = get_most_compatible_device(cuda_device);
4242
TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
4343
device_info = most_compatible_device.value();
@@ -85,6 +85,14 @@ TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDe
8585
LOG_DEBUG(*this);
8686
}
8787

88+
void TRTEngine::set_paths() {
89+
execution_profile_path = profile_path + "/" + name + "_execution_profile.trace";
90+
device_profile_path = profile_path + "/" + name + "_device_config_profile.trace";
91+
input_profile_path = profile_path + "/" + name + "_input_profile.trace";
92+
output_profile_path = profile_path + "/" + name + "_output_profile.trace";
93+
enqueue_profile_path = profile_path + "/" + name + "_enqueue_profile.trace";
94+
}
95+
8896
TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
8997
rt = other.rt;
9098
cuda_engine = other.cuda_engine;

core/runtime/TRTEngine.h

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#pragma once
2+
#include <map>
3+
#include <memory>
4+
#include <mutex>
5+
#include <utility>
6+
#include "ATen/core/function_schema.h"
7+
#include "NvInfer.h"
8+
#include "core/util/prelude.h"
9+
#include "torch/custom_class.h"
10+
11+
namespace torch_tensorrt {
12+
namespace core {
13+
namespace runtime {
14+
15+
struct TRTEngine : torch::CustomClassHolder {
16+
// Each engine needs it's own runtime object
17+
std::shared_ptr<nvinfer1::IRuntime> rt;
18+
std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
19+
std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
20+
std::pair<uint64_t, uint64_t> num_io;
21+
std::string name;
22+
std::mutex mu;
23+
CUDADevice device_info;
24+
25+
std::string execution_profile_path;
26+
std::string device_profile_path;
27+
std::string input_profile_path;
28+
std::string output_profile_path;
29+
std::string enqueue_profile_path;
30+
std::string profile_path = "/tmp";
31+
32+
std::unordered_map<uint64_t, uint64_t> in_binding_map;
33+
std::unordered_map<uint64_t, uint64_t> out_binding_map;
34+
35+
#ifndef NDEBUG
36+
bool debug = true;
37+
#else
38+
bool debug = false;
39+
#endif
40+
41+
~TRTEngine() = default;
42+
TRTEngine(std::string serialized_engine, CUDADevice cuda_device);
43+
TRTEngine(std::vector<std::string> serialized_info);
44+
TRTEngine(std::string mod_name, std::string serialized_engine, CUDADevice cuda_device);
45+
TRTEngine& operator=(const TRTEngine& other);
46+
std::string to_str() const;
47+
void set_paths();
48+
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
49+
// TODO: Implement a call method
50+
// c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
51+
};
52+
53+
} // namespace torch_tensorrt
54+
} // namespace core
55+
} // namespace runtime

0 commit comments

Comments
 (0)