pytorch
diff --git a/‎.circleci/config.yml
+185-175 b/‎.circleci/config.yml
+185-175
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎WORKSPACE
+11-11 b/‎WORKSPACE
+11-11
diff --git a/‎core/conversion/converters/impl/conv_deconv.cpp
+14 b/‎core/conversion/converters/impl/conv_deconv.cpp
+14
diff --git a/‎core/runtime/TRTEngine.cpp
+46-46 b/‎core/runtime/TRTEngine.cpp
+46-46
diff --git a/‎core/runtime/execute_engine.cpp
+19-24 b/‎core/runtime/execute_engine.cpp
+19-24
diff --git a/‎docker/WORKSPACE.docker
+4-4 b/‎docker/WORKSPACE.docker
+4-4
diff --git a/‎docs/_cpp_api/classtorch__tensorrt_1_1DataType.html
+2-2 b/‎docs/_cpp_api/classtorch__tensorrt_1_1DataType.html
+2-2
diff --git a/‎docs/_cpp_api/classtorch__tensorrt_1_1Device_1_1DeviceType.html
+2-2 b/‎docs/_cpp_api/classtorch__tensorrt_1_1Device_1_1DeviceType.html
+2-2
@@ -114,10 +114,10 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts") # save the TRT embedd
 These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
 
 - Bazel 5.2.0
-- Libtorch 1.13.0 (built with CUDA 11.7)
+- Libtorch 2.0.0.dev20230103 (built with CUDA 11.7)
 - CUDA 11.7
 - cuDNN 8.5.0
-- TensorRT 8.5.0
+- TensorRT 8.5.1.7
 
 ## Prebuilt Binaries and Wheel files
 
 
@@ -41,7 +41,7 @@ local_repository(
 new_local_repository(
     name = "cuda",
     build_file = "@//third_party/cuda:BUILD",
-    path = "/usr/local/cuda-11.6/",
+    path = "/usr/local/cuda-11.7/",
 )
 
 new_local_repository(
@@ -56,17 +56,17 @@ new_local_repository(
 http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "b565c662435fd58ec295fa0791388ea52ad0f5fd33517b2d7c0fdcc91b6db531",
+    sha256 = "59b8b5e1954a86d50b79c13f06398d385b200da13e37a08ecf31d3c62e5ca127",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/nightly/cu116/libtorch-cxx11-abi-shared-with-deps-1.14.0.dev20221114%2Bcu116.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-cxx11-abi-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
 )
 
 http_archive(
     name = "libtorch_pre_cxx11_abi",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "fbb37446c33b05c1e26256c09f6ffb46cea1f6ff9ee2ad5b79b146d09023b0c1",
+    sha256 = "e260fc7476be89d1650953e8643e9f7363845f5a52de4bab87ac0e619c1f6ad4",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/nightly/cu116/libtorch-shared-with-deps-1.14.0.dev20221114%2Bcu116.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
 )
 
 # Download these tarballs manually from the NVIDIA website
@@ -76,20 +76,20 @@ http_archive(
 http_archive(
     name = "cudnn",
     build_file = "@//third_party/cudnn/archive:BUILD",
-    sha256 = "ec96d2376d81fca42bdd3d4c3d705a99b29a065bab57f920561c763e29c67d01",
-    strip_prefix = "cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive",
+    sha256 = "5454a6fd94f008728caae9adad993c4e85ef36302e26bce43bea7d458a5e7b6d",
+    strip_prefix = "cudnn-linux-x86_64-8.5.0.96_cuda11-archive",
     urls = [
-        "https://developer.nvidia.com/compute/cudnn/secure/8.4.1/local_installers/11.6/cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz",
+        "https://developer.nvidia.com/compute/cudnn/secure/8.5.0/local_installers/11.7/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz",
     ],
 )
 
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    sha256 = "8d7c2085c1639dcc73875048c23598a8526ce3089136876e31d90258e49e4f61",
-    strip_prefix = "TensorRT-8.4.3.1",
+    sha256 = "39cc7f077057d1363794e8ff51c4cf21a5dbeccf1116b0020ba0dae0f3063076",
+    strip_prefix = "TensorRT-8.5.1.7",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.3/tars/tensorrt-8.4.3.1.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz",
+        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.5.1/tars/TensorRT-8.5.1.7.Linux.x86_64-gnu.cuda-11.8.cudnn8.6.tar.gz",
     ],
 )
 
 
@@ -102,6 +102,20 @@ bool add_conv_deconv(ConversionCtx* ctx, const torch::jit::Node* n, args& args)
   }
 
   auto w = Weights(ctx, args[1].unwrapToTensor());
+  // TODO: Remove this when conv3d with kernel size=1 bug is fixed.
+  // Github issue: https://github.com/pytorch/TensorRT/issues/1445
+  bool is_kernel_size_one = true;
+  bool is_3d_kernel = w.kernel_shape.nbDims == 3;
+  for (int64_t i = 0; i < w.kernel_shape.nbDims; i++) {
+    if (w.kernel_shape.d[i] != 1.0f) {
+      is_kernel_size_one = false;
+    }
+  }
+  if (is_kernel_size_one && is_3d_kernel) {
+    LOG_WARNING(
+        "Conv3d layer with kernel size = 1 configuration incurs a failure with TensorRT tactic optimizer in some cases. \
+    Github issue: https://github.com/pytorch/TensorRT/issues/1445. Other conv variants do not have this issue.");
+  }
   auto dims = in->getDimensions();
   auto orig_dims = dims;
   LOG_DEBUG("Input dims: " << orig_dims);
 
@@ -68,8 +68,8 @@ TRTEngine::TRTEngine(
     uint64_t inputs = 0;
     uint64_t outputs = 0;
 
-    for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
-      std::string bind_name = cuda_engine->getBindingName(x);
+    for (int64_t trt_idx = 0; trt_idx < cuda_engine->getNbIOTensors(); trt_idx++) {
+      std::string bind_name = cuda_engine->getIOTensorName(trt_idx);
       LOG_DEBUG("Binding name: " << bind_name);
       auto delim = bind_name.find(".");
       if (delim == std::string::npos) {
@@ -80,46 +80,45 @@ TRTEngine::TRTEngine(
                 << bind_name
                 << "\nEnsure module was compiled with Torch-TensorRT.ts or follows Torch-TensorRT Runtime conventions");
       }
-
       std::string idx_s = bind_name.substr(delim + 1);
-      uint64_t idx = static_cast<uint64_t>(std::stoi(idx_s));
+      uint64_t pyt_idx = static_cast<uint64_t>(std::stoi(idx_s));
 
-      if (cuda_engine->bindingIsInput(x)) {
+      if (cuda_engine->getTensorIOMode(bind_name.c_str()) == nvinfer1::TensorIOMode::kINPUT) {
         inputs++;
-        in_binding_map[x] = idx;
-        LOG_DEBUG("TRT Binding: " << x << ": PYT Input: " << idx);
+        in_binding_map[trt_idx] = pyt_idx;
+        LOG_DEBUG("TRT Binding index: " << trt_idx << "corresponds to PYT Input index: " << pyt_idx);
       } else {
         outputs++;
-        out_binding_map[x] = idx;
-        LOG_DEBUG("TRT Binding: " << x << ": PYT Output: " << idx);
+        out_binding_map[trt_idx] = pyt_idx;
+        LOG_DEBUG("TRT Binding index: " << trt_idx << "corresponds to PYT Output: " << pyt_idx);
       }
     }
 
     num_io = std::make_pair(inputs, outputs);
     in_binding_names.resize(inputs);
     out_binding_names.resize(outputs);
-
-    for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
-      std::string bind_name = cuda_engine->getBindingName(x);
-      if (cuda_engine->bindingIsInput(x)) {
+    for (int64_t x = 0; x < cuda_engine->getNbIOTensors(); x++) {
+      std::string bind_name = cuda_engine->getIOTensorName(x);
+      if (cuda_engine->getTensorIOMode(bind_name.c_str()) == nvinfer1::TensorIOMode::kINPUT) {
         in_binding_names[in_binding_map.at(x)] = bind_name;
       } else {
         out_binding_names[out_binding_map.at(x)] = bind_name;
       }
     }
   } else {
-    uint64_t inputs = _in_binding_names.size();
-    in_binding_names.resize(inputs);
-    for (size_t pyt_idx = 0; pyt_idx < inputs; pyt_idx++) {
+    uint64_t inputs_size = _in_binding_names.size();
+    in_binding_names.resize(inputs_size);
+    for (size_t pyt_idx = 0; pyt_idx < inputs_size; pyt_idx++) {
       auto binding_name = _in_binding_names[pyt_idx];
       auto trt_idx = cuda_engine->getBindingIndex(binding_name.c_str());
-      TORCHTRT_CHECK((trt_idx >= 0), "Could not find a TensorRT engine binding for input named " << binding_name);
+      std::string engine_binded_name = cuda_engine->getIOTensorName(pyt_idx);
       TORCHTRT_CHECK(
-          cuda_engine->bindingIsInput(trt_idx),
+          (binding_name == engine_binded_name),
+          "Could not find a TensorRT engine binding for input named " << binding_name);
+      TORCHTRT_CHECK(
+          (cuda_engine->getTensorIOMode(binding_name.c_str()) == nvinfer1::TensorIOMode::kINPUT),
           "Binding " << binding_name << " specified as input but found as output in TensorRT engine");
-      LOG_DEBUG(
-          "Input binding name: " << binding_name << " (trt binding idx: " << trt_idx << ", "
-                                 << "pyt arg idx: " << pyt_idx << ")");
+      LOG_DEBUG("Input binding name: " << binding_name << "pyt arg idx: " << pyt_idx << ")");
       in_binding_map[trt_idx] = pyt_idx;
       in_binding_names[pyt_idx] = _in_binding_names[pyt_idx];
     }
@@ -129,17 +128,18 @@ TRTEngine::TRTEngine(
     for (size_t pyt_idx = 0; pyt_idx < outputs; pyt_idx++) {
       auto binding_name = _out_binding_names[pyt_idx];
       auto trt_idx = cuda_engine->getBindingIndex(binding_name.c_str());
-      TORCHTRT_CHECK((trt_idx >= 0), "Could not find a TensorRT engine binding for output named " << binding_name);
+      std::string engine_binded_name = cuda_engine->getIOTensorName(inputs_size + pyt_idx);
+      TORCHTRT_CHECK(
+          (binding_name == engine_binded_name),
+          "Could not find a TensorRT engine binding for output named " << binding_name);
       TORCHTRT_CHECK(
-          !cuda_engine->bindingIsInput(trt_idx),
+          !(cuda_engine->getTensorIOMode(binding_name.c_str()) == nvinfer1::TensorIOMode::kINPUT),
           "Binding " << binding_name << " specified as output but found as input in TensorRT engine");
-      LOG_DEBUG(
-          "Output binding name: " << binding_name << " (trt binding idx: " << trt_idx << ", "
-                                  << "pyt return idx: " << pyt_idx << ")");
+      LOG_DEBUG("Output binding name: " << binding_name << "pyt return idx: " << inputs_size + pyt_idx << ")");
       out_binding_map[trt_idx] = pyt_idx;
       out_binding_names[pyt_idx] = binding_name;
     }
-    num_io = std::make_pair(inputs, outputs);
+    num_io = std::make_pair(inputs_size, outputs);
   }
 
 #ifndef NDEBUG
@@ -149,10 +149,10 @@ TRTEngine::TRTEngine(
 }
 
 TRTEngine::~TRTEngine() {
+  rt.reset();
   trt_engine_profiler.reset();
   exec_ctx.reset();
   cuda_engine.reset();
-  rt.reset();
 }
 
 void TRTEngine::disable_profiling() {
@@ -164,7 +164,7 @@ void TRTEngine::disable_profiling() {
 }
 
 void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) {
-  auto inspector = cuda_engine->createEngineInspector();
+  auto inspector = make_trt(cuda_engine->createEngineInspector());
   std::ofstream f(path);
   f << std::string(inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON));
   f.close();
@@ -208,23 +208,23 @@ std::string TRTEngine::to_str() const {
   std::stringstream ss;
   ss << "Torch-TensorRT TensorRT Engine:" << std::endl;
   ss << "  Name: " << name << std::endl;
-  ss << "  Bindings: {" << std::endl;
-  for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
-    if (cuda_engine->bindingIsInput(x)) {
-      const uint64_t pyt_idx = in_binding_map.at(x);
-  ss << "    (" << x << ": " << in_binding_names.at(pyt_idx) << ") Input: [" << std::endl;
-  ss << "      pytorch arg idx: " << pyt_idx << std::endl;
-  ss << "        shape: " << exec_ctx->getBindingDimensions(x) << std::endl;
-  ss << "        dtype: " << util::TRTDataTypeToScalarType(exec_ctx->getEngine().getBindingDataType(x)) << std::endl;
-  ss << "    ]" << std::endl;
-    } else {
-      const uint64_t pyt_idx = out_binding_map.at(x);
-  ss << "    (" << x <<  ": " << out_binding_names.at(pyt_idx) << ") Output: [" << std::endl;
-  ss << "      pytorch return idx: " << pyt_idx << std::endl;
-  ss << "        shape: " << exec_ctx->getBindingDimensions(x) << std::endl;
-  ss << "        dtype: " << util::TRTDataTypeToScalarType(exec_ctx->getEngine().getBindingDataType(x)) << std::endl;
-  ss << "    ]" << std::endl;
-    }
+  ss << "  Inputs: [" << std::endl;
+  for (uint64_t i = 0; i < num_io.first; i++) {
+    ss << "    id: " << i << std::endl;
+    ss << "      shape: " << exec_ctx->getTensorShape(std::string("input_" + str(i)).c_str()) << std::endl;
+    ss << "      dtype: "
+       << util::TRTDataTypeToScalarType(exec_ctx->getEngine().getTensorDataType(std::string("input_" + str(i)).c_str()))
+       << std::endl;
+  }
+  ss << "  ]" << std::endl;
+  ss << "  Outputs: [" << std::endl;
+  for (uint64_t o = 0; o < num_io.second; o++) {
+    ss << "    id: " << o << std::endl;
+    ss << "      shape: " << exec_ctx->getTensorShape(std::string("output_" + str(o)).c_str()) << std::endl;
+    ss << "      dtype: "
+       << util::TRTDataTypeToScalarType(
+              exec_ctx->getEngine().getTensorDataType(std::string("output_" + str(o)).c_str()))
+       << std::endl;
   }
   ss << "  }" << std::endl;
   ss << "  Device: " << device_info << std::endl;
 
@@ -121,36 +121,30 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
   }
 
-  std::vector<void*> gpu_handles;
-  std::vector<at::Tensor> contig_inputs{};
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
     if (compiled_engine->profile_execution) {
       input_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
     }
-
-    contig_inputs.reserve(inputs.size());
-
     for (size_t i = 0; i < inputs.size(); i++) {
-      uint64_t pyt_idx = compiled_engine->in_binding_map[i];
+      std::string name = compiled_engine->in_binding_names[i];
       TORCHTRT_CHECK(
-          inputs[pyt_idx].is_cuda(),
-          "Expected input tensors to have device cuda, found device " << inputs[pyt_idx].device());
-      auto expected_type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getBindingDataType(i));
+          inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
+      auto expected_type =
+          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
       TORCHTRT_CHECK(
-          inputs[pyt_idx].dtype() == expected_type,
-          "Expected input tensors to have type " << expected_type << ", found type " << inputs[pyt_idx].dtype());
-      auto dims = core::util::toDimsPad(inputs[pyt_idx].sizes(), 1);
+          inputs[i].dtype() == expected_type,
+          "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
+      auto dims = core::util::toDimsPad(inputs[i].sizes(), 1);
       auto shape = core::util::toVec(dims);
-      contig_inputs.push_back(inputs[pyt_idx].view(shape).contiguous());
-      LOG_DEBUG("Input shape: " << dims);
-      compiled_engine->exec_ctx->setBindingDimensions(i, dims);
-      gpu_handles.push_back(contig_inputs.back().data_ptr());
+      LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
+      compiled_engine->exec_ctx->setInputShape(name.c_str(), dims);
+      compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().data_ptr());
     }
+
     TORCHTRT_CHECK(
-        compiled_engine->exec_ctx->allInputDimensionsSpecified(),
-        "Not enough inputs provided (torch.ops.tensorrt.execute_engine)");
+        compiled_engine->exec_ctx->allInputShapesSpecified(), "Not enough inputs provided (runtime.RunCudaEngine)");
   }
 
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
@@ -163,26 +157,27 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     for (size_t o = inputs.size(); o < (compiled_engine->num_io.first + compiled_engine->num_io.second); o++) {
       uint64_t pyt_idx = compiled_engine->out_binding_map[o];
-      auto out_shape = compiled_engine->exec_ctx->getBindingDimensions(o);
-      LOG_DEBUG("Output shape: " << out_shape);
+      std::string name = compiled_engine->out_binding_names[pyt_idx];
+      auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
+      LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
       auto dims = core::util::toVec(out_shape);
-      auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getBindingDataType(o));
+      auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
       outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
-      gpu_handles.push_back(outputs[pyt_idx].data_ptr());
+      compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr());
     }
   }
+
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
     if (compiled_engine->profile_execution) {
       enqueue_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
     }
-
     c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
 
     // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.
     std::unique_lock<std::mutex> lock(compiled_engine->mu);
-    compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
+    compiled_engine->exec_ctx->enqueueV3(stream);
     if (compiled_engine->profile_execution) {
       LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
       dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
 
@@ -51,17 +51,17 @@ new_local_repository(
 http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "8d9e829ce9478db4f35bdb7943308cf02e8a2f58cf9bb10f742462c1d57bf287",
+    sha256 = "59b8b5e1954a86d50b79c13f06398d385b200da13e37a08ecf31d3c62e5ca127",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-cxx11-abi-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
 )
 
 http_archive(
     name = "libtorch_pre_cxx11_abi",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad",
+    sha256 = "e260fc7476be89d1650953e8643e9f7363845f5a52de4bab87ac0e619c1f6ad4",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
 )
 
 ####################################################################################
 
@@ -10,7 +10,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  <title>Class DataType &mdash; Torch-TensorRT v1.4.0dev0+544654f documentation</title>
+  <title>Class DataType &mdash; Torch-TensorRT v1.4.0dev0+b638e78 documentation</title>
 
 
 
@@ -215,7 +215,7 @@
 
 
                 <div class="version">
-                  v1.4.0dev0+544654f
+                  v1.4.0dev0+b638e78
                 </div>
 
 
 
@@ -10,7 +10,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  <title>Class Device::DeviceType &mdash; Torch-TensorRT v1.4.0dev0+544654f documentation</title>
+  <title>Class Device::DeviceType &mdash; Torch-TensorRT v1.4.0dev0+b638e78 documentation</title>
 
 
 
@@ -215,7 +215,7 @@
 
 
                 <div class="version">
-                  v1.4.0dev0+544654f
+                  v1.4.0dev0+b638e78
                 </div>