pytorch
diff --git a/‎.circleci/config.yml
+755-53 b/‎.circleci/config.yml
+755-53
diff --git a/‎.github/code-owners.yml
+1-1 b/‎.github/code-owners.yml
+1-1
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎README.md
+8-7 b/‎README.md
+8-7
diff --git a/‎WORKSPACE
+10-10 b/‎WORKSPACE
+10-10
diff --git a/‎core/compiler.cpp
+19-23 b/‎core/compiler.cpp
+19-23
diff --git a/‎core/conversion/conversion.cpp
+1-1 b/‎core/conversion/conversion.cpp
+1-1
diff --git a/‎core/conversion/conversionctx/ConversionCtx.cpp
+28-6 b/‎core/conversion/conversionctx/ConversionCtx.cpp
+28-6
diff --git a/‎core/conversion/conversionctx/ConversionCtx.h
+7-1 b/‎core/conversion/conversionctx/ConversionCtx.h
+7-1
@@ -110,7 +110,7 @@
   - "peri044"
   - "bowang007"
 
-"component: docker":
+"channel: docker":
   - "andi4191"
   - "narendasan"
 
 
@@ -62,3 +62,6 @@ bazel-Torch-TensorRT-Preview
 docsrc/src/
 bazel-TensorRT
 bazel-tensorrt
+.pytest_cache
+*.cache
+*cifar-10-batches-py*
@@ -2,13 +2,14 @@
 
 [![Documentation](https://img.shields.io/badge/docs-master-brightgreen)](https://nvidia.github.io/Torch-TensorRT/)
 
-> Ahead of Time (AOT) compiling for PyTorch JIT
+> Ahead of Time (AOT) compiling for PyTorch JIT and FX
 
-Torch-TensorRT is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, Torch-TensorRT is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript program into an module targeting a TensorRT engine. Torch-TensorRT operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/FP16/INT8) and other settings for your module.
+Torch-TensorRT is a compiler for PyTorch/TorchScript/FX, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, Torch-TensorRT is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript or FX program into an module targeting a TensorRT engine. Torch-TensorRT operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/FP16/INT8) and other settings for your module.
 
 Resources:
 - [Documentation](https://nvidia.github.io/Torch-TensorRT/)
-- [Torch-TensorRT Explained in 2 minutes!](https://www.youtube.com/watch?v=TU5BMU6iYZ0&ab_channel=NVIDIADeveloper)
+- [FX path Documentation](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
+- [Torch-TensorRT Explained in 2 minutes!](https://www.youtube.com/watch?v=TU5BMU6iYZ0&ab_channel=NVIDIADeveloper) 
 - [Comprehensive Discusion (GTC Event)](https://www.nvidia.com/en-us/on-demand/session/gtcfall21-a31107/)
 - [Pre-built Docker Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch). To use this container, make an NGC account and sign in to NVIDIA's registry with an API key. Refer to [this guide](https://docs.nvidia.com/ngc/ngc-catalog-user-guide/index.html#registering-activating-ngc-account) for the same.
 
@@ -111,10 +112,10 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts") # save the TRT embedd
 These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
 
 - Bazel 5.1.1
-- Libtorch 1.11.0 (built with CUDA 11.3)
+- Libtorch 1.12.0 (built with CUDA 11.3)
 - CUDA 11.3
-- cuDNN 8.2.1
-- TensorRT 8.2.4.2
+- cuDNN 8.4.1
+- TensorRT 8.4.1.5
 
 ## Prebuilt Binaries and Wheel files
 
@@ -213,7 +214,7 @@ bazel build //:libtorchtrt --compilation_mode opt
 ```
 
 ### FX path (Python only) installation
-If the user plan to try FX path (Python only) and would like to avoid bazel build. Please follow the steps below.
+If the user plans to try FX path (Python only) and would like to avoid bazel build. Please follow the steps below.
 ``` shell
 cd py && python3 setup.py install --fx-only
 ```
 
@@ -56,17 +56,17 @@ new_local_repository(
 http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "8d9e829ce9478db4f35bdb7943308cf02e8a2f58cf9bb10f742462c1d57bf287",
+    sha256 = "80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip"],
 )
 
 http_archive(
     name = "libtorch_pre_cxx11_abi",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad",
+    sha256 = "8e35371403f7052d9e9b43bcff383980dbde4df028986dc1dab539953481d55f",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.12.0%2Bcu113.zip"],
 )
 
 # Download these tarballs manually from the NVIDIA website
@@ -76,20 +76,20 @@ http_archive(
 http_archive(
     name = "cudnn",
     build_file = "@//third_party/cudnn/archive:BUILD",
-    sha256 = "0e5d2df890b9967efa6619da421310d97323565a79f05a1a8cb9b7165baad0d7",
-    strip_prefix = "cuda",
+    sha256 = "ec96d2376d81fca42bdd3d4c3d705a99b29a065bab57f920561c763e29c67d01",
+    strip_prefix = "cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.2.4/11.4_20210831/cudnn-11.4-linux-x64-v8.2.4.15.tgz",
+        "https://developer.nvidia.com/compute/cudnn/secure/8.4.1/local_installers/11.6/cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz",
     ],
 )
 
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    sha256 = "826180eaaecdf9a7e76116855b9f1f3400ea9b06e66b06a3f6a0747ba6f863ad",
-    strip_prefix = "TensorRT-8.2.4.2",
+    sha256 = "8107861af218694130f170e071f49814fa3e27f1386ce7cb6d807ac05a7fcf0e",
+    strip_prefix = "TensorRT-8.4.1.5",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.4/tars/tensorrt-8.2.4.2.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
+        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.1/tars/tensorrt-8.4.1.5.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz",
     ],
 )
 
 
@@ -359,14 +359,6 @@ void MapInputsAndDetermineDTypes(
   }
 }
 
-uint64_t GetRecommendedWorkspaceSize(const runtime::CudaDevice& device) {
-  if (device.major < 6) {
-    return 256 * (1 << 20);
-  } else {
-    return 1 << 30;
-  }
-}
-
 std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
   // Go through Lowering to simplify graph and extract weight parameters
   auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
@@ -380,14 +372,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   // Infer the type of an input from the weights of the calculation
   auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
 
-  // GPU default WS size : 1 GB
-  // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
-  auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
-  auto device_spec = cfg.convert_info.engine_settings.device;
-  auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
-  if (workspace_size == 0) {
-    cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
-  }
+  // // GPU default WS size : 1 GB
+  // // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
+  // auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
+  // auto device_spec = cfg.convert_info.engine_settings.device;
+  // auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+  // if (workspace_size == 0) {
+  //   cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
+  // }
 
   MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
 
@@ -399,14 +391,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
   torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
-  // GPU default WS size : 1 GB
-  // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
-  auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
+  // // GPU default WS size : 1 GB
+  // // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
+  // auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
   auto device_spec = cfg.convert_info.engine_settings.device;
   auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
-  if (workspace_size == 0) {
-    cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
-  }
+  // if (workspace_size == 0) {
+  //   cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
+  // }
 
   for (const torch::jit::Method& method : mod.get_methods()) {
     if (method.name().compare("forward") == 0) {
@@ -436,7 +428,11 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
         auto graph_and_mapping =
             ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params, fallback_nodes);
         new_g = graph_and_mapping.first;
-        LOG_INFO("Graph after Fallback: " << *new_g);
+        // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly
+        for (size_t i = 0; i < new_g->inputs().size(); ++i) {
+          new_g->inputs()[i]->setDebugName(std::string("input_") + std::to_string(i));
+        }
+        LOG_INFO(*new_g << "(GraphAfterFallback)");
 
         // if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
         // module
 
@@ -188,7 +188,7 @@ void AddInputs(
       ctx->input_is_dynamic = true;
     }
 
-    ctx->value_tensor_map[in] = trt_in;
+    ctx->RecordNewITensor(in, trt_in);
     ctx->num_inputs += 1;
   }
 
 
@@ -20,9 +20,11 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
        << "\n    Debuggable Engine: " << s.debug                                           \
        << "\n    GPU ID: " << s.device.gpu_id                                              \
        << "\n    Allow GPU Fallback (if running on DLA): " << s.device.allow_gpu_fallback  \
-       << "\n    Min Timing Iterations: " << s.num_min_timing_iters                        \
        << "\n    Avg Timing Iterations: " << s.num_avg_timing_iters                        \
-       << "\n    Max Workspace Size: " << s.workspace_size;
+       << "\n    Max Workspace Size: " << s.workspace_size                                 \
+       << "\n    DLA SRAM Size: " << s.dla_sram_size                                       \
+       << "\n    DLA Local DRAM Size: " << s.dla_local_dram_size                           \
+       << "\n    DLA Global DRAM Size: " << s.dla_global_dram_size;
 
     os << "\n    Device Type: " << s.device.device_type                                    \
        << "\n    GPU ID: " << s.device.gpu_id;
@@ -104,9 +106,11 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
     cfg->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
   }
 
-  cfg->setMinTimingIterations(settings.num_min_timing_iters);
   cfg->setAvgTimingIterations(settings.num_avg_timing_iters);
-  cfg->setMaxWorkspaceSize(settings.workspace_size);
+  if (settings.workspace_size != 0){
+    cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, settings.workspace_size);
+  }
+
   cfg->setDefaultDeviceType(settings.device.device_type);
   cfg->setEngineCapability(settings.capability);
 
@@ -120,6 +124,15 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
         settings.enabled_precisions.find(nvinfer1::DataType::kFLOAT) == settings.enabled_precisions.end(),
         "DLA supports only fp16 or int8 precision");
     cfg->setDLACore(settings.device.dla_core);
+    if (settings.dla_sram_size != 1048576){
+      cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, settings.dla_sram_size);
+    }
+    if (settings.dla_local_dram_size != 1073741824){
+      cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, settings.dla_local_dram_size);
+    }
+    if (settings.dla_global_dram_size != 536870912){
+      cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, settings.dla_global_dram_size);
+    }
   }
 }
 
@@ -130,8 +143,8 @@ ConversionCtx::~ConversionCtx() {
 }
 
 nvinfer1::ITensor* ConversionCtx::AssociateValueAndTensor(const torch::jit::Value* value, nvinfer1::ITensor* tensor) {
-  tensor->setName(value->debugName().c_str());
-  this->value_tensor_map[value] = tensor;
+  RecordNewITensor(value, tensor);
+
   return tensor;
 }
 
@@ -140,6 +153,15 @@ torch::jit::IValue* ConversionCtx::AssociateValueAndIValue(const torch::jit::Val
   return &this->evaluated_value_map[value];
 }
 
+void ConversionCtx::RecordNewITensor(const torch::jit::Value* value, nvinfer1::ITensor* tensor) {
+  value_tensor_map[value] = tensor;
+  auto ret = seen_itensors.insert(tensor);
+  if (!ret.second) {
+    LOG_WARNING(
+        "Trying to record the value " << value->debugName() << " with the ITensor " << tensor->getName() << " again.");
+  }
+}
+
 std::string ConversionCtx::SerializeEngine() {
 #if NV_TENSORRT_MAJOR > 7
   auto serialized_network = builder->buildSerializedNetwork(*net, *cfg);
 
@@ -33,9 +33,11 @@ struct BuilderSettings {
   Device device;
   nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
-  uint64_t num_min_timing_iters = 2;
   uint64_t num_avg_timing_iters = 1;
   uint64_t workspace_size = 0;
+  uint64_t dla_sram_size = 1048576;
+  uint64_t dla_local_dram_size = 1073741824;
+  uint64_t dla_global_dram_size = 536870912;
 
   BuilderSettings() = default;
   BuilderSettings(const BuilderSettings& other) = default;
@@ -46,6 +48,7 @@ struct ConversionCtx {
   ConversionCtx(BuilderSettings settings);
   std::string SerializeEngine();
   nvinfer1::ITensor* AssociateValueAndTensor(const torch::jit::Value* value, nvinfer1::ITensor* tensor);
+  void RecordNewITensor(const torch::jit::Value* value, nvinfer1::ITensor* tensor);
   torch::jit::IValue* AssociateValueAndIValue(const torch::jit::Value* value, torch::jit::IValue tensor);
   bool CheckLayerAddition(const torch::jit::Node* n);
 
@@ -69,6 +72,9 @@ struct ConversionCtx {
 
   std::unordered_map<const torch::jit::Value*, nvinfer1::ITensor*> value_tensor_map;
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> evaluated_value_map;
+
+  // record already named ITensors to prevent rewriting another name to the same tensor
+  std::unordered_set<nvinfer1::ITensor*> seen_itensors;
 };
 
 } // namespace conversion
Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ void AddInputs(`
`188`	`188`	`ctx->input_is_dynamic = true;`
`189`	`189`	`}`
`190`	`190`
`191`		`- ctx->value_tensor_map[in] = trt_in;`
	`191`	`+ ctx->RecordNewITensor(in, trt_in);`
`192`	`192`	`ctx->num_inputs += 1;`
`193`	`193`	`}`
`194`	`194`