pytorch
diff --git a/‎.circleci/config.yml
Lines changed: 185 additions & 175 deletions b/‎.circleci/config.yml
Lines changed: 185 additions & 175 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 4 deletions b/‎README.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎WORKSPACE
Lines changed: 11 additions & 11 deletions b/‎WORKSPACE
Lines changed: 11 additions & 11 deletions
diff --git a/‎core/compiler.cpp
Lines changed: 53 additions & 31 deletions b/‎core/compiler.cpp
Lines changed: 53 additions & 31 deletions
diff --git a/‎core/conversion/conversion.cpp
Lines changed: 1 addition & 1 deletion b/‎core/conversion/conversion.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/conversion/converters/impl/conv_deconv.cpp
Lines changed: 14 additions & 0 deletions b/‎core/conversion/converters/impl/conv_deconv.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎core/conversion/converters/impl/max.cpp
Lines changed: 13 additions & 1 deletion b/‎core/conversion/converters/impl/max.cpp
Lines changed: 13 additions & 1 deletion
diff --git a/‎core/conversion/converters/impl/select.cpp
Lines changed: 15 additions & 1 deletion b/‎core/conversion/converters/impl/select.cpp
Lines changed: 15 additions & 1 deletion
diff --git a/‎core/ir/Input.cpp
Lines changed: 6 additions & 6 deletions b/‎core/ir/Input.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎core/ir/ir.h
Lines changed: 4 additions & 3 deletions b/‎core/ir/ir.h
Lines changed: 4 additions & 3 deletions
@@ -114,10 +114,10 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts") # save the TRT embedd
 These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
 
 - Bazel 5.2.0
-- Libtorch 1.12.1 (built with CUDA 11.6)
-- CUDA 11.6
-- cuDNN 8.4.1
-- TensorRT 8.4.3.1
+- Libtorch 2.0.0.dev20230103 (built with CUDA 11.7)
+- CUDA 11.7
+- cuDNN 8.5.0
+- TensorRT 8.5.1.7
 
 ## Prebuilt Binaries and Wheel files
 
 
@@ -41,7 +41,7 @@ local_repository(
 new_local_repository(
     name = "cuda",
     build_file = "@//third_party/cuda:BUILD",
-    path = "/usr/local/cuda-11.6/",
+    path = "/usr/local/cuda-11.7/",
 )
 
 new_local_repository(
@@ -56,17 +56,17 @@ new_local_repository(
 http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "b565c662435fd58ec295fa0791388ea52ad0f5fd33517b2d7c0fdcc91b6db531",
+    sha256 = "59b8b5e1954a86d50b79c13f06398d385b200da13e37a08ecf31d3c62e5ca127",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/nightly/cu116/libtorch-cxx11-abi-shared-with-deps-1.14.0.dev20221114%2Bcu116.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-cxx11-abi-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
 )
 
 http_archive(
     name = "libtorch_pre_cxx11_abi",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "fbb37446c33b05c1e26256c09f6ffb46cea1f6ff9ee2ad5b79b146d09023b0c1",
+    sha256 = "e260fc7476be89d1650953e8643e9f7363845f5a52de4bab87ac0e619c1f6ad4",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/nightly/cu116/libtorch-shared-with-deps-1.14.0.dev20221114%2Bcu116.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
 )
 
 # Download these tarballs manually from the NVIDIA website
@@ -76,20 +76,20 @@ http_archive(
 http_archive(
     name = "cudnn",
     build_file = "@//third_party/cudnn/archive:BUILD",
-    sha256 = "ec96d2376d81fca42bdd3d4c3d705a99b29a065bab57f920561c763e29c67d01",
-    strip_prefix = "cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive",
+    sha256 = "5454a6fd94f008728caae9adad993c4e85ef36302e26bce43bea7d458a5e7b6d",
+    strip_prefix = "cudnn-linux-x86_64-8.5.0.96_cuda11-archive",
     urls = [
-        "https://developer.nvidia.com/compute/cudnn/secure/8.4.1/local_installers/11.6/cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz",
+        "https://developer.nvidia.com/compute/cudnn/secure/8.5.0/local_installers/11.7/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz",
     ],
 )
 
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    sha256 = "8d7c2085c1639dcc73875048c23598a8526ce3089136876e31d90258e49e4f61",
-    strip_prefix = "TensorRT-8.4.3.1",
+    sha256 = "39cc7f077057d1363794e8ff51c4cf21a5dbeccf1116b0020ba0dae0f3063076",
+    strip_prefix = "TensorRT-8.5.1.7",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.3/tars/tensorrt-8.4.3.1.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz",
+        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.5.1/tars/TensorRT-8.5.1.7.Linux.x86_64-gnu.cuda-11.8.cudnn8.6.tar.gz",
     ],
 )
 
 
@@ -187,7 +187,7 @@ partitioning::GraphAndMapping BuildHybridGraph(
   return partitioning::stitch(&partitioning_ctx, block);
 }
 
-void MapInputsAndDetermineDTypes(
+ir::TypeMap MapInputsAndDetermineDTypes(
     CompileSpec& cfg,
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
@@ -197,6 +197,7 @@ void MapInputsAndDetermineDTypes(
   cfg.partitioning_info.collection_input_spec_map =
       ir::CollectionInputSpecMap(cfg.convert_info.collection_input_spec_map);
 
+  ir::TypeMap inferred_dtypes;
   auto collection_inputs = ir::get_collection_inputs(g, static_params);
   LOG_DEBUG(
       "In MapInputsAndDetermineDTypes, the g->inputs() size is "
@@ -218,13 +219,13 @@ void MapInputsAndDetermineDTypes(
         LOG_INFO(
             "Since input type is not explicitly defined, infering using first tensor calculation\n  Inferred input "
             << in->debugName() << " has type " << est_type_opt[i].value());
-        spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value());
+        spec[i].dtype = est_type_opt[i].value();
       } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) {
         // If we cannot calculate the type and the user did not define the type, then default to FP32
         LOG_WARNING(
             "Cannot infer input type from calcuations in graph for input "
             << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
-        spec[i].dtype = nvinfer1::DataType::kFLOAT;
+        spec[i].dtype = at::kFloat;
       } else if (spec[i].dtype_is_user_defined && cfg.partitioning_info.enabled) {
         if (!est_type_opt[i]) {
           LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
@@ -236,37 +237,35 @@ void MapInputsAndDetermineDTypes(
           auto warn_str = ss.str();
           LOG_WARNING(warn_str);
           // Overwrite type map with user settings
-          first_use_type_map[in][i] = {
-              util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
-
-        } else {
-          if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) !=
-              est_type_opt[i].value()) {
-            std::stringstream ss;
-            ss << "For input " << in->debugName() << ", found user specified input dtype as ";
-            ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
-            ss << ", however when inspecting the graph, the input type expected was inferred to be ";
-            ss << est_type_opt[i].value() << std::endl;
-            ss << "The compiler is going to use the user setting "
-               << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
-            ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
-            ss << "compatibility with PyTorch's data type convention is required.\n";
-            ss << "If you do indeed see errors at runtime either:\n";
-            ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
-            ss << "- Disable partial compilation by setting require_full_compilation to True";
-            auto warn_str = ss.str();
-            LOG_WARNING(warn_str);
-            // Overwrite type map with user settings
-            first_use_type_map[in][i] = {
-                util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
-          }
+          first_use_type_map[in][i] = {cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype};
+
+        } else if (cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype != est_type_opt[i].value()) {
+          std::stringstream ss;
+          ss << "For input " << in->debugName() << ", found user specified input dtype as ";
+          ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+          ss << ", however when inspecting the graph, the input type expected was inferred to be ";
+          ss << est_type_opt[i].value() << std::endl;
+          ss << "The compiler is going to use the user setting "
+             << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+          ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
+          ss << "compatibility with PyTorch's data type convention is required.\n";
+          ss << "If you do indeed see errors at runtime either:\n";
+          ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
+          ss << "- Disable partial compilation by setting require_full_compilation to True";
+          auto warn_str = ss.str();
+          LOG_WARNING(warn_str);
+          // Overwrite type map with user settings
+          first_use_type_map[in][i] = {cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype};
         }
       } else {
         // The user defined the type so no changes are necessary
       }
+
+      // Insert entry for Value pointer and determined ScalarType
+      inferred_dtypes.insert({in, {spec[i].dtype}});
     }
   }
-  // }
+  return inferred_dtypes;
 }
 
 std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
@@ -284,6 +283,15 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
   MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
 
+  // Ensure none of the specified types are of acceptable input types incompatible with TRT
+  // Currently, only at::kLong is an acceptable, though TRT-incompatible type
+  for (auto value_to_dtypes : first_use_types) {
+    for (auto dtype : value_to_dtypes.second) {
+      TORCHTRT_CHECK(
+          !dtype || dtype.value() != at::kLong, "Cannot specify Int64 input for a model fully compiled in TRT");
+    }
+  }
+
   auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
 
   return engine;
@@ -307,10 +315,24 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       // Infer the type of an input from the weights of the calculation
       auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
-      MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
+      // Extract map of IValue to DType
+      auto type_map = MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
+
+      // Check whether any of the input types are Long
+      bool user_requested_long = false;
+      for (auto dtype : type_map) {
+        user_requested_long |= dtype.second && (dtype.second.value() == at::kLong);
+      }
+
+      // Use dtype map to autocast Tensor-type inputs to Long dtype as necessary
+      if (cfg.partitioning_info.enabled && cfg.partitioning_info.truncate_long_and_double && user_requested_long) {
+        auto casts_inserted = lowering::AutocastLongInputs(g, type_map, cfg.lower_info.getGPUDeviceString());
+        user_requested_long &= (casts_inserted > 0);
+      }
+
       auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
       auto outputIsCollection = conversion::OutputIsCollection(g->block());
-      if (cfg.partitioning_info.enabled &&
+      if (cfg.partitioning_info.enabled && !user_requested_long &&
           (cfg.lower_info.forced_fallback_modules.size() == 0 &&
            cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) &&
           !outputIsCollection) {
@@ -320,7 +342,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       if (cfg.partitioning_info.enabled &&
           (!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
              cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) ||
-           outputIsCollection)) {
+           outputIsCollection || user_requested_long)) {
         auto graph_and_mapping = BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types);
         new_g = graph_and_mapping.first;
         // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly
 
@@ -183,7 +183,7 @@ void AddInputs(ConversionCtx* ctx, c10::ArrayRef<const torch::jit::Value*> input
         "Adding Input " << in->debugName() << " (named: " << name << "): " << spec
                         << " in engine (conversion.AddInputs)");
 
-    auto trt_in = ctx->net->addInput(name.c_str(), spec.dtype, spec.input_shape);
+    auto trt_in = ctx->net->addInput(name.c_str(), util::ScalarTypeToTRTDataType(spec.dtype), spec.input_shape);
     TORCHTRT_CHECK(trt_in, "Failed to add input node: " << in->debugName() << " (conversion.AddInputs)");
     trt_in->setAllowedFormats(1U << static_cast<int>(spec.format));
 
 
@@ -102,6 +102,20 @@ bool add_conv_deconv(ConversionCtx* ctx, const torch::jit::Node* n, args& args)
   }
 
   auto w = Weights(ctx, args[1].unwrapToTensor());
+  // TODO: Remove this when conv3d with kernel size=1 bug is fixed.
+  // Github issue: https://github.com/pytorch/TensorRT/issues/1445
+  bool is_kernel_size_one = true;
+  bool is_3d_kernel = w.kernel_shape.nbDims == 3;
+  for (int64_t i = 0; i < w.kernel_shape.nbDims; i++) {
+    if (w.kernel_shape.d[i] != 1.0f) {
+      is_kernel_size_one = false;
+    }
+  }
+  if (is_kernel_size_one && is_3d_kernel) {
+    LOG_WARNING(
+        "Conv3d layer with kernel size = 1 configuration incurs a failure with TensorRT tactic optimizer in some cases. \
+    Github issue: https://github.com/pytorch/TensorRT/issues/1445. Other conv variants do not have this issue.");
+  }
   auto dims = in->getDimensions();
   auto orig_dims = dims;
   LOG_DEBUG("Input dims: " << orig_dims);
 
@@ -22,6 +22,11 @@ bool min_max_dim(ConversionCtx* ctx, const torch::jit::Node* n, args& args, nvin
   if (dim < 0) {
     dim = selfDim.size() + dim;
   }
+  bool int_input = self->getType() == nvinfer1::DataType::kINT32;
+  if (int_input) {
+    LOG_DEBUG("topk layer does not support int32 inputs, adding cast to float");
+    self = castITensor(ctx, self, nvinfer1::DataType::kFLOAT, util::node_info(n) + "_input");
+  }
   uint32_t reduce_axes_mask = 1 << dim;
   auto topk_layer = ctx->net->addTopK(*self, topKOperation, 1, reduce_axes_mask);
   TORCHTRT_CHECK(topk_layer, "Unable to create topk layer from node: " << *n);
@@ -44,7 +49,10 @@ bool min_max_dim(ConversionCtx* ctx, const torch::jit::Node* n, args& args, nvin
     out0 = ctx->AssociateValueAndTensor(n->outputs()[0], topk_layer->getOutput(0));
     out1 = ctx->AssociateValueAndTensor(n->outputs()[1], topk_layer->getOutput(1));
   }
-
+  if (int_input) {
+    LOG_DEBUG("Adding cast of topK layer output back to int32");
+    out0 = castITensor(ctx, out0, nvinfer1::DataType::kINT32, util::node_info(n) + "_output");
+  }
   LOG_DEBUG("Output tensor(0) shape: " << out0->getDimensions());
   LOG_DEBUG("Output tensor(1) shape: " << out1->getDimensions());
 
@@ -59,6 +67,10 @@ bool arg_min_max(ConversionCtx* ctx, const torch::jit::Node* n, args& args, nvin
   if (dim < 0) {
     dim = selfDim.size() + dim;
   }
+  if (self->getType() == nvinfer1::DataType::kINT32) {
+    LOG_DEBUG("topk layer does not support int32 inputs, adding cast to float");
+    self = castITensor(ctx, self, nvinfer1::DataType::kFLOAT, util::node_info(n) + "_input");
+  }
   uint32_t reduce_axes_mask = 1 << dim;
   auto topk_layer = ctx->net->addTopK(*self, topKOperation, 1, reduce_axes_mask);
   TORCHTRT_CHECK(topk_layer, "Unable to create topk layer from node: " << *n);
 
@@ -16,7 +16,7 @@ namespace impl {
 namespace {
 
 bool add_split(ConversionCtx* ctx, const torch::jit::Node* n, args& args, bool split_list, bool unbind) {
-  auto in = args[0].ITensor();
+  auto in = args[0].ITensorOrFreeze(ctx);
   auto numOutputs = 1, numRemainder = 0;
   std::vector<int64_t> sizes;
 
@@ -736,8 +736,22 @@ auto select_registrations TORCHTRT_UNUSED =
             {"aten::where.self(Tensor condition, Tensor self, Tensor other) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto condition = args[0].ITensorOrFreeze(ctx);
+               auto condition_nbDims = condition->getDimensions().nbDims;
                auto x = args[1].ITensorOrFreeze(ctx);
+               auto x_nbDims = x->getDimensions().nbDims;
                auto y = args[2].ITensorOrFreeze(ctx);
+               auto y_nbDims = y->getDimensions().nbDims;
+
+               // Get maximum rank of all input tensors
+               auto max_nbDims = std::max(condition_nbDims, std::max(x_nbDims, y_nbDims));
+
+               // TensorRT requires all inputs to Select layers to have the same rank, so for each
+               // tensor input, ensure that its rank is equal to the maximum number of dimensions
+               // If not, left-pad the tensor dimension with 1s until the max rank is achieved
+               condition =
+                   addPadding(ctx, n, condition, max_nbDims, /*bool trailing =*/false, /*bool use_zeros =*/false);
+               x = addPadding(ctx, n, x, max_nbDims, /*bool trailing =*/false, /*bool use_zeros =*/false);
+               y = addPadding(ctx, n, y, max_nbDims, /*bool trailing =*/false, /*bool use_zeros =*/false);
 
                auto layer = ctx->net->addSelect(*condition, *x, *y);
 
 
@@ -75,7 +75,7 @@ bool valid_input_domain(std::vector<int64_t> domain) {
 
 Input::Input(
     std::vector<int64_t> shape,
-    nvinfer1::DataType dtype,
+    at::ScalarType dtype,
     nvinfer1::TensorFormat format,
     bool dtype_is_user_defined,
     std::vector<int64_t> tensor_domain) {
@@ -89,10 +89,10 @@ Input::Input(
   input_shape = util::toDims(shape);
   input_is_dynamic = false;
 
-  TORCHTRT_CHECK(valid_input_dtype(dtype), "Unsupported input data type: " << dtype);
+  TORCHTRT_CHECK(valid_input_dtype(util::ScalarTypeToTRTDataType(dtype)), "Unsupported input data type: " << dtype);
   this->dtype = dtype;
   TORCHTRT_CHECK(
-      valid_dtype_format_combo(dtype, format),
+      valid_dtype_format_combo(util::ScalarTypeToTRTDataType(dtype), format),
       "Unsupported combination of dtype and tensor format: ("
           << dtype << ", " << format
           << "), Torch-TensorRT only supports contiguous format (NCHW) except with input type Float32 where channel last (NHWC) is also supported");
@@ -109,7 +109,7 @@ Input::Input(
     std::vector<int64_t> min_shape,
     std::vector<int64_t> opt_shape,
     std::vector<int64_t> max_shape,
-    nvinfer1::DataType dtype,
+    at::ScalarType dtype,
     nvinfer1::TensorFormat format,
     bool dtype_is_user_defined,
     std::vector<int64_t> tensor_domain) {
@@ -148,10 +148,10 @@ Input::Input(
 
   input_shape = util::toDims(dyn_shape);
 
-  TORCHTRT_CHECK(valid_input_dtype(dtype), "Unsupported input data type: " << dtype);
+  TORCHTRT_CHECK(valid_input_dtype(util::ScalarTypeToTRTDataType(dtype)), "Unsupported input data type: " << dtype);
   this->dtype = dtype;
   TORCHTRT_CHECK(
-      valid_dtype_format_combo(dtype, format),
+      valid_dtype_format_combo(util::ScalarTypeToTRTDataType(dtype), format),
       "Unsupported combination of dtype and tensor format: ("
           << dtype << ", " << format
           << "), Torch-TensorRT only supports contiguous format (NCHW) except with input type Float32 where channel last (NHWC) is also supported");
 
@@ -29,18 +29,19 @@ struct Input : torch::CustomClassHolder {
   Input(){};
   Input(
       std::vector<int64_t> shape,
-      nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT,
+      at::ScalarType dtype = at::kFloat,
       nvinfer1::TensorFormat format = nvinfer1::TensorFormat::kLINEAR,
       bool dtype_is_user_defined = false,
       std::vector<int64_t> tensor_domain = std::vector<int64_t>{0, 2});
   Input(
       std::vector<int64_t> min_shape,
       std::vector<int64_t> opt_shape,
       std::vector<int64_t> max_shape,
-      nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT,
+      at::ScalarType dtype = at::kFloat,
       nvinfer1::TensorFormat format = nvinfer1::TensorFormat::kLINEAR,
       bool dtype_is_user_defined = false,
       std::vector<int64_t> tensor_domain = std::vector<int64_t>{0, 2});
+
   friend std::ostream& operator<<(std::ostream& os, const Input& input);
 
   bool input_is_dynamic = false;
@@ -50,7 +51,7 @@ struct Input : torch::CustomClassHolder {
   nvinfer1::Dims min;
   nvinfer1::Dims max;
   nvinfer1::Dims opt;
-  nvinfer1::DataType dtype;
+  at::ScalarType dtype;
   nvinfer1::TensorFormat format;
   int id;
 };