fix: Update paradigm for device casting to depend on user-specified device

gs-olive · gs-olive · commit 8583a4c00fa1 · 2022-11-14T10:15:39.000-08:00
- Adde field to LowerInfo to hold device information
- Update internal Device struct location to allow streamlined imports
- Update BUILD files
- Build strings in lowering phase using user-specified target device
- Update CMakeLists to reflect IR dependency in lowering
- Update runtime device location code to run regardless of whether a switch is required or not.
diff --git a/core/conversion/conversionctx/BUILD b/core/conversion/conversionctx/BUILD
@@ -21,6 +21,7 @@ cc_library(
     deps = [
         "@tensorrt//:nvinfer",
         "//core/util:prelude",
+        "//core/ir",
     ] + select({
         ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
         "//conditions:default": ["@libtorch//:libtorch"],
diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h
@@ -9,28 +9,21 @@
 #include "torch/csrc/jit/ir/ir.h"
 
 #include <cuda_runtime.h>
+#include "core/ir/ir.h"
 #include "core/util/prelude.h"
 
 namespace torch_tensorrt {
 namespace core {
 namespace conversion {
 
-struct Device {
-  nvinfer1::DeviceType device_type;
-  int64_t gpu_id;
-  int64_t dla_core;
-  bool allow_gpu_fallback;
-  Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
-};
-
 struct BuilderSettings {
   std::set<nvinfer1::DataType> enabled_precisions = {};
   bool sparse_weights = false;
   bool disable_tf32 = false;
   bool refit = false;
   bool debug = false;
   bool truncate_long_and_double = false;
-  Device device;
+  ir::Device device;
   nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
   uint64_t num_avg_timing_iters = 1;
diff --git a/core/ir/ir.h b/core/ir/ir.h
@@ -11,6 +11,14 @@ namespace torch_tensorrt {
 namespace core {
 namespace ir {
 
+struct Device {
+  nvinfer1::DeviceType device_type;
+  int64_t gpu_id;
+  int64_t dla_core;
+  bool allow_gpu_fallback;
+  Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
+};
+
 struct Input : torch::CustomClassHolder {
   Input(){};
   Input(
diff --git a/core/lowering/BUILD b/core/lowering/BUILD
@@ -24,6 +24,7 @@ cc_library(
     deps = [
         "//core/lowering/passes",
         "//core/util:prelude",
+        "//core/ir",
     ] + select({
         ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
         "//conditions:default": ["@libtorch//:libtorch"],
diff --git a/core/lowering/CMakeLists.txt b/core/lowering/CMakeLists.txt
@@ -15,6 +15,8 @@ set(HEADER_FILES
 target_sources(${lib_name}
     PRIVATE
         ${CXX_SRCS}
+    PUBLIC
+        $<TARGET_OBJECTS:core_ir>
         $<TARGET_OBJECTS:core_util>
 )
 
@@ -25,8 +27,9 @@ target_include_directories(${lib_name}
 
 target_link_libraries(${lib_name}
     PUBLIC
+        TensorRT::nvinfer
         torch
-    PRIVATE
+        core_ir
         core_util
 )
 
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
@@ -70,9 +70,9 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::I
   passes::SiluToSigmoidMultipication(g);
   passes::RemoveSingleUse0DTensors(g);
   passes::RemoveUnnecessaryCasts(g);
-  passes::UnpackAndCastMaskedFill(g);
-  passes::UnpackAndCastNumToTensor(g);
-  passes::UnpackAndCastFull(g);
+  passes::UnpackAndCastMaskedFill(g, lower_info.getGPUDeviceString());
+  passes::UnpackAndCastNumToTensor(g, lower_info.getGPUDeviceString());
+  passes::UnpackAndCastFull(g, lower_info.getGPUDeviceString());
   passes::ReplaceScalarImplicit(g);
   passes::RewriteInputsWithParams(g, params);
   LOG_GRAPH(*g);
diff --git a/core/lowering/lowering.h b/core/lowering/lowering.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <memory>
+#include "core/ir/ir.h"
 #include "torch/csrc/jit/ir/ir.h"
 
 namespace torch_tensorrt {
@@ -15,8 +16,13 @@ struct LowerInfo {
   // Since these QDQ nodes will be identical as they share same input, one of them is eliminated due to CSE lowering
   // pass. Disable this in order to not disturb TensorRT's QAT optimizations.
   bool disable_cse = false;
+  ir::Device target_device;
   std::vector<std::string> forced_fallback_modules;
   friend std::ostream& operator<<(std::ostream& os, const LowerInfo& l);
+
+  std::string getGPUDeviceString() {
+    return "cuda:" + std::to_string(target_device.gpu_id);
+  };
 };
 
 void LowerBlock(torch::jit::Block* b);
diff --git a/core/lowering/passes/device_casting.cpp b/core/lowering/passes/device_casting.cpp
@@ -8,68 +8,86 @@ namespace core {
 namespace lowering {
 namespace passes {
 
-void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph) {
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) {
   std::string masked_fill_pattern = R"IR(
     graph(%self, %mask, %value):
       %out: Tensor = aten::masked_fill_(%self, %mask, %value)
       return (%out))IR";
 
   // Calls to masked_fill_ often utilize CPU tensors, and as such
-  // should be casted to CUDA to avoid device mismatch errors
-  std::string unpacked_pattern = R"IR(
+  // should be moved to gpu to avoid device mismatch errors
+
+  // Separate string into portions to insert device name
+  std::string clean_pattern_part_1 = R"IR(
     graph(%self, %mask, %value):
-      %device: Device = prim::Constant[value="cuda"]()
+      %device: Device = prim::Constant[value=")IR";
+
+  std::string clean_pattern_part_2 = R"IR("]()
       %dtype: NoneType = prim::Constant()
       %false: bool = prim::Constant[value=0]()
       %mask_cuda: Tensor = aten::to(%mask, %device, %dtype, %false, %false)
       %self_cuda: Tensor = aten::to(%self, %device, %dtype, %false, %false)
-      %out: Tensor = aten::masked_fill_(%self_cuda, %mask_cuda, %value)
+      %out: Tensor = aten::masked_fill(%self_cuda, %mask_cuda, %value)
       return (%out))IR";
 
+  auto unpacked_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2;
+
   torch::jit::SubgraphRewriter masked_fill_rewriter;
   masked_fill_rewriter.RegisterRewritePattern(masked_fill_pattern, unpacked_pattern);
   masked_fill_rewriter.runOnGraph(graph);
   LOG_GRAPH("After unpack and cast masked_fill_: " << *graph);
 }
 
-void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph) {
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) {
   std::string num_to_tensor_cast_pattern = R"IR(
     graph(%1: Scalar):
       %2: Tensor = prim::NumToTensor(%1)
       return (%2))IR";
 
-  // 0D Tensors are initialized on cpu, and need to be casted to CUDA
+  // 0D Tensors are initialized on cpu, and need to be moved to gpu
   // to avoid device mismatch issues
-  std::string num_to_tensor_clean_pattern = R"IR(
+
+  // Separate string into portions to insert device name
+  std::string clean_pattern_part_1 = R"IR(
     graph(%1: Scalar):
       %2: Tensor = prim::NumToTensor(%1)
-      %device: Device = prim::Constant[value="cuda"]()
+      %device: Device = prim::Constant[value=")IR";
+
+  std::string clean_pattern_part_2 = R"IR("]()
       %dtype: NoneType = prim::Constant()
       %false: bool = prim::Constant[value=0]()
       %3: Tensor = aten::to(%2, %device, %dtype, %false, %false)
       return (%3))IR";
 
+  auto num_to_tensor_clean_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2;
+
   torch::jit::SubgraphRewriter num_to_tensor_cast_rewriter;
   num_to_tensor_cast_rewriter.RegisterRewritePattern(num_to_tensor_cast_pattern, num_to_tensor_clean_pattern);
   num_to_tensor_cast_rewriter.runOnGraph(graph);
 
   LOG_GRAPH("After unpack and cast NumToTensor: " << *graph);
 }
 
-void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph) {
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) {
   std::string full_cast_pattern = R"IR(
     graph(%1, %2, %3, %4, %5, %6):
       %out: Tensor = aten::full(%1, %2, %3, %4, %5, %6)
       return (%out))IR";
 
-  // Tensors created via aten::full are initialized on cpu, and need to be casted to CUDA
+  // Tensors created via aten::full are initialized on cpu, and need to be casted to gpu
   // to avoid device mismatch issues
-  std::string full_clean_pattern = R"IR(
+
+  // Separate string into portions to insert device name
+  std::string clean_pattern_part_1 = R"IR(
     graph(%1, %2, %3, %4, %5, %6):
-      %cuda: Device = prim::Constant[value="cuda"]()
-      %out: Tensor = aten::full(%1, %2, %3, %4, %cuda, %6)
+      %device: Device = prim::Constant[value=")IR";
+
+  std::string clean_pattern_part_2 = R"IR("]()
+      %out: Tensor = aten::full(%1, %2, %3, %4, %device, %6)
       return (%out))IR";
 
+  auto full_clean_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2;
+
   torch::jit::SubgraphRewriter full_cast_rewriter;
   full_cast_rewriter.RegisterRewritePattern(full_cast_pattern, full_clean_pattern);
   full_cast_rewriter.runOnGraph(graph);
diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h
@@ -41,9 +41,9 @@ void SiluToSigmoidMultipication(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackHardSwish(std::shared_ptr<torch::jit::Graph>& graph);
 void RewriteInputsWithParams(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::IValue>& params);
 void UnpackHardSigmoid(std::shared_ptr<torch::jit::Graph>& graph);
-void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph);
-void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph);
-void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
 void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph);
 
 } // namespace passes
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -80,23 +80,22 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   } else {
     // Target device is current device
     target_device += std::to_string(curr_device.id);
+  }
+
+  // For each input, ensure its current device is the desired target device
+  for (size_t i = 0; i < inputs.size(); i++) {
+    at::Tensor* in = &inputs[i];
+    std::string current_tensor_device = in->device().str();
 
-    // For each input, ensure its current device is the desired target device
-    for (size_t i = 0; i < inputs.size(); i++) {
-      at::Tensor* in = &inputs[i];
-      std::string current_tensor_device = in->device().str();
-
-      // If current device string does not match target device, display warning and move tensor accordingly
-      if (current_tensor_device != target_device) {
-        LOG_WARNING(
-            "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
-                     << " but should be on " << target_device
-                     << ". This tensor is being moved manually by the runtime but "
-                     << "for performance considerations, ensure your inputs are all on GPU "
-                     << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
-                     << "warning persists.");
-        *in = in->to(torch::Device(target_device));
-      }
+    // If current device string does not match target device, display warning and move tensor accordingly
+    if (current_tensor_device != target_device) {
+      LOG_WARNING(
+          "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
+                   << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
+                   << "for performance considerations, ensure your inputs are all on GPU "
+                   << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
+                   << "warning persists.");
+      *in = in->to(torch::Device(target_device));
     }
   }
 
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
@@ -110,6 +110,7 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
   internal.convert_info.engine_settings.debug = external.debug;
   internal.convert_info.engine_settings.truncate_long_and_double = external.truncate_long_and_double;
   internal.convert_info.engine_settings.device.allow_gpu_fallback = external.device.allow_gpu_fallback;
+  internal.lower_info.target_device.allow_gpu_fallback = external.device.allow_gpu_fallback;
 
   TORCHTRT_CHECK(
       !(external.require_full_compilation && (external.torch_executed_ops.size() > 0)),
@@ -130,10 +131,12 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
   switch (external.device.device_type) {
     case Device::DeviceType::kDLA:
       internal.convert_info.engine_settings.device.device_type = nvinfer1::DeviceType::kDLA;
+      internal.lower_info.target_device.device_type = nvinfer1::DeviceType::kDLA;
       break;
     case Device::DeviceType::kGPU:
     default:
       internal.convert_info.engine_settings.device.device_type = nvinfer1::DeviceType::kGPU;
+      internal.lower_info.target_device.device_type = nvinfer1::DeviceType::kGPU;
   }
 
   switch (external.capability) {
@@ -150,6 +153,8 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
 
   internal.convert_info.engine_settings.device.gpu_id = external.device.gpu_id;
   internal.convert_info.engine_settings.device.dla_core = external.device.dla_core;
+  internal.lower_info.target_device.gpu_id = external.device.gpu_id;
+  internal.lower_info.target_device.dla_core = external.device.dla_core;
   internal.convert_info.engine_settings.num_avg_timing_iters = external.num_avg_timing_iters;
   internal.convert_info.engine_settings.workspace_size = external.workspace_size;
   internal.convert_info.engine_settings.dla_sram_size = external.dla_sram_size;
diff --git a/tests/core/lowering/test_device_casting.cpp b/tests/core/lowering/test_device_casting.cpp
@@ -23,7 +23,7 @@ TEST(LoweringPasses, UnpackAndCastMaskedFillLowersCorrectly) {
   torch::jit::parseIR(graph, g.get());
 
   auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in, in2, in3});
-  torch_tensorrt::core::lowering::passes::UnpackAndCastMaskedFill(g);
+  torch_tensorrt::core::lowering::passes::UnpackAndCastMaskedFill(g, "cuda:0");
   torch::jit::EliminateCommonSubexpression(g);
   auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in, in2, in3});
 
@@ -43,7 +43,7 @@ TEST(LoweringPasses, UnpackAndCastNumToTensorLowersIntCorrectly) {
   torch::jit::parseIR(graph, g.get());
 
   auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
-  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g);
+  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g, "cuda:0");
   torch::jit::EliminateCommonSubexpression(g);
   auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
 
@@ -63,7 +63,7 @@ TEST(LoweringPasses, UnpackAndCastNumToTensorLowersFloatCorrectly) {
   torch::jit::parseIR(graph, g.get());
 
   auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
-  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g);
+  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g, "cuda:0");
   torch::jit::EliminateCommonSubexpression(g);
   auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
 
@@ -86,7 +86,7 @@ TEST(LoweringPasses, UnpackAndCastFullIntLowersCorrectly) {
   torch::jit::parseIR(graph, g.get());
 
   auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
-  torch_tensorrt::core::lowering::passes::UnpackAndCastFull(g);
+  torch_tensorrt::core::lowering::passes::UnpackAndCastFull(g, "cuda:0");
   torch::jit::EliminateCommonSubexpression(g);
   auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
 
@@ -110,7 +110,7 @@ TEST(LoweringPasses, UnpackAndCastFullFloatLowersCorrectly) {
   torch::jit::parseIR(graph, g.get());
 
   auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
-  torch_tensorrt::core::lowering::passes::UnpackAndCastFull(g);
+  torch_tensorrt::core::lowering::passes::UnpackAndCastFull(g, "cuda:0");
   torch::jit::EliminateCommonSubexpression(g);
   auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
 
@@ -124,7 +124,7 @@ TEST(LoweringPasses, ReplaceScalarImplicitLowersCorrectly) {
         %5 : int = prim::Constant[value=0]()
         %false : bool = prim::Constant[value=0]()
         %none : NoneType = prim::Constant()
-        %cuda : Device = prim::Constant[value="cuda"]()
+        %cuda : Device = prim::Constant[value="cuda:0"]()
         %3 : int = aten::size(%x.1, %5)
         %y.2 : Tensor = prim::NumToTensor(%3)
         %y.1 : Tensor = aten::to(%y.2, %cuda, %none, %false, %false)
@@ -162,7 +162,7 @@ TEST(LoweringPasses, ReplaceScalarImplicitIntNumToTensorLowersCorrectly) {
   torch::jit::parseIR(graph, g.get());
 
   auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
-  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g);
+  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g, "cuda:0");
   torch_tensorrt::core::lowering::passes::ReplaceScalarImplicit(g);
   torch::jit::EliminateCommonSubexpression(g);
   auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});