fix: Device casting issues with certain aten operators

gs-olive · gs-olive · commit 1d5712d415f3 · 2022-11-14T10:15:27.000-08:00
- Investigated issue arising with BART-base model (https://huggingface.co/facebook/bart-base) where certain tensor inputs to TensorRT were on the cpu, despite users explicitly casting all inputs properly - Traced issue to internally-generated 0D tensors, mask tensors, and operations returning CPU tensors passed between Torch and Torch-TensorRT engines - Added lowering passes to ensure function edge cases are appropriately dealt with, tensors are located on the proper device at runtime, and added validation check in runtime to avoid models crashing at runtime due to device mismatches - Added testing for lowering passes to ensure output values are accurate
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
@@ -70,6 +70,10 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::I
   passes::SiluToSigmoidMultipication(g);
   passes::RemoveSingleUse0DTensors(g);
   passes::RemoveUnnecessaryCasts(g);
+  passes::UnpackAndCastMaskedFill(g);
+  passes::UnpackAndCastNumToTensor(g);
+  passes::UnpackAndCastFull(g);
+  passes::ReplaceScalarImplicit(g);
   passes::RewriteInputsWithParams(g, params);
   LOG_GRAPH(*g);
 }
diff --git a/core/lowering/passes/BUILD b/core/lowering/passes/BUILD
@@ -14,6 +14,7 @@ cc_library(
     name = "passes",
     srcs = [
         "convNd_to_convolution.cpp",
+        "device_casting.cpp",
         "exception_elimination.cpp",
         "fuse_addmm_branches.cpp",
         "linear_to_addmm.cpp",
diff --git a/core/lowering/passes/CMakeLists.txt b/core/lowering/passes/CMakeLists.txt
@@ -1,5 +1,6 @@
 target_sources(${lib_name}
     PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/convNd_to_convolution.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/device_casting.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/exception_elimination.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/fuse_addmm_branches.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/linear_to_addmm.cpp"
diff --git a/core/lowering/passes/device_casting.cpp b/core/lowering/passes/device_casting.cpp
@@ -0,0 +1,103 @@
+#include "torch/csrc/jit/ir/constants.h"
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string masked_fill_pattern = R"IR(
+    graph(%self, %mask, %value):
+      %out: Tensor = aten::masked_fill_(%self, %mask, %value)
+      return (%out))IR";
+
+  // Calls to masked_fill_ often utilize CPU tensors, and as such
+  // should be casted to CUDA to avoid device mismatch errors
+  std::string unpacked_pattern = R"IR(
+    graph(%self, %mask, %value):
+      %device: Device = prim::Constant[value="cuda"]()
+      %dtype: NoneType = prim::Constant()
+      %false: bool = prim::Constant[value=0]()
+      %mask_cuda: Tensor = aten::to(%mask, %device, %dtype, %false, %false)
+      %self_cuda: Tensor = aten::to(%self, %device, %dtype, %false, %false)
+      %out: Tensor = aten::masked_fill_(%self_cuda, %mask_cuda, %value)
+      return (%out))IR";
+
+  torch::jit::SubgraphRewriter masked_fill_rewriter;
+  masked_fill_rewriter.RegisterRewritePattern(masked_fill_pattern, unpacked_pattern);
+  masked_fill_rewriter.runOnGraph(graph);
+  LOG_GRAPH("After unpack and cast masked_fill_: " << *graph);
+}
+
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string num_to_tensor_cast_pattern = R"IR(
+    graph(%1: Scalar):
+      %2: Tensor = prim::NumToTensor(%1)
+      return (%2))IR";
+
+  // 0D Tensors are initialized on cpu, and need to be casted to CUDA
+  // to avoid device mismatch issues
+  std::string num_to_tensor_clean_pattern = R"IR(
+    graph(%1: Scalar):
+      %2: Tensor = prim::NumToTensor(%1)
+      %device: Device = prim::Constant[value="cuda"]()
+      %dtype: NoneType = prim::Constant()
+      %false: bool = prim::Constant[value=0]()
+      %3: Tensor = aten::to(%2, %device, %dtype, %false, %false)
+      return (%3))IR";
+
+  torch::jit::SubgraphRewriter num_to_tensor_cast_rewriter;
+  num_to_tensor_cast_rewriter.RegisterRewritePattern(num_to_tensor_cast_pattern, num_to_tensor_clean_pattern);
+  num_to_tensor_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast NumToTensor: " << *graph);
+}
+
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string full_cast_pattern = R"IR(
+    graph(%1, %2, %3, %4, %5, %6):
+      %out: Tensor = aten::full(%1, %2, %3, %4, %5, %6)
+      return (%out))IR";
+
+  // Tensors created via aten::full are initialized on cpu, and need to be casted to CUDA
+  // to avoid device mismatch issues
+  std::string full_clean_pattern = R"IR(
+    graph(%1, %2, %3, %4, %5, %6):
+      %cuda: Device = prim::Constant[value="cuda"]()
+      %out: Tensor = aten::full(%1, %2, %3, %4, %cuda, %6)
+      return (%out))IR";
+
+  torch::jit::SubgraphRewriter full_cast_rewriter;
+  full_cast_rewriter.RegisterRewritePattern(full_cast_pattern, full_clean_pattern);
+  full_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast full: " << *graph);
+}
+
+void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string scalar_implicit_cast_pattern = R"IR(
+    graph(%1: Tensor):
+      %2: Scalar = aten::ScalarImplicit(%1)
+      return (%2))IR";
+
+  // ScalarImplicit can only unpack 0D tensors, whereas Tensors operated on by
+  // TensorRT are padded to 1 dimension. aten::item() resolves this conflict
+  std::string scalar_implicit_clean_pattern = R"IR(
+    graph(%1: Tensor):
+      %2: Scalar = aten::item(%1)
+      return (%2))IR";
+
+  torch::jit::SubgraphRewriter scalar_implicit_cast_rewriter;
+  scalar_implicit_cast_rewriter.RegisterRewritePattern(scalar_implicit_cast_pattern, scalar_implicit_clean_pattern);
+  scalar_implicit_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast full: " << *graph);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace torch_tensorrt
diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h
@@ -41,6 +41,10 @@ void SiluToSigmoidMultipication(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackHardSwish(std::shared_ptr<torch::jit::Graph>& graph);
 void RewriteInputsWithParams(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::IValue>& params);
 void UnpackHardSigmoid(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph);
+void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph);
 
 } // namespace passes
 } // namespace lowering
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -63,16 +63,41 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   CudaDevice curr_device = get_current_device();
   LOG_DEBUG("Current Device: " << curr_device);
 
+  // Generic Target Device Prefix
+  std::string target_device = "cuda:";
+
   if (is_switch_required(curr_device, compiled_engine->device_info)) {
     // Scan through available CUDA devices and set the CUDA device context correctly
     CudaDevice device = select_cuda_device(compiled_engine->device_info);
     set_cuda_device(device);
 
-    std::string target_device = "cuda:" + std::to_string(device.id);
+    // Target device is new device
+    target_device += std::to_string(device.id);
 
     for (auto& in : inputs) {
       in = in.to(torch::Device(target_device));
     }
+  } else {
+    // Target device is current device
+    target_device += std::to_string(curr_device.id);
+
+    // For each input, ensure its current device is the desired target device
+    for (size_t i = 0; i < inputs.size(); i++) {
+      at::Tensor* in = &inputs[i];
+      std::string current_tensor_device = in->device().str();
+
+      // If current device string does not match target device, display warning and move tensor accordingly
+      if (current_tensor_device != target_device) {
+        LOG_WARNING(
+            "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
+                     << " but should be on " << target_device
+                     << ". This tensor is being moved manually by the runtime but "
+                     << "for performance considerations, ensure your inputs are all on GPU "
+                     << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
+                     << "warning persists.");
+        *in = in->to(torch::Device(target_device));
+      }
+    }
   }
 
   std::vector<void*> gpu_handles;
diff --git a/tests/core/lowering/BUILD b/tests/core/lowering/BUILD
@@ -31,6 +31,10 @@ lowering_test(
     name = "test_conv1d_pass",
 )
 
+lowering_test(
+    name = "test_device_casting",
+)
+
 lowering_test(
     name = "test_exception_elimination_pass",
 )
@@ -95,6 +99,7 @@ test_suite(
     name = "lowering_tests",
     tests = [
         ":test_conv1d_pass",
+        ":test_device_casting",
         ":test_exception_elimination_pass",
         ":test_linear_to_addmm",
         ":test_module_fallback_passes",
diff --git a/tests/core/lowering/test_device_casting.cpp b/tests/core/lowering/test_device_casting.cpp
@@ -0,0 +1,194 @@
+#include <string>
+#include "core/compiler.h"
+#include "core/lowering/passes/passes.h"
+#include "core/util/prelude.h"
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/csrc/jit/ir/irparser.h"
+#include "torch/csrc/jit/ir/subgraph_matcher.h"
+#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/torch.h"
+
+TEST(LoweringPasses, UnpackAndCastMaskedFillLowersCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: Tensor, %x.2: Tensor, %x.3: float):
+        %2 : Tensor = aten::masked_fill_(%x.1, %x.2, %x.3)
+        return (%2))IR";
+
+  auto in = at::rand({2, 3, 5, 7}, {at::kCUDA});
+  auto in2 = at::rand({2, 3, 5, 7}, {at::kCUDA}).to(torch::kBool);
+  auto in3 = 7.3;
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in, in2, in3});
+  torch_tensorrt::core::lowering::passes::UnpackAndCastMaskedFill(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in, in2, in3});
+
+  ASSERT_TRUE(
+      torch_tensorrt::tests::util::almostEqual(jit_pre_results[0].toTensor(), jit_post_results[0].toTensor(), 2e-6));
+}
+
+TEST(LoweringPasses, UnpackAndCastNumToTensorLowersIntCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: int):
+        %2 : Tensor = prim::NumToTensor(%x.1)
+        return (%2))IR";
+
+  auto in = 1;
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+
+  ASSERT_TRUE(
+      torch_tensorrt::tests::util::almostEqual(jit_pre_results[0].toTensor(), jit_post_results[0].toTensor(), 2e-6));
+}
+
+TEST(LoweringPasses, UnpackAndCastNumToTensorLowersFloatCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: float):
+        %2 : Tensor = prim::NumToTensor(%x.1)
+        return (%2))IR";
+
+  auto in = 78.1;
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+
+  ASSERT_TRUE(
+      torch_tensorrt::tests::util::almostEqual(jit_pre_results[0].toTensor(), jit_post_results[0].toTensor(), 2e-6));
+}
+
+TEST(LoweringPasses, UnpackAndCastFullIntLowersCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: int):
+        %5 : NoneType = prim::Constant()
+        %2 : int = prim::Constant[value=3]()
+        %10 : int[] = prim::ListConstruct(%2, %2)
+        %out : Tensor = aten::full(%10, %x.1, %5, %5, %5, %5)
+        return (%out))IR";
+
+  auto in = 4;
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+  torch_tensorrt::core::lowering::passes::UnpackAndCastFull(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      jit_pre_results[0].toTensor(), jit_post_results[0].toTensor().cpu(), 2e-6));
+}
+
+TEST(LoweringPasses, UnpackAndCastFullFloatLowersCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: float):
+        %5 : NoneType = prim::Constant()
+        %2 : int = prim::Constant[value=5]()
+        %3 : int = prim::Constant[value=4]()
+        %10 : int[] = prim::ListConstruct(%2, %3)
+        %out : Tensor = aten::full(%10, %x.1, %5, %5, %5, %5)
+        return (%out))IR";
+
+  auto in = 54.1;
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+  torch_tensorrt::core::lowering::passes::UnpackAndCastFull(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      jit_pre_results[0].toTensor(), jit_post_results[0].toTensor().cpu(), 2e-6));
+}
+
+TEST(LoweringPasses, ReplaceScalarImplicitLowersCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: Tensor):
+        %5 : int = prim::Constant[value=0]()
+        %false : bool = prim::Constant[value=0]()
+        %none : NoneType = prim::Constant()
+        %cuda : Device = prim::Constant[value="cuda"]()
+        %3 : int = aten::size(%x.1, %5)
+        %y.2 : Tensor = prim::NumToTensor(%3)
+        %y.1 : Tensor = aten::to(%y.2, %cuda, %none, %false, %false)
+        %19 : Tensor[] = prim::ListConstruct(%x.1, %y.1)
+        %21 : Tensor, %22 : Tensor = prim::ListUnpack(%19)
+        %2 : Scalar = aten::ScalarImplicit(%22)
+        %out : Tensor = prim::NumToTensor(%2)
+        return (%out))IR";
+
+  auto in = at::rand({2, 3, 5, 7}, {at::kCUDA});
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+  torch_tensorrt::core::lowering::passes::ReplaceScalarImplicit(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+
+  ASSERT_TRUE(
+      torch_tensorrt::tests::util::almostEqual(jit_pre_results[0].toTensor(), jit_post_results[0].toTensor(), 2e-6));
+}
+
+TEST(LoweringPasses, ReplaceScalarImplicitIntNumToTensorLowersCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: int):
+        %1 : Tensor = prim::NumToTensor(%x.1)
+        %2 : Scalar = aten::ScalarImplicit(%1)
+        %3 : Tensor = prim::NumToTensor(%2)
+        return (%3))IR";
+
+  auto in = 25;
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+  torch_tensorrt::core::lowering::passes::UnpackAndCastNumToTensor(g);
+  torch_tensorrt::core::lowering::passes::ReplaceScalarImplicit(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+
+  ASSERT_TRUE(
+      torch_tensorrt::tests::util::almostEqual(jit_pre_results[0].toTensor(), jit_post_results[0].toTensor(), 2e-6));
+}
+
+TEST(LoweringPasses, ReplaceScalarImplicitFloatLowersCorrectly) {
+  const auto graph = R"IR(
+      graph(%x.1: float):
+        %1 : Tensor = prim::NumToTensor(%x.1)
+        %2 : Scalar = aten::ScalarImplicit(%1)
+        %3 : Tensor = prim::NumToTensor(%2)
+        return (%3))IR";
+
+  auto in = 2.5;
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_pre_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+  torch_tensorrt::core::lowering::passes::ReplaceScalarImplicit(g);
+  torch::jit::EliminateCommonSubexpression(g);
+  auto jit_post_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {in});
+
+  ASSERT_TRUE(
+      torch_tensorrt::tests::util::almostEqual(jit_pre_results[0].toTensor(), jit_post_results[0].toTensor(), 2e-6));
+}