|
| 1 | +#include "torch/csrc/jit/ir/constants.h" |
| 2 | +#include "torch/csrc/jit/passes/subgraph_rewrite.h" |
| 3 | + |
| 4 | +#include "core/util/prelude.h" |
| 5 | + |
| 6 | +namespace torch_tensorrt { |
| 7 | +namespace core { |
| 8 | +namespace lowering { |
| 9 | +namespace passes { |
| 10 | + |
| 11 | +void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) { |
| 12 | + std::string masked_fill_pattern = R"IR( |
| 13 | + graph(%self, %mask, %value): |
| 14 | + %out: Tensor = aten::masked_fill_(%self, %mask, %value) |
| 15 | + return (%out))IR"; |
| 16 | + |
| 17 | + // Calls to masked_fill_ often utilize CPU tensors, and as such |
| 18 | + // should be moved to gpu to avoid device mismatch errors |
| 19 | + |
| 20 | + // Separate string into portions to insert device name |
| 21 | + std::string clean_pattern_part_1 = R"IR( |
| 22 | + graph(%self, %mask, %value): |
| 23 | + %device: Device = prim::Constant[value=")IR"; |
| 24 | + |
| 25 | + std::string clean_pattern_part_2 = R"IR("]() |
| 26 | + %dtype: NoneType = prim::Constant() |
| 27 | + %false: bool = prim::Constant[value=0]() |
| 28 | + %mask_cuda: Tensor = aten::to(%mask, %device, %dtype, %false, %false) |
| 29 | + %self_cuda: Tensor = aten::to(%self, %device, %dtype, %false, %false) |
| 30 | + %out: Tensor = aten::masked_fill(%self_cuda, %mask_cuda, %value) |
| 31 | + return (%out))IR"; |
| 32 | + |
| 33 | + auto unpacked_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2; |
| 34 | + |
| 35 | + torch::jit::SubgraphRewriter masked_fill_rewriter; |
| 36 | + masked_fill_rewriter.RegisterRewritePattern(masked_fill_pattern, unpacked_pattern); |
| 37 | + masked_fill_rewriter.runOnGraph(graph); |
| 38 | + LOG_GRAPH("After unpack and cast masked_fill_: " << *graph); |
| 39 | +} |
| 40 | + |
| 41 | +void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) { |
| 42 | + std::string num_to_tensor_cast_pattern = R"IR( |
| 43 | + graph(%1: Scalar): |
| 44 | + %2: Tensor = prim::NumToTensor(%1) |
| 45 | + return (%2))IR"; |
| 46 | + |
| 47 | + // 0D Tensors are initialized on cpu, and need to be moved to gpu |
| 48 | + // to avoid device mismatch issues |
| 49 | + |
| 50 | + // Separate string into portions to insert device name |
| 51 | + std::string clean_pattern_part_1 = R"IR( |
| 52 | + graph(%1: Scalar): |
| 53 | + %2: Tensor = prim::NumToTensor(%1) |
| 54 | + %device: Device = prim::Constant[value=")IR"; |
| 55 | + |
| 56 | + std::string clean_pattern_part_2 = R"IR("]() |
| 57 | + %dtype: NoneType = prim::Constant() |
| 58 | + %false: bool = prim::Constant[value=0]() |
| 59 | + %3: Tensor = aten::to(%2, %device, %dtype, %false, %false) |
| 60 | + return (%3))IR"; |
| 61 | + |
| 62 | + auto num_to_tensor_clean_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2; |
| 63 | + |
| 64 | + torch::jit::SubgraphRewriter num_to_tensor_cast_rewriter; |
| 65 | + num_to_tensor_cast_rewriter.RegisterRewritePattern(num_to_tensor_cast_pattern, num_to_tensor_clean_pattern); |
| 66 | + num_to_tensor_cast_rewriter.runOnGraph(graph); |
| 67 | + |
| 68 | + LOG_GRAPH("After unpack and cast NumToTensor: " << *graph); |
| 69 | +} |
| 70 | + |
| 71 | +void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) { |
| 72 | + std::string full_cast_pattern = R"IR( |
| 73 | + graph(%1, %2, %3, %4, %5, %6): |
| 74 | + %out: Tensor = aten::full(%1, %2, %3, %4, %5, %6) |
| 75 | + return (%out))IR"; |
| 76 | + |
| 77 | + // Tensors created via aten::full are initialized on cpu, and need to be casted to gpu |
| 78 | + // to avoid device mismatch issues |
| 79 | + |
| 80 | + // Separate string into portions to insert device name |
| 81 | + std::string clean_pattern_part_1 = R"IR( |
| 82 | + graph(%1, %2, %3, %4, %5, %6): |
| 83 | + %device: Device = prim::Constant[value=")IR"; |
| 84 | + |
| 85 | + std::string clean_pattern_part_2 = R"IR("]() |
| 86 | + %out: Tensor = aten::full(%1, %2, %3, %4, %device, %6) |
| 87 | + return (%out))IR"; |
| 88 | + |
| 89 | + auto full_clean_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2; |
| 90 | + |
| 91 | + torch::jit::SubgraphRewriter full_cast_rewriter; |
| 92 | + full_cast_rewriter.RegisterRewritePattern(full_cast_pattern, full_clean_pattern); |
| 93 | + full_cast_rewriter.runOnGraph(graph); |
| 94 | + |
| 95 | + LOG_GRAPH("After unpack and cast full: " << *graph); |
| 96 | +} |
| 97 | + |
| 98 | +void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph) { |
| 99 | + std::string scalar_implicit_cast_pattern = R"IR( |
| 100 | + graph(%1: Tensor): |
| 101 | + %2: Scalar = aten::ScalarImplicit(%1) |
| 102 | + return (%2))IR"; |
| 103 | + |
| 104 | + // ScalarImplicit can only unpack 0D tensors, whereas Tensors operated on by |
| 105 | + // TensorRT are padded to 1 dimension. aten::item() resolves this conflict |
| 106 | + std::string scalar_implicit_clean_pattern = R"IR( |
| 107 | + graph(%1: Tensor): |
| 108 | + %2: Scalar = aten::item(%1) |
| 109 | + return (%2))IR"; |
| 110 | + |
| 111 | + torch::jit::SubgraphRewriter scalar_implicit_cast_rewriter; |
| 112 | + scalar_implicit_cast_rewriter.RegisterRewritePattern(scalar_implicit_cast_pattern, scalar_implicit_clean_pattern); |
| 113 | + scalar_implicit_cast_rewriter.runOnGraph(graph); |
| 114 | + |
| 115 | + LOG_GRAPH("After unpack and cast full: " << *graph); |
| 116 | +} |
| 117 | + |
| 118 | +} // namespace passes |
| 119 | +} // namespace lowering |
| 120 | +} // namespace core |
| 121 | +} // namespace torch_tensorrt |
0 commit comments