From d00240e4d8ff157b947e150110c334a1f79d7c4d Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Fri, 12 Aug 2022 11:15:00 -0700 Subject: [PATCH 01/11] feat(aten::hardsigmoid): Unpack hardsigmoid Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/lowering/lowering.cpp | 1 + core/lowering/passes/BUILD | 1 + core/lowering/passes/passes.h | 1 + core/lowering/passes/unpack_hardsigmoid.cpp | 43 +++++++++ tests/core/lowering/BUILD | 5 ++ .../core/lowering/test_unpack_hardsigmoid.cpp | 87 +++++++++++++++++++ 6 files changed, 138 insertions(+) create mode 100644 core/lowering/passes/unpack_hardsigmoid.cpp create mode 100644 tests/core/lowering/test_unpack_hardsigmoid.cpp diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp index 8bbae296c3..5442440422 100644 --- a/core/lowering/lowering.cpp +++ b/core/lowering/lowering.cpp @@ -41,6 +41,7 @@ void LowerGraph(std::shared_ptr& g, LowerInfo lower_info) { passes::MarkNodesForFallback(g, true); } passes::UnpackHardSwish(g); + passes::UnpackHardSigmoid(g); passes::EliminateExceptionOrPassPattern(g); passes::ReduceToOperation(g); passes::ReduceGelu(g); diff --git a/core/lowering/passes/BUILD b/core/lowering/passes/BUILD index 1f6a0cde8f..d5f3616f8d 100644 --- a/core/lowering/passes/BUILD +++ b/core/lowering/passes/BUILD @@ -30,6 +30,7 @@ cc_library( "silu_to_sigmoid_multiplication.cpp", "unpack_addmm.cpp", "unpack_batch_norm.cpp", + "unpack_hardsigmoid.cpp", "unpack_hardswish.cpp", "unpack_log_softmax.cpp", "unpack_std.cpp", diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h index 73bd9f61d7..3b946593e2 100644 --- a/core/lowering/passes/passes.h +++ b/core/lowering/passes/passes.h @@ -38,6 +38,7 @@ void UnpackVar(std::shared_ptr& graph); void AliasOperators(std::shared_ptr& graph); void SiluToSigmoidMultipication(std::shared_ptr& graph); void UnpackHardSwish(std::shared_ptr& graph); +void UnpackHardSigmoid(std::shared_ptr& graph); } // namespace passes } // namespace lowering diff --git a/core/lowering/passes/unpack_hardsigmoid.cpp b/core/lowering/passes/unpack_hardsigmoid.cpp new file mode 100644 index 0000000000..876196215a --- /dev/null +++ b/core/lowering/passes/unpack_hardsigmoid.cpp @@ -0,0 +1,43 @@ +#include "torch/csrc/jit/passes/subgraph_rewrite.h" + +#include "core/util/prelude.h" + +namespace torch_tensorrt { +namespace core { +namespace lowering { +namespace passes { + +void UnpackHardSigmoid(std::shared_ptr& graph) { + std::string hardsigmoid_pattern = R"IR( + graph(%input): + %result = aten::hardsigmoid(%input) + return (%result))IR"; + + std::string hardsigmoid_pattern_inplace = R"IR( + graph(%input): + %result = aten::hardsigmoid_(%input) + return (%result))IR"; + + std::string new_pattern = R"IR( + graph(%x.1): + %22 : float = prim::Constant[value=0.5]() + %3 : int = prim::Constant[value=6]() + %5 : int = prim::Constant[value=1]() + %10 : int = prim::Constant[value=0]() + %4 : Tensor = aten::div(%x.1, %3) + %9 : Tensor = aten::add(%4, %22, %5) + %21 : Tensor = aten::clamp(%9, %10, %5) + return (%21))IR"; + + torch::jit::SubgraphRewriter rewriter; + rewriter.RegisterRewritePattern(hardsigmoid_pattern, new_pattern); + rewriter.RegisterRewritePattern(hardsigmoid_pattern_inplace, new_pattern); + rewriter.runOnGraph(graph); + + LOG_GRAPH("Post unpack hardsigmoid: " << *graph); +} + +} // namespace passes +} // namespace lowering +} // namespace core +} // namespace torch_tensorrt diff --git a/tests/core/lowering/BUILD b/tests/core/lowering/BUILD index b33685a647..75ae818905 100644 --- a/tests/core/lowering/BUILD +++ b/tests/core/lowering/BUILD @@ -75,6 +75,10 @@ lowering_test( name = "test_silu_to_sigmoid_multiplication", ) +lowering_test( + name = "test_unpack_hardsigmoid", +) + lowering_test( name = "test_unpack_hardswish", ) @@ -98,6 +102,7 @@ test_suite( ":test_remove_detach_pass", ":test_remove_dropout_pass", ":test_remove_unnecessary_casts", + ":test_unpack_hardsigmoid", ":test_unpack_hardswish", ":test_unpack_reduce_ops", ":test_view_to_reshape_pass", diff --git a/tests/core/lowering/test_unpack_hardsigmoid.cpp b/tests/core/lowering/test_unpack_hardsigmoid.cpp new file mode 100644 index 0000000000..f8206511be --- /dev/null +++ b/tests/core/lowering/test_unpack_hardsigmoid.cpp @@ -0,0 +1,87 @@ +#include +#include "core/compiler.h" +#include "core/lowering/passes/passes.h" +#include "gtest/gtest.h" +#include "tests/util/util.h" +#include "torch/csrc/jit/ir/irparser.h" +#include "torch/csrc/jit/ir/subgraph_matcher.h" + +TEST(LoweringPasses, UnpackHardSigmoid) { + std::string source_graph = R"IR( + graph(%input): + %result = aten::hardsigmoid(%input) + return (%result))IR"; + + std::string target_graph = R"IR( + graph(%x.1): + %22 : float = prim::Constant[value=0.5]() + %3 : int = prim::Constant[value=6]() + %5 : int = prim::Constant[value=1]() + %10 : int = prim::Constant[value=0]() + %4 : Tensor = aten::div(%x.1, %3) + %9 : Tensor = aten::add(%4, %22, %5) + %21 : Tensor = aten::clamp(%9, %10, %5) + return (%21))IR"; + + torch_tensorrt::core::util::logging::get_logger().set_reportable_log_level( + torch_tensorrt::core::util::logging::LogLevel::kGRAPH); + auto sg = std::make_shared(); + torch::jit::parseIR(source_graph, &*sg); + + auto in = at::rand({10, 100}, {at::kCUDA}); + auto sg_params = torch_tensorrt::core::ir::get_static_params(sg->inputs(), {}); + auto sg_results = torch_tensorrt::tests::util::RunGraph(sg, sg_params, {in}); + + torch_tensorrt::core::lowering::passes::UnpackHardSigmoid(sg); + + auto tg = std::make_shared(); + torch::jit::parseIR(target_graph, &*tg); + + ASSERT_TRUE(!torch::jit::findPatternMatches(*tg, *sg).empty()); + + in = at::clone(in); + auto tg_params = torch_tensorrt::core::ir::get_static_params(tg->inputs(), {}); + auto tg_results = torch_tensorrt::tests::util::RunGraph(tg, tg_params, {in}); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(sg_results[0], tg_results[0], 2e-6)); +} + +TEST(LoweringPasses, UnpackHardSigmoidInPlace) { + std::string source_graph = R"IR( + graph(%input): + %result = aten::hardsigmoid_(%input) + return (%result))IR"; + + std::string target_graph = R"IR( + graph(%x.1): + %22 : float = prim::Constant[value=0.5]() + %3 : int = prim::Constant[value=6]() + %5 : int = prim::Constant[value=1]() + %10 : int = prim::Constant[value=0]() + %4 : Tensor = aten::div(%x.1, %3) + %9 : Tensor = aten::add(%4, %22, %5) + %21 : Tensor = aten::clamp(%9, %10, %5) + return (%21))IR"; + + torch_tensorrt::core::util::logging::get_logger().set_reportable_log_level( + torch_tensorrt::core::util::logging::LogLevel::kGRAPH); + auto sg = std::make_shared(); + torch::jit::parseIR(source_graph, &*sg); + + auto in = at::rand({10, 100}, {at::kCUDA}); + auto sg_params = torch_tensorrt::core::ir::get_static_params(sg->inputs(), {}); + auto sg_results = torch_tensorrt::tests::util::RunGraph(sg, sg_params, {in}); + + torch_tensorrt::core::lowering::passes::UnpackHardSigmoid(sg); + + auto tg = std::make_shared(); + torch::jit::parseIR(target_graph, &*tg); + + ASSERT_TRUE(!torch::jit::findPatternMatches(*tg, *sg).empty()); + + in = at::clone(in); + auto tg_params = torch_tensorrt::core::ir::get_static_params(tg->inputs(), {}); + auto tg_results = torch_tensorrt::tests::util::RunGraph(tg, tg_params, {in}); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(sg_results[0], tg_results[0], 2e-6)); +} From c9e504d8bbee4848698726e6e3dbc51274965843 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Sat, 13 Aug 2022 16:48:25 -0700 Subject: [PATCH 02/11] chore: reorg in prep for state centralizing Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/compiler.cpp | 14 +++---- core/compiler.h | 2 +- core/lowering/passes/CMakeLists.txt | 1 + core/partitioning/BUILD | 10 +---- core/partitioning/CMakeLists.txt | 37 ++++++++++-------- core/partitioning/partitioning.cpp | 24 ++++++------ core/partitioning/partitioning.h | 21 +++++++--- core/partitioning/partitioninginfo/BUILD | 39 +++++++++++++++++++ .../partitioninginfo/CMakeLists.txt | 12 ++++++ .../PartitioningInfo.cpp} | 4 +- .../PartitioningInfo.h} | 4 +- core/partitioning/segmentedblock/BUILD | 39 +++++++++++++++++++ .../segmentedblock/CMakeLists.txt | 12 ++++++ .../{ => segmentedblock}/SegmentedBlock.cpp | 0 .../{ => segmentedblock}/SegmentedBlock.h | 1 - core/partitioning/shape_analysis.cpp | 19 ++++----- core/partitioning/shape_analysis.h | 20 ---------- cpp/src/compile_spec.cpp | 8 ++-- 18 files changed, 179 insertions(+), 88 deletions(-) create mode 100644 core/partitioning/partitioninginfo/BUILD create mode 100644 core/partitioning/partitioninginfo/CMakeLists.txt rename core/partitioning/{PartitionInfo.cpp => partitioninginfo/PartitioningInfo.cpp} (82%) rename core/partitioning/{PartitionInfo.h => partitioninginfo/PartitioningInfo.h} (79%) create mode 100644 core/partitioning/segmentedblock/BUILD create mode 100644 core/partitioning/segmentedblock/CMakeLists.txt rename core/partitioning/{ => segmentedblock}/SegmentedBlock.cpp (100%) rename core/partitioning/{ => segmentedblock}/SegmentedBlock.h (98%) delete mode 100644 core/partitioning/shape_analysis.h diff --git a/core/compiler.cpp b/core/compiler.cpp index 7b58dbb2c1..8b62aeba4d 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -227,11 +227,11 @@ GraphAndMapping ConstructFallbackGraph( ir::StaticParams static_params, std::unordered_map& fallback_nodes) { auto convert_cfg = cfg.convert_info; - auto partition_info = cfg.partition_info; + auto partitioning_info = cfg.partitioning_info; auto new_g = std::make_shared(); - auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partition_info, fallback_nodes); + auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partitioning_info, fallback_nodes); // the mapping from lowering graph => fallback global graph std::unordered_map old_to_new_g; @@ -339,7 +339,7 @@ void MapInputsAndDetermineDTypes( "Cannot infer input type from calcuations in graph for input " << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity"); spec[i].dtype = nvinfer1::DataType::kFLOAT; - } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) { + } else if (spec[i].dtype_is_user_defined && cfg.partitioning_info.enabled) { if (!est_type_opt[i]) { LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting"); std::stringstream ss; @@ -424,15 +424,15 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types); auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true); auto outputIsCollection = conversion::OutputIsCollection(g->block()); - if (cfg.partition_info.enabled && + if (cfg.partitioning_info.enabled && (cfg.lower_info.forced_fallback_modules.size() == 0 && - cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) { + cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) { LOG_INFO("Skipping partitioning since model is fully supported"); } - if (cfg.partition_info.enabled && + if (cfg.partitioning_info.enabled && (!(cfg.lower_info.forced_fallback_modules.size() == 0 && - cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible) || + cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) || outputIsCollection)) { std::unordered_map fallback_nodes; auto collection_input_ivalues_map = diff --git a/core/compiler.h b/core/compiler.h index c8dc85020b..1b7b3defe8 100644 --- a/core/compiler.h +++ b/core/compiler.h @@ -19,7 +19,7 @@ struct CompileSpec { ir::GraphInputs graph_inputs; conversion::ConversionInfo convert_info; lowering::LowerInfo lower_info; - partitioning::PartitionInfo partition_info; + partitioning::PartitioningInfo partitioning_info; }; bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod, std::string method_name); diff --git a/core/lowering/passes/CMakeLists.txt b/core/lowering/passes/CMakeLists.txt index a8cda65e71..48e644a70d 100644 --- a/core/lowering/passes/CMakeLists.txt +++ b/core/lowering/passes/CMakeLists.txt @@ -17,6 +17,7 @@ target_sources(${lib_name} "${CMAKE_CURRENT_SOURCE_DIR}/silu_to_sigmoid_multiplication.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/unpack_addmm.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/unpack_batch_norm.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/unpack_hardsigmoid.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/unpack_hardswish.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/unpack_log_softmax.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/unpack_std.cpp" diff --git a/core/partitioning/BUILD b/core/partitioning/BUILD index fbc9eeac7a..22b3ff6729 100644 --- a/core/partitioning/BUILD +++ b/core/partitioning/BUILD @@ -13,22 +13,19 @@ config_setting( cc_library( name = "partitioning", srcs = [ - "PartitionInfo.cpp", - "SegmentedBlock.cpp", "partitioning.cpp", "shape_analysis.cpp", ], hdrs = [ - "PartitionInfo.h", - "SegmentedBlock.h", "partitioning.h", - "shape_analysis.h", ], deps = [ "//core/util:prelude", "//core/ir", "//core/conversion", "//core/lowering", + "//core/partitioning/partitioninginfo", + "//core/partitioning/segmentedblock", ] + select({ ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"], "//conditions:default": ["@libtorch//:libtorch"], @@ -39,10 +36,7 @@ cc_library( pkg_tar( name = "include", srcs = [ - "PartitionInfo.h", - "SegmentedBlock.h", "partitioning.h", - "shape_analysis.h", ], package_dir = "core/partitioning/", ) diff --git a/core/partitioning/CMakeLists.txt b/core/partitioning/CMakeLists.txt index 15784f638e..7ce16fd67f 100644 --- a/core/partitioning/CMakeLists.txt +++ b/core/partitioning/CMakeLists.txt @@ -1,33 +1,38 @@ set(lib_name "core_partitioning") add_library(${lib_name} OBJECT) -target_sources(${lib_name} - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/SegmentedBlock.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/shape_analysis.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/partitioning.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/PartitionInfo.cpp" - $ - PUBLIC $ - $ +set(CXX_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/partitioning.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/shape_analysis.cpp" ) set(HEADER_FILES - "${CMAKE_CURRENT_SOURCE_DIR}/SegmentedBlock.h" - "${CMAKE_CURRENT_SOURCE_DIR}/shape_analysis.h" - "${CMAKE_CURRENT_SOURCE_DIR}/PartitionInfo.h" "${CMAKE_CURRENT_SOURCE_DIR}/partitioning.h" ) -target_include_directories(${lib_name} PUBLIC "$") +target_sources(${lib_name} + PRIVATE + ${CXX_SRCS} + PUBLIC + $ + $ + $ +) + target_link_libraries(${lib_name} PUBLIC - torch TensorRT::nvinfer + torch core_ir core_util - PRIVATE core_conversion ) -# Install headers -install(FILES ${HEADER_FILES} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/torch_tensorrt/core/partitioning/") +target_include_directories(${lib_name} + PUBLIC "$" +) + +add_subdirectory(partitioninginfo) +add_subdirectory(segmentedblock) + +install(FILES ${HEADER_FILES} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/torch_tensorrt/core/partitioning") \ No newline at end of file diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 28bfd0712c..9018ce1fd9 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -1,12 +1,12 @@ -#include "partitioning.h" - #include -#include "core/conversion/conversion.h" -#include "core/conversion/evaluators/evaluators.h" -#include "core/partitioning/shape_analysis.h" + #include "torch/csrc/jit/passes/constant_pooling.h" #include "torch/csrc/jit/passes/dead_code_elimination.h" +#include "core/conversion/conversion.h" +#include "core/conversion/evaluators/evaluators.h" +#include "core/partitioning/partitioning.h" + namespace torch_tensorrt { namespace core { namespace partitioning { @@ -357,11 +357,11 @@ void find_min_block_size_fallback_nodes( PartitionedGraph segment_graph( torch::jit::Block* block, - const PartitionInfo& partition_info, + const PartitioningInfo& partitioning_info, std::unordered_map& global_fallback_nodes) { - auto min_block_size = partition_info.min_block_size; + auto min_block_size = partitioning_info.min_block_size; std::unordered_set forced_fallback_ops( - partition_info.forced_fallback_operators.begin(), partition_info.forced_fallback_operators.end()); + partitioning_info.forced_fallback_operators.begin(), partitioning_info.forced_fallback_operators.end()); // get the initial fallback nodes (nodes that are unsupported or forced fallback) get_fallback_nodes(block, forced_fallback_ops, global_fallback_nodes); @@ -450,16 +450,16 @@ PartitionedGraph segment_graph( PartitionedGraph Partition( torch::jit::Block* block, std::unordered_map& example_tensor_map, - const PartitionInfo& partition_info, + const PartitioningInfo& partitioning_info, std::unordered_map& global_fallback_nodes) { - LOG_DEBUG(partition_info); + LOG_DEBUG(partitioning_info); // if there is nonTensor input/output for the entire graph, fallback the node that consumes/produces this nonTensor // output fallback_graph_nontensor_in_out(block, global_fallback_nodes); // segment lowering global graph into blocks LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks"); - PartitionedGraph segmented_blocks = segment_graph(block, partition_info, global_fallback_nodes); + PartitionedGraph segmented_blocks = segment_graph(block, partitioning_info, global_fallback_nodes); // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks @@ -471,7 +471,7 @@ PartitionedGraph Partition( registerSegmentsOutputs(segmented_blocks, block); // run shape analysis on each segmented block - runShapeAnalysis(segmented_blocks, example_tensor_map, partition_info); + runShapeAnalysis(segmented_blocks, example_tensor_map, partitioning_info); for (uint64_t i = 0; i < segmented_blocks.size(); i++) { segmented_blocks[i].update_id(i); diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index f1eb38df8a..54a5334e07 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -3,12 +3,12 @@ #include #include +#include "torch/csrc/jit/ir/ir.h" + #include "core/ir/ir.h" -#include "core/partitioning/PartitionInfo.h" -#include "core/partitioning/SegmentedBlock.h" -#include "core/partitioning/shape_analysis.h" +#include "core/partitioning/partitioninginfo/PartitioningInfo.h" +#include "core/partitioning/segmentedblock/SegmentedBlock.h" #include "core/util/prelude.h" -#include "torch/csrc/jit/ir/ir.h" namespace torch_tensorrt { namespace core { @@ -30,15 +30,24 @@ enum FallbackNodeType { kNON_TENSOR, }; +std::unordered_map generateRandomInputs( + std::unordered_map>& input_ranges, + std::unordered_map>>& input_types); + +void runShapeAnalysis( + std::vector& segmented_blocks, + std::unordered_map& ivalues_maps, + const PartitioningInfo& partitioning_info); + PartitionedGraph segment_graph( torch::jit::Block* block, - const PartitionInfo& partition_info, + const PartitioningInfo& partitioning_info, std::unordered_map& fallback_nodes); PartitionedGraph Partition( torch::jit::Block* block, std::unordered_map& example_tensor_map, - const PartitionInfo& partition_info, + const PartitioningInfo& partitioning_info, std::unordered_map& fallback_nodes); std::ostream& operator<<(std::ostream& os, const PartitionedGraph& g); diff --git a/core/partitioning/partitioninginfo/BUILD b/core/partitioning/partitioninginfo/BUILD new file mode 100644 index 0000000000..74e34d134b --- /dev/null +++ b/core/partitioning/partitioninginfo/BUILD @@ -0,0 +1,39 @@ +load("@rules_cc//cc:defs.bzl", "cc_library") +load("@rules_pkg//:pkg.bzl", "pkg_tar") + +package(default_visibility = ["//visibility:public"]) + +config_setting( + name = "use_pre_cxx11_abi", + values = { + "define": "abi=pre_cxx11_abi", + }, +) + +cc_library( + name = "partitioninginfo", + srcs = [ + "PartitioningInfo.cpp", + ], + hdrs = [ + "PartitioningInfo.h", + ], + deps = [ + "//core/util:prelude", + "//core/ir", + "//core/conversion", + "//core/lowering", + ] + select({ + ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"], + "//conditions:default": ["@libtorch//:libtorch"], + }), + alwayslink = True, +) + +pkg_tar( + name = "include", + srcs = [ + "PartitioningInfo.h", + ], + package_dir = "core/partitioning/partitioninginfo", +) diff --git a/core/partitioning/partitioninginfo/CMakeLists.txt b/core/partitioning/partitioninginfo/CMakeLists.txt new file mode 100644 index 0000000000..86c7388daf --- /dev/null +++ b/core/partitioning/partitioninginfo/CMakeLists.txt @@ -0,0 +1,12 @@ +set(sub_lib_name "partitioninginfo") + +target_sources(${lib_name} + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/PartitioningInfo.cpp" +) + +set(HEADER_FILES + "${CMAKE_CURRENT_SOURCE_DIR}/PartitioningInfo.h" +) + +# Install headers +install(FILES ${HEADER_FILES} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/torch_tensorrt/core/partitioning/${sub_lib_name}") diff --git a/core/partitioning/PartitionInfo.cpp b/core/partitioning/partitioninginfo/PartitioningInfo.cpp similarity index 82% rename from core/partitioning/PartitionInfo.cpp rename to core/partitioning/partitioninginfo/PartitioningInfo.cpp index 59e29a9bf1..16bdd7b9a7 100644 --- a/core/partitioning/PartitionInfo.cpp +++ b/core/partitioning/partitioninginfo/PartitioningInfo.cpp @@ -2,13 +2,13 @@ #include #include -#include "core/partitioning/PartitionInfo.h" +#include "core/partitioning/partitioninginfo/PartitioningInfo.h" namespace torch_tensorrt { namespace core { namespace partitioning { // clang-format off -std::ostream& operator<<(std::ostream& os, const PartitionInfo& s) { +std::ostream& operator<<(std::ostream& os, const PartitioningInfo& s) { os << "Settings requested for Torch Fallback:" \ << "\n \"enabled\": "; if (s.enabled) { diff --git a/core/partitioning/PartitionInfo.h b/core/partitioning/partitioninginfo/PartitioningInfo.h similarity index 79% rename from core/partitioning/PartitionInfo.h rename to core/partitioning/partitioninginfo/PartitioningInfo.h index dc63597912..d57d79368f 100644 --- a/core/partitioning/PartitionInfo.h +++ b/core/partitioning/partitioninginfo/PartitioningInfo.h @@ -8,14 +8,14 @@ namespace torch_tensorrt { namespace core { namespace partitioning { -struct PartitionInfo { +struct PartitioningInfo { bool enabled = false; uint64_t min_block_size = 1; std::vector forced_fallback_operators; bool truncate_long_and_double; }; -std::ostream& operator<<(std::ostream& os, const PartitionInfo& s); +std::ostream& operator<<(std::ostream& os, const PartitioningInfo& s); } // namespace partitioning } // namespace core diff --git a/core/partitioning/segmentedblock/BUILD b/core/partitioning/segmentedblock/BUILD new file mode 100644 index 0000000000..8efe1e6b0a --- /dev/null +++ b/core/partitioning/segmentedblock/BUILD @@ -0,0 +1,39 @@ +load("@rules_cc//cc:defs.bzl", "cc_library") +load("@rules_pkg//:pkg.bzl", "pkg_tar") + +package(default_visibility = ["//visibility:public"]) + +config_setting( + name = "use_pre_cxx11_abi", + values = { + "define": "abi=pre_cxx11_abi", + }, +) + +cc_library( + name = "segmentedblock", + srcs = [ + "SegmentedBlock.cpp", + ], + hdrs = [ + "SegmentedBlock.h", + ], + deps = [ + "//core/util:prelude", + "//core/ir", + "//core/conversion", + "//core/lowering", + ] + select({ + ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"], + "//conditions:default": ["@libtorch//:libtorch"], + }), + alwayslink = True, +) + +pkg_tar( + name = "include", + srcs = [ + "SegmentedBlock.h", + ], + package_dir = "core/partitioning/segmentedblock", +) diff --git a/core/partitioning/segmentedblock/CMakeLists.txt b/core/partitioning/segmentedblock/CMakeLists.txt new file mode 100644 index 0000000000..ad6d9ee875 --- /dev/null +++ b/core/partitioning/segmentedblock/CMakeLists.txt @@ -0,0 +1,12 @@ +set(sub_lib_name "segmentedblock") + +target_sources(${lib_name} + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/SegmentedBlock.cpp" +) + +set(HEADER_FILES + "${CMAKE_CURRENT_SOURCE_DIR}/SegmentedBlock.h" +) + +# Install headers +install(FILES ${HEADER_FILES} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/torch_tensorrt/core/partitioning/${sub_lib_name}") diff --git a/core/partitioning/SegmentedBlock.cpp b/core/partitioning/segmentedblock/SegmentedBlock.cpp similarity index 100% rename from core/partitioning/SegmentedBlock.cpp rename to core/partitioning/segmentedblock/SegmentedBlock.cpp diff --git a/core/partitioning/SegmentedBlock.h b/core/partitioning/segmentedblock/SegmentedBlock.h similarity index 98% rename from core/partitioning/SegmentedBlock.h rename to core/partitioning/segmentedblock/SegmentedBlock.h index f7d8a0b612..0e04237f63 100644 --- a/core/partitioning/SegmentedBlock.h +++ b/core/partitioning/segmentedblock/SegmentedBlock.h @@ -5,7 +5,6 @@ #include "NvInfer.h" #include "core/ir/ir.h" -#include "core/partitioning/PartitionInfo.h" #include "torch/csrc/jit/ir/ir.h" namespace torch_tensorrt { diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index 7a36529949..06344a2ac6 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -1,9 +1,10 @@ -#include "core/partitioning/shape_analysis.h" -#include -#include "core/util/prelude.h" +#include "ATen/ATen.h" #include "torch/csrc/jit/api/module.h" #include "torch/csrc/jit/passes/constant_pooling.h" +#include "core/partitioning/partitioning.h" +#include "core/util/prelude.h" + namespace torch_tensorrt { namespace core { namespace partitioning { @@ -61,7 +62,7 @@ std::unordered_map generateRandomI void getSegmentsOutputByRunning( SegmentedBlock& seg_block, std::unordered_map& ivalues_maps, - const PartitionInfo& partition_info) { + const PartitioningInfo& partitioning_info) { // create a module to run the graph auto g = seg_block.g(); auto copy_g = g->copy(); @@ -151,13 +152,13 @@ void getSegmentsOutputByRunning( // shape inference auto cur_ivalue = ivalues_maps[i]; at::ScalarType t = cur_ivalue.toTensor().scalar_type(); - if (!partition_info.truncate_long_and_double && (t == at::kLong || t == at::kDouble)) { + if (!partitioning_info.truncate_long_and_double && (t == at::kLong || t == at::kDouble)) { TORCHTRT_THROW_ERROR( "Unable to process subgraph input type of at::kLong/at::kDouble, try to compile model with truncate_long_and_double enabled"); - } else if (partition_info.truncate_long_and_double && t == at::kLong) { + } else if (partitioning_info.truncate_long_and_double && t == at::kLong) { cur_ivalue = cur_ivalue.toTensor().to(at::kInt); LOG_WARNING("Truncating graph input type from at::kLong to at::kInt"); - } else if (partition_info.truncate_long_and_double && t == at::kDouble) { + } else if (partitioning_info.truncate_long_and_double && t == at::kDouble) { cur_ivalue = cur_ivalue.toTensor().to(at::kFloat); LOG_WARNING("Truncating graph input type from at::kDouble to at::kFloat"); } @@ -183,11 +184,11 @@ void getSegmentsOutputByRunning( void runShapeAnalysis( std::vector& segmented_blocks, std::unordered_map& example_tensor_map, - const PartitionInfo& partition_info) { + const PartitioningInfo& partitioning_info) { // register every segment's input shape, and it's running output IValues for (auto& seg_block : segmented_blocks) { torch::jit::ConstantPooling(seg_block.g()); - getSegmentsOutputByRunning(seg_block, example_tensor_map, partition_info); + getSegmentsOutputByRunning(seg_block, example_tensor_map, partitioning_info); } return; } diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h deleted file mode 100644 index 780449d514..0000000000 --- a/core/partitioning/shape_analysis.h +++ /dev/null @@ -1,20 +0,0 @@ -#include "core/ir/ir.h" -#include "core/partitioning/SegmentedBlock.h" -#include "torch/csrc/jit/ir/ir.h" - -namespace torch_tensorrt { -namespace core { -namespace partitioning { - -std::unordered_map generateRandomInputs( - std::unordered_map>& input_ranges, - std::unordered_map>>& input_types); - -void runShapeAnalysis( - std::vector& segmented_blocks, - std::unordered_map& ivalues_maps, - const PartitionInfo& partition_info); - -} // namespace partitioning -} // namespace core -} // namespace torch_tensorrt diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp index cfbc228396..3d7d9b15d3 100644 --- a/cpp/src/compile_spec.cpp +++ b/cpp/src/compile_spec.cpp @@ -121,10 +121,10 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) { "require_full_compilation is enabled however the list of modules to run in torch is not empty (Found " << external.torch_executed_modules.size() << " modules)"); - internal.partition_info.enabled = !external.require_full_compilation; - internal.partition_info.min_block_size = external.min_block_size; - internal.partition_info.forced_fallback_operators = std::move(external.torch_executed_ops); - internal.partition_info.truncate_long_and_double = external.truncate_long_and_double; + internal.partitioning_info.enabled = !external.require_full_compilation; + internal.partitioning_info.min_block_size = external.min_block_size; + internal.partitioning_info.forced_fallback_operators = std::move(external.torch_executed_ops); + internal.partitioning_info.truncate_long_and_double = external.truncate_long_and_double; internal.lower_info.forced_fallback_modules = std::move(external.torch_executed_modules); switch (external.device.device_type) { From 85f8cc309599d78a60d31097677fb49936d364f8 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Sun, 14 Aug 2022 00:27:02 -0700 Subject: [PATCH 03/11] refactor(//core/partitioning): Centralizing partitioning state Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/compiler.cpp | 50 ++-- core/partitioning/BUILD | 1 + core/partitioning/CMakeLists.txt | 1 + core/partitioning/partitioning.cpp | 219 +++++++----------- core/partitioning/partitioning.h | 47 +--- core/partitioning/partitioningctx/BUILD | 40 ++++ .../partitioningctx/CMakeLists.txt | 12 + .../partitioningctx/PartitioningCtx.cpp | 123 ++++++++++ .../partitioningctx/PartitioningCtx.h | 69 ++++++ .../partitioninginfo/PartitioningInfo.h | 3 + core/partitioning/shape_analysis.cpp | 9 +- .../lowering/test_module_fallback_passes.cpp | 2 +- .../lowering/test_view_to_reshape_pass.cpp | 4 +- tests/core/partitioning/test_conditionals.cpp | 6 +- .../test_fallback_graph_output.cpp | 8 +- .../core/partitioning/test_loading_model.cpp | 2 +- .../core/partitioning/test_loop_fallback.cpp | 4 +- .../test_resolve_nontensor_inputs.cpp | 33 ++- tests/core/partitioning/test_segmentation.cpp | 152 ++++++------ .../core/partitioning/test_shape_analysis.cpp | 18 +- .../core/partitioning/test_stitched_graph.cpp | 4 +- .../partitioning/test_tensorrt_conversion.cpp | 4 +- 22 files changed, 496 insertions(+), 315 deletions(-) create mode 100644 core/partitioning/partitioningctx/BUILD create mode 100644 core/partitioning/partitioningctx/CMakeLists.txt create mode 100644 core/partitioning/partitioningctx/PartitioningCtx.cpp create mode 100644 core/partitioning/partitioningctx/PartitioningCtx.h diff --git a/core/compiler.cpp b/core/compiler.cpp index 8b62aeba4d..ddf59c4a0c 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -219,19 +219,16 @@ void AddIfBlockToGraph( return; } -GraphAndMapping ConstructFallbackGraph( +GraphAndMapping ConstructFallbackGraph_( torch::jit::script::Module& new_mod, torch::jit::Block* block, - std::unordered_map example_tensor_map, - CompileSpec cfg, + partitioning::PartitioningCtx* partitioning_ctx, + conversion::ConversionInfo convert_info, ir::StaticParams static_params, - std::unordered_map& fallback_nodes) { - auto convert_cfg = cfg.convert_info; - auto partitioning_info = cfg.partitioning_info; - + std::unordered_map example_tensor_map) { auto new_g = std::make_shared(); - auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partitioning_info, fallback_nodes); + auto segmented_blocks = partitioning::partition(partitioning_ctx, block, example_tensor_map); // the mapping from lowering graph => fallback global graph std::unordered_map old_to_new_g; @@ -240,7 +237,7 @@ GraphAndMapping ConstructFallbackGraph( } for (auto& seg_block : segmented_blocks) { - LOG_INFO(seg_block << "(GraphInSegmentedBlock)\n"); + LOG_INFO("Block segment:" << seg_block); std::ostringstream trt_engine_id; trt_engine_id << reinterpret_cast(&seg_block); @@ -254,12 +251,12 @@ GraphAndMapping ConstructFallbackGraph( inputs.push_back(in); } // update the input ranges for each segments - convert_cfg.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params); + convert_info.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params); // TODO mapping Inputs Ivalue to flatten one here - auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, static_params); + auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_info, static_params); auto temp_g = std::make_shared(); - auto device_spec = convert_cfg.engine_settings.device; + auto device_spec = convert_info.engine_settings.device; auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type); AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true); @@ -272,8 +269,8 @@ GraphAndMapping ConstructFallbackGraph( // convert the 2 blocks in prim::if and get the converted graph with mappings std::vector graph_and_mappings; for (auto cur_block : if_node->blocks()) { - graph_and_mappings.push_back( - ConstructFallbackGraph(new_mod, cur_block, example_tensor_map, cfg, static_params, fallback_nodes)); + graph_and_mappings.push_back(ConstructFallbackGraph_( + new_mod, cur_block, partitioning_ctx, convert_info, static_params, example_tensor_map)); } AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g); @@ -303,6 +300,23 @@ GraphAndMapping ConstructFallbackGraph( return {new_g, old_to_new_g}; } +GraphAndMapping ConstructFallbackGraph( + torch::jit::script::Module& new_mod, + torch::jit::Block* block, + CompileSpec cfg, + ir::StaticParams static_params, + ir::CollectionTypeMap first_use_types) { + auto convert_info = cfg.convert_info; + auto partitioning_info = cfg.partitioning_info; + + auto partitioning_ctx = partitioning::PartitioningCtx(block, partitioning_info); + auto collection_input_ivalues_map = + partitioning::generateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types); + + return ConstructFallbackGraph_( + new_mod, block, &partitioning_ctx, convert_info, static_params, collection_input_ivalues_map); +} + void MapInputsAndDetermineDTypes( CompileSpec& cfg, std::shared_ptr& g, @@ -310,6 +324,8 @@ void MapInputsAndDetermineDTypes( ir::CollectionTypeMap& first_use_type_map) { cfg.convert_info.collection_input_spec_map = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params)); + cfg.partitioning_info.collection_input_spec_map = + ir::CollectionInputSpecMap(cfg.convert_info.collection_input_spec_map); auto collection_inputs = ir::get_collection_inputs(g, static_params); LOG_DEBUG( @@ -434,11 +450,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) (!(cfg.lower_info.forced_fallback_modules.size() == 0 && cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) || outputIsCollection)) { - std::unordered_map fallback_nodes; - auto collection_input_ivalues_map = - partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types); - auto graph_and_mapping = ConstructFallbackGraph( - new_mod, g->block(), collection_input_ivalues_map, cfg, static_params, fallback_nodes); + auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), cfg, static_params, first_use_types); new_g = graph_and_mapping.first; // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly for (size_t i = 0; i < new_g->inputs().size(); ++i) { diff --git a/core/partitioning/BUILD b/core/partitioning/BUILD index 22b3ff6729..a1aa49ad4f 100644 --- a/core/partitioning/BUILD +++ b/core/partitioning/BUILD @@ -24,6 +24,7 @@ cc_library( "//core/ir", "//core/conversion", "//core/lowering", + "//core/partitioning/partitioningctx", "//core/partitioning/partitioninginfo", "//core/partitioning/segmentedblock", ] + select({ diff --git a/core/partitioning/CMakeLists.txt b/core/partitioning/CMakeLists.txt index 7ce16fd67f..7f83b3d891 100644 --- a/core/partitioning/CMakeLists.txt +++ b/core/partitioning/CMakeLists.txt @@ -32,6 +32,7 @@ target_include_directories(${lib_name} PUBLIC "$" ) +add_subdirectory(partitioningctx) add_subdirectory(partitioninginfo) add_subdirectory(segmentedblock) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 9018ce1fd9..1f1b341c55 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -6,6 +6,7 @@ #include "core/conversion/conversion.h" #include "core/conversion/evaluators/evaluators.h" #include "core/partitioning/partitioning.h" +#include "core/partitioning/partitioningctx/PartitioningCtx.h" namespace torch_tensorrt { namespace core { @@ -92,13 +93,11 @@ std::vector getDependencyNodes( // check if the input and output of the graph is Tensor after collection is enabled. If it is, then fallback related // nodes -void fallback_graph_nontensor_in_out( - torch::jit::Block* block, - std::unordered_map& global_fallback_nodes) { +void fallback_graph_nontensor_in_out(PartitioningCtx* ctx, torch::jit::Block* block) { // fallback nodes that produce entire graph's nonTensor output for (auto i : block->outputs()) { if (!isTensor(i)) { - global_fallback_nodes.insert({i->node(), FallbackNodeType::kNON_TENSOR}); + ctx->setNodeExecutorDecision(i->node(), NodeExecutorDecision::kNON_TENSOR); } } @@ -106,15 +105,13 @@ void fallback_graph_nontensor_in_out( for (auto i : block->inputs()) { if (!isTensor(i)) { for (auto use : i->uses()) { - global_fallback_nodes.insert({use.user, FallbackNodeType::kNON_TENSOR}); + ctx->setNodeExecutorDecision(use.user, NodeExecutorDecision::kNON_TENSOR); } } } } -void find_all_fallback_nodes( - std::unordered_map& initial_fallback_nodes, - std::unordered_map& global_fallback_nodes) { +void find_all_fallback_nodes(PartitioningCtx* ctx, NodeExecutorDecisionMap& initial_fallback_nodes) { // initial_fallback_nodes are the fallback nodes that we have before we run BFS in this function // global_fallback_nodes are the fallback nodes that we maintain globally std::queue q; @@ -128,8 +125,9 @@ void find_all_fallback_nodes( q.pop(); // for every node that produces this fallback node's NonTensor input, they should fallback too for (auto input : cur_node->inputs()) { + // NOTE: This does not make sense, does this rely on shortciruiting to work right? if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant && - global_fallback_nodes.insert({input->node(), FallbackNodeType::kNON_TENSOR}).second) { + ctx->setNodeExecutorDecision(input->node(), NodeExecutorDecision::kNON_TENSOR)) { q.push(input->node()); } } @@ -138,8 +136,9 @@ void find_all_fallback_nodes( if (!isTensor(output)) { for (auto use : output->uses()) { auto node = use.user; + // NOTE: This does not make sense, does this rely on shortciruiting to work right? if (node->kind() != torch::jit::prim::Constant && - global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) { + ctx->setNodeExecutorDecision(node, NodeExecutorDecision::kNON_TENSOR)) { q.push(node); } } @@ -148,32 +147,32 @@ void find_all_fallback_nodes( } } -void resolveTRTNonTensorInputs(PartitionedGraph& segmented_blocks) { +void resolveTRTNonTensorInputs(PartitioningCtx* ctx) { // if a TRT segment has nonTensor Inputs, the nodes that produce this nonTensor Inputs must in another TensorRT engine // because we have already found the interface between Torch and TRT in segmentation phase // what we do here is just find the dependency nodes of the TRT segments that have nonTensor inputs - for (size_t i = 0; i < segmented_blocks.size(); ++i) { - if (segmented_blocks[i].target() == SegmentedBlock::kTensorRT) { + for (size_t i = 0; i < ctx->blocks.size(); ++i) { + if (ctx->blocks[i].target() == SegmentedBlock::kTensorRT) { std::vector inputs_to_resolve; - for (auto input : segmented_blocks[i].raw_inputs()) { + for (auto input : ctx->blocks[i].raw_inputs()) { if (!isTensor(input)) { inputs_to_resolve.push_back(input); } } if (!inputs_to_resolve.empty()) { - std::vector dependency_nodes = getDependencyNodes(inputs_to_resolve, segmented_blocks[i]); + std::vector dependency_nodes = getDependencyNodes(inputs_to_resolve, ctx->blocks[i]); dependency_nodes.insert( - dependency_nodes.end(), segmented_blocks[i].raw_nodes().begin(), segmented_blocks[i].raw_nodes().end()); - segmented_blocks[i] = SegmentedBlock(SegmentedBlock::kTensorRT, dependency_nodes); + dependency_nodes.end(), ctx->blocks[i].raw_nodes().begin(), ctx->blocks[i].raw_nodes().end()); + ctx->blocks[i] = SegmentedBlock(SegmentedBlock::kTensorRT, dependency_nodes); } } } } -void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Block* block) { +void registerSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { // find the corresponding raw values in original global graph for this segmented block's inputs/outputs std::set input_values; - for (auto& seg_block : segmented_blocks) { + for (auto& seg_block : ctx->blocks) { for (auto& input : seg_block.raw_inputs()) { input_values.insert(input); } @@ -186,7 +185,7 @@ void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Blo // should be careful here because some in-place operations don't return any values, there is no output for this kind // of segment identify the output for each mini-graph by checking if any value in this graph is used later we // shouldn't register nonTensor output for TensorRT segments - for (auto& seg_block : segmented_blocks) { + for (auto& seg_block : ctx->blocks) { for (auto& mini_graph_input : input_values) { if (std::find(seg_block.raw_inputs().begin(), seg_block.raw_inputs().end(), mini_graph_input) == seg_block.raw_inputs().end() && @@ -215,20 +214,21 @@ void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Blo } } - std::for_each(segmented_blocks.begin(), segmented_blocks.end(), [](SegmentedBlock& seg_block) { + std::for_each(ctx->blocks.begin(), ctx->blocks.end(), [](SegmentedBlock& seg_block) { torch::jit::EliminateDeadCode(seg_block.g()); }); // erase segments which still have no output - segmented_blocks.erase( + ctx->blocks.erase( std::remove_if( - segmented_blocks.begin(), - segmented_blocks.end(), + ctx->blocks.begin(), + ctx->blocks.end(), [](SegmentedBlock& seg_block) { return seg_block.raw_outputs().empty(); }), - segmented_blocks.end()); + ctx->blocks.end()); return; } +// Need to check if this makes sense might be a root cause of some issues of over aggressive fallback bool checkLoopEvaluatable(torch::jit::Node* n) { bool compile_to_trt = true; for (auto bn : n->blocks()[0]->nodes()) { @@ -243,44 +243,9 @@ bool checkLoopEvaluatable(torch::jit::Node* n) { return compile_to_trt; } -bool check_node_fallback(torch::jit::Node* n, const std::unordered_map& fallback_nodes) { - if (fallback_nodes.count(n)) { - if (fallback_nodes.at(n) == FallbackNodeType::kUNSUPPORTED) { - LOG_GRAPH("Node not supported by conversion: " << util::node_info(n)); - } else if (fallback_nodes.at(n) == FallbackNodeType::kOPERATOR_FALLBACK) { - LOG_GRAPH("Node explicitly set to run in torch: " << util::node_info(n)); - } else if (fallback_nodes.at(n) == FallbackNodeType::kMODULE_FALLBACK) { - LOG_GRAPH("Node is within a module set to run in torch: " << util::node_info(n)); - } else if (fallback_nodes.at(n) == FallbackNodeType::kMIN_BLOCK_FALLBACK) { - LOG_GRAPH("Node fallback to Torch because of min_block_size" << util::node_info(n)); - } else { - LOG_GRAPH( - "Node fallback to Torch because the NonTensor dependencies with other fallback nodes: " - << util::node_info(n)); - } - return false; - } - - LOG_GRAPH("Node is going to run in TensorRT: " << util::node_info(n)); - return true; -} - -void finalize_block( - PartitionedGraph& g, - SegmentedBlock::SegmentedBlockTarget kind, - std::vector& nodes) { - LOG_DEBUG("Finalizing in progress " << SegmentedBlock::target_to_str(kind) << " block"); - g.emplace_back(g.size(), kind, nodes); - nodes.clear(); - LOG_DEBUG(g.back()); -} - // use this function to get all initial fallback nodes (nodes that are unsupported or forced fallback) // we use a map to indicate the reason why it's fallback to torch -void get_fallback_nodes( - torch::jit::Block* block, - const std::unordered_set& forced_fallback_ops, - std::unordered_map& fallback_nodes) { +void get_fallback_nodes(PartitioningCtx* ctx, torch::jit::Block* block) { auto nodes = block->nodes(); for (const auto n : nodes) { if (n->kind() == torch::jit::prim::Constant) { @@ -289,94 +254,90 @@ void get_fallback_nodes( // If the op is not supported by the conversion phase it should run in PyTorch if (!conversion::OpSupported(n)) { - fallback_nodes.insert({n, FallbackNodeType::kUNSUPPORTED}); + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kUNSUPPORTED); } // If the user specifies the op to run in Torch it should run in PyTorch - if (forced_fallback_ops.find(n->kind().toQualString()) != forced_fallback_ops.end()) { - fallback_nodes.insert({n, FallbackNodeType::kOPERATOR_FALLBACK}); + if (ctx->forced_fallback_ops.find(n->kind().toQualString()) != ctx->forced_fallback_ops.end()) { + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kOPERATOR_FALLBACK); } // If the user specifies the module containing this op to run in torch it should run in PyTorch const auto to_compile_sym = c10::Symbol::attr("to_compile"); if (n->hasAttribute(to_compile_sym) && n->i(to_compile_sym) == (int64_t) false) { - fallback_nodes.insert({n, FallbackNodeType::kMODULE_FALLBACK}); + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kMODULE_FALLBACK); } } return; } -std::vector traverse_nodes_for_min_block_size( - torch::jit::Block* block, - const std::unordered_map& global_fallback_nodes, - size_t min_block_size) { +std::vector traverse_nodes_for_min_block_size(PartitioningCtx* ctx, torch::jit::Block* block) { auto nodes = block->nodes(); std::vector cur_trt_nodes; std::vector min_block_fallback_nodes; for (const auto n : nodes) { - if (n->kind() == torch::jit::prim::Constant) + if (n->kind() == torch::jit::prim::Constant) { continue; + } // check if current node fallback or not - if (!global_fallback_nodes.count(n)) { + if (!ctx->shouldNodeRunInTorch(n)) { // if this node is not in fallback nodes, then it's in trt segments cur_trt_nodes.push_back(n); } else { - if (cur_trt_nodes.size() < min_block_size) { + if (cur_trt_nodes.size() < ctx->settings.min_block_size) { min_block_fallback_nodes.insert(min_block_fallback_nodes.end(), cur_trt_nodes.begin(), cur_trt_nodes.end()); } cur_trt_nodes.clear(); } } - if (cur_trt_nodes.size() < min_block_size) { + if (cur_trt_nodes.size() < ctx->settings.min_block_size) { min_block_fallback_nodes.insert(min_block_fallback_nodes.end(), cur_trt_nodes.begin(), cur_trt_nodes.end()); } return min_block_fallback_nodes; } -void find_min_block_size_fallback_nodes( - torch::jit::Block* block, - std::unordered_map& global_fallback_nodes, - size_t min_block_size) { +void find_min_block_size_fallback_nodes(PartitioningCtx* ctx, torch::jit::Block* block) { // first traverse all the nodes to find the initial nodes that don't meet the min_block_size requirement - auto min_block_fallback_nodes = traverse_nodes_for_min_block_size(block, global_fallback_nodes, min_block_size); - std::unordered_map initial_fallback_nodes; + auto min_block_fallback_nodes = traverse_nodes_for_min_block_size(ctx, block); + NodeExecutorDecisionMap initial_fallback_nodes; // keep fallback until all segments meet the min_block_size requirement while (!min_block_fallback_nodes.empty()) { for (const auto i : min_block_fallback_nodes) { - initial_fallback_nodes.insert({i, FallbackNodeType::kMIN_BLOCK_FALLBACK}); + initial_fallback_nodes.insert({i, NodeExecutorDecision::kMIN_BLOCK_FALLBACK}); + ctx->setNodeExecutorDecision(i, NodeExecutorDecision::kMIN_BLOCK_FALLBACK); } - global_fallback_nodes.insert(initial_fallback_nodes.begin(), initial_fallback_nodes.end()); // find the fallback nodes because of dependency with min_block_size caused fallback nodes - find_all_fallback_nodes(initial_fallback_nodes, global_fallback_nodes); + find_all_fallback_nodes(ctx, initial_fallback_nodes); // keep traverse the graph until there is no node fallback because of min_block_size - min_block_fallback_nodes = traverse_nodes_for_min_block_size(block, global_fallback_nodes, min_block_size); + min_block_fallback_nodes = traverse_nodes_for_min_block_size(ctx, block); } } -PartitionedGraph segment_graph( - torch::jit::Block* block, - const PartitioningInfo& partitioning_info, - std::unordered_map& global_fallback_nodes) { - auto min_block_size = partitioning_info.min_block_size; - std::unordered_set forced_fallback_ops( - partitioning_info.forced_fallback_operators.begin(), partitioning_info.forced_fallback_operators.end()); - +void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block) { // get the initial fallback nodes (nodes that are unsupported or forced fallback) - get_fallback_nodes(block, forced_fallback_ops, global_fallback_nodes); + get_fallback_nodes(ctx, block); // For fallback nodes, if it consumes any NonTensor inputs or TensorList inputs, then the node that produces this // input should also fallback Similarly, if it produces any NonTensor outputs or TensorList outputs, then the node // that produces this input should also fallback // TODO: don't need to fallback the TensorList related nodes once the collection feature is supported - find_all_fallback_nodes(global_fallback_nodes, global_fallback_nodes); + find_all_fallback_nodes(ctx, ctx->node_executor_decision_map); // find all fallback nodes because of the min_block_size requirement - find_min_block_size_fallback_nodes(block, global_fallback_nodes, min_block_size); + find_min_block_size_fallback_nodes(ctx, block); auto nodes = block->nodes(); - PartitionedGraph segmented_blocks; + + // NOTE: Realize this may be redundant, but will let us have an explicit state for each node. Maybe there is a better + // way for (auto n : nodes) { + // if (!ctx->shouldNodeRunInTorch(n) && !ctx->isNodeExecutorKnown(n)) { + // if (conversion::OpSupported(n)) { + // ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kCONVERT); + // } + // } + // } // segment the nodes std::vector in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes; @@ -386,24 +347,29 @@ PartitionedGraph segment_graph( continue; } // the outputs of trt subgraph shouldn't be collections - if (check_node_fallback(n, global_fallback_nodes)) { + if (!ctx->shouldNodeRunInTorch(n)) { in_prog_trt_blk_nodes.push_back(n); // If there is an active PyTorch block and we have passed the threshold for a valid TRT // block then segment and reset the active PyTorch block - if (in_prog_trt_blk_nodes.size() >= min_block_size && !in_prog_pyt_blk_nodes.empty()) { - finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + if (in_prog_trt_blk_nodes.size() >= ctx->settings.min_block_size && !in_prog_pyt_blk_nodes.empty()) { + ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } } else { // If there is an active TRT block that is valid segment and reset the active TRT block // otherwise add it to the active PyTorch block and reset - if (in_prog_trt_blk_nodes.size() >= min_block_size) { - finalize_block(segmented_blocks, SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); + if (in_prog_trt_blk_nodes.size() >= ctx->settings.min_block_size) { + ctx->finalizeNewBlock(SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); } else { LOG_DEBUG( - "In progress TRT block does not meet minimum block size requirements, therefore folding into in progress PyTorch block"); + "In progress TRT block does not meet minimum block size requirements (" + << in_prog_trt_blk_nodes.size() << ", expected at least " << ctx->settings.min_block_size + << "), therefore folding into in progress PyTorch block"); in_prog_pyt_blk_nodes.insert( in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); + for (auto n : in_prog_pyt_blk_nodes) { + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kMIN_BLOCK_FALLBACK); + } } in_prog_trt_blk_nodes.clear(); // if there is a prim::If then this if node will be encapsulated in a SegmentedBlock @@ -412,20 +378,20 @@ PartitionedGraph segment_graph( LOG_DEBUG( "Hit a conditional statement, finializing in progress PYT block and creating a new one for the conditional"); if (!in_prog_pyt_blk_nodes.empty()) { - finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } auto cond_node = std::vector{n}; - finalize_block(segmented_blocks, SegmentedBlock::kTorch, cond_node); + ctx->finalizeNewBlock(SegmentedBlock::kTorch, cond_node); continue; } else if (n->kind() == torch::jit::prim::Loop) { if (!in_prog_pyt_blk_nodes.empty()) { - finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } if (checkLoopEvaluatable(n)) { in_prog_trt_blk_nodes.push_back(n); } else { auto loop_node = std::vector{n}; - finalize_block(segmented_blocks, SegmentedBlock::kTorch, loop_node); + ctx->finalizeNewBlock(SegmentedBlock::kTorch, loop_node); } continue; } @@ -435,60 +401,47 @@ PartitionedGraph segment_graph( // if there is any kTorch nodes left, then either the last nodes are kTorch or last nodes are kTensorRT but num < // min_block_size - if (in_prog_trt_blk_nodes.size() >= min_block_size) { - finalize_block(segmented_blocks, SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); + if (in_prog_trt_blk_nodes.size() >= ctx->settings.min_block_size) { + ctx->finalizeNewBlock(SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); } if (!in_prog_pyt_blk_nodes.empty() || !in_prog_trt_blk_nodes.empty()) { in_prog_pyt_blk_nodes.insert( in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); - finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } - return segmented_blocks; + return; } -PartitionedGraph Partition( - torch::jit::Block* block, - std::unordered_map& example_tensor_map, - const PartitioningInfo& partitioning_info, - std::unordered_map& global_fallback_nodes) { - LOG_DEBUG(partitioning_info); +PartitionedGraph partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { + LOG_DEBUG(ctx->settings); // if there is nonTensor input/output for the entire graph, fallback the node that consumes/produces this nonTensor // output - fallback_graph_nontensor_in_out(block, global_fallback_nodes); + fallback_graph_nontensor_in_out(ctx, block); // segment lowering global graph into blocks LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks"); - PartitionedGraph segmented_blocks = segment_graph(block, partitioning_info, global_fallback_nodes); + segment_graph(ctx, block); // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks // resolve nonTensor inputs/outputs - resolveTRTNonTensorInputs(segmented_blocks); + resolveTRTNonTensorInputs(ctx); // register input/output torch::jit::Value for segmented graphs LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs"); - registerSegmentsOutputs(segmented_blocks, block); + registerSegmentsOutputs(ctx, block); // run shape analysis on each segmented block - runShapeAnalysis(segmented_blocks, example_tensor_map, partitioning_info); + runShapeAnalysis(ctx, example_tensor_map); - for (uint64_t i = 0; i < segmented_blocks.size(); i++) { - segmented_blocks[i].update_id(i); + for (uint64_t i = 0; i < ctx->blocks.size(); i++) { + ctx->blocks[i].update_id(i); } - LOG_INFO(segmented_blocks); - - return segmented_blocks; -} + LOG_INFO(ctx->blocks); -std::ostream& operator<<(std::ostream& os, const PartitionedGraph& g) { - os << "Partitioned Graph: ["; - for (auto b : g) { - os << b; - } - os << "]"; - return os; + return std::move(ctx->blocks); } } // namespace partitioning diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index 54a5334e07..320944d1d2 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -6,6 +6,7 @@ #include "torch/csrc/jit/ir/ir.h" #include "core/ir/ir.h" +#include "core/partitioning/partitioningctx/PartitioningCtx.h" #include "core/partitioning/partitioninginfo/PartitioningInfo.h" #include "core/partitioning/segmentedblock/SegmentedBlock.h" #include "core/util/prelude.h" @@ -14,43 +15,15 @@ namespace torch_tensorrt { namespace core { namespace partitioning { -typedef std::vector PartitionedGraph; - -enum FallbackNodeType { - /// Node is not supported by TensorRT - kUNSUPPORTED, - /// Node is explicitly forced to fallback to Pytorch due to operator fallback - kOPERATOR_FALLBACK, - /// Node is explicitly forced to fallback to Pytorch due to module fallback - kMODULE_FALLBACK, - /// This node is in a TRT segment which does not satisfy min_block_size - /// and hence is forced to fallback. - kMIN_BLOCK_FALLBACK, - /// This node produces/consumes non-tensor inputs - kNON_TENSOR, -}; - -std::unordered_map generateRandomInputs( - std::unordered_map>& input_ranges, - std::unordered_map>>& input_types); - -void runShapeAnalysis( - std::vector& segmented_blocks, - std::unordered_map& ivalues_maps, - const PartitioningInfo& partitioning_info); - -PartitionedGraph segment_graph( - torch::jit::Block* block, - const PartitioningInfo& partitioning_info, - std::unordered_map& fallback_nodes); - -PartitionedGraph Partition( - torch::jit::Block* block, - std::unordered_map& example_tensor_map, - const PartitioningInfo& partitioning_info, - std::unordered_map& fallback_nodes); - -std::ostream& operator<<(std::ostream& os, const PartitionedGraph& g); +typedef std::unordered_map ExampleIValues; + +ExampleIValues generateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types); + +void runShapeAnalysis(PartitioningCtx* ctx, ExampleIValues& ivalues_maps); + +void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block); + +PartitionedGraph partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map); } // namespace partitioning } // namespace core diff --git a/core/partitioning/partitioningctx/BUILD b/core/partitioning/partitioningctx/BUILD new file mode 100644 index 0000000000..6895f8d451 --- /dev/null +++ b/core/partitioning/partitioningctx/BUILD @@ -0,0 +1,40 @@ +load("@rules_cc//cc:defs.bzl", "cc_library") +load("@rules_pkg//:pkg.bzl", "pkg_tar") + +package(default_visibility = ["//visibility:public"]) + +config_setting( + name = "use_pre_cxx11_abi", + values = { + "define": "abi=pre_cxx11_abi", + }, +) + +cc_library( + name = "partitioningctx", + srcs = [ + "PartitioningCtx.cpp", + ], + hdrs = [ + "PartitioningCtx.h", + ], + deps = [ + "//core/util:prelude", + "//core/ir", + "//core/conversion", + "//core/partitioning/segmentedblock", + "//core/partitioning/partitioninginfo", + ] + select({ + ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"], + "//conditions:default": ["@libtorch//:libtorch"], + }), + alwayslink = True, +) + +pkg_tar( + name = "include", + srcs = [ + "PartitioningCtx.h", + ], + package_dir = "core/partitioning/partitioningctx", +) diff --git a/core/partitioning/partitioningctx/CMakeLists.txt b/core/partitioning/partitioningctx/CMakeLists.txt new file mode 100644 index 0000000000..090167f829 --- /dev/null +++ b/core/partitioning/partitioningctx/CMakeLists.txt @@ -0,0 +1,12 @@ +set(sub_lib_name "partitioningctx") + +target_sources(${lib_name} + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/PartitioningCtx.cpp" +) + +set(HEADER_FILES + "${CMAKE_CURRENT_SOURCE_DIR}/PartitioningCtx.h" +) + +# Install headers +install(FILES ${HEADER_FILES} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/torch_tensorrt/core/partitioning/${sub_lib_name}") diff --git a/core/partitioning/partitioningctx/PartitioningCtx.cpp b/core/partitioning/partitioningctx/PartitioningCtx.cpp new file mode 100644 index 0000000000..6a7ddf9f92 --- /dev/null +++ b/core/partitioning/partitioningctx/PartitioningCtx.cpp @@ -0,0 +1,123 @@ +#include + +#include "core/partitioning/partitioningctx/PartitioningCtx.h" +#include "core/util/prelude.h" + +namespace torch_tensorrt { +namespace core { +namespace partitioning { + +PartitioningCtx::PartitioningCtx(torch::jit::Block* b, PartitioningInfo info) + : settings(info), + forced_fallback_ops(info.forced_fallback_operators.begin(), info.forced_fallback_operators.end()) { + LOG_DEBUG(settings); + //_load_nodes_into_decision_map(b); +} + +void PartitioningCtx::_load_nodes_into_decision_map(torch::jit::Block* b) { + for (const auto n : b->nodes()) { + node_executor_decision_map[n] = NodeExecutorDecision::kUNKNOWN; + for (const auto sub_b : n->blocks()) { + _load_nodes_into_decision_map(sub_b); + } + } +} + +void PartitioningCtx::finalizeNewBlock( + SegmentedBlock::SegmentedBlockTarget kind, + std::vector& nodes) { + LOG_DEBUG("Finalizing in progress " << SegmentedBlock::target_to_str(kind) << " block"); + blocks.emplace_back(blocks.size(), kind, nodes); + + // TODO: Can we not need this? + nodes.clear(); + LOG_DEBUG(blocks.back()); +} + +bool PartitioningCtx::setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision) { + auto iter = node_executor_decision_map.find(n); + auto prev_decision = NodeExecutorDecision::kUNKNOWN; + if (iter != node_executor_decision_map.end()) { + prev_decision = iter->second; + } + LOG_GRAPH("Setting node " << util::node_info(n) << " " << decision << " (previously was " << prev_decision << ")"); + + // NOTE: This is this way due to partitioning.cpp L#134 I dont know if this is what we should do. + auto result = node_executor_decision_map.insert({n, decision}); + return result.second; +} + +bool PartitioningCtx::shouldNodeRunInTorch(torch::jit::Node* n) { + auto iter = node_executor_decision_map.find(n); + auto decision = NodeExecutorDecision::kUNKNOWN; + if (iter != node_executor_decision_map.end()) { + decision = iter->second; + } + + if (decision == NodeExecutorDecision::kCONVERT || decision == NodeExecutorDecision::kUNKNOWN) { + return false; + } else { + return true; + } +} + +bool PartitioningCtx::shouldNodeRunInTensorRT(torch::jit::Node* n) { + auto iter = node_executor_decision_map.find(n); + auto decision = NodeExecutorDecision::kUNKNOWN; + if (iter != node_executor_decision_map.end()) { + decision = iter->second; + } + + if (decision == NodeExecutorDecision::kCONVERT) { + return true; + } else { + return false; + } +} + +bool PartitioningCtx::isNodeExecutorKnown(torch::jit::Node* n) { + auto iter = node_executor_decision_map.find(n); + auto decision = NodeExecutorDecision::kUNKNOWN; + if (iter != node_executor_decision_map.end()) { + decision = iter->second; + } + + if (decision == NodeExecutorDecision::kUNKNOWN) { + return false; + } else { + return true; + } +} + +std::ostream& operator<<(std::ostream& os, const NodeExecutorDecision& format) { + switch (format) { + case NodeExecutorDecision::kUNSUPPORTED: + return os << "to run torch due to lack of converter support"; + case NodeExecutorDecision::kOPERATOR_FALLBACK: + return os << "to run torch due to user expectily requesting op kind runs in torch"; + case NodeExecutorDecision::kMODULE_FALLBACK: + return os << "to run torch due to being a member of a module user has requested to run in torch"; + case NodeExecutorDecision::kMIN_BLOCK_FALLBACK: + return os << "to run torch due owning block not large enough to exceed user specified min_block_size"; + case NodeExecutorDecision::kNON_TENSOR: + return os << "to run torch due to producing or consuming non-tensor values"; + case NodeExecutorDecision::kCONVERT: + return os << "to run in tensorrt"; + case NodeExecutorDecision::kUNKNOWN: + default: + return os << "unknown node executor decision"; + } +} + +std::ostream& operator<<(std::ostream& os, const PartitionedGraph& g) { + os << "Partitioned Graph: ["; + for (auto b : g) { + os << b; + } + os << "]"; + return os; +} + +} // namespace partitioning +} // namespace core +} // namespace torch_tensorrt diff --git a/core/partitioning/partitioningctx/PartitioningCtx.h b/core/partitioning/partitioningctx/PartitioningCtx.h new file mode 100644 index 0000000000..d13476b017 --- /dev/null +++ b/core/partitioning/partitioningctx/PartitioningCtx.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include +#include +#include + +#include "core/partitioning/partitioninginfo/PartitioningInfo.h" +#include "core/partitioning/segmentedblock/SegmentedBlock.h" + +namespace torch_tensorrt { +namespace core { +namespace partitioning { + +enum NodeExecutorDecision { + /// Node is not supported by TensorRT + kUNSUPPORTED, + /// Node is explicitly forced to fallback to Pytorch due to operator fallback + kOPERATOR_FALLBACK, + /// Node is explicitly forced to fallback to Pytorch due to module fallback + kMODULE_FALLBACK, + /// This node is in a TRT segment which does not satisfy min_block_size + /// and hence is forced to fallback. + kMIN_BLOCK_FALLBACK, + /// This node produces/consumes non-tensor inputs + kNON_TENSOR, + /// This node is going to be converted + kCONVERT, + /// Sentinel + kUNKNOWN, +}; + +std::ostream& operator<<(std::ostream& os, const NodeExecutorDecision& format); + +typedef std::unordered_map NodeExecutorDecisionMap; + +typedef std::vector PartitionedGraph; + +std::ostream& operator<<(std::ostream& os, const PartitionedGraph& g); + +struct UsageInfo { + size_t produce_id; // id of segmented block which contains a raw value of a given torch::jit::Value + std::vector torch_use_id; // ids of segmented blocks which are of type Pytorch + std::vector tensorrt_use_id; // ids of segmented blocks which are of type TensorRT +}; + +struct PartitioningCtx { + // TODO: Make the set a part of settings not stand alone + PartitioningInfo settings; + NodeExecutorDecisionMap node_executor_decision_map; + PartitionedGraph blocks; + std::unordered_set forced_fallback_ops; + + PartitioningCtx(torch::jit::Block* b, PartitioningInfo info); + bool setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision); + void finalizeNewBlock(SegmentedBlock::SegmentedBlockTarget kind, std::vector& nodes); + bool shouldNodeRunInTorch(torch::jit::Node* n); + bool shouldNodeRunInTensorRT(torch::jit::Node* n); + bool isNodeExecutorKnown(torch::jit::Node* n); + + private: + void _load_nodes_into_decision_map(torch::jit::Block* b); +}; + +std::ostream& operator<<(std::ostream& os, const PartitioningCtx& s); + +} // namespace partitioning +} // namespace core +} // namespace torch_tensorrt diff --git a/core/partitioning/partitioninginfo/PartitioningInfo.h b/core/partitioning/partitioninginfo/PartitioningInfo.h index d57d79368f..8eb052e0fa 100644 --- a/core/partitioning/partitioninginfo/PartitioningInfo.h +++ b/core/partitioning/partitioninginfo/PartitioningInfo.h @@ -4,11 +4,14 @@ #include #include +#include "core/ir/ir.h" + namespace torch_tensorrt { namespace core { namespace partitioning { struct PartitioningInfo { + ir::CollectionInputSpecMap collection_input_spec_map; bool enabled = false; uint64_t min_block_size = 1; std::vector forced_fallback_operators; diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index 06344a2ac6..ebc279e9da 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -181,14 +181,11 @@ void getSegmentsOutputByRunning( seg_block.register_intypes(input_types); } -void runShapeAnalysis( - std::vector& segmented_blocks, - std::unordered_map& example_tensor_map, - const PartitioningInfo& partitioning_info) { +void runShapeAnalysis(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) { // register every segment's input shape, and it's running output IValues - for (auto& seg_block : segmented_blocks) { + for (auto& seg_block : ctx->blocks) { torch::jit::ConstantPooling(seg_block.g()); - getSegmentsOutputByRunning(seg_block, example_tensor_map, partitioning_info); + getSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings); } return; } diff --git a/tests/core/lowering/test_module_fallback_passes.cpp b/tests/core/lowering/test_module_fallback_passes.cpp index f11882df8b..ef518434bd 100644 --- a/tests/core/lowering/test_module_fallback_passes.cpp +++ b/tests/core/lowering/test_module_fallback_passes.cpp @@ -100,7 +100,7 @@ TEST(Lowering, LowerAndPartitionSimpleModuleFallbackCorrectly) { std::vector input_ranges{torch_tensorrt::core::ir::Input({1, 1, 16, 16})}; torch_tensorrt::core::CompileSpec cfg(input_ranges); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; cfg.lower_info.forced_fallback_modules.push_back("ModuleFallbackSub"); auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); diff --git a/tests/core/lowering/test_view_to_reshape_pass.cpp b/tests/core/lowering/test_view_to_reshape_pass.cpp index d1f787bc10..a6254bccde 100644 --- a/tests/core/lowering/test_view_to_reshape_pass.cpp +++ b/tests/core/lowering/test_view_to_reshape_pass.cpp @@ -66,8 +66,8 @@ TEST(LoweringPasses, ViewToReshapeResultsCorrectly) { std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({2, 3, 4, 5})); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; - cfg.partition_info.forced_fallback_operators.push_back("aten::permute"); + cfg.partitioning_info.enabled = true; + cfg.partitioning_info.forced_fallback_operators.push_back("aten::permute"); torch::jit::script::Module mod(c10::QualifiedName("module")); diff --git a/tests/core/partitioning/test_conditionals.cpp b/tests/core/partitioning/test_conditionals.cpp index 424fac86e0..ba336db663 100644 --- a/tests/core/partitioning/test_conditionals.cpp +++ b/tests/core/partitioning/test_conditionals.cpp @@ -34,7 +34,7 @@ TEST(Partitioning, FallbackOnConditionalsCorrectly) { std::vector inputs{torch_tensorrt::core::ir::Input({3, 3, 16, 16})}; auto g = mod.get_method("forward").graph(); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; torch::jit::script::Module new_mod = torch_tensorrt::core::CompileGraph(mod, cfg); auto new_g = new_mod.get_method("forward").graph(); @@ -65,8 +65,8 @@ TEST(Partitioning, FallbackInplaceOPInConditionalsCorrectly) { torch_tensorrt::core::ir::Input({4, 4}), torch_tensorrt::core::ir::Input({4, 4})}; auto g = mod.get_method("forward").graph(); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; - cfg.partition_info.forced_fallback_operators.push_back("prim::ListConstruct"); + cfg.partitioning_info.enabled = true; + cfg.partitioning_info.forced_fallback_operators.push_back("prim::ListConstruct"); auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); diff --git a/tests/core/partitioning/test_fallback_graph_output.cpp b/tests/core/partitioning/test_fallback_graph_output.cpp index 98fc4e6128..f1351741f5 100644 --- a/tests/core/partitioning/test_fallback_graph_output.cpp +++ b/tests/core/partitioning/test_fallback_graph_output.cpp @@ -28,8 +28,8 @@ TEST(Partitioning, ComputeResNet50FallbackGraphCorrectly) { std::vector input_ranges{torch_tensorrt::core::ir::Input({1, 3, 224, 224})}; torch_tensorrt::core::CompileSpec cfg(input_ranges); - cfg.partition_info.enabled = true; - cfg.partition_info.forced_fallback_operators.push_back("aten::add"); + cfg.partitioning_info.enabled = true; + cfg.partitioning_info.forced_fallback_operators.push_back("aten::add"); auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); @@ -58,8 +58,8 @@ TEST(Partitioning, ComputeMobileNetFallbackGraphCorrectly) { std::vector input_ranges{torch_tensorrt::core::ir::Input({1, 3, 224, 224})}; auto g = mod.get_method("forward").graph(); torch_tensorrt::core::CompileSpec cfg(input_ranges); - cfg.partition_info.enabled = true; - cfg.partition_info.forced_fallback_operators.push_back("aten::hardtanh"); + cfg.partitioning_info.enabled = true; + cfg.partitioning_info.forced_fallback_operators.push_back("aten::hardtanh"); auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); diff --git a/tests/core/partitioning/test_loading_model.cpp b/tests/core/partitioning/test_loading_model.cpp index 057aaff2d8..b42368fe3e 100644 --- a/tests/core/partitioning/test_loading_model.cpp +++ b/tests/core/partitioning/test_loading_model.cpp @@ -28,7 +28,7 @@ TEST(Partitioning, ComputeResNet50FallbackGraphCorrectly) { std::vector input_ranges{torch_tensorrt::core::ir::Input({1, 3, 224, 224})}; torch_tensorrt::core::CompileSpec cfg(input_ranges); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); diff --git a/tests/core/partitioning/test_loop_fallback.cpp b/tests/core/partitioning/test_loop_fallback.cpp index 83556b5512..5f6bc2ae4d 100644 --- a/tests/core/partitioning/test_loop_fallback.cpp +++ b/tests/core/partitioning/test_loop_fallback.cpp @@ -25,7 +25,7 @@ TEST(Partitioning, CheckLoopFallbackEvalCompilesCorrectly) { std::vector input_ranges{torch_tensorrt::core::ir::Input({1, 10})}; torch_tensorrt::core::CompileSpec cfg(input_ranges); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); @@ -53,7 +53,7 @@ TEST(Partitioning, CheckLoopFallbackNoEvalCompilesCorrectly) { std::vector input_ranges{torch_tensorrt::core::ir::Input({1, 10})}; torch_tensorrt::core::CompileSpec cfg(input_ranges); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp index 30656a3d9e..3df65deca3 100644 --- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp +++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp @@ -60,10 +60,10 @@ TEST(Partitioning, ResolveNonTensorInputsForIFBlockCorrectly) { inputs.push_back(torch_tensorrt::core::ir::Input({3, 4})); inputs.push_back(torch_tensorrt::core::ir::Input({3, 4})); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; - cfg.partition_info.forced_fallback_operators.push_back("aten::sub"); + cfg.partitioning_info.enabled = true; + cfg.partitioning_info.forced_fallback_operators.push_back("aten::sub"); cfg.convert_info.engine_settings.truncate_long_and_double = true; - cfg.partition_info.truncate_long_and_double = true; + cfg.partitioning_info.truncate_long_and_double = true; torch::jit::script::Module mod(c10::QualifiedName("module")); @@ -109,8 +109,8 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; + torch_tensorrt::core::partitioning::PartitioningInfo partitioning_info; + partitioning_info.enabled = true; std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({1, 3, 16, 16})); inputs.push_back(torch_tensorrt::core::ir::Input({16, 3, 3, 3})); @@ -123,9 +123,9 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) { input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); - std::unordered_map fallback_nodes; + torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); std::vector segmented_blocks = - torch_tensorrt::core::partitioning::Partition(g->block(), input_ivalues_map, partition_info, fallback_nodes); + torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); int torch_block_cnt = 0, trt_block_cnt = 0; for (const auto& segmented_block : segmented_blocks) { @@ -168,8 +168,8 @@ TEST(Partitioning, ResolveTensorListInputsInTrtCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; + torch_tensorrt::core::partitioning::PartitioningInfo partitioning_info; + partitioning_info.enabled = true; std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({1, 3, 16, 16})); inputs.push_back(torch_tensorrt::core::ir::Input({16, 6, 3, 3})); @@ -182,9 +182,9 @@ TEST(Partitioning, ResolveTensorListInputsInTrtCorrectly) { input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); - std::unordered_map fallback_nodes; + torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); std::vector segmented_blocks = - torch_tensorrt::core::partitioning::Partition(g->block(), input_ivalues_map, partition_info, fallback_nodes); + torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); int torch_block_cnt = 0, trt_block_cnt = 0; for (const auto& segmented_block : segmented_blocks) { @@ -244,7 +244,7 @@ TEST(Partitioning, ConvertForTensorListInputsInFallbackCorrectly) { std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({1, 3, 16, 16})); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; torch::jit::script::Module mod(c10::QualifiedName("module")); auto self = g->insertInput(0, "self_1"); @@ -361,8 +361,8 @@ TEST(Partitioning, ResolveOnlyNeccessaryNonTensorInputs) { g->registerOutput(get_ins_node->output()); g->registerOutput(get_outs_node->output()); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; + torch_tensorrt::core::partitioning::PartitioningInfo partitioning_info; + partitioning_info.enabled = true; std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({4, 4})); inputs.push_back(torch_tensorrt::core::ir::Input({4, 4})); @@ -374,9 +374,8 @@ TEST(Partitioning, ResolveOnlyNeccessaryNonTensorInputs) { input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); - std::unordered_map fallback_nodes; - auto segmented_blocks = - torch_tensorrt::core::partitioning::Partition(g->block(), input_ivalues_map, partition_info, fallback_nodes); + torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); + auto segmented_blocks = torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); int torch_block_cnt = 0, trt_block_cnt = 0; for (const auto& segmented_block : segmented_blocks) { diff --git a/tests/core/partitioning/test_segmentation.cpp b/tests/core/partitioning/test_segmentation.cpp index bf8a36d081..efee4ec85a 100644 --- a/tests/core/partitioning/test_segmentation.cpp +++ b/tests/core/partitioning/test_segmentation.cpp @@ -6,9 +6,14 @@ #include "torch/script.h" #include "torch_tensorrt/torch_tensorrt.h" +namespace torch_tensorrt { +namespace core { +namespace partitioning { +namespace tests { + bool checkSegmentedBlockNumber( - torch_tensorrt::core::partitioning::PartitionedGraph& segmented_blocks, - torch_tensorrt::core::partitioning::SegmentedBlock::SegmentedBlockTarget target, + PartitionedGraph& segmented_blocks, + SegmentedBlock::SegmentedBlockTarget target, int target_count) { int64_t cnt = 0; for (auto& seg_block : segmented_blocks) { @@ -27,7 +32,7 @@ bool checkSegmentedBlockNumber( } bool checkSegmentedBlockNodesMapping( - std::vector& segmented_blocks, + std::vector& segmented_blocks, std::shared_ptr g, std::vector> nodes_index) { std::vector graph_nodes; @@ -71,17 +76,15 @@ TEST(Partitioning, SegmentSequentialModelCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); + LOG_GRAPH(*g); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; - std::unordered_map fallback_nodes; - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::segment_graph(g->block(), partition_info, fallback_nodes); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTensorRT, 2)); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(segmented_blocks, g, {{0, 1, 2}, {3}, {4}})); + PartitioningInfo partitioning_info; + partitioning_info.enabled = true; + PartitioningCtx ctx(g->block(), partitioning_info); + segment_graph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 2)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2}, {3}, {4}})); } TEST(Partitioning, SegmentSequentialModelWithMinBlockSizeCorrectly) { @@ -106,18 +109,16 @@ TEST(Partitioning, SegmentSequentialModelWithMinBlockSizeCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); + LOG_GRAPH(*g); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; - partition_info.min_block_size = 3; - std::unordered_map fallback_nodes; - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::segment_graph(g->block(), partition_info, fallback_nodes); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTensorRT, 1)); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(segmented_blocks, g, {{0, 1, 2}, {3, 4}})); + PartitioningInfo partitioning_info; + partitioning_info.enabled = true; + partitioning_info.min_block_size = 3; + PartitioningCtx ctx(g->block(), partitioning_info); + segment_graph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 1)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2}, {3, 4}})); } TEST(Partitioning, SegmentModelWithMinBlockSizeCausedFallbackCorrectly) { @@ -146,18 +147,16 @@ TEST(Partitioning, SegmentModelWithMinBlockSizeCausedFallbackCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); + LOG_GRAPH(*g); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; - partition_info.min_block_size = 3; - std::unordered_map fallback_nodes; - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::segment_graph(g->block(), partition_info, fallback_nodes); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTensorRT, 1)); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(segmented_blocks, g, {{0, 1, 2, 3}, {4, 5, 6, 7}})); + PartitioningInfo partitioning_info; + partitioning_info.enabled = true; + partitioning_info.min_block_size = 3; + PartitioningCtx ctx(g->block(), partitioning_info); + segment_graph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 1)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2, 3}, {4, 5, 6, 7}})); } TEST(Partitioning, SegmentSequentialModelWithForcedOPCorrectly) { @@ -182,18 +181,16 @@ TEST(Partitioning, SegmentSequentialModelWithForcedOPCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); + LOG_GRAPH(*g); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; - partition_info.forced_fallback_operators.push_back("aten::relu"); - std::unordered_map fallback_nodes; - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::segment_graph(g->block(), partition_info, fallback_nodes); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTensorRT, 3)); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTorch, 2)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(segmented_blocks, g, {{0}, {1}, {2}, {3}, {4}})); + PartitioningInfo partitioning_info; + partitioning_info.enabled = true; + partitioning_info.forced_fallback_operators.push_back("aten::relu"); + PartitioningCtx ctx(g->block(), partitioning_info); + segment_graph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 3)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 2)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0}, {1}, {2}, {3}, {4}})); } TEST(Partitioning, SegmentBranchModelCorrectly) { @@ -219,17 +216,15 @@ TEST(Partitioning, SegmentBranchModelCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); + LOG_GRAPH(*g); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; - std::unordered_map fallback_nodes; - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::segment_graph(g->block(), partition_info, fallback_nodes); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTensorRT, 2)); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(segmented_blocks, g, {{0, 1}, {2}, {3, 4, 5, 6}})); + PartitioningInfo partitioning_info; + partitioning_info.enabled = true; + PartitioningCtx ctx(g->block(), partitioning_info); + segment_graph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 2)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1}, {2}, {3, 4, 5, 6}})); } TEST(Partitioning, SegmentBranchModelWithMinBlockSizeCorrectly) { @@ -255,18 +250,16 @@ TEST(Partitioning, SegmentBranchModelWithMinBlockSizeCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); + LOG_GRAPH(*g); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; - partition_info.min_block_size = 3; - std::unordered_map fallback_nodes; - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::segment_graph(g->block(), partition_info, fallback_nodes); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTensorRT, 1)); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(segmented_blocks, g, {{0, 1, 2}, {3, 4, 5, 6}})); + PartitioningInfo partitioning_info; + partitioning_info.enabled = true; + partitioning_info.min_block_size = 3; + PartitioningCtx ctx(g->block(), partitioning_info); + segment_graph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 1)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2}, {3, 4, 5, 6}})); } TEST(Partitioning, SegmentBranchModelWithForcedFallbackOPCorrectly) { @@ -296,16 +289,19 @@ TEST(Partitioning, SegmentBranchModelWithForcedFallbackOPCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); + LOG_GRAPH(*g); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; - partition_info.forced_fallback_operators.push_back("aten::relu"); - std::unordered_map fallback_nodes; - torch_tensorrt::core::partitioning::PartitionedGraph segmented_blocks = - torch_tensorrt::core::partitioning::segment_graph(g->block(), partition_info, fallback_nodes); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTensorRT, 3)); - ASSERT_TRUE( - checkSegmentedBlockNumber(segmented_blocks, torch_tensorrt::core::partitioning::SegmentedBlock::kTorch, 2)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(segmented_blocks, g, {{0, 1}, {2}, {3}, {4}, {5, 6}})); + PartitioningInfo partitioning_info; + partitioning_info.enabled = true; + partitioning_info.forced_fallback_operators.push_back("aten::relu"); + PartitioningCtx ctx(g->block(), partitioning_info); + segment_graph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 3)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 2)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1}, {2}, {3}, {4}, {5, 6}})); } + +} // namespace tests +} // namespace partitioning +} // namespace core +} // namespace torch_tensorrt \ No newline at end of file diff --git a/tests/core/partitioning/test_shape_analysis.cpp b/tests/core/partitioning/test_shape_analysis.cpp index 98b375f121..e2767185c6 100644 --- a/tests/core/partitioning/test_shape_analysis.cpp +++ b/tests/core/partitioning/test_shape_analysis.cpp @@ -48,8 +48,8 @@ TEST(Partitioning, InferSequentialModelSegmentedBlockShapeCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; + torch_tensorrt::core::partitioning::PartitioningInfo partitioning_info; + partitioning_info.enabled = true; std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({3, 3, 16, 16})); inputs.push_back(torch_tensorrt::core::ir::Input({32, 3, 3, 3})); @@ -66,9 +66,10 @@ TEST(Partitioning, InferSequentialModelSegmentedBlockShapeCorrectly) { input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); - std::unordered_map fallback_nodes; + + torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); std::vector segmented_blocks = - torch_tensorrt::core::partitioning::Partition(g->block(), input_ivalues_map, partition_info, fallback_nodes); + torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); ASSERT_TRUE(checkSegmentedBlockInputShape( segmented_blocks, @@ -101,8 +102,8 @@ TEST(Partitioning, InferBranchModelSegmentedBlockShapeCorrectly) { auto g = std::make_shared(); torch::jit::parseIR(graph, g.get()); - torch_tensorrt::core::partitioning::PartitionInfo partition_info; - partition_info.enabled = true; + torch_tensorrt::core::partitioning::PartitioningInfo partitioning_info; + partitioning_info.enabled = true; std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({3, 3, 16, 16})); inputs.push_back(torch_tensorrt::core::ir::Input({32, 3, 3, 3})); @@ -117,9 +118,10 @@ TEST(Partitioning, InferBranchModelSegmentedBlockShapeCorrectly) { input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); - std::unordered_map fallback_nodes; + + torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); std::vector segmented_blocks = - torch_tensorrt::core::partitioning::Partition(g->block(), input_ivalues_map, partition_info, fallback_nodes); + torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); ASSERT_TRUE(checkSegmentedBlockInputShape( segmented_blocks, diff --git a/tests/core/partitioning/test_stitched_graph.cpp b/tests/core/partitioning/test_stitched_graph.cpp index 61c5b58552..4332668506 100644 --- a/tests/core/partitioning/test_stitched_graph.cpp +++ b/tests/core/partitioning/test_stitched_graph.cpp @@ -75,7 +75,7 @@ TEST(Partitioning, StitchSequentialModelSegmentedBlockCorrectly) { std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({3, 3, 16, 16})); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; torch::jit::script::Module new_mod = torch_tensorrt::core::CompileGraph(mod, cfg); auto fallback_g = new_mod.get_method("forward").graph(); ASSERT_TRUE(checkAllInputsExistInStitchedGraph(fallback_g)); @@ -133,7 +133,7 @@ TEST(Partitioning, StitchBranchModelSegmentedBlockCorrectly) { std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({3, 3, 16, 16})); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; torch::jit::script::Module new_mod = torch_tensorrt::core::CompileGraph(mod, cfg); auto fallback_g = new_mod.get_method("forward").graph(); ASSERT_TRUE(checkAllInputsExistInStitchedGraph(fallback_g)); diff --git a/tests/core/partitioning/test_tensorrt_conversion.cpp b/tests/core/partitioning/test_tensorrt_conversion.cpp index 8b42f95e24..41431c76db 100644 --- a/tests/core/partitioning/test_tensorrt_conversion.cpp +++ b/tests/core/partitioning/test_tensorrt_conversion.cpp @@ -57,7 +57,7 @@ TEST(Partitioning, ConvertSequentialModelSegmentedBlockCorrectly) { std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({3, 3, 16, 16})); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; torch::jit::script::Module mod(c10::QualifiedName("module")); auto self = g->insertInput(0, "self_1"); @@ -116,7 +116,7 @@ TEST(Partitioning, ConvertBranchModelSegmentedBlockCorrectly) { std::vector inputs; inputs.push_back(torch_tensorrt::core::ir::Input({3, 3, 16, 16})); torch_tensorrt::core::CompileSpec cfg(inputs); - cfg.partition_info.enabled = true; + cfg.partitioning_info.enabled = true; torch::jit::script::Module mod(c10::QualifiedName("module")); auto self = g->insertInput(0, "self_1"); From 1a4084b39a4725cf4818c2ebcab455cd1793031e Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Sun, 14 Aug 2022 10:47:15 -0700 Subject: [PATCH 04/11] chore: update usage in py api Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- py/torch_tensorrt/csrc/tensorrt_classes.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp index 96fef793fd..1721ffd6c9 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp @@ -313,10 +313,10 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() { info.convert_info.engine_settings.device.gpu_id = device.gpu_id; info.convert_info.engine_settings.device.dla_core = device.dla_core; info.convert_info.engine_settings.device.allow_gpu_fallback = device.allow_gpu_fallback; - info.partition_info.enabled = torch_fallback.enabled; - info.partition_info.min_block_size = torch_fallback.min_block_size; - info.partition_info.forced_fallback_operators = torch_fallback.forced_fallback_operators; - info.partition_info.truncate_long_and_double = truncate_long_and_double; + info.partitioning_info.enabled = torch_fallback.enabled; + info.partitioning_info.min_block_size = torch_fallback.min_block_size; + info.partitioning_info.forced_fallback_operators = torch_fallback.forced_fallback_operators; + info.partitioning_info.truncate_long_and_double = truncate_long_and_double; info.lower_info.forced_fallback_modules = torch_fallback.forced_fallback_modules; info.convert_info.engine_settings.truncate_long_and_double = truncate_long_and_double; From 4cc31430f85266245f09bdabab9d0a594341832d Mon Sep 17 00:00:00 2001 From: Naren Dasan <1790613+narendasan@users.noreply.github.com> Date: Fri, 26 Aug 2022 13:33:19 -0700 Subject: [PATCH 05/11] fix: Update new info name --- core/compiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index 5412a5e38e..71eb7d6795 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -442,7 +442,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) auto outputIsCollection = conversion::OutputIsCollection(g->block()); if (cfg.partitioning_info.enabled && (cfg.lower_info.forced_fallback_modules.size() == 0 && - cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible) && + cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) && !outputIsCollection) { LOG_INFO("Skipping partitioning since model is fully supported"); } From c3082f5261157aa9c8a64d9866ca1f1a7c6d5bbd Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Tue, 30 Aug 2022 15:38:03 -0700 Subject: [PATCH 06/11] refactor: refactor the NodeExecutor logic Signed-off-by: Bo Wang --- core/compiler.cpp | 2 +- core/partitioning/partitioning.cpp | 319 +++++++++--------- core/partitioning/partitioning.h | 2 +- .../partitioningctx/PartitioningCtx.cpp | 15 +- .../partitioningctx/PartitioningCtx.h | 1 + 5 files changed, 181 insertions(+), 158 deletions(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index 71eb7d6795..7e9a042471 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -228,7 +228,7 @@ GraphAndMapping ConstructFallbackGraph_( std::unordered_map example_tensor_map) { auto new_g = std::make_shared(); - auto segmented_blocks = partitioning::partition(partitioning_ctx, block, example_tensor_map); + auto segmented_blocks = partitioning::Partition(partitioning_ctx, block, example_tensor_map); // the mapping from lowering graph => fallback global graph std::unordered_map old_to_new_g; diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index ec21a9af73..ea29f53078 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -1,3 +1,7 @@ + + + + #include #include "torch/csrc/jit/passes/constant_pooling.h" @@ -31,6 +35,136 @@ bool containNonTensorOutputs(torch::jit::Node* n) { return false; } + + +// Check if the inputs and outputs of the graph are Tensor. If not, then fallback connected nodes +void SetInputsOutputsConnectedNodes(PartitioningCtx* ctx, torch::jit::Block* block) { + // fallback nodes that produce entire graph's nonTensor output + for (auto i : block->outputs()) { + if (!isTensor(i)) { + ctx->setNodeExecutorDecision(i->node(), NodeExecutorDecision::kNON_TENSOR); + } + } + + // fallback nodes that consume entire graph's nonTensor input + for (auto i : block->inputs()) { + if (!isTensor(i)) { + for (auto use : i->uses()) { + ctx->setNodeExecutorDecision(use.user, NodeExecutorDecision::kNON_TENSOR); + } + } + } +} + +// Find and set all explicit fallback nodes (nodes that are unsupported or forced fallback) +// we use a map to indicate the reason why it's fallback to torch +// For any node that's not explicitly fallback, we set it to run in TensorRT for now +void SetExplicitFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { + auto nodes = block->nodes(); + const auto to_compile_sym = c10::Symbol::attr("to_compile"); + + for (const auto n : nodes) { + if (n->kind() == torch::jit::prim::Constant) { + continue; + } + + if (!conversion::OpSupported(n)) { + // If the op is not supported by the conversion phase it should run in PyTorch + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kUNSUPPORTED); + } else if (ctx->forced_fallback_ops.find(n->kind().toQualString()) != ctx->forced_fallback_ops.end()) { + // If the user specifies the op to run in Torch it should run in PyTorch + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kOPERATOR_FALLBACK); + } else if (n->hasAttribute(to_compile_sym) && n->i(to_compile_sym) == (int64_t) false) { + // If the user specifies the module containing this op to run in torch it should run in PyTorch + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kMODULE_FALLBACK); + } else { + // Set the rest nodes to TensorRt + ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kCONVERT); + } + } + return; +} + +// For a given set of fallback nodes, check their inputs/outputs, if any inputs/outputs of them are NonTensor, +// then the nodes that produces/consumes those values should also fallback +void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vector& initial_fallback_nodes) { + // initial_fallback_nodes are the fallback nodes that we have before we run BFS in this function + std::queue q; + for (auto& node : initial_fallback_nodes) { + q.push(node.first); + } + + while (!q.empty()) { + auto cur_node = q.front(); + q.pop(); + // for every node that produces this fallback node's NonTensor input, they should fallback too + for (auto input : cur_node->inputs()) { + if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant && + ctx->shouldNodeRunInTensorRT(input->node())) { + ctx->setNodeExecutorDecision(input->node(), NodeExecutorDecision::kNON_TENSOR); + q.push(input->node()); + } + } + // for every node that consumes this fallback node's NonTensor output, they should fallback too + for (auto output : cur_node->outputs()) { + if (!isTensor(output)) { + for (auto use : output->uses()) { + auto node = use.user; + if (node->kind() != torch::jit::prim::Constant && + ctx->shouldNodeRunInTensorRT(node)) { + ctx->setNodeExecutorDecision(node, NodeExecutorDecision::kNON_TENSOR); + q.push(node); + } + } + } + } + } +} + +// Sub-function that traverses the entire block and check if TensorRT node sequence satisfy min_block_size +std::vector TraverseNodesForMinBlockSize(PartitioningCtx* ctx, torch::jit::Block* block) { + auto nodes = block->nodes(); + std::vector cur_trt_nodes; + std::vector min_block_fallback_nodes; + for (const auto n : nodes) { + if (n->kind() == torch::jit::prim::Constant) { + continue; + } + + // check if current node fallback or not + if (!ctx->shouldNodeRunInTorch(n)) { + cur_trt_nodes.push_back(n); + } else { + if (cur_trt_nodes.size() < ctx->settings.min_block_size) { + min_block_fallback_nodes.insert(min_block_fallback_nodes.end(), cur_trt_nodes.begin(), cur_trt_nodes.end()); + } + cur_trt_nodes.clear(); + } + } + if (cur_trt_nodes.size() < ctx->settings.min_block_size) { + min_block_fallback_nodes.insert(min_block_fallback_nodes.end(), cur_trt_nodes.begin(), cur_trt_nodes.end()); + } + return min_block_fallback_nodes; +} + + +// Set the nodes that fallback because of min_block_size +void SetMinBlockFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { + // first traverse all the nodes to find the initial nodes that don't meet the min_block_size requirement + auto min_block_fallback_nodes = TraverseNodesForMinBlockSize(ctx, block); + + // keep fallback until all segments meet the min_block_size requirement + while (!min_block_fallback_nodes.empty()) { + for (const auto i : min_block_fallback_nodes) { + ctx->setNodeExecutorDecision(i, NodeExecutorDecision::kMIN_BLOCK_FALLBACK); + } + // find the fallback nodes because of dependency with min_block_size caused fallback nodes + SetNonTensorConnectedNodes(ctx, min_block_fallback_nodes); + // keep traverse the graph until there is no node fallback because of min_block_size + min_block_fallback_nodes = TraverseNodesForMinBlockSize(ctx, block); + } +} + bool isModifyingNodes(torch::jit::Node* node, torch::jit::Value* val) { const torch::jit::FunctionSchema* schema = node->maybeSchema(); if (!schema) { @@ -97,62 +231,6 @@ std::vector getDependencyNodes( return stk; } -// check if the input and output of the graph is Tensor after collection is enabled. If it is, then fallback related -// nodes -void fallback_graph_nontensor_in_out(PartitioningCtx* ctx, torch::jit::Block* block) { - // fallback nodes that produce entire graph's nonTensor output - for (auto i : block->outputs()) { - if (!isTensor(i)) { - ctx->setNodeExecutorDecision(i->node(), NodeExecutorDecision::kNON_TENSOR); - } - } - - // fallback nodes that consume entire graph's nonTensor input - for (auto i : block->inputs()) { - if (!isTensor(i)) { - for (auto use : i->uses()) { - ctx->setNodeExecutorDecision(use.user, NodeExecutorDecision::kNON_TENSOR); - } - } - } -} - -void find_all_fallback_nodes(PartitioningCtx* ctx, NodeExecutorDecisionMap& initial_fallback_nodes) { - // initial_fallback_nodes are the fallback nodes that we have before we run BFS in this function - // global_fallback_nodes are the fallback nodes that we maintain globally - std::queue q; - for (auto& node : initial_fallback_nodes) { - q.push(node.first); - } - - std::unordered_set visited_nodes; - while (!q.empty()) { - auto cur_node = q.front(); - q.pop(); - // for every node that produces this fallback node's NonTensor input, they should fallback too - for (auto input : cur_node->inputs()) { - // NOTE: This does not make sense, does this rely on shortciruiting to work right? - if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant && - ctx->setNodeExecutorDecision(input->node(), NodeExecutorDecision::kNON_TENSOR)) { - q.push(input->node()); - } - } - // for every node that consumes this fallback node's NonTensor output, they should fallback too - for (auto output : cur_node->outputs()) { - if (!isTensor(output)) { - for (auto use : output->uses()) { - auto node = use.user; - // NOTE: This does not make sense, does this rely on shortciruiting to work right? - if (node->kind() != torch::jit::prim::Constant && - ctx->setNodeExecutorDecision(node, NodeExecutorDecision::kNON_TENSOR)) { - q.push(node); - } - } - } - } - } -} - void resolveTRTNonTensorInputs(PartitioningCtx* ctx) { // if a TRT segment has nonTensor Inputs, the nodes that produce this nonTensor Inputs must in another TensorRT engine // because we have already found the interface between Torch and TRT in segmentation phase @@ -250,102 +328,10 @@ bool checkLoopEvaluatable(torch::jit::Node* n) { return compile_to_trt; } -// use this function to get all initial fallback nodes (nodes that are unsupported or forced fallback) -// we use a map to indicate the reason why it's fallback to torch -void get_fallback_nodes(PartitioningCtx* ctx, torch::jit::Block* block) { - auto nodes = block->nodes(); - for (const auto n : nodes) { - if (n->kind() == torch::jit::prim::Constant) { - continue; - } - - // If the op is not supported by the conversion phase it should run in PyTorch - if (!conversion::OpSupported(n)) { - ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kUNSUPPORTED); - } - - // If the user specifies the op to run in Torch it should run in PyTorch - if (ctx->forced_fallback_ops.find(n->kind().toQualString()) != ctx->forced_fallback_ops.end()) { - ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kOPERATOR_FALLBACK); - } - - // If the user specifies the module containing this op to run in torch it should run in PyTorch - const auto to_compile_sym = c10::Symbol::attr("to_compile"); - if (n->hasAttribute(to_compile_sym) && n->i(to_compile_sym) == (int64_t) false) { - ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kMODULE_FALLBACK); - } - } - return; -} - -std::vector traverse_nodes_for_min_block_size(PartitioningCtx* ctx, torch::jit::Block* block) { - auto nodes = block->nodes(); - std::vector cur_trt_nodes; - std::vector min_block_fallback_nodes; - for (const auto n : nodes) { - if (n->kind() == torch::jit::prim::Constant) { - continue; - } - - // check if current node fallback or not - if (!ctx->shouldNodeRunInTorch(n)) { - // if this node is not in fallback nodes, then it's in trt segments - cur_trt_nodes.push_back(n); - } else { - if (cur_trt_nodes.size() < ctx->settings.min_block_size) { - min_block_fallback_nodes.insert(min_block_fallback_nodes.end(), cur_trt_nodes.begin(), cur_trt_nodes.end()); - } - cur_trt_nodes.clear(); - } - } - if (cur_trt_nodes.size() < ctx->settings.min_block_size) { - min_block_fallback_nodes.insert(min_block_fallback_nodes.end(), cur_trt_nodes.begin(), cur_trt_nodes.end()); - } - return min_block_fallback_nodes; -} - -void find_min_block_size_fallback_nodes(PartitioningCtx* ctx, torch::jit::Block* block) { - // first traverse all the nodes to find the initial nodes that don't meet the min_block_size requirement - auto min_block_fallback_nodes = traverse_nodes_for_min_block_size(ctx, block); - NodeExecutorDecisionMap initial_fallback_nodes; - - // keep fallback until all segments meet the min_block_size requirement - while (!min_block_fallback_nodes.empty()) { - for (const auto i : min_block_fallback_nodes) { - initial_fallback_nodes.insert({i, NodeExecutorDecision::kMIN_BLOCK_FALLBACK}); - ctx->setNodeExecutorDecision(i, NodeExecutorDecision::kMIN_BLOCK_FALLBACK); - } - // find the fallback nodes because of dependency with min_block_size caused fallback nodes - find_all_fallback_nodes(ctx, initial_fallback_nodes); - // keep traverse the graph until there is no node fallback because of min_block_size - min_block_fallback_nodes = traverse_nodes_for_min_block_size(ctx, block); - } -} - -void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block) { - // get the initial fallback nodes (nodes that are unsupported or forced fallback) - get_fallback_nodes(ctx, block); - - // For fallback nodes, if it consumes any NonTensor inputs or TensorList inputs, then the node that produces this - // input should also fallback Similarly, if it produces any NonTensor outputs or TensorList outputs, then the node - // that produces this input should also fallback - // TODO: don't need to fallback the TensorList related nodes once the collection feature is supported - find_all_fallback_nodes(ctx, ctx->node_executor_decision_map); - - // find all fallback nodes because of the min_block_size requirement - find_min_block_size_fallback_nodes(ctx, block); +void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { auto nodes = block->nodes(); - // NOTE: Realize this may be redundant, but will let us have an explicit state for each node. Maybe there is a better - // way for (auto n : nodes) { - // if (!ctx->shouldNodeRunInTorch(n) && !ctx->isNodeExecutorKnown(n)) { - // if (conversion::OpSupported(n)) { - // ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kCONVERT); - // } - // } - // } - // segment the nodes std::vector in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes; for (const auto n : nodes) { @@ -420,18 +406,41 @@ void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block) { return; } -PartitionedGraph partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { +void SetNodeExecutorDecision(PartitioningCtx* ctx, torch::jit::Block* block) { + // First, find all the explicit fallback nodes that should run in Torch: + // 1. nodes that are unsupported + // 2. nodes that the user specifies to run in torch + // 3. nodes that the user specifies the module containing this op to run in torch + // At the same time, set all the rest nodes to NodeExecutorDecision::kCONVERT + SetExplicitFallbackNodes(ctx, block); + + // Second, check if there is nonTensor input/output for the block, if there is, then fallback the nodes that + // consume/produce this nonTensor value + SetInputsOutputsConnectedNodes(ctx, block); + + // Third, for fallback nodes, if it consumes any NonTensor inputs, then the nodes that produce this + // input should also fallback. Similarly, if it produces any NonTensor outputs, then the nodes + // that consume this output should also fallback + auto cur_fallback_nodes = ctx->getNodesRunInTorch(); + SetNonTensorConnectedNodes(ctx, cur_fallback_nodes); + + // Finally, check if all current tensorrt blocks satisfy the min_block_size requirement. + // We need to traverse the whole graph many times here + SetMinBlockFallbackNodes(ctx, block); +} + + + +PartitionedGraph Partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { LOG_DEBUG(ctx->settings); - // if there is nonTensor input/output for the entire graph, fallback the node that consumes/produces this nonTensor - // output - fallback_graph_nontensor_in_out(ctx, block); + + SetNodeExecutorDecision(ctx, block); // segment lowering global graph into blocks LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks"); - segment_graph(ctx, block); + SegmentGraph(ctx, block); // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks - // resolve nonTensor inputs/outputs resolveTRTNonTensorInputs(ctx); diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index 320944d1d2..29aa683b5c 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -23,7 +23,7 @@ void runShapeAnalysis(PartitioningCtx* ctx, ExampleIValues& ivalues_maps); void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block); -PartitionedGraph partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map); +PartitionedGraph Partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map); } // namespace partitioning } // namespace core diff --git a/core/partitioning/partitioningctx/PartitioningCtx.cpp b/core/partitioning/partitioningctx/PartitioningCtx.cpp index 6a7ddf9f92..7acd6e6b7d 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.cpp +++ b/core/partitioning/partitioningctx/PartitioningCtx.cpp @@ -11,11 +11,14 @@ PartitioningCtx::PartitioningCtx(torch::jit::Block* b, PartitioningInfo info) : settings(info), forced_fallback_ops(info.forced_fallback_operators.begin(), info.forced_fallback_operators.end()) { LOG_DEBUG(settings); - //_load_nodes_into_decision_map(b); + _load_nodes_into_decision_map(b); } void PartitioningCtx::_load_nodes_into_decision_map(torch::jit::Block* b) { for (const auto n : b->nodes()) { + if (n->kind() == torch::jit::prim::Constant) { + continue; + } node_executor_decision_map[n] = NodeExecutorDecision::kUNKNOWN; for (const auto sub_b : n->blocks()) { _load_nodes_into_decision_map(sub_b); @@ -89,6 +92,16 @@ bool PartitioningCtx::isNodeExecutorKnown(torch::jit::Node* n) { } } +std::vector PartitionCtx::getNodesRunInTorch() { + std::vector nodes_run_in_torch; + for (auto i : node_executor_decision_map) { + if (i.second == NodeExecutorDecision::kCONVERT) { + nodes_run_in_torch.push_back(i.first); + } + } + return nodes_run_in_torch; +} + std::ostream& operator<<(std::ostream& os, const NodeExecutorDecision& format) { switch (format) { case NodeExecutorDecision::kUNSUPPORTED: diff --git a/core/partitioning/partitioningctx/PartitioningCtx.h b/core/partitioning/partitioningctx/PartitioningCtx.h index d13476b017..cc5bb3d774 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.h +++ b/core/partitioning/partitioningctx/PartitioningCtx.h @@ -57,6 +57,7 @@ struct PartitioningCtx { bool shouldNodeRunInTorch(torch::jit::Node* n); bool shouldNodeRunInTensorRT(torch::jit::Node* n); bool isNodeExecutorKnown(torch::jit::Node* n); + std::vector getNodesRunInTorch(); private: void _load_nodes_into_decision_map(torch::jit::Block* b); From a165811b2c43d3723d685708ab2ae212608138b7 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Tue, 30 Aug 2022 16:02:51 -0700 Subject: [PATCH 07/11] fix: fix typo Signed-off-by: Bo Wang --- core/partitioning/partitioning.cpp | 23 ++++--------------- .../partitioningctx/PartitioningCtx.cpp | 2 +- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index ea29f53078..1852359aaf 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -1,16 +1,10 @@ - - - - +#include "core/partitioning/partitioning.h" #include - -#include "torch/csrc/jit/passes/constant_pooling.h" -#include "torch/csrc/jit/passes/dead_code_elimination.h" - #include "core/conversion/conversion.h" #include "core/conversion/evaluators/evaluators.h" -#include "core/partitioning/partitioning.h" #include "core/partitioning/partitioningctx/PartitioningCtx.h" +#include "torch/csrc/jit/passes/constant_pooling.h" +#include "torch/csrc/jit/passes/dead_code_elimination.h" namespace torch_tensorrt { namespace core { @@ -35,8 +29,6 @@ bool containNonTensorOutputs(torch::jit::Node* n) { return false; } - - // Check if the inputs and outputs of the graph are Tensor. If not, then fallback connected nodes void SetInputsOutputsConnectedNodes(PartitioningCtx* ctx, torch::jit::Block* block) { // fallback nodes that produce entire graph's nonTensor output @@ -91,7 +83,7 @@ void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vector q; for (auto& node : initial_fallback_nodes) { - q.push(node.first); + q.push(node); } while (!q.empty()) { @@ -110,8 +102,7 @@ void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vectoruses()) { auto node = use.user; - if (node->kind() != torch::jit::prim::Constant && - ctx->shouldNodeRunInTensorRT(node)) { + if (node->kind() != torch::jit::prim::Constant && ctx->shouldNodeRunInTensorRT(node)) { ctx->setNodeExecutorDecision(node, NodeExecutorDecision::kNON_TENSOR); q.push(node); } @@ -147,7 +138,6 @@ std::vector TraverseNodesForMinBlockSize(PartitioningCtx* ctx return min_block_fallback_nodes; } - // Set the nodes that fallback because of min_block_size void SetMinBlockFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { // first traverse all the nodes to find the initial nodes that don't meet the min_block_size requirement @@ -328,7 +318,6 @@ bool checkLoopEvaluatable(torch::jit::Node* n) { return compile_to_trt; } - void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { auto nodes = block->nodes(); @@ -429,8 +418,6 @@ void SetNodeExecutorDecision(PartitioningCtx* ctx, torch::jit::Block* block) { SetMinBlockFallbackNodes(ctx, block); } - - PartitionedGraph Partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { LOG_DEBUG(ctx->settings); diff --git a/core/partitioning/partitioningctx/PartitioningCtx.cpp b/core/partitioning/partitioningctx/PartitioningCtx.cpp index 7acd6e6b7d..b83f6f5ca5 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.cpp +++ b/core/partitioning/partitioningctx/PartitioningCtx.cpp @@ -92,7 +92,7 @@ bool PartitioningCtx::isNodeExecutorKnown(torch::jit::Node* n) { } } -std::vector PartitionCtx::getNodesRunInTorch() { +std::vector PartitioningCtx::getNodesRunInTorch() { std::vector nodes_run_in_torch; for (auto i : node_executor_decision_map) { if (i.second == NodeExecutorDecision::kCONVERT) { From ff2270703973c93d75c8e264a21eb9434b11c3e1 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Tue, 6 Sep 2022 19:07:07 -0700 Subject: [PATCH 08/11] refactor: extract stitching phase out of compiler.cpp Signed-off-by: Bo Wang --- core/compiler.cpp | 212 +++--------------- core/partitioning/BUILD | 1 + core/partitioning/partitioning.cpp | 110 +++++---- core/partitioning/partitioning.h | 9 +- .../partitioningctx/PartitioningCtx.cpp | 19 +- .../partitioningctx/PartitioningCtx.h | 6 +- core/partitioning/shape_analysis.cpp | 4 +- core/partitioning/stitching.cpp | 153 +++++++++++++ 8 files changed, 274 insertions(+), 240 deletions(-) create mode 100644 core/partitioning/stitching.cpp diff --git a/core/compiler.cpp b/core/compiler.cpp index 7e9a042471..178d3c48c6 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -11,7 +11,6 @@ #include "torch/csrc/jit/frontend/function_schema_parser.h" #include "torch/csrc/jit/ir/ir.h" -#include "torch/csrc/jit/ir/ir_views.h" #include "torch/csrc/jit/passes/graph_fuser.h" #include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/passes/lower_graph.h" @@ -128,193 +127,54 @@ bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod, std::stri return conversion::VerifyConverterSupportForBlock(g->block()); } -void AddSegmentedBlockToGraph( - std::shared_ptr& g, - partitioning::SegmentedBlock& seg, - std::unordered_map& old_to_new_g) { - // old_to_new_g contains: original global graph value => new global graph value, - // mini_to_new_g: mini graph value -> new graph value - std::unordered_map mini_to_new_g; - size_t input_idx = 0; - if (seg.target() == partitioning::SegmentedBlock::kTensorRT && g->inputs().size() > 0) { - if (g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) { - auto self = g->insertInput(0, "self_1"); - self->setType(seg.inputs()[0]->type()); - } - mini_to_new_g[seg.inputs()[input_idx++]] = g->inputs()[0]; - } - - for (auto& raw_input : seg.raw_inputs()) { - if (old_to_new_g.count(raw_input)) { - mini_to_new_g[seg.inputs()[input_idx++]] = old_to_new_g[raw_input]; - } - } - - for (const auto n : seg.nodes()) { - util::cloneNode(n, g, mini_to_new_g); - } - - // original graph value => new global graph value - for (size_t i = 0; i < seg.raw_outputs().size(); ++i) { - old_to_new_g[seg.raw_outputs()[i]] = mini_to_new_g[seg.outputs()[i]]; - } - size_t offset = seg.target() == partitioning::SegmentedBlock::kTensorRT ? 1 : 0; - for (size_t i = 0; i < seg.raw_inputs().size(); ++i) { - if (!old_to_new_g.count(seg.raw_inputs()[i])) { - old_to_new_g[seg.raw_inputs()[i]] = mini_to_new_g[seg.inputs()[i + offset]]; - } - } - - return; -} - -typedef std::pair, std::unordered_map> - GraphAndMapping; - -void AddIfBlockToGraph( - std::shared_ptr& new_g, - torch::jit::Node* if_node, - const std::vector& graph_and_mappings, - std::unordered_map& old_to_new_g) { - torch::jit::IfView if_view(if_node); - - // create a new if node in new_g and add corresponding inputs - auto new_if = new_g->insertNode(new_g->create(torch::jit::prim::If, {}, 0)); - new_if->addInput(util::getOrAddInputForValue(if_view.cond(), new_g, old_to_new_g)); - - // iterate over all blocks and add them to new created prim::If - for (auto graph_and_mapping : graph_and_mappings) { - auto new_if_block = new_if->addBlock(); - auto cur_block_graph = graph_and_mapping.first; - auto cur_block_mapping = graph_and_mapping.second; - std::unordered_map block_graph_to_new_g; - for (auto& i : cur_block_mapping) { - // for every pair in then_mapping, old_value => mini graph value, if old_value also appears in old_to_new_g, then - // it's mini graph's input - if (old_to_new_g.count(i.first)) { - block_graph_to_new_g[i.second] = old_to_new_g[i.first]; - } - } - - auto env = [&](torch::jit::Value* v) { return util::getOrAddInputForValue(v, new_g, block_graph_to_new_g); }; - new_if_block->cloneFrom(cur_block_graph->block(), env); - if (cur_block_graph->inputs().size() && - cur_block_graph->inputs()[0]->type()->str().find("__torch__") != std::string::npos) { - if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) { - auto self = new_g->insertInput(0, "self_1"); - self->setType(cur_block_graph->inputs()[0]->type()); - } - block_graph_to_new_g[cur_block_graph->inputs()[0]] = new_g->inputs()[0]; - } - for (int i = cur_block_graph->inputs().size() - 1; i >= 0; --i) { - new_if_block->inputs()[i]->replaceAllUsesWith(block_graph_to_new_g[cur_block_graph->inputs()[i]]); - new_if_block->eraseInput(i); - } - } - for (auto ov : if_view.outputs()) { - auto no = new_if->addOutput(); - old_to_new_g[ov] = no; - no->copyMetadata(ov); - } - return; -} - -GraphAndMapping ConstructFallbackGraph_( +partitioning::GraphAndMapping BuildHybridGraph( torch::jit::script::Module& new_mod, torch::jit::Block* block, - partitioning::PartitioningCtx* partitioning_ctx, - conversion::ConversionInfo convert_info, + CompileSpec cfg, ir::StaticParams static_params, - std::unordered_map example_tensor_map) { - auto new_g = std::make_shared(); + ir::CollectionTypeMap first_use_types) { + auto convert_info = cfg.convert_info; + auto partitioning_info = cfg.partitioning_info; + + auto partitioning_ctx = partitioning::PartitioningCtx(block, partitioning_info); + auto collection_input_ivalues_map = + partitioning::generateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types); - auto segmented_blocks = partitioning::Partition(partitioning_ctx, block, example_tensor_map); + partitioning::Partition(&partitioning_ctx, collection_input_ivalues_map); - // the mapping from lowering graph => fallback global graph - std::unordered_map old_to_new_g; - for (auto input : block->inputs()) { - util::getOrAddInputForValue(input, new_g, old_to_new_g); - } + for (auto &partitioned_block : partitioning_ctx.partitioned_blocks) { + partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second; - for (auto& seg_block : segmented_blocks) { - LOG_INFO("Block segment:" << seg_block); - std::ostringstream trt_engine_id; - trt_engine_id << reinterpret_cast(&seg_block); - - if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) { - auto shapes = seg_block.in_shapes(); - auto types = seg_block.in_types(); - std::vector inputs; - for (size_t i = 0; i < shapes.size(); i++) { - auto in = ir::Input(shapes[i]); - in.dtype = util::ScalarTypeToTRTDataType(types[i]); - inputs.push_back(in); - } - // update the input ranges for each segments - convert_info.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params); - - // TODO mapping Inputs Ivalue to flatten one here - auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_info, static_params); - auto temp_g = std::make_shared(); - auto device_spec = convert_info.engine_settings.device; - auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type); - AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true); - - seg_block.update_graph(temp_g); - AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); - } else { - if (seg_block.raw_nodes()[0]->kind() == torch::jit::prim::If) { - auto if_node = seg_block.raw_nodes()[0]; - - // convert the 2 blocks in prim::if and get the converted graph with mappings - std::vector graph_and_mappings; - for (auto cur_block : if_node->blocks()) { - graph_and_mappings.push_back(ConstructFallbackGraph_( - new_mod, cur_block, partitioning_ctx, convert_info, static_params, example_tensor_map)); + for (auto& seg_block : segmented_blocks) { + LOG_INFO("Block segment:" << seg_block); + std::ostringstream trt_engine_id; + trt_engine_id << reinterpret_cast(&seg_block); + + if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) { + auto shapes = seg_block.in_shapes(); + auto types = seg_block.in_types(); + std::vector inputs; + for (size_t i = 0; i < shapes.size(); i++) { + auto in = ir::Input(shapes[i]); + in.dtype = util::ScalarTypeToTRTDataType(types[i]); + inputs.push_back(in); } - AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g); + // update the input ranges for each segments + convert_info.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params); - } else { - AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); - } - } - } + // TODO mapping Inputs Ivalue to flatten one here + auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_info, static_params); + auto temp_g = std::make_shared(); + auto device_spec = convert_info.engine_settings.device; + auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type); + AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true); - if (block->outputs().size() > 1) { - std::vector fallback_graph_vector; - for (auto& output : block->outputs()) { - if (old_to_new_g.count(output)) { - fallback_graph_vector.push_back(old_to_new_g[output]); + seg_block.update_graph(temp_g); } } - torch::jit::ArrayRef fallback_graph_outputs(fallback_graph_vector); - auto return_tuple_node = new_g->createTuple(fallback_graph_outputs); - new_g->block()->appendNode(return_tuple_node); - // Set the output as the produced tuple - new_g->registerOutput(return_tuple_node->outputs()[0]); - } else { - if (block->outputs().size() && old_to_new_g.count(block->outputs()[0])) { - new_g->registerOutput(old_to_new_g[block->outputs()[0]]); - } } - return {new_g, old_to_new_g}; -} - -GraphAndMapping ConstructFallbackGraph( - torch::jit::script::Module& new_mod, - torch::jit::Block* block, - CompileSpec cfg, - ir::StaticParams static_params, - ir::CollectionTypeMap first_use_types) { - auto convert_info = cfg.convert_info; - auto partitioning_info = cfg.partitioning_info; - - auto partitioning_ctx = partitioning::PartitioningCtx(block, partitioning_info); - auto collection_input_ivalues_map = - partitioning::generateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types); - return ConstructFallbackGraph_( - new_mod, block, &partitioning_ctx, convert_info, static_params, collection_input_ivalues_map); + return partitioning::Stitch(&partitioning_ctx, block); } void MapInputsAndDetermineDTypes( @@ -451,7 +311,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) (!(cfg.lower_info.forced_fallback_modules.size() == 0 && cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) || outputIsCollection)) { - auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), cfg, static_params, first_use_types); + auto graph_and_mapping = BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types); new_g = graph_and_mapping.first; // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly for (size_t i = 0; i < new_g->inputs().size(); ++i) { diff --git a/core/partitioning/BUILD b/core/partitioning/BUILD index a1aa49ad4f..f1f60ef651 100644 --- a/core/partitioning/BUILD +++ b/core/partitioning/BUILD @@ -15,6 +15,7 @@ cc_library( srcs = [ "partitioning.cpp", "shape_analysis.cpp", + "stitching.cpp" ], hdrs = [ "partitioning.h", diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 1852359aaf..1a596a2704 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -73,6 +73,7 @@ void SetExplicitFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { // Set the rest nodes to TensorRt ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kCONVERT); } + } return; } @@ -221,23 +222,24 @@ std::vector getDependencyNodes( return stk; } -void resolveTRTNonTensorInputs(PartitioningCtx* ctx) { +void resolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { // if a TRT segment has nonTensor Inputs, the nodes that produce this nonTensor Inputs must in another TensorRT engine // because we have already found the interface between Torch and TRT in segmentation phase // what we do here is just find the dependency nodes of the TRT segments that have nonTensor inputs - for (size_t i = 0; i < ctx->blocks.size(); ++i) { - if (ctx->blocks[i].target() == SegmentedBlock::kTensorRT) { + PartitionedGraph& cur_partitioned_block = ctx->partitioned_blocks[block]; + for (size_t i = 0; i < cur_partitioned_block.size(); ++i) { + if (cur_partitioned_block[i].target() == SegmentedBlock::kTensorRT) { std::vector inputs_to_resolve; - for (auto input : ctx->blocks[i].raw_inputs()) { + for (auto input : cur_partitioned_block[i].raw_inputs()) { if (!isTensor(input)) { inputs_to_resolve.push_back(input); } } if (!inputs_to_resolve.empty()) { - std::vector dependency_nodes = getDependencyNodes(inputs_to_resolve, ctx->blocks[i]); + std::vector dependency_nodes = getDependencyNodes(inputs_to_resolve, cur_partitioned_block[i]); dependency_nodes.insert( - dependency_nodes.end(), ctx->blocks[i].raw_nodes().begin(), ctx->blocks[i].raw_nodes().end()); - ctx->blocks[i] = SegmentedBlock(SegmentedBlock::kTensorRT, dependency_nodes); + dependency_nodes.end(), cur_partitioned_block[i].raw_nodes().begin(), cur_partitioned_block[i].raw_nodes().end()); + cur_partitioned_block[i] = SegmentedBlock(SegmentedBlock::kTensorRT, dependency_nodes); } } } @@ -245,9 +247,10 @@ void resolveTRTNonTensorInputs(PartitioningCtx* ctx) { void registerSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { // find the corresponding raw values in original global graph for this segmented block's inputs/outputs + PartitionedGraph& cur_partitioned_block = ctx->partitioned_blocks[block]; auto cmp = [](torch::jit::Value* a, torch::jit::Value* b) { return a->unique() < b->unique(); }; std::set input_values(cmp); - for (auto& seg_block : ctx->blocks) { + for (auto& seg_block : cur_partitioned_block) { for (auto& input : seg_block.raw_inputs()) { input_values.insert(input); } @@ -260,7 +263,7 @@ void registerSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { // should be careful here because some in-place operations don't return any values, there is no output for this kind // of segment identify the output for each mini-graph by checking if any value in this graph is used later we // shouldn't register nonTensor output for TensorRT segments - for (auto& seg_block : ctx->blocks) { + for (auto& seg_block : cur_partitioned_block) { for (auto& mini_graph_input : input_values) { if (std::find(seg_block.raw_inputs().begin(), seg_block.raw_inputs().end(), mini_graph_input) == seg_block.raw_inputs().end() && @@ -289,16 +292,16 @@ void registerSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { } } - std::for_each(ctx->blocks.begin(), ctx->blocks.end(), [](SegmentedBlock& seg_block) { + std::for_each(cur_partitioned_block.begin(), cur_partitioned_block.end(), [](SegmentedBlock& seg_block) { torch::jit::EliminateDeadCode(seg_block.g()); }); // erase segments which still have no output - ctx->blocks.erase( + cur_partitioned_block.erase( std::remove_if( - ctx->blocks.begin(), - ctx->blocks.end(), + cur_partitioned_block.begin(), + cur_partitioned_block.end(), [](SegmentedBlock& seg_block) { return seg_block.raw_outputs().empty(); }), - ctx->blocks.end()); + cur_partitioned_block.end()); return; } @@ -318,12 +321,25 @@ bool checkLoopEvaluatable(torch::jit::Node* n) { return compile_to_trt; } +void finalizeNewBlock( + PartitionedGraph& g, + SegmentedBlock::SegmentedBlockTarget kind, + std::vector& nodes) { + LOG_DEBUG("Finalizing in progress " << SegmentedBlock::target_to_str(kind) << " block"); + g.emplace_back(g.size(), kind, nodes); + nodes.clear(); + LOG_DEBUG(g.back()); +} + void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { auto nodes = block->nodes(); // segment the nodes + PartitionedGraph segmented_blocks; + std::vector in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes; for (const auto n : nodes) { + // Skip constant nodes as they are resources for both kinds of modules if (n->kind() == torch::jit::prim::Constant) { continue; @@ -335,13 +351,13 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { // If there is an active PyTorch block and we have passed the threshold for a valid TRT // block then segment and reset the active PyTorch block if (in_prog_trt_blk_nodes.size() >= ctx->settings.min_block_size && !in_prog_pyt_blk_nodes.empty()) { - ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } } else { // If there is an active TRT block that is valid segment and reset the active TRT block // otherwise add it to the active PyTorch block and reset if (in_prog_trt_blk_nodes.size() >= ctx->settings.min_block_size) { - ctx->finalizeNewBlock(SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); } else { LOG_DEBUG( "In progress TRT block does not meet minimum block size requirements (" @@ -349,9 +365,6 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { << "), therefore folding into in progress PyTorch block"); in_prog_pyt_blk_nodes.insert( in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); - for (auto n : in_prog_pyt_blk_nodes) { - ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kMIN_BLOCK_FALLBACK); - } } in_prog_trt_blk_nodes.clear(); // if there is a prim::If then this if node will be encapsulated in a SegmentedBlock @@ -360,20 +373,20 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { LOG_DEBUG( "Hit a conditional statement, finializing in progress PYT block and creating a new one for the conditional"); if (!in_prog_pyt_blk_nodes.empty()) { - ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } auto cond_node = std::vector{n}; - ctx->finalizeNewBlock(SegmentedBlock::kTorch, cond_node); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTorch, cond_node); continue; } else if (n->kind() == torch::jit::prim::Loop) { if (!in_prog_pyt_blk_nodes.empty()) { - ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } if (checkLoopEvaluatable(n)) { in_prog_trt_blk_nodes.push_back(n); } else { auto loop_node = std::vector{n}; - ctx->finalizeNewBlock(SegmentedBlock::kTorch, loop_node); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTorch, loop_node); } continue; } @@ -384,18 +397,20 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { // if there is any kTorch nodes left, then either the last nodes are kTorch or last nodes are kTensorRT but num < // min_block_size if (in_prog_trt_blk_nodes.size() >= ctx->settings.min_block_size) { - ctx->finalizeNewBlock(SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTensorRT, in_prog_trt_blk_nodes); } if (!in_prog_pyt_blk_nodes.empty() || !in_prog_trt_blk_nodes.empty()) { in_prog_pyt_blk_nodes.insert( in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); - ctx->finalizeNewBlock(SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); + finalizeNewBlock(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } + + ctx->partitioned_blocks.insert({block, segmented_blocks}); return; } -void SetNodeExecutorDecision(PartitioningCtx* ctx, torch::jit::Block* block) { +void SetNodeExecutorLUT(PartitioningCtx* ctx, torch::jit::Block* block) { // First, find all the explicit fallback nodes that should run in Torch: // 1. nodes that are unsupported // 2. nodes that the user specifies to run in torch @@ -418,33 +433,42 @@ void SetNodeExecutorDecision(PartitioningCtx* ctx, torch::jit::Block* block) { SetMinBlockFallbackNodes(ctx, block); } -PartitionedGraph Partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { +void Partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) { LOG_DEBUG(ctx->settings); - SetNodeExecutorDecision(ctx, block); + // Go through all the blocks to do the partitioning + for (torch::jit::Block* block : ctx->original_blocks) { - // segment lowering global graph into blocks - LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks"); - SegmentGraph(ctx, block); + // Find all the fallback nodes and build execution decision LUT for all nodes + SetNodeExecutorLUT(ctx, block); - // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks - // resolve nonTensor inputs/outputs - resolveTRTNonTensorInputs(ctx); + // segment lowering global graph into blocks + SegmentGraph(ctx, block); - // register input/output torch::jit::Value for segmented graphs - LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs"); - registerSegmentsOutputs(ctx, block); + // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks + // resolve nonTensor inputs/outputs + resolveTRTNonTensorInputs(ctx, block); - // run shape analysis on each segmented block - runShapeAnalysis(ctx, example_tensor_map); + // register input/output torch::jit::Value for segmented graphs + LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs"); + registerSegmentsOutputs(ctx, block); + + for (auto &i : ctx->partitioned_blocks[block]) { + LOG_DEBUG(i); + } + + // run shape analysis on each segmented block + runShapeAnalysis(ctx, block, example_tensor_map); - for (uint64_t i = 0; i < ctx->blocks.size(); i++) { - ctx->blocks[i].update_id(i); } - LOG_INFO(ctx->blocks); - return std::move(ctx->blocks); + +// for (uint64_t i = 0; i < ctx->blocks.size(); i++) { +// ctx->blocks[i].update_id(i); +// } + + } } // namespace partitioning diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index 29aa683b5c..391ad67d53 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -17,13 +17,18 @@ namespace partitioning { typedef std::unordered_map ExampleIValues; +typedef std::pair, std::unordered_map> + GraphAndMapping; + ExampleIValues generateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types); -void runShapeAnalysis(PartitioningCtx* ctx, ExampleIValues& ivalues_maps); +void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps); void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block); -PartitionedGraph Partition(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map); +GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block); + +void Partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map); } // namespace partitioning } // namespace core diff --git a/core/partitioning/partitioningctx/PartitioningCtx.cpp b/core/partitioning/partitioningctx/PartitioningCtx.cpp index b83f6f5ca5..4b8368db3f 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.cpp +++ b/core/partitioning/partitioningctx/PartitioningCtx.cpp @@ -15,6 +15,7 @@ PartitioningCtx::PartitioningCtx(torch::jit::Block* b, PartitioningInfo info) } void PartitioningCtx::_load_nodes_into_decision_map(torch::jit::Block* b) { + original_blocks.push_back(b); for (const auto n : b->nodes()) { if (n->kind() == torch::jit::prim::Constant) { continue; @@ -26,18 +27,7 @@ void PartitioningCtx::_load_nodes_into_decision_map(torch::jit::Block* b) { } } -void PartitioningCtx::finalizeNewBlock( - SegmentedBlock::SegmentedBlockTarget kind, - std::vector& nodes) { - LOG_DEBUG("Finalizing in progress " << SegmentedBlock::target_to_str(kind) << " block"); - blocks.emplace_back(blocks.size(), kind, nodes); - - // TODO: Can we not need this? - nodes.clear(); - LOG_DEBUG(blocks.back()); -} - -bool PartitioningCtx::setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision) { +void PartitioningCtx::setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision) { auto iter = node_executor_decision_map.find(n); auto prev_decision = NodeExecutorDecision::kUNKNOWN; if (iter != node_executor_decision_map.end()) { @@ -46,8 +36,9 @@ bool PartitioningCtx::setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorD LOG_GRAPH("Setting node " << util::node_info(n) << " " << decision << " (previously was " << prev_decision << ")"); // NOTE: This is this way due to partitioning.cpp L#134 I dont know if this is what we should do. - auto result = node_executor_decision_map.insert({n, decision}); - return result.second; + + auto result = node_executor_decision_map[n] = decision; + return ; } bool PartitioningCtx::shouldNodeRunInTorch(torch::jit::Node* n) { diff --git a/core/partitioning/partitioningctx/PartitioningCtx.h b/core/partitioning/partitioningctx/PartitioningCtx.h index cc5bb3d774..aceede90a6 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.h +++ b/core/partitioning/partitioningctx/PartitioningCtx.h @@ -47,13 +47,13 @@ struct UsageInfo { struct PartitioningCtx { // TODO: Make the set a part of settings not stand alone PartitioningInfo settings; + std::vector original_blocks; NodeExecutorDecisionMap node_executor_decision_map; - PartitionedGraph blocks; + std::unordered_map partitioned_blocks; std::unordered_set forced_fallback_ops; PartitioningCtx(torch::jit::Block* b, PartitioningInfo info); - bool setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision); - void finalizeNewBlock(SegmentedBlock::SegmentedBlockTarget kind, std::vector& nodes); + void setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision); bool shouldNodeRunInTorch(torch::jit::Node* n); bool shouldNodeRunInTensorRT(torch::jit::Node* n); bool isNodeExecutorKnown(torch::jit::Node* n); diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index ebc279e9da..e10f8bec5f 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -181,9 +181,9 @@ void getSegmentsOutputByRunning( seg_block.register_intypes(input_types); } -void runShapeAnalysis(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) { +void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { // register every segment's input shape, and it's running output IValues - for (auto& seg_block : ctx->blocks) { + for (auto& seg_block : ctx->partitioned_blocks[block]) { torch::jit::ConstantPooling(seg_block.g()); getSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings); } diff --git a/core/partitioning/stitching.cpp b/core/partitioning/stitching.cpp new file mode 100644 index 0000000000..f8a96337db --- /dev/null +++ b/core/partitioning/stitching.cpp @@ -0,0 +1,153 @@ +#include "ATen/ATen.h" +#include "torch/csrc/jit/api/module.h" +#include "torch/csrc/jit/ir/ir_views.h" + +#include "core/partitioning/partitioning.h" +#include "core/util/prelude.h" + +namespace torch_tensorrt { +namespace core { +namespace partitioning { + +void AddSegmentedBlockToGraph( + std::shared_ptr& g, + partitioning::SegmentedBlock& seg, + std::unordered_map& old_to_new_g) { + // old_to_new_g contains: original global graph value => new global graph value, + // mini_to_new_g: mini graph value -> new graph value + std::unordered_map mini_to_new_g; + size_t input_idx = 0; + if (seg.target() == partitioning::SegmentedBlock::kTensorRT && g->inputs().size() > 0) { + if (g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) { + auto self = g->insertInput(0, "self_1"); + self->setType(seg.inputs()[0]->type()); + } + mini_to_new_g[seg.inputs()[input_idx++]] = g->inputs()[0]; + } + + for (auto& raw_input : seg.raw_inputs()) { + if (old_to_new_g.count(raw_input)) { + mini_to_new_g[seg.inputs()[input_idx++]] = old_to_new_g[raw_input]; + } + } + + for (const auto n : seg.nodes()) { + util::cloneNode(n, g, mini_to_new_g); + } + + // original graph value => new global graph value + for (size_t i = 0; i < seg.raw_outputs().size(); ++i) { + old_to_new_g[seg.raw_outputs()[i]] = mini_to_new_g[seg.outputs()[i]]; + } + size_t offset = seg.target() == partitioning::SegmentedBlock::kTensorRT ? 1 : 0; + for (size_t i = 0; i < seg.raw_inputs().size(); ++i) { + if (!old_to_new_g.count(seg.raw_inputs()[i])) { + old_to_new_g[seg.raw_inputs()[i]] = mini_to_new_g[seg.inputs()[i + offset]]; + } + } + + return; +} + +void AddIfBlockToGraph( + std::shared_ptr& new_g, + torch::jit::Node* if_node, + const std::vector& graph_and_mappings, + std::unordered_map& old_to_new_g) { + torch::jit::IfView if_view(if_node); + + // create a new if node in new_g and add corresponding inputs + auto new_if = new_g->insertNode(new_g->create(torch::jit::prim::If, {}, 0)); + new_if->addInput(util::getOrAddInputForValue(if_view.cond(), new_g, old_to_new_g)); + + // iterate over all blocks and add them to new created prim::If + for (auto graph_and_mapping : graph_and_mappings) { + auto new_if_block = new_if->addBlock(); + auto cur_block_graph = graph_and_mapping.first; + auto cur_block_mapping = graph_and_mapping.second; + std::unordered_map block_graph_to_new_g; + for (auto& i : cur_block_mapping) { + // for every pair in then_mapping, old_value => mini graph value, if old_value also appears in old_to_new_g, then + // it's mini graph's input + if (old_to_new_g.count(i.first)) { + block_graph_to_new_g[i.second] = old_to_new_g[i.first]; + } + } + + auto env = [&](torch::jit::Value* v) { return util::getOrAddInputForValue(v, new_g, block_graph_to_new_g); }; + new_if_block->cloneFrom(cur_block_graph->block(), env); + if (cur_block_graph->inputs().size() && + cur_block_graph->inputs()[0]->type()->str().find("__torch__") != std::string::npos) { + if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) { + auto self = new_g->insertInput(0, "self_1"); + self->setType(cur_block_graph->inputs()[0]->type()); + } + block_graph_to_new_g[cur_block_graph->inputs()[0]] = new_g->inputs()[0]; + } + for (int i = cur_block_graph->inputs().size() - 1; i >= 0; --i) { + new_if_block->inputs()[i]->replaceAllUsesWith(block_graph_to_new_g[cur_block_graph->inputs()[i]]); + new_if_block->eraseInput(i); + } + } + for (auto ov : if_view.outputs()) { + auto no = new_if->addOutput(); + old_to_new_g[ov] = no; + no->copyMetadata(ov); + } + return; +} + + +GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block) { + auto new_g = std::make_shared(); + + // the mapping from lowering graph => fallback global graph + std::unordered_map old_to_new_g; + for (auto input : block->inputs()) { + util::getOrAddInputForValue(input, new_g, old_to_new_g); + } + + for (auto seg_block : ctx->partitioned_blocks[block]) { + LOG_INFO("Block segment:" << seg_block); + if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) { + AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); + } else { + if (seg_block.raw_nodes()[0]->kind() == torch::jit::prim::If) { + auto if_node = seg_block.raw_nodes()[0]; + + // convert the 2 blocks in prim::if and get the converted graph with mappings + std::vector graph_and_mappings; + for (auto cur_block : if_node->blocks()) { + graph_and_mappings.push_back(Stitch(ctx, cur_block)); + } + AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g); + + } else { + AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); + } + } + } + + if (block->outputs().size() > 1) { + std::vector fallback_graph_vector; + for (auto& output : block->outputs()) { + if (old_to_new_g.count(output)) { + fallback_graph_vector.push_back(old_to_new_g[output]); + } + } + torch::jit::ArrayRef fallback_graph_outputs(fallback_graph_vector); + auto return_tuple_node = new_g->createTuple(fallback_graph_outputs); + new_g->block()->appendNode(return_tuple_node); + // Set the output as the produced tuple + new_g->registerOutput(return_tuple_node->outputs()[0]); + } else { + if (block->outputs().size() && old_to_new_g.count(block->outputs()[0])) { + new_g->registerOutput(old_to_new_g[block->outputs()[0]]); + } + } + return {new_g, old_to_new_g}; + +} +} +} +} From 77f4c091d63daeb36261bf09a60385d889c5956e Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Wed, 7 Sep 2022 19:02:20 -0700 Subject: [PATCH 09/11] fix: fix bugs found when running tests Signed-off-by: Bo Wang --- core/compiler.cpp | 4 +- core/partitioning/BUILD | 2 +- core/partitioning/partitioning.cpp | 95 ++++++++----------- core/partitioning/partitioning.h | 6 +- .../partitioningctx/PartitioningCtx.cpp | 52 +++------- .../partitioningctx/PartitioningCtx.h | 4 +- core/partitioning/shape_analysis.cpp | 16 ++-- core/partitioning/stitching.cpp | 8 +- .../test_fallback_graph_output.cpp | 4 +- .../test_resolve_nontensor_inputs.cpp | 16 ++-- tests/core/partitioning/test_segmentation.cpp | 58 +++++------ .../core/partitioning/test_shape_analysis.cpp | 12 +-- 12 files changed, 121 insertions(+), 156 deletions(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index 178d3c48c6..10f68847b0 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -138,11 +138,11 @@ partitioning::GraphAndMapping BuildHybridGraph( auto partitioning_ctx = partitioning::PartitioningCtx(block, partitioning_info); auto collection_input_ivalues_map = - partitioning::generateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types); + partitioning::GenerateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types); partitioning::Partition(&partitioning_ctx, collection_input_ivalues_map); - for (auto &partitioned_block : partitioning_ctx.partitioned_blocks) { + for (auto& partitioned_block : partitioning_ctx.partitioned_blocks) { partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second; for (auto& seg_block : segmented_blocks) { diff --git a/core/partitioning/BUILD b/core/partitioning/BUILD index f1f60ef651..4204939684 100644 --- a/core/partitioning/BUILD +++ b/core/partitioning/BUILD @@ -15,7 +15,7 @@ cc_library( srcs = [ "partitioning.cpp", "shape_analysis.cpp", - "stitching.cpp" + "stitching.cpp", ], hdrs = [ "partitioning.h", diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 1a596a2704..86cfd6d73b 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -73,7 +73,6 @@ void SetExplicitFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { // Set the rest nodes to TensorRt ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kCONVERT); } - } return; } @@ -103,7 +102,8 @@ void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vectoruses()) { auto node = use.user; - if (node->kind() != torch::jit::prim::Constant && ctx->shouldNodeRunInTensorRT(node)) { + if (node->kind() != torch::jit::prim::Constant && node->kind() != torch::jit::prim::Return && + ctx->shouldNodeRunInTensorRT(node)) { ctx->setNodeExecutorDecision(node, NodeExecutorDecision::kNON_TENSOR); q.push(node); } @@ -175,7 +175,7 @@ bool isModifyingNodes(torch::jit::Node* node, torch::jit::Value* val) { return false; } -std::vector findModifyingNodes( +std::vector FindModifyingNodes( torch::jit::Value* val, const std::unordered_set& seg_block_nodes) { std::vector modifying_nodes; @@ -192,7 +192,7 @@ std::vector findModifyingNodes( } // this function is only used when a TRT segment produces nonTensor values which are used by later TRT segment -std::vector getDependencyNodes( +std::vector GetDependencyNodes( const std::vector& vals, const SegmentedBlock& seg_block) { // get all nodes in the segmentedblock @@ -208,7 +208,7 @@ std::vector getDependencyNodes( auto node = cur_val->node(); if (node->kind() != torch::jit::prim::Constant && !visited.count(node)) { visited.insert(node); - auto modifying_nodes = findModifyingNodes(cur_val, seg_block_nodes); + auto modifying_nodes = FindModifyingNodes(cur_val, seg_block_nodes); stk.insert(stk.end(), modifying_nodes.rbegin(), modifying_nodes.rend()); stk.push_back(node); for (auto input : node->inputs()) { @@ -222,7 +222,7 @@ std::vector getDependencyNodes( return stk; } -void resolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { +void ResolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { // if a TRT segment has nonTensor Inputs, the nodes that produce this nonTensor Inputs must in another TensorRT engine // because we have already found the interface between Torch and TRT in segmentation phase // what we do here is just find the dependency nodes of the TRT segments that have nonTensor inputs @@ -236,16 +236,19 @@ void resolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { } } if (!inputs_to_resolve.empty()) { - std::vector dependency_nodes = getDependencyNodes(inputs_to_resolve, cur_partitioned_block[i]); + std::vector dependency_nodes = + GetDependencyNodes(inputs_to_resolve, cur_partitioned_block[i]); dependency_nodes.insert( - dependency_nodes.end(), cur_partitioned_block[i].raw_nodes().begin(), cur_partitioned_block[i].raw_nodes().end()); + dependency_nodes.end(), + cur_partitioned_block[i].raw_nodes().begin(), + cur_partitioned_block[i].raw_nodes().end()); cur_partitioned_block[i] = SegmentedBlock(SegmentedBlock::kTensorRT, dependency_nodes); } } } } -void registerSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { +void RegisterSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { // find the corresponding raw values in original global graph for this segmented block's inputs/outputs PartitionedGraph& cur_partitioned_block = ctx->partitioned_blocks[block]; auto cmp = [](torch::jit::Value* a, torch::jit::Value* b) { return a->unique() < b->unique(); }; @@ -331,7 +334,33 @@ void finalizeNewBlock( LOG_DEBUG(g.back()); } +void SetNodeExecutorLUT(PartitioningCtx* ctx, torch::jit::Block* block) { + // First, find all the explicit fallback nodes that should run in Torch: + // 1. nodes that are unsupported + // 2. nodes that the user specifies to run in torch + // 3. nodes that the user specifies the module containing this op to run in torch + // At the same time, set all the rest nodes to NodeExecutorDecision::kCONVERT + SetExplicitFallbackNodes(ctx, block); + + // Second, check if there is nonTensor input/output for the block, if there is, then fallback the nodes that + // consume/produce this nonTensor value + SetInputsOutputsConnectedNodes(ctx, block); + + // Third, for fallback nodes, if it consumes any NonTensor inputs, then the nodes that produce this + // input should also fallback. Similarly, if it produces any NonTensor outputs, then the nodes + // that consume this output should also fallback + auto cur_fallback_nodes = ctx->getNodesRunInTorch(); + SetNonTensorConnectedNodes(ctx, cur_fallback_nodes); + + // Finally, check if all current tensorrt blocks satisfy the min_block_size requirement. + // We need to traverse the whole graph many times here + SetMinBlockFallbackNodes(ctx, block); +} + void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { + // Find all the fallback nodes and build execution decision LUT for all nodes + SetNodeExecutorLUT(ctx, block); + auto nodes = block->nodes(); // segment the nodes @@ -339,13 +368,12 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { std::vector in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes; for (const auto n : nodes) { - // Skip constant nodes as they are resources for both kinds of modules if (n->kind() == torch::jit::prim::Constant) { continue; } // the outputs of trt subgraph shouldn't be collections - if (!ctx->shouldNodeRunInTorch(n)) { + if (ctx->shouldNodeRunInTensorRT(n)) { in_prog_trt_blk_nodes.push_back(n); // If there is an active PyTorch block and we have passed the threshold for a valid TRT @@ -410,65 +438,26 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { return; } -void SetNodeExecutorLUT(PartitioningCtx* ctx, torch::jit::Block* block) { - // First, find all the explicit fallback nodes that should run in Torch: - // 1. nodes that are unsupported - // 2. nodes that the user specifies to run in torch - // 3. nodes that the user specifies the module containing this op to run in torch - // At the same time, set all the rest nodes to NodeExecutorDecision::kCONVERT - SetExplicitFallbackNodes(ctx, block); - - // Second, check if there is nonTensor input/output for the block, if there is, then fallback the nodes that - // consume/produce this nonTensor value - SetInputsOutputsConnectedNodes(ctx, block); - - // Third, for fallback nodes, if it consumes any NonTensor inputs, then the nodes that produce this - // input should also fallback. Similarly, if it produces any NonTensor outputs, then the nodes - // that consume this output should also fallback - auto cur_fallback_nodes = ctx->getNodesRunInTorch(); - SetNonTensorConnectedNodes(ctx, cur_fallback_nodes); - - // Finally, check if all current tensorrt blocks satisfy the min_block_size requirement. - // We need to traverse the whole graph many times here - SetMinBlockFallbackNodes(ctx, block); -} - void Partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) { LOG_DEBUG(ctx->settings); // Go through all the blocks to do the partitioning for (torch::jit::Block* block : ctx->original_blocks) { - - // Find all the fallback nodes and build execution decision LUT for all nodes - SetNodeExecutorLUT(ctx, block); - // segment lowering global graph into blocks SegmentGraph(ctx, block); // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks // resolve nonTensor inputs/outputs - resolveTRTNonTensorInputs(ctx, block); + ResolveTRTNonTensorInputs(ctx, block); // register input/output torch::jit::Value for segmented graphs LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs"); - registerSegmentsOutputs(ctx, block); + RegisterSegmentsOutputs(ctx, block); - for (auto &i : ctx->partitioned_blocks[block]) { - LOG_DEBUG(i); - } // run shape analysis on each segmented block - runShapeAnalysis(ctx, block, example_tensor_map); - + RunShapeAnalysis(ctx, block, example_tensor_map); } - - - -// for (uint64_t i = 0; i < ctx->blocks.size(); i++) { -// ctx->blocks[i].update_id(i); -// } - - } } // namespace partitioning diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index 391ad67d53..cf59e3439c 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -20,11 +20,11 @@ typedef std::unordered_map Example typedef std::pair, std::unordered_map> GraphAndMapping; -ExampleIValues generateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types); +ExampleIValues GenerateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types); -void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps); +void RunShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps); -void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block); +void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block); GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block); diff --git a/core/partitioning/partitioningctx/PartitioningCtx.cpp b/core/partitioning/partitioningctx/PartitioningCtx.cpp index 4b8368db3f..b2951e42b2 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.cpp +++ b/core/partitioning/partitioningctx/PartitioningCtx.cpp @@ -15,7 +15,9 @@ PartitioningCtx::PartitioningCtx(torch::jit::Block* b, PartitioningInfo info) } void PartitioningCtx::_load_nodes_into_decision_map(torch::jit::Block* b) { - original_blocks.push_back(b); + if (!b->owningNode() || b->owningNode()->kind() != torch::jit::prim::Loop) { + original_blocks.push_back(b); + } for (const auto n : b->nodes()) { if (n->kind() == torch::jit::prim::Constant) { continue; @@ -33,60 +35,28 @@ void PartitioningCtx::setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorD if (iter != node_executor_decision_map.end()) { prev_decision = iter->second; } - LOG_GRAPH("Setting node " << util::node_info(n) << " " << decision << " (previously was " << prev_decision << ")"); - - // NOTE: This is this way due to partitioning.cpp L#134 I dont know if this is what we should do. + LOG_DEBUG("Setting node " << util::node_info(n) << " " << decision << " (previously was " << prev_decision << ")"); - auto result = node_executor_decision_map[n] = decision; - return ; + node_executor_decision_map[n] = decision; + return; } bool PartitioningCtx::shouldNodeRunInTorch(torch::jit::Node* n) { auto iter = node_executor_decision_map.find(n); - auto decision = NodeExecutorDecision::kUNKNOWN; - if (iter != node_executor_decision_map.end()) { - decision = iter->second; - } - - if (decision == NodeExecutorDecision::kCONVERT || decision == NodeExecutorDecision::kUNKNOWN) { - return false; - } else { - return true; + if (iter == node_executor_decision_map.end()) { + LOG_ERROR("No info about node " << *n << " execution decision status."); } + return iter->second != NodeExecutorDecision::kCONVERT; } bool PartitioningCtx::shouldNodeRunInTensorRT(torch::jit::Node* n) { - auto iter = node_executor_decision_map.find(n); - auto decision = NodeExecutorDecision::kUNKNOWN; - if (iter != node_executor_decision_map.end()) { - decision = iter->second; - } - - if (decision == NodeExecutorDecision::kCONVERT) { - return true; - } else { - return false; - } -} - -bool PartitioningCtx::isNodeExecutorKnown(torch::jit::Node* n) { - auto iter = node_executor_decision_map.find(n); - auto decision = NodeExecutorDecision::kUNKNOWN; - if (iter != node_executor_decision_map.end()) { - decision = iter->second; - } - - if (decision == NodeExecutorDecision::kUNKNOWN) { - return false; - } else { - return true; - } + return !shouldNodeRunInTorch(n); } std::vector PartitioningCtx::getNodesRunInTorch() { std::vector nodes_run_in_torch; for (auto i : node_executor_decision_map) { - if (i.second == NodeExecutorDecision::kCONVERT) { + if (i.second != NodeExecutorDecision::kCONVERT) { nodes_run_in_torch.push_back(i.first); } } diff --git a/core/partitioning/partitioningctx/PartitioningCtx.h b/core/partitioning/partitioningctx/PartitioningCtx.h index aceede90a6..ed8e705be5 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.h +++ b/core/partitioning/partitioningctx/PartitioningCtx.h @@ -47,8 +47,11 @@ struct UsageInfo { struct PartitioningCtx { // TODO: Make the set a part of settings not stand alone PartitioningInfo settings; + // records all the original blocks topologically in the module std::vector original_blocks; + // mapping: node=> execution status NodeExecutorDecisionMap node_executor_decision_map; + // LUT of the segmented blocks for each blocks in the module std::unordered_map partitioned_blocks; std::unordered_set forced_fallback_ops; @@ -56,7 +59,6 @@ struct PartitioningCtx { void setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision); bool shouldNodeRunInTorch(torch::jit::Node* n); bool shouldNodeRunInTensorRT(torch::jit::Node* n); - bool isNodeExecutorKnown(torch::jit::Node* n); std::vector getNodesRunInTorch(); private: diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index e10f8bec5f..20ff36eb88 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -9,7 +9,7 @@ namespace torch_tensorrt { namespace core { namespace partitioning { -at::Tensor generateSingleInput(ir::Input& input, c10::optional& type_opt) { +at::Tensor GenerateSingleInput(ir::Input& input, c10::optional& type_opt) { auto cur_shape = input.input_shape; std::vector shape; shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); @@ -25,7 +25,7 @@ at::Tensor generateSingleInput(ir::Input& input, c10::optional& return in; } -std::unordered_map generateRandomInputs( +std::unordered_map GenerateRandomInputs( std::unordered_map>& inputs, std::unordered_map>>& types) { // generate random inputs for running pytorch segments @@ -38,7 +38,7 @@ std::unordered_map generateRandomI c10::TypePtr elementType = c10::TensorType::get(); auto generic_list = c10::impl::GenericList(elementType); for (size_t i = 0; i < input.second.size(); i++) { - auto in = generateSingleInput(input.second[i], types[input.first][i]); + auto in = GenerateSingleInput(input.second[i], types[input.first][i]); generic_list.push_back(in.clone()); } ivalue_map[input.first] = c10::IValue(generic_list); @@ -46,20 +46,20 @@ std::unordered_map generateRandomI // create tuple std::vector list; for (size_t i = 0; i < input.second.size(); i++) { - auto in = generateSingleInput(input.second[i], types[input.first][i]); + auto in = GenerateSingleInput(input.second[i], types[input.first][i]); list.push_back(in.clone()); } auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr ivalue_map[input.first] = c10::IValue(tuple); } else { - auto in = generateSingleInput(input.second[0], types[input.first][0]); + auto in = GenerateSingleInput(input.second[0], types[input.first][0]); ivalue_map[input.first] = in.clone(); } } return ivalue_map; } -void getSegmentsOutputByRunning( +void GetSegmentsOutputByRunning( SegmentedBlock& seg_block, std::unordered_map& ivalues_maps, const PartitioningInfo& partitioning_info) { @@ -181,11 +181,11 @@ void getSegmentsOutputByRunning( seg_block.register_intypes(input_types); } -void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { +void RunShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { // register every segment's input shape, and it's running output IValues for (auto& seg_block : ctx->partitioned_blocks[block]) { torch::jit::ConstantPooling(seg_block.g()); - getSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings); + GetSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings); } return; } diff --git a/core/partitioning/stitching.cpp b/core/partitioning/stitching.cpp index f8a96337db..42cfe9d98e 100644 --- a/core/partitioning/stitching.cpp +++ b/core/partitioning/stitching.cpp @@ -97,7 +97,6 @@ void AddIfBlockToGraph( return; } - GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block) { auto new_g = std::make_shared(); @@ -146,8 +145,7 @@ GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block) { } } return {new_g, old_to_new_g}; - -} -} -} } +} // namespace partitioning +} // namespace core +} // namespace torch_tensorrt diff --git a/tests/core/partitioning/test_fallback_graph_output.cpp b/tests/core/partitioning/test_fallback_graph_output.cpp index f1351741f5..111563c0f3 100644 --- a/tests/core/partitioning/test_fallback_graph_output.cpp +++ b/tests/core/partitioning/test_fallback_graph_output.cpp @@ -34,7 +34,7 @@ TEST(Partitioning, ComputeResNet50FallbackGraphCorrectly) { auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor(); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-6)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-1)); } TEST(Partitioning, ComputeMobileNetFallbackGraphCorrectly) { @@ -64,6 +64,6 @@ TEST(Partitioning, ComputeMobileNetFallbackGraphCorrectly) { auto jit_results = mod.forward(jit_inputs_ivalues).toTensor(); auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg); auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor(); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-6)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-1)); } #endif diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp index 3df65deca3..b454162993 100644 --- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp +++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp @@ -122,10 +122,11 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); + torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); std::vector segmented_blocks = - torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); + ctx.partitioned_blocks.begin()->second; int torch_block_cnt = 0, trt_block_cnt = 0; for (const auto& segmented_block : segmented_blocks) { @@ -181,10 +182,12 @@ TEST(Partitioning, ResolveTensorListInputsInTrtCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); + + torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); std::vector segmented_blocks = - torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); + ctx.partitioned_blocks.begin()->second; int torch_block_cnt = 0, trt_block_cnt = 0; for (const auto& segmented_block : segmented_blocks) { @@ -373,9 +376,10 @@ TEST(Partitioning, ResolveOnlyNeccessaryNonTensorInputs) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - auto segmented_blocks = torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); + torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + auto segmented_blocks = ctx.partitioned_blocks.begin()->second; int torch_block_cnt = 0, trt_block_cnt = 0; for (const auto& segmented_block : segmented_blocks) { diff --git a/tests/core/partitioning/test_segmentation.cpp b/tests/core/partitioning/test_segmentation.cpp index efee4ec85a..a5c5571a13 100644 --- a/tests/core/partitioning/test_segmentation.cpp +++ b/tests/core/partitioning/test_segmentation.cpp @@ -81,10 +81,10 @@ TEST(Partitioning, SegmentSequentialModelCorrectly) { PartitioningInfo partitioning_info; partitioning_info.enabled = true; PartitioningCtx ctx(g->block(), partitioning_info); - segment_graph(&ctx, g->block()); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 2)); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2}, {3}, {4}})); + SegmentGraph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 2)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2}, {3}, {4}})); } TEST(Partitioning, SegmentSequentialModelWithMinBlockSizeCorrectly) { @@ -115,10 +115,10 @@ TEST(Partitioning, SegmentSequentialModelWithMinBlockSizeCorrectly) { partitioning_info.enabled = true; partitioning_info.min_block_size = 3; PartitioningCtx ctx(g->block(), partitioning_info); - segment_graph(&ctx, g->block()); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 1)); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2}, {3, 4}})); + SegmentGraph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 1)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2}, {3, 4}})); } TEST(Partitioning, SegmentModelWithMinBlockSizeCausedFallbackCorrectly) { @@ -153,10 +153,10 @@ TEST(Partitioning, SegmentModelWithMinBlockSizeCausedFallbackCorrectly) { partitioning_info.enabled = true; partitioning_info.min_block_size = 3; PartitioningCtx ctx(g->block(), partitioning_info); - segment_graph(&ctx, g->block()); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 1)); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2, 3}, {4, 5, 6, 7}})); + SegmentGraph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 1)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2, 3}, {4, 5, 6, 7}})); } TEST(Partitioning, SegmentSequentialModelWithForcedOPCorrectly) { @@ -187,10 +187,10 @@ TEST(Partitioning, SegmentSequentialModelWithForcedOPCorrectly) { partitioning_info.enabled = true; partitioning_info.forced_fallback_operators.push_back("aten::relu"); PartitioningCtx ctx(g->block(), partitioning_info); - segment_graph(&ctx, g->block()); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 3)); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 2)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0}, {1}, {2}, {3}, {4}})); + SegmentGraph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 3)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 2)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0}, {1}, {2}, {3}, {4}})); } TEST(Partitioning, SegmentBranchModelCorrectly) { @@ -221,10 +221,10 @@ TEST(Partitioning, SegmentBranchModelCorrectly) { PartitioningInfo partitioning_info; partitioning_info.enabled = true; PartitioningCtx ctx(g->block(), partitioning_info); - segment_graph(&ctx, g->block()); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 2)); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1}, {2}, {3, 4, 5, 6}})); + SegmentGraph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 2)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1}, {2}, {3, 4, 5, 6}})); } TEST(Partitioning, SegmentBranchModelWithMinBlockSizeCorrectly) { @@ -256,10 +256,10 @@ TEST(Partitioning, SegmentBranchModelWithMinBlockSizeCorrectly) { partitioning_info.enabled = true; partitioning_info.min_block_size = 3; PartitioningCtx ctx(g->block(), partitioning_info); - segment_graph(&ctx, g->block()); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 1)); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 1)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1, 2}, {3, 4, 5, 6}})); + SegmentGraph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 1)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); + ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2}, {3, 4, 5, 6}})); } TEST(Partitioning, SegmentBranchModelWithForcedFallbackOPCorrectly) { @@ -295,10 +295,12 @@ TEST(Partitioning, SegmentBranchModelWithForcedFallbackOPCorrectly) { partitioning_info.enabled = true; partitioning_info.forced_fallback_operators.push_back("aten::relu"); PartitioningCtx ctx(g->block(), partitioning_info); - segment_graph(&ctx, g->block()); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTensorRT, 3)); - ASSERT_TRUE(checkSegmentedBlockNumber(ctx.blocks, SegmentedBlock::kTorch, 2)); - ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.blocks, g, {{0, 1}, {2}, {3}, {4}, {5, 6}})); + + SegmentGraph(&ctx, g->block()); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 3)); + ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 2)); + ASSERT_TRUE( + checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1}, {2}, {3}, {4}, {5, 6}})); } } // namespace tests diff --git a/tests/core/partitioning/test_shape_analysis.cpp b/tests/core/partitioning/test_shape_analysis.cpp index e2767185c6..1727935752 100644 --- a/tests/core/partitioning/test_shape_analysis.cpp +++ b/tests/core/partitioning/test_shape_analysis.cpp @@ -65,11 +65,11 @@ TEST(Partitioning, InferSequentialModelSegmentedBlockShapeCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); + torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + auto segmented_blocks = ctx.partitioned_blocks.begin()->second; ASSERT_TRUE(checkSegmentedBlockInputShape( segmented_blocks, @@ -117,11 +117,11 @@ TEST(Partitioning, InferBranchModelSegmentedBlockShapeCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - std::vector segmented_blocks = - torch_tensorrt::core::partitioning::partition(&ctx, g->block(), input_ivalues_map); + torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + auto segmented_blocks = ctx.partitioned_blocks.begin()->second; ASSERT_TRUE(checkSegmentedBlockInputShape( segmented_blocks, From 34366b36c36ed7488222eb9050e5d0121549e62b Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Wed, 7 Sep 2022 23:54:38 -0700 Subject: [PATCH 10/11] fix: change the shouldNodeRunInTorch logic Signed-off-by: Bo Wang --- core/partitioning/partitioning.cpp | 5 +--- core/partitioning/partitioning.h | 2 -- .../partitioningctx/PartitioningCtx.cpp | 24 +++++++++++++++---- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 86cfd6d73b..6e1dc07f7b 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -2,7 +2,6 @@ #include #include "core/conversion/conversion.h" #include "core/conversion/evaluators/evaluators.h" -#include "core/partitioning/partitioningctx/PartitioningCtx.h" #include "torch/csrc/jit/passes/constant_pooling.h" #include "torch/csrc/jit/passes/dead_code_elimination.h" @@ -102,8 +101,7 @@ void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vectoruses()) { auto node = use.user; - if (node->kind() != torch::jit::prim::Constant && node->kind() != torch::jit::prim::Return && - ctx->shouldNodeRunInTensorRT(node)) { + if (node->kind() != torch::jit::prim::Constant && ctx->shouldNodeRunInTensorRT(node)) { ctx->setNodeExecutorDecision(node, NodeExecutorDecision::kNON_TENSOR); q.push(node); } @@ -454,7 +452,6 @@ void Partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) { LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs"); RegisterSegmentsOutputs(ctx, block); - // run shape analysis on each segmented block RunShapeAnalysis(ctx, block, example_tensor_map); } diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index cf59e3439c..a147dffea5 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -7,8 +7,6 @@ #include "core/ir/ir.h" #include "core/partitioning/partitioningctx/PartitioningCtx.h" -#include "core/partitioning/partitioninginfo/PartitioningInfo.h" -#include "core/partitioning/segmentedblock/SegmentedBlock.h" #include "core/util/prelude.h" namespace torch_tensorrt { diff --git a/core/partitioning/partitioningctx/PartitioningCtx.cpp b/core/partitioning/partitioningctx/PartitioningCtx.cpp index b2951e42b2..7bcaaea120 100644 --- a/core/partitioning/partitioningctx/PartitioningCtx.cpp +++ b/core/partitioning/partitioningctx/PartitioningCtx.cpp @@ -43,14 +43,30 @@ void PartitioningCtx::setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorD bool PartitioningCtx::shouldNodeRunInTorch(torch::jit::Node* n) { auto iter = node_executor_decision_map.find(n); - if (iter == node_executor_decision_map.end()) { - LOG_ERROR("No info about node " << *n << " execution decision status."); + auto decision = NodeExecutorDecision::kUNKNOWN; + + if (iter != node_executor_decision_map.end()) { + decision = iter->second; + } + if (decision == NodeExecutorDecision::kCONVERT || decision == NodeExecutorDecision::kUNKNOWN) { + return false; + } else { + return true; } - return iter->second != NodeExecutorDecision::kCONVERT; } bool PartitioningCtx::shouldNodeRunInTensorRT(torch::jit::Node* n) { - return !shouldNodeRunInTorch(n); + auto iter = node_executor_decision_map.find(n); + auto decision = NodeExecutorDecision::kUNKNOWN; + if (iter != node_executor_decision_map.end()) { + decision = iter->second; + } + + if (decision == NodeExecutorDecision::kCONVERT) { + return true; + } else { + return false; + } } std::vector PartitioningCtx::getNodesRunInTorch() { From d053b4d5138a27f6a493119acfc7f497cf80edb7 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 22 Sep 2022 10:49:32 -0700 Subject: [PATCH 11/11] chore: address review comments Signed-off-by: Dheeraj Peri --- core/compiler.cpp | 6 +-- core/partitioning/partitioning.cpp | 52 +++++++++---------- core/partitioning/partitioning.h | 10 ++-- core/partitioning/shape_analysis.cpp | 16 +++--- core/partitioning/stitching.cpp | 14 ++--- .../test_resolve_nontensor_inputs.cpp | 12 ++--- tests/core/partitioning/test_segmentation.cpp | 16 +++--- .../core/partitioning/test_shape_analysis.cpp | 8 +-- 8 files changed, 67 insertions(+), 67 deletions(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index 10f68847b0..118ca7aa1c 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -138,9 +138,9 @@ partitioning::GraphAndMapping BuildHybridGraph( auto partitioning_ctx = partitioning::PartitioningCtx(block, partitioning_info); auto collection_input_ivalues_map = - partitioning::GenerateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types); + partitioning::generateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types); - partitioning::Partition(&partitioning_ctx, collection_input_ivalues_map); + partitioning::partition(&partitioning_ctx, collection_input_ivalues_map); for (auto& partitioned_block : partitioning_ctx.partitioned_blocks) { partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second; @@ -174,7 +174,7 @@ partitioning::GraphAndMapping BuildHybridGraph( } } - return partitioning::Stitch(&partitioning_ctx, block); + return partitioning::stitch(&partitioning_ctx, block); } void MapInputsAndDetermineDTypes( diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 6e1dc07f7b..eb8c86de50 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -29,7 +29,7 @@ bool containNonTensorOutputs(torch::jit::Node* n) { } // Check if the inputs and outputs of the graph are Tensor. If not, then fallback connected nodes -void SetInputsOutputsConnectedNodes(PartitioningCtx* ctx, torch::jit::Block* block) { +void setInputsOutputsConnectedNodes(PartitioningCtx* ctx, torch::jit::Block* block) { // fallback nodes that produce entire graph's nonTensor output for (auto i : block->outputs()) { if (!isTensor(i)) { @@ -50,7 +50,7 @@ void SetInputsOutputsConnectedNodes(PartitioningCtx* ctx, torch::jit::Block* blo // Find and set all explicit fallback nodes (nodes that are unsupported or forced fallback) // we use a map to indicate the reason why it's fallback to torch // For any node that's not explicitly fallback, we set it to run in TensorRT for now -void SetExplicitFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { +void setExplicitFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { auto nodes = block->nodes(); const auto to_compile_sym = c10::Symbol::attr("to_compile"); @@ -78,7 +78,7 @@ void SetExplicitFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { // For a given set of fallback nodes, check their inputs/outputs, if any inputs/outputs of them are NonTensor, // then the nodes that produces/consumes those values should also fallback -void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vector& initial_fallback_nodes) { +void setNonTensorConnectedNodes(PartitioningCtx* ctx, std::vector& initial_fallback_nodes) { // initial_fallback_nodes are the fallback nodes that we have before we run BFS in this function std::queue q; for (auto& node : initial_fallback_nodes) { @@ -112,7 +112,7 @@ void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vector TraverseNodesForMinBlockSize(PartitioningCtx* ctx, torch::jit::Block* block) { +std::vector traverseNodesForMinBlockSize(PartitioningCtx* ctx, torch::jit::Block* block) { auto nodes = block->nodes(); std::vector cur_trt_nodes; std::vector min_block_fallback_nodes; @@ -138,9 +138,9 @@ std::vector TraverseNodesForMinBlockSize(PartitioningCtx* ctx } // Set the nodes that fallback because of min_block_size -void SetMinBlockFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { +void setMinBlockFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { // first traverse all the nodes to find the initial nodes that don't meet the min_block_size requirement - auto min_block_fallback_nodes = TraverseNodesForMinBlockSize(ctx, block); + auto min_block_fallback_nodes = traverseNodesForMinBlockSize(ctx, block); // keep fallback until all segments meet the min_block_size requirement while (!min_block_fallback_nodes.empty()) { @@ -148,9 +148,9 @@ void SetMinBlockFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) { ctx->setNodeExecutorDecision(i, NodeExecutorDecision::kMIN_BLOCK_FALLBACK); } // find the fallback nodes because of dependency with min_block_size caused fallback nodes - SetNonTensorConnectedNodes(ctx, min_block_fallback_nodes); + setNonTensorConnectedNodes(ctx, min_block_fallback_nodes); // keep traverse the graph until there is no node fallback because of min_block_size - min_block_fallback_nodes = TraverseNodesForMinBlockSize(ctx, block); + min_block_fallback_nodes = traverseNodesForMinBlockSize(ctx, block); } } @@ -173,7 +173,7 @@ bool isModifyingNodes(torch::jit::Node* node, torch::jit::Value* val) { return false; } -std::vector FindModifyingNodes( +std::vector findModifyingNodes( torch::jit::Value* val, const std::unordered_set& seg_block_nodes) { std::vector modifying_nodes; @@ -190,7 +190,7 @@ std::vector FindModifyingNodes( } // this function is only used when a TRT segment produces nonTensor values which are used by later TRT segment -std::vector GetDependencyNodes( +std::vector getDependencyNodes( const std::vector& vals, const SegmentedBlock& seg_block) { // get all nodes in the segmentedblock @@ -206,7 +206,7 @@ std::vector GetDependencyNodes( auto node = cur_val->node(); if (node->kind() != torch::jit::prim::Constant && !visited.count(node)) { visited.insert(node); - auto modifying_nodes = FindModifyingNodes(cur_val, seg_block_nodes); + auto modifying_nodes = findModifyingNodes(cur_val, seg_block_nodes); stk.insert(stk.end(), modifying_nodes.rbegin(), modifying_nodes.rend()); stk.push_back(node); for (auto input : node->inputs()) { @@ -220,7 +220,7 @@ std::vector GetDependencyNodes( return stk; } -void ResolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { +void resolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { // if a TRT segment has nonTensor Inputs, the nodes that produce this nonTensor Inputs must in another TensorRT engine // because we have already found the interface between Torch and TRT in segmentation phase // what we do here is just find the dependency nodes of the TRT segments that have nonTensor inputs @@ -235,7 +235,7 @@ void ResolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { } if (!inputs_to_resolve.empty()) { std::vector dependency_nodes = - GetDependencyNodes(inputs_to_resolve, cur_partitioned_block[i]); + getDependencyNodes(inputs_to_resolve, cur_partitioned_block[i]); dependency_nodes.insert( dependency_nodes.end(), cur_partitioned_block[i].raw_nodes().begin(), @@ -246,7 +246,7 @@ void ResolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) { } } -void RegisterSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { +void registerSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) { // find the corresponding raw values in original global graph for this segmented block's inputs/outputs PartitionedGraph& cur_partitioned_block = ctx->partitioned_blocks[block]; auto cmp = [](torch::jit::Value* a, torch::jit::Value* b) { return a->unique() < b->unique(); }; @@ -332,32 +332,32 @@ void finalizeNewBlock( LOG_DEBUG(g.back()); } -void SetNodeExecutorLUT(PartitioningCtx* ctx, torch::jit::Block* block) { +void setNodeExecutorLUT(PartitioningCtx* ctx, torch::jit::Block* block) { // First, find all the explicit fallback nodes that should run in Torch: // 1. nodes that are unsupported // 2. nodes that the user specifies to run in torch // 3. nodes that the user specifies the module containing this op to run in torch // At the same time, set all the rest nodes to NodeExecutorDecision::kCONVERT - SetExplicitFallbackNodes(ctx, block); + setExplicitFallbackNodes(ctx, block); // Second, check if there is nonTensor input/output for the block, if there is, then fallback the nodes that // consume/produce this nonTensor value - SetInputsOutputsConnectedNodes(ctx, block); + setInputsOutputsConnectedNodes(ctx, block); // Third, for fallback nodes, if it consumes any NonTensor inputs, then the nodes that produce this // input should also fallback. Similarly, if it produces any NonTensor outputs, then the nodes // that consume this output should also fallback auto cur_fallback_nodes = ctx->getNodesRunInTorch(); - SetNonTensorConnectedNodes(ctx, cur_fallback_nodes); + setNonTensorConnectedNodes(ctx, cur_fallback_nodes); // Finally, check if all current tensorrt blocks satisfy the min_block_size requirement. // We need to traverse the whole graph many times here - SetMinBlockFallbackNodes(ctx, block); + setMinBlockFallbackNodes(ctx, block); } -void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { +void segmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { // Find all the fallback nodes and build execution decision LUT for all nodes - SetNodeExecutorLUT(ctx, block); + setNodeExecutorLUT(ctx, block); auto nodes = block->nodes(); @@ -436,24 +436,24 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) { return; } -void Partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) { +void partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) { LOG_DEBUG(ctx->settings); // Go through all the blocks to do the partitioning for (torch::jit::Block* block : ctx->original_blocks) { // segment lowering global graph into blocks - SegmentGraph(ctx, block); + segmentGraph(ctx, block); // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks // resolve nonTensor inputs/outputs - ResolveTRTNonTensorInputs(ctx, block); + resolveTRTNonTensorInputs(ctx, block); // register input/output torch::jit::Value for segmented graphs LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs"); - RegisterSegmentsOutputs(ctx, block); + registerSegmentsOutputs(ctx, block); // run shape analysis on each segmented block - RunShapeAnalysis(ctx, block, example_tensor_map); + runShapeAnalysis(ctx, block, example_tensor_map); } } diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index a147dffea5..3038f6c52f 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -18,15 +18,15 @@ typedef std::unordered_map Example typedef std::pair, std::unordered_map> GraphAndMapping; -ExampleIValues GenerateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types); +ExampleIValues generateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types); -void RunShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps); +void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps); -void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block); +void segmentGraph(PartitioningCtx* ctx, torch::jit::Block* block); -GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block); +GraphAndMapping stitch(PartitioningCtx* ctx, torch::jit::Block* block); -void Partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map); +void partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map); } // namespace partitioning } // namespace core diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index baa3691090..514681a088 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -9,7 +9,7 @@ namespace torch_tensorrt { namespace core { namespace partitioning { -at::Tensor GenerateSingleInput(ir::Input& input, c10::optional& type_opt) { +at::Tensor generateSingleInput(ir::Input& input, c10::optional& type_opt) { auto cur_shape = input.input_shape; std::vector shape; shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); @@ -25,7 +25,7 @@ at::Tensor GenerateSingleInput(ir::Input& input, c10::optional& return in; } -std::unordered_map GenerateRandomInputs( +std::unordered_map generateRandomInputs( std::unordered_map>& inputs, std::unordered_map>>& types) { // generate random inputs for running pytorch segments @@ -38,7 +38,7 @@ std::unordered_map GenerateRandomI c10::TypePtr elementType = c10::TensorType::get(); auto generic_list = c10::impl::GenericList(elementType); for (size_t i = 0; i < input.second.size(); i++) { - auto in = GenerateSingleInput(input.second[i], types[input.first][i]); + auto in = generateSingleInput(input.second[i], types[input.first][i]); generic_list.push_back(in.clone()); } ivalue_map[input.first] = c10::IValue(generic_list); @@ -46,20 +46,20 @@ std::unordered_map GenerateRandomI // create tuple std::vector list; for (size_t i = 0; i < input.second.size(); i++) { - auto in = GenerateSingleInput(input.second[i], types[input.first][i]); + auto in = generateSingleInput(input.second[i], types[input.first][i]); list.push_back(in.clone()); } auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr ivalue_map[input.first] = c10::IValue(tuple); } else { - auto in = GenerateSingleInput(input.second[0], types[input.first][0]); + auto in = generateSingleInput(input.second[0], types[input.first][0]); ivalue_map[input.first] = in.clone(); } } return ivalue_map; } -void GetSegmentsOutputByRunning( +void getSegmentsOutputByRunning( SegmentedBlock& seg_block, std::unordered_map& ivalues_maps, const PartitioningInfo& partitioning_info) { @@ -181,11 +181,11 @@ void GetSegmentsOutputByRunning( seg_block.register_intypes(input_types); } -void RunShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { +void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) { // register every segment's input shape, and it's running output IValues for (auto& seg_block : ctx->partitioned_blocks[block]) { torch::jit::ConstantPooling(seg_block.g()); - GetSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings); + getSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings); } return; } diff --git a/core/partitioning/stitching.cpp b/core/partitioning/stitching.cpp index 42cfe9d98e..6ed5a27463 100644 --- a/core/partitioning/stitching.cpp +++ b/core/partitioning/stitching.cpp @@ -9,7 +9,7 @@ namespace torch_tensorrt { namespace core { namespace partitioning { -void AddSegmentedBlockToGraph( +void addSegmentedBlockToGraph( std::shared_ptr& g, partitioning::SegmentedBlock& seg, std::unordered_map& old_to_new_g) { @@ -49,7 +49,7 @@ void AddSegmentedBlockToGraph( return; } -void AddIfBlockToGraph( +void addIfBlockToGraph( std::shared_ptr& new_g, torch::jit::Node* if_node, const std::vector& graph_and_mappings, @@ -97,7 +97,7 @@ void AddIfBlockToGraph( return; } -GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block) { +GraphAndMapping stitch(PartitioningCtx* ctx, torch::jit::Block* block) { auto new_g = std::make_shared(); // the mapping from lowering graph => fallback global graph @@ -109,7 +109,7 @@ GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block) { for (auto seg_block : ctx->partitioned_blocks[block]) { LOG_INFO("Block segment:" << seg_block); if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) { - AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); + addSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); } else { if (seg_block.raw_nodes()[0]->kind() == torch::jit::prim::If) { auto if_node = seg_block.raw_nodes()[0]; @@ -117,12 +117,12 @@ GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block) { // convert the 2 blocks in prim::if and get the converted graph with mappings std::vector graph_and_mappings; for (auto cur_block : if_node->blocks()) { - graph_and_mappings.push_back(Stitch(ctx, cur_block)); + graph_and_mappings.push_back(stitch(ctx, cur_block)); } - AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g); + addIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g); } else { - AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); + addSegmentedBlockToGraph(new_g, seg_block, old_to_new_g); } } } diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp index b454162993..950859e524 100644 --- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp +++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp @@ -122,9 +122,9 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + torch_tensorrt::core::partitioning::partition(&ctx, input_ivalues_map); std::vector segmented_blocks = ctx.partitioned_blocks.begin()->second; @@ -182,10 +182,10 @@ TEST(Partitioning, ResolveTensorListInputsInTrtCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + torch_tensorrt::core::partitioning::partition(&ctx, input_ivalues_map); std::vector segmented_blocks = ctx.partitioned_blocks.begin()->second; @@ -376,9 +376,9 @@ TEST(Partitioning, ResolveOnlyNeccessaryNonTensorInputs) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + torch_tensorrt::core::partitioning::partition(&ctx, input_ivalues_map); auto segmented_blocks = ctx.partitioned_blocks.begin()->second; int torch_block_cnt = 0, trt_block_cnt = 0; diff --git a/tests/core/partitioning/test_segmentation.cpp b/tests/core/partitioning/test_segmentation.cpp index a5c5571a13..8d47af553e 100644 --- a/tests/core/partitioning/test_segmentation.cpp +++ b/tests/core/partitioning/test_segmentation.cpp @@ -81,7 +81,7 @@ TEST(Partitioning, SegmentSequentialModelCorrectly) { PartitioningInfo partitioning_info; partitioning_info.enabled = true; PartitioningCtx ctx(g->block(), partitioning_info); - SegmentGraph(&ctx, g->block()); + segmentGraph(&ctx, g->block()); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 2)); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2}, {3}, {4}})); @@ -115,7 +115,7 @@ TEST(Partitioning, SegmentSequentialModelWithMinBlockSizeCorrectly) { partitioning_info.enabled = true; partitioning_info.min_block_size = 3; PartitioningCtx ctx(g->block(), partitioning_info); - SegmentGraph(&ctx, g->block()); + segmentGraph(&ctx, g->block()); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 1)); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2}, {3, 4}})); @@ -153,7 +153,7 @@ TEST(Partitioning, SegmentModelWithMinBlockSizeCausedFallbackCorrectly) { partitioning_info.enabled = true; partitioning_info.min_block_size = 3; PartitioningCtx ctx(g->block(), partitioning_info); - SegmentGraph(&ctx, g->block()); + segmentGraph(&ctx, g->block()); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 1)); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2, 3}, {4, 5, 6, 7}})); @@ -187,7 +187,7 @@ TEST(Partitioning, SegmentSequentialModelWithForcedOPCorrectly) { partitioning_info.enabled = true; partitioning_info.forced_fallback_operators.push_back("aten::relu"); PartitioningCtx ctx(g->block(), partitioning_info); - SegmentGraph(&ctx, g->block()); + segmentGraph(&ctx, g->block()); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 3)); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 2)); ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0}, {1}, {2}, {3}, {4}})); @@ -221,7 +221,7 @@ TEST(Partitioning, SegmentBranchModelCorrectly) { PartitioningInfo partitioning_info; partitioning_info.enabled = true; PartitioningCtx ctx(g->block(), partitioning_info); - SegmentGraph(&ctx, g->block()); + segmentGraph(&ctx, g->block()); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 2)); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1}, {2}, {3, 4, 5, 6}})); @@ -256,7 +256,7 @@ TEST(Partitioning, SegmentBranchModelWithMinBlockSizeCorrectly) { partitioning_info.enabled = true; partitioning_info.min_block_size = 3; PartitioningCtx ctx(g->block(), partitioning_info); - SegmentGraph(&ctx, g->block()); + segmentGraph(&ctx, g->block()); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 1)); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 1)); ASSERT_TRUE(checkSegmentedBlockNodesMapping(ctx.partitioned_blocks.begin()->second, g, {{0, 1, 2}, {3, 4, 5, 6}})); @@ -296,7 +296,7 @@ TEST(Partitioning, SegmentBranchModelWithForcedFallbackOPCorrectly) { partitioning_info.forced_fallback_operators.push_back("aten::relu"); PartitioningCtx ctx(g->block(), partitioning_info); - SegmentGraph(&ctx, g->block()); + segmentGraph(&ctx, g->block()); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTensorRT, 3)); ASSERT_TRUE(checkSegmentedBlockNumber(ctx.partitioned_blocks.begin()->second, SegmentedBlock::kTorch, 2)); ASSERT_TRUE( @@ -306,4 +306,4 @@ TEST(Partitioning, SegmentBranchModelWithForcedFallbackOPCorrectly) { } // namespace tests } // namespace partitioning } // namespace core -} // namespace torch_tensorrt \ No newline at end of file +} // namespace torch_tensorrt diff --git a/tests/core/partitioning/test_shape_analysis.cpp b/tests/core/partitioning/test_shape_analysis.cpp index 1727935752..87c42c0e47 100644 --- a/tests/core/partitioning/test_shape_analysis.cpp +++ b/tests/core/partitioning/test_shape_analysis.cpp @@ -65,10 +65,10 @@ TEST(Partitioning, InferSequentialModelSegmentedBlockShapeCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + torch_tensorrt::core::partitioning::partition(&ctx, input_ivalues_map); auto segmented_blocks = ctx.partitioned_blocks.begin()->second; ASSERT_TRUE(checkSegmentedBlockInputShape( @@ -117,10 +117,10 @@ TEST(Partitioning, InferBranchModelSegmentedBlockShapeCorrectly) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); input_types.insert({g->inputs()[i], {{at::kFloat}}}); } - auto input_ivalues_map = torch_tensorrt::core::partitioning::GenerateRandomInputs(inputs_map, input_types); + auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); torch_tensorrt::core::partitioning::PartitioningCtx ctx(g->block(), partitioning_info); - torch_tensorrt::core::partitioning::Partition(&ctx, input_ivalues_map); + torch_tensorrt::core::partitioning::partition(&ctx, input_ivalues_map); auto segmented_blocks = ctx.partitioned_blocks.begin()->second; ASSERT_TRUE(checkSegmentedBlockInputShape(