Merge pull request #1414 from pytorch/dyn_shapes

peri044 · web-flow · commit dbaab178ac0e · 2022-11-16T21:19:52.000-08:00
feat(//core/partitioning) : Dynamic shapes + fallback
diff --git a/core/compiler.cpp b/core/compiler.cpp
@@ -137,10 +137,13 @@ partitioning::GraphAndMapping BuildHybridGraph(
   auto partitioning_info = cfg.partitioning_info;
 
   auto partitioning_ctx = partitioning::PartitioningCtx(block, partitioning_info);
-  auto collection_input_ivalues_map =
-      partitioning::generateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types);
+  partitioning_ctx.input_types_map = first_use_types;
 
-  partitioning::partition(&partitioning_ctx, collection_input_ivalues_map);
+  // Generate a dictionary of input torch::jit::Value's to their min, opt, max tensors and store in ctx
+  // TODO: Combine this within partition call
+  partitioning::populateInputIValues(&partitioning_ctx);
+
+  partitioning::partition(&partitioning_ctx);
 
   for (auto& partitioned_block : partitioning_ctx.partitioned_blocks) {
     partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second;
@@ -151,14 +154,7 @@ partitioning::GraphAndMapping BuildHybridGraph(
       trt_engine_id << reinterpret_cast<const int*>(&seg_block);
 
       if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
-        auto shapes = seg_block.in_shapes();
-        auto types = seg_block.in_types();
-        std::vector<ir::Input> inputs;
-        for (size_t i = 0; i < shapes.size(); i++) {
-          auto in = ir::Input(shapes[i]);
-          in.dtype = util::ScalarTypeToTRTDataType(types[i]);
-          inputs.push_back(in);
-        }
+        auto inputs = seg_block.construct_inputs_spec();
         // update the input ranges for each segments
         convert_info.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
 
diff --git a/core/ir/ir.h b/core/ir/ir.h
@@ -11,6 +11,12 @@ namespace torch_tensorrt {
 namespace core {
 namespace ir {
 
+enum class ShapeMode {
+  kMIN,
+  kOPT,
+  kMAX,
+};
+
 struct Device {
   nvinfer1::DeviceType device_type;
   int64_t gpu_id;
diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
@@ -536,7 +536,35 @@ void segmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) {
   return;
 }
 
-void partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) {
+bool isInputDynamic(PartitioningCtx* ctx) {
+  // Check if inputs have dynamic shapes
+  bool input_is_dynamic = true;
+  auto inputs_map = ctx->settings.collection_input_spec_map;
+  for (auto inputs : inputs_map) {
+    for (auto input : inputs.second) {
+      if (!input.input_is_dynamic) {
+        input_is_dynamic = false;
+      }
+    }
+  }
+  return input_is_dynamic;
+}
+
+void populateInputIValues(PartitioningCtx* ctx) {
+  if (isInputDynamic(ctx)) {
+    ctx->min_input_ivalues_map = partitioning::generateRandomInputs(
+        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMIN);
+    ctx->opt_input_ivalues_map = partitioning::generateRandomInputs(
+        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT);
+    ctx->max_input_ivalues_map = partitioning::generateRandomInputs(
+        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMAX);
+  } else {
+    ctx->opt_input_ivalues_map = partitioning::generateRandomInputs(
+        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT);
+  }
+}
+
+void partition(PartitioningCtx* ctx) {
   LOG_DEBUG(ctx->settings);
 
   // Go through all the blocks to do the partitioning
@@ -546,15 +574,24 @@ void partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) {
 
     // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks
     // resolve nonTensor inputs/outputs
+    LOG_DEBUG("Resolving non-tensor inputs for segmented blocks");
     resolveTRTNonTensorInputs(ctx, block);
 
     // register input/output torch::jit::Value for segmented graphs
     LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs");
     registerSegmentsOutputs(ctx, block);
 
-    // run shape analysis on each segmented block
-    LOG_DEBUG("Running shape analysis for segmented graphs");
-    runShapeAnalysis(ctx, block, example_tensor_map);
+    // Incase of dynamic shape inputs, run shape analysis on each segmented block for min/opt/max ranges and register
+    // output shapes for each block accordingly
+    if (isInputDynamic(ctx)) {
+      LOG_DEBUG("Performing shape analysis for segmented blocks using min/opt/max shapes for inputs");
+      runShapeAnalysis(ctx, block, ctx->min_input_ivalues_map, ir::ShapeMode::kMIN);
+      runShapeAnalysis(ctx, block, ctx->opt_input_ivalues_map, ir::ShapeMode::kOPT);
+      runShapeAnalysis(ctx, block, ctx->max_input_ivalues_map, ir::ShapeMode::kMAX);
+    } else {
+      LOG_DEBUG("Performing shape analysis for segmented blocks using static shapes for inputs");
+      runShapeAnalysis(ctx, block, ctx->opt_input_ivalues_map, ir::ShapeMode::kOPT);
+    }
   }
 }
 
diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h
@@ -18,15 +18,24 @@ typedef std::unordered_map<const torch::jit::Value*, torch::jit::IValue> Example
 typedef std::pair<std::shared_ptr<torch::jit::Graph>, std::unordered_map<torch::jit::Value*, torch::jit::Value*>>
     GraphAndMapping;
 
-ExampleIValues generateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types);
+ExampleIValues generateRandomInputs(
+    ir::CollectionInputSpecMap& input_ranges,
+    ir::CollectionTypeMap& input_types,
+    const ir::ShapeMode& shape_mode = ir::ShapeMode::kOPT);
 
-void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps);
+void populateInputIValues(PartitioningCtx* ctx);
+
+void runShapeAnalysis(
+    PartitioningCtx* ctx,
+    torch::jit::Block* block,
+    ExampleIValues& ivalues_maps,
+    const ir::ShapeMode& shape_mode);
 
 void segmentGraph(PartitioningCtx* ctx, torch::jit::Block* block);
 
 GraphAndMapping stitch(PartitioningCtx* ctx, torch::jit::Block* block);
 
-void partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map);
+void partition(PartitioningCtx* ctx);
 
 } // namespace partitioning
 } // namespace core
diff --git a/core/partitioning/partitioningctx/PartitioningCtx.h b/core/partitioning/partitioningctx/PartitioningCtx.h
@@ -47,6 +47,9 @@ struct UsageInfo {
 struct PartitioningCtx {
   // TODO: Make the set a part of settings not stand alone
   PartitioningInfo settings;
+  std::unordered_map<const torch::jit::Value*, torch::jit::IValue> min_input_ivalues_map;
+  std::unordered_map<const torch::jit::Value*, torch::jit::IValue> opt_input_ivalues_map;
+  std::unordered_map<const torch::jit::Value*, torch::jit::IValue> max_input_ivalues_map;
   // records all the original blocks topologically in the module
   std::vector<torch::jit::Block*> original_blocks;
   // mapping: node=> execution status
@@ -60,6 +63,7 @@ struct PartitioningCtx {
   bool shouldNodeRunInTorch(torch::jit::Node* n);
   bool shouldNodeRunInTensorRT(torch::jit::Node* n);
   std::vector<torch::jit::Node*> getNodesRunInTorch();
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types_map;
 
  private:
   void _load_nodes_into_decision_map(torch::jit::Block* b);
diff --git a/core/partitioning/segmentedblock/SegmentedBlock.cpp b/core/partitioning/segmentedblock/SegmentedBlock.cpp
@@ -1,4 +1,5 @@
 #include "SegmentedBlock.h"
+#include "core/util/prelude.h"
 
 namespace torch_tensorrt {
 namespace core {
@@ -56,6 +57,24 @@ torch::jit::Value* SegmentedBlock::getOrAddInputForValue(torch::jit::Value* old_
   }
 }
 
+std::vector<ir::Input> SegmentedBlock::construct_inputs_spec() const {
+  std::vector<ir::Input> inputs;
+  if (min_shapes_.size() == opt_shapes_.size() && opt_shapes_.size() == max_shapes_.size()) {
+    for (uint64_t i = 0; i < opt_shapes_.size(); i++) {
+      auto in = ir::Input(min_shapes_[i], opt_shapes_[i], max_shapes_[i]);
+      in.dtype = util::ScalarTypeToTRTDataType(in_types_[i]);
+      inputs.push_back(in);
+    }
+  } else {
+    for (uint64_t i = 0; i < opt_shapes_.size(); i++) {
+      auto in = ir::Input(opt_shapes_[i]);
+      in.dtype = util::ScalarTypeToTRTDataType(in_types_[i]);
+      inputs.push_back(in);
+    }
+  }
+  return inputs;
+}
+
 torch::jit::Node* SegmentedBlock::cloneNode(torch::jit::Node* node) {
   auto* block = g_->block();
   auto env = [&](torch::jit::Value* v) { return getOrAddInputForValue(v); };
diff --git a/core/partitioning/segmentedblock/SegmentedBlock.h b/core/partitioning/segmentedblock/SegmentedBlock.h
@@ -35,6 +35,7 @@ struct SegmentedBlock {
   SegmentedBlock(BlockID id, SegmentedBlockTarget blk_target, const std::vector<torch::jit::Node*>& nodes);
 
   torch::jit::Value* getOrAddInputForValue(torch::jit::Value* v);
+  std::vector<ir::Input> construct_inputs_spec() const;
   torch::jit::Node* cloneNode(torch::jit::Node* node);
   void appendNode(torch::jit::Node* n) {
     cloneNode(n);
@@ -72,18 +73,31 @@ struct SegmentedBlock {
   bool contain_raw_value(torch::jit::Value* input) const {
     return old_to_new_.count(input);
   }
-  void register_inshapes(std::vector<ir::Input>& in_shapes) {
-    in_shapes_ = in_shapes;
+  void register_inshapes(std::vector<std::vector<int64_t>>& in_shapes, const ir::ShapeMode& shape_mode) {
+    if (shape_mode == ir::ShapeMode::kMIN) {
+      min_shapes_ = in_shapes;
+    } else if (shape_mode == ir::ShapeMode::kOPT) {
+      opt_shapes_ = in_shapes;
+    } else {
+      max_shapes_ = in_shapes;
+    }
+  }
+  const std::vector<std::vector<int64_t>> in_opt_shapes() const {
+    return opt_shapes_;
   }
-  const std::vector<ir::Input>& in_shapes() const {
-    return in_shapes_;
+  const std::vector<std::vector<int64_t>> in_min_shapes() const {
+    return min_shapes_;
+  }
+  const std::vector<std::vector<int64_t>> in_max_shapes() const {
+    return max_shapes_;
   }
   void register_intypes(std::vector<at::ScalarType>& in_types) {
     in_types_ = in_types;
   }
   const std::vector<at::ScalarType>& in_types() const {
     return in_types_;
   }
+
   void update_id(BlockID new_id) {
     id_ = new_id;
   }
@@ -107,7 +121,9 @@ struct SegmentedBlock {
  private:
   BlockID id_;
   SegmentedBlockTarget target_;
-  std::vector<ir::Input> in_shapes_;
+  std::vector<std::vector<int64_t>> min_shapes_;
+  std::vector<std::vector<int64_t>> opt_shapes_;
+  std::vector<std::vector<int64_t>> max_shapes_;
   std::vector<at::ScalarType> in_types_;
   std::vector<torch::jit::Value*> inputs_;
   std::vector<torch::jit::Value*> outputs_;
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
@@ -10,16 +10,25 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
-at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>& type_opt) {
-  auto cur_shape = input.input_shape;
-  std::vector<int64_t> shape;
+at::Tensor generateSingleInput(
+    ir::Input& input,
+    c10::optional<at::ScalarType>& type_opt,
+    const ir::ShapeMode& shape_mode) {
+  nvinfer1::Dims input_shape = input.input_shape;
+  if (input.input_is_dynamic) {
+    if (shape_mode == ir::ShapeMode::kMIN) {
+      input_shape = input.min;
+    } else if (shape_mode == ir::ShapeMode::kOPT) {
+      input_shape = input.opt;
+    } else {
+      input_shape = input.max;
+    }
+  }
 
   // Initialize min and max ranges for random number selection
   int LoValIncl = 0;
   int HiValExcl = 2;
 
-  shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
-
   auto type = at::kFloat;
   if (type_opt) {
     type = type_opt.value();
@@ -29,14 +38,15 @@ at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>&
 
   // Make the value range for input tensor a uniform (float) distribution
   // over [LoValIncl, HiValExcl), then cast to the desired dtype
-  auto in = ((HiValExcl - LoValIncl) * at::rand(shape, {at::kCUDA}) + LoValIncl).to(type);
+  auto in = ((HiValExcl - LoValIncl) * at::rand(util::toVec(input_shape), {at::kCUDA}) + LoValIncl).to(type);
 
   return in;
 }
 
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
-    std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types) {
+    std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types,
+    const ir::ShapeMode& shape_mode) {
   // generate random inputs for running pytorch segments
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> ivalue_map;
 
@@ -45,21 +55,21 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode);
         generic_list.push_back(in.clone());
       }
       ivalue_map[input.first] = c10::IValue(generic_list);
     } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
       std::vector<torch::jit::IValue> list;
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode);
         list.push_back(in.clone());
       }
       auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
       ivalue_map[input.first] = c10::IValue(tuple);
     } else {
-      auto in = generateSingleInput(input.second[0], types[input.first][0]);
+      auto in = generateSingleInput(input.second[0], types[input.first][0], shape_mode);
       ivalue_map[input.first] = in.clone();
     }
   }
@@ -124,7 +134,8 @@ torch::jit::Node* createCastNode(SegmentedBlock& seg_block, size_t index, bool i
 void getSegmentsOutputByRunning(
     SegmentedBlock& seg_block,
     std::unordered_map<const torch::jit::Value*, torch::jit::IValue>& ivalues_maps,
-    const PartitioningInfo& partitioning_info) {
+    const PartitioningInfo& partitioning_info,
+    const ir::ShapeMode& shape_mode) {
   // create a module to run the graph
   auto g = seg_block.g();
   auto copy_g = g->copy();
@@ -235,7 +246,7 @@ void getSegmentsOutputByRunning(
   }
 
   // set input shape for each segmented block so we wil use it in conversion process
-  std::vector<ir::Input> input_shapes;
+  std::vector<std::vector<int64_t>> input_shapes;
   std::vector<at::ScalarType> input_types;
   for (size_t i = 0; i < seg_block.inputs().size(); ++i) {
     if (ivalues_maps[seg_block.raw_inputs()[i]].isTensor()) {
@@ -270,15 +281,19 @@ void getSegmentsOutputByRunning(
     // TODO: tuple and list inputs in subgraph
   }
 
-  seg_block.register_inshapes(input_shapes);
+  seg_block.register_inshapes(input_shapes, shape_mode);
   seg_block.register_intypes(input_types);
 }
 
-void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) {
+void runShapeAnalysis(
+    PartitioningCtx* ctx,
+    torch::jit::Block* block,
+    ExampleIValues& example_tensor_map,
+    const ir::ShapeMode& shape_mode) {
   // register every segment's input shape, and it's running output IValues
   for (auto& seg_block : ctx->partitioned_blocks[block]) {
     torch::jit::ConstantPooling(seg_block.g());
-    getSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings);
+    getSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings, shape_mode);
   }
   return;
 }
diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
diff --git a/tests/core/partitioning/test_shape_analysis.cpp b/tests/core/partitioning/test_shape_analysis.cpp
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
diff --git a/tests/cpp/test_dynamic_fallback.cpp b/tests/cpp/test_dynamic_fallback.cpp