pytorch
diff --git a/‎.circleci/config.yml
+146-8 b/‎.circleci/config.yml
+146-8
diff --git a/‎core/compiler.cpp
+67-16 b/‎core/compiler.cpp
+67-16
@@ -263,7 +263,7 @@ commands:
     parameters:
       torch-build:
         type: string
-        default: "2.0.0.dev20230219+cu117"
+        default: "2.1.0.dev20230314+cu117"
       torch-build-index:
         type: string
         default: "https://download.pytorch.org/whl/nightly/cu117"
@@ -524,15 +524,47 @@ commands:
       - store_artifacts:
           path: /tmp/testlogs
 
-  test-fx_converters:
-    description: "Test the fx converters"
+  test-fx_converters_acc:
+    description: "Test the fx acc converters"
     steps:
       - run:
           name: Run FX converter tests
           command: |
             cd py/torch_tensorrt/fx/test
-            pushd converters/
-            pytest --junitxml=/tmp/artifacts/test_results/fx/converters/test_results.xml
+            pushd converters/acc_op/
+            pytest --junitxml=/tmp/artifacts/test_results/fx/converters/acc_op/test_results.xml
+            popd
+
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
+  test-fx_converters_aten:
+    description: "Test the fx aten converters"
+    steps:
+      - run:
+          name: Run FX converter tests
+          command: |
+            cd py/torch_tensorrt/fx/test
+            pushd converters/aten_op/
+            pytest --junitxml=/tmp/artifacts/test_results/fx/converters/aten_op/test_results.xml
+            popd
+
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
+  test-fx_converters_vanilla:
+    description: "Test the fx vanilla converters"
+    steps:
+      - run:
+          name: Run FX converter tests
+          command: |
+            cd py/torch_tensorrt/fx/test
+            pushd converters/vanilla/
+            pytest --junitxml=/tmp/artifacts/test_results/fx/converters/vanilla/test_results.xml
             popd
 
       - store_test_results:
@@ -587,7 +619,7 @@ commands:
           path: /tmp/testlogs
 
   test-fx_tracer:
-    description: "Test the fx tracer"
+    description: "Test all fx tracers"
     steps:
       - run:
           name: Run FX tracer
@@ -602,6 +634,22 @@ commands:
       - store_artifacts:
           path: /tmp/testlogs
 
+  test-fx_tracer_acc:
+    description: "Test the fx acc tracer only"
+    steps:
+      - run:
+          name: Run FX tracer
+          command: |
+            cd py/torch_tensorrt/fx/test
+            pushd tracer
+            list_tracer=$(ls | grep test_acc)
+            pytest $list_tracer --junitxml=/tmp/artifacts/test_results/fx/tracer/test_results.xml
+            popd
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
   test-fx_quant:
     description: "Test the fx quant"
     steps:
@@ -625,7 +673,9 @@ commands:
           name: Run fx tests
           command: |
             mkdir -p /tmp/artifacts/test_results
-      - test-fx_converters
+      - test-fx_converters_acc
+      - test-fx_converters_aten
+      - test-fx_converters_vanilla
       - test-fx_passes
       - test-fx_tools
       - test-fx_trt_lower
@@ -637,6 +687,26 @@ commands:
       - store_artifacts:
           path: /tmp/testlogs
 
+  test-fx-no-aten:
+    description: "Test the fx backend without aten operators"
+    steps:
+      - run:
+          name: Run fx tests without aten ops
+          command: |
+            mkdir -p /tmp/artifacts/test_results
+      - test-fx_converters_acc
+      - test-fx_converters_vanilla
+      - test-fx_passes
+      - test-fx_tools
+      - test-fx_trt_lower
+      - test-fx_tracer_acc
+      - test-fx_core
+      - test-fx_quant
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
 # Define a job to be invoked later in a workflow.
 # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
 jobs:
@@ -782,6 +852,37 @@ jobs:
       - dump-test-env
       - test-fx
 
+  test-py-fx-x86_64-linux-no-aten:
+    parameters:
+      torch-build:
+        type: string
+      torch-build-index:
+        type: string
+      trt-version-long:
+        type: string
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+    resource_class: gpu.nvidia.large
+    steps:
+      - checkout
+      - attach_workspace:
+          at: /tmp/dist/
+      - install-torch-from-index:
+          torch-build:  << parameters.torch-build >>
+          torch-build-index: << parameters.torch-build-index >>
+      - create-py-env:
+          trt-version-long: << parameters.trt-version-long >>
+      - install-cudnn
+      # - run:
+      #     name: "Set LD_LIBRARY_PATH path to include the installed CUDNN"
+      #     command: export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+      - run:
+          name: "Install torch-tensorrt"
+          command: pip3 install --pre /tmp/dist/x86_64-linux/*cp39-cp39*.whl
+      # We install torch after torch-trt because pip automatically enforces the version constraint otherwise
+      - dump-test-env
+      - test-fx-no-aten
+
   package-x86_64-linux:
     parameters:
       enabled:
@@ -1070,10 +1171,16 @@ parameters:
   # Nightly platform config
   torch-build:
     type: string
-    default: "2.0.0.dev20230219+cu117"
+    default: "2.1.0.dev20230314+cu117"
   torch-build-index:
     type: string
     default: "https://download.pytorch.org/whl/nightly/cu117"
+  torch-build-legacy:
+    type: string
+    default: "1.13.1+cu117"
+  torch-build-index-legacy:
+    type: string
+    default: "https://download.pytorch.org/whl/cu117"
   cudnn-version:
     type: string
     default: "8.5.0.96"
@@ -1127,6 +1234,7 @@ workflows:
                 - release/**/*
     jobs:
       - build-x86_64-linux:
+          name: build-x86_64-linux
           torch-build: << pipeline.parameters.torch-build >>
           torch-build-index: << pipeline.parameters.torch-build-index >>
 
@@ -1153,6 +1261,36 @@ workflows:
           requires:
             - build-x86_64-linux
 
+      - build-x86_64-linux:
+          name: build-x86_64-linux-legacy
+          torch-build: << pipeline.parameters.torch-build-legacy >>
+          torch-build-index: << pipeline.parameters.torch-build-index-legacy >>
+
+      - test-core-cpp-x86_64-linux:
+          name: test-core-cpp-x86_64-linux-legacy
+          torch-build: << pipeline.parameters.torch-build-legacy >>
+          torch-build-index: << pipeline.parameters.torch-build-index-legacy >>
+          trt-version-short: << pipeline.parameters.trt-version-short >>
+          trt-version-long: << pipeline.parameters.trt-version-long >>
+          cudnn-version: << pipeline.parameters.cudnn-version >>
+          requires:
+            - build-x86_64-linux-legacy
+
+      - test-py-ts-x86_64-linux:
+          name: test-py-ts-x86_64-linux-legacy
+          torch-build: << pipeline.parameters.torch-build-legacy >>
+          torch-build-index: << pipeline.parameters.torch-build-index-legacy >>
+          trt-version-long: << pipeline.parameters.trt-version-long >>
+          requires:
+            - build-x86_64-linux-legacy
+
+      - test-py-fx-x86_64-linux-no-aten:
+          torch-build: << pipeline.parameters.torch-build-legacy >>
+          torch-build-index: << pipeline.parameters.torch-build-index-legacy >>
+          trt-version-long: << pipeline.parameters.trt-version-long >>
+          requires:
+            - build-x86_64-linux-legacy
+
   release:
     when: << pipeline.parameters.enable-packaging >>
     jobs:
 
@@ -138,7 +138,8 @@ partitioning::GraphAndMapping BuildHybridGraph(
     torch::jit::Block* block,
     CompileSpec cfg,
     ir::StaticParams static_params,
-    ir::CollectionTypeMap first_use_types) {
+    ir::CollectionTypeMap first_use_types,
+    bool expect_full_compilation = false) {
   auto convert_info = cfg.convert_info;
   auto partitioning_info = cfg.partitioning_info;
 
@@ -149,17 +150,20 @@ partitioning::GraphAndMapping BuildHybridGraph(
   // TODO: Combine this within partition call
   partitioning::populateInputIValues(&partitioning_ctx);
 
-  partitioning::partition(&partitioning_ctx);
+  partitioning::partition(&partitioning_ctx, expect_full_compilation);
 
   for (auto& partitioned_block : partitioning_ctx.partitioned_blocks) {
     partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second;
+    int num_torch_segments = 0;
+    int num_trt_segments = 0;
 
     for (auto& seg_block : segmented_blocks) {
       LOG_INFO("Block segment:" << seg_block);
       std::ostringstream trt_engine_id;
       trt_engine_id << reinterpret_cast<const int*>(&seg_block);
 
       if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
+        num_trt_segments++;
         auto inputs = seg_block.construct_inputs_spec();
         // update the input ranges for each segments
         convert_info.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
@@ -180,8 +184,32 @@ partitioning::GraphAndMapping BuildHybridGraph(
             true);
 
         seg_block.update_graph(temp_g);
+      } else {
+        num_torch_segments++;
+
+        // If full compilation is expected, ensure that all operators in Torch blocks are
+        // for collections processing
+        if (expect_full_compilation) {
+          for (auto torch_node : seg_block.block()->nodes()) {
+            if (partitioning::CollectionNodeKinds.find(torch_node->kind()) == partitioning::CollectionNodeKinds.end()) {
+              TORCHTRT_THROW_ERROR(
+                  "Full compilation specified but node "
+                  << *torch_node
+                  << " is set to run in PyTorch due to either lack of support in TensorRT or graph partitioning rules."
+                  << " Try recompiling with require_full_compilation=False.");
+            }
+          }
+        }
       }
     }
+
+    // If full compilation is expected, cannot have more than 2 Torch segments
+    // (one for preprocessing inputs, one for post-processing outputs) and 1 TRT segment
+    if (expect_full_compilation && !(num_torch_segments <= 2 && num_trt_segments == 1)) {
+      TORCHTRT_THROW_ERROR(
+          "Full compilation was requested but unable to convert all operations to TensorRT."
+          << " Try recompiling with require_full_compilation=False.");
+    }
   }
 
   return partitioning::stitch(&partitioning_ctx, block);
@@ -191,7 +219,8 @@ ir::TypeMap MapInputsAndDetermineDTypes(
     CompileSpec& cfg,
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
-    ir::CollectionTypeMap& first_use_type_map) {
+    ir::CollectionTypeMap& first_use_type_map,
+    bool requires_collection_handling = false) {
   cfg.convert_info.collection_input_spec_map =
       std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
   cfg.partitioning_info.collection_input_spec_map =
@@ -226,7 +255,7 @@ ir::TypeMap MapInputsAndDetermineDTypes(
             "Cannot infer input type from calcuations in graph for input "
             << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
         spec[i].dtype = at::kFloat;
-      } else if (spec[i].dtype_is_user_defined && cfg.partitioning_info.enabled) {
+      } else if (spec[i].dtype_is_user_defined && (cfg.partitioning_info.enabled || requires_collection_handling)) {
         if (!est_type_opt[i]) {
           LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
           std::stringstream ss;
@@ -297,6 +326,11 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   return engine;
 }
 
+bool userRequestedFallback(CompileSpec& cfg) {
+  return cfg.lower_info.forced_fallback_modules.size() != 0 ||
+      cfg.partitioning_info.forced_fallback_operators.size() != 0;
+}
+
 torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
   torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
@@ -315,8 +349,17 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       // Infer the type of an input from the weights of the calculation
       auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
+      // Determine if the block is convertible/has collection output, and based on the result,
+      // whether full compilation can be expected
+      auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
+      auto outputIsCollection = conversion::OutputIsCollection(g->block());
+      auto requires_collection_handling = (isBlockConvertible && outputIsCollection);
+
+      // Determine whether user specifications necessitate partitioning
+      auto isFallbackRequested = userRequestedFallback(cfg);
+
       // Extract map of IValue to DType
-      auto type_map = MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
+      auto type_map = MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types, requires_collection_handling);
 
       // Check whether any of the input types are Long
       bool user_requested_long = false;
@@ -330,20 +373,28 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
         user_requested_long &= (casts_inserted > 0);
       }
 
-      auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
-      auto outputIsCollection = conversion::OutputIsCollection(g->block());
-      if (cfg.partitioning_info.enabled && !user_requested_long &&
-          (cfg.lower_info.forced_fallback_modules.size() == 0 &&
-           cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) &&
-          !outputIsCollection) {
+      // Partitioning is required if:
+      // 1. User requested some modules/operators fallback
+      // 2. The block (graph) cannot be converted due to operator coverage
+      // 3. The output of the graph is a collection
+      // 4. The user requested a non-TRT data type input
+      auto isPartitioningRequired =
+          (isFallbackRequested || !isBlockConvertible || outputIsCollection || user_requested_long);
+
+      // The user did not require full compilation, but the model can be fully compiled
+      if (cfg.partitioning_info.enabled && !isPartitioningRequired) {
         LOG_INFO("Skipping partitioning since model is fully supported");
       }
 
-      if (cfg.partitioning_info.enabled &&
-          (!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
-             cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) ||
-           outputIsCollection || user_requested_long)) {
-        auto graph_and_mapping = BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types);
+      // The user did not require full compilation, and the model can be fully compiled
+      // or, the user required full compilation but the I/O of the graph use collections
+      if ((cfg.partitioning_info.enabled && isPartitioningRequired) || requires_collection_handling) {
+        // If the model is fully-compilable and the user has specified full compilation, run partitioning
+        // to generate collection-processing code in Torch
+        auto expect_full_compilation = (requires_collection_handling && !cfg.partitioning_info.enabled);
+
+        auto graph_and_mapping =
+            BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types, expect_full_compilation);
         new_g = graph_and_mapping.first;
         // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly
         for (size_t i = 0; i < new_g->inputs().size(); ++i) {