pytorch
diff --git a/‎.circleci/config.yml
+17-2 b/‎.circleci/config.yml
+17-2
diff --git a/‎.github/workflows/build-test.yml
+36-33 b/‎.github/workflows/build-test.yml
+36-33
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎core/conversion/converters/impl/shuffle.cpp
+6-1 b/‎core/conversion/converters/impl/shuffle.cpp
+6-1
diff --git a/‎core/runtime/execute_engine.cpp
+3-3 b/‎core/runtime/execute_engine.cpp
+3-3
diff --git a/‎core/runtime/runtime.cpp
+22-5 b/‎core/runtime/runtime.cpp
+22-5
diff --git a/‎core/runtime/runtime.h
+3-1 b/‎core/runtime/runtime.h
+3-1
diff --git a/‎core/util/trt_util.cpp
+1-1 b/‎core/util/trt_util.cpp
+1-1
diff --git a/‎cpp/include/torch_tensorrt/torch_tensorrt.h
+2 b/‎cpp/include/torch_tensorrt/torch_tensorrt.h
+2
diff --git a/‎cpp/src/types.cpp
+7-1 b/‎cpp/src/types.cpp
+7-1
diff --git a/‎docker/WORKSPACE.ngc
+19-19 b/‎docker/WORKSPACE.ngc
+19-19
diff --git a/‎docs/_cpp_api/classtorch__tensorrt_1_1DataType.html
+11-2 b/‎docs/_cpp_api/classtorch__tensorrt_1_1DataType.html
+11-2
@@ -802,7 +802,7 @@ commands:
       - store_artifacts:
           path: /tmp/testlogs
 
-  test-dynamo-models_torch_export:
+  test-dynamo-models_export:
     description: "Test the Dynamo models via torch_export path"
     steps:
       - run:
@@ -818,6 +818,20 @@ commands:
       - store_artifacts:
           path: /tmp/testlogs
 
+  test-dynamo-export_serde:
+    description: "Test the export serialize/deserialize functionality for Dynamo models"
+    steps:
+      - run:
+          name: Run Dynamo models and test export serde with TRT compiled modules
+          command: |
+            cd tests/py/dynamo/models
+            pytest test_export_serde.py --junitxml=/tmp/artifacts/test_results/dynamo/backend/test_results.xml --ir dynamo
+
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
   test-dynamo-converters:
     description: "Test the Dynamo aten converters"
     steps:
@@ -1122,7 +1136,8 @@ jobs:
       - test-dynamo-backend
       - test-dynamo-shared_utilities
       - test-dynamo-models_torch_compile
-      - test-dynamo-models_torch_export
+      - test-dynamo-models_export
+      - test-dynamo-export_serde
 
   package-x86_64-linux:
     parameters:
 
@@ -54,39 +54,40 @@ jobs:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
 
-  # tests-py-torchscript-fe:
-  #   name: Test torchscript frontend [Python]
-  #   needs: [generate-matrix, build]
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       include:
-  #         - repository: pytorch/tensorrt
-  #           package-name: torch_tensorrt
-  #           pre-script: packaging/pre_build_script.sh
-  #   uses: pytorch/tensorrt/.github/workflows/linux-test.yml@main
-  #   with:
-  #     job-name: tests-py-torchscript-fe
-  #     repository: "pytorch/tensorrt"
-  #     ref: ""
-  #     test-infra-repository: pytorch/test-infra
-  #     test-infra-ref: main
-  #     build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-  #     pre-script: ${{ matrix.pre-script }}
-  #     script: |
-  #       export USE_HOST_DEPS=1
-  #       pushd .
-  #       cd tests/modules
-  #       ${CONDA_RUN} python -m pip install -r requirements.txt
-  #       ${CONDA_RUN} python hub.py
-  #       popd
-  #       pushd .
-  #       cd tests/py/ts
-  #       ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest --use-deprecated=legacy-resolver
-  #       ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
-  #       ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
-  #       ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
-  #       popd
+  tests-py-torchscript-fe:
+    name: Test torchscript frontend [Python]
+    needs: [generate-matrix, build]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+    uses: pytorch/tensorrt/.github/workflows/linux-test.yml@main
+    with:
+      job-name: tests-py-torchscript-fe
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        export USE_HOST_DEPS=1
+        export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
+        pushd .
+        cd tests/modules
+        ${CONDA_RUN} python -m pip install --pre -r requirements.txt --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python hub.py
+        popd
+        pushd .
+        cd tests/py/ts
+        ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
+        popd
 
   tests-py-dynamo-converters:
     name: Test dynamo converters [Python]
@@ -140,6 +141,8 @@ jobs:
         cd tests/py/dynamo
         ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py
         popd
 
   tests-py-torch-compile-be:
 
@@ -40,7 +40,7 @@ repos:
     rev: 'v1.4.1'
     hooks:
     -   id: mypy
-        exclude: "^py/torch_tensorrt/fx|^examples|^tests|^tools|^docs|noxfile.py|setup.py|versions.py"
+        exclude: "^py/torch_tensorrt/fx|^examples|^tests|^py/torch_tensorrt/dynamo/_experimental|^tools|^docs|noxfile.py|setup.py|versions.py"
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.0.278
 
@@ -20,7 +20,12 @@ static auto shuffle_registrations TORCHTRT_UNUSED =
                auto in_shape = util::toVec(in->getDimensions());
                std::vector<int64_t> out_shape;
                if (ctx->input_is_dynamic) {
-                 end_dim = (end_dim == -1) ? in_shape.size() - 1 : end_dim;
+                 if (start_dim < 0) {
+                   start_dim = start_dim + in_shape.size();
+                 }
+                 if (end_dim < 0) {
+                   end_dim = end_dim + in_shape.size();
+                 }
                  int nbDynamicFlattenedDims = 0;
                  int nbDynamicUnflattenedDims = 0;
                  for (int i = 0; i < (int)in_shape.size(); i++) {
 
@@ -43,8 +43,8 @@ bool is_switch_required(const RTDevice& curr_device, const RTDevice& engine_devi
   return false;
 }
 
-RTDevice select_rt_device(const RTDevice& engine_device) {
-  auto new_target_device_opt = get_most_compatible_device(engine_device);
+RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_device) {
+  auto new_target_device_opt = get_most_compatible_device(engine_device, curr_device);
 
   // REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
   // TODO: I think this logic could be way simpler at execution time since if the tensors arent on the right
@@ -89,7 +89,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     if (is_switch_required(curr_device, compiled_engine->device_info)) {
       // Scan through available CUDA devices and set the CUDA device context correctly
-      RTDevice device = select_rt_device(compiled_engine->device_info);
+      RTDevice device = select_rt_device(compiled_engine->device_info, curr_device);
       set_rt_device(device);
 
       // Target device is new device
 
@@ -7,9 +7,16 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device) {
+c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device, const RTDevice& curr_device) {
   LOG_DEBUG("Target Device: " << target_device);
   auto device_options = find_compatible_devices(target_device);
+  RTDevice current_device;
+  if (current_device.id == -1) {
+    current_device = get_current_device();
+  } else {
+    current_device = curr_device;
+  }
+
   if (device_options.size() == 0) {
     return {};
   } else if (device_options.size() == 1) {
@@ -21,10 +28,20 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
   dev_list << "[" << std::endl;
   for (auto device : device_options) {
     dev_list << "    " << device << ',' << std::endl;
-    if (device.device_name == target_device.device_name && best_match.device_name != target_device.device_name) {
-      best_match = device;
-    } else if (device.device_name == target_device.device_name && best_match.device_name == target_device.device_name) {
-      if (device.id == target_device.id && best_match.id != target_device.id) {
+    if (device.device_name == target_device.device_name) {
+      // First priority is selecting a candidate which agrees with the current device ID
+      // If such a device is found, we can select it and break out of the loop
+      if (device.id == current_device.id && best_match.id != current_device.id) {
+        best_match = device;
+        break;
+      }
+      // Second priority is selecting a candidate which agrees with the target device ID
+      // At deserialization time, the current device and target device may not agree
+      else if (device.id == target_device.id && best_match.id != target_device.id) {
+        best_match = device;
+      }
+      // If no such GPU ID is found, select the first available candidate GPU
+      else if (best_match.device_name != target_device.device_name) {
         best_match = device;
       }
     }
 
@@ -26,7 +26,9 @@ typedef enum {
   SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
-c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device);
+c10::optional<RTDevice> get_most_compatible_device(
+    const RTDevice& target_device,
+    const RTDevice& curr_device = RTDevice());
 std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
 
@@ -216,7 +216,7 @@ nvinfer1::Dims squeezeDims(const nvinfer1::Dims& d, int pos, bool use_zeros, boo
       // Replace all instances of -1, indicating dynamic dimension
       // with 0, indicating copy the dimension from another tensor
       // (Generally used for reshape operations)
-      if (use_zeros && d.d[i] == -1) {
+      if (use_zeros && d.d[i] == -1 && i < pos) {
         dims.d[j] = 0;
         // If zeros already exist in the dimensions (empty tensor),
         // Replace all instances of 0, indicating empty dimension
 
@@ -60,6 +60,8 @@ class DataType {
   enum Value : int8_t {
     /// INT64
     kLong,
+    /// FP64
+    kDouble,
     /// FP32
     kFloat,
     /// FP16
 
@@ -97,6 +97,8 @@ at::ScalarType toAtenDataType(DataType value) {
       return at::kInt;
     case DataType::kLong:
       return at::kLong;
+    case DataType::kDouble:
+      return at::kDouble;
     case DataType::kBool:
       return at::kBool;
     case DataType::kFloat:
@@ -119,7 +121,8 @@ nvinfer1::TensorFormat toTRTTensorFormat(TensorFormat value) {
 
 DataType::DataType(c10::ScalarType t) {
   TORCHTRT_CHECK(
-      t == at::kHalf || t == at::kFloat || t == at::kChar || t == at::kLong || t == at::kInt || t == at::kBool,
+      t == at::kHalf || t == at::kFloat || t == at::kChar || t == at::kLong || t == at::kDouble || t == at::kInt ||
+          t == at::kBool,
       "Data type is unsupported (" << t << ")");
   switch (t) {
     case at::kHalf:
@@ -134,6 +137,9 @@ DataType::DataType(c10::ScalarType t) {
     case at::kLong:
       value = DataType::kLong;
       break;
+    case at::kDouble:
+      value = DataType::kDouble;
+      break;
     case at::kBool:
       value = DataType::kBool;
       break;
 
@@ -9,24 +9,28 @@ http_archive(
         sha256 = "778197e26c5fbeb07ac2a2c5ae405b30f6cb7ad1f5510ea6fdac03bded96cc6f",
     )
 
-load("@rules_python//python:pip.bzl", "pip_install")
+load("@rules_python//python:repositories.bzl", "py_repositories")
+
+py_repositories()
 
 http_archive(
     name = "rules_pkg",
+    sha256 = "8f9ee2dc10c1ae514ee599a8b42ed99fa262b757058f65ad3c384289ff70c4b8",
     urls = [
-    	"https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.4.0/rules_pkg-0.4.0.tar.gz",
-	"https://github.com/bazelbuild/rules_pkg/releases/download/0.4.0/rules_pkg-0.4.0.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.9.1/rules_pkg-0.9.1.tar.gz",
+        "https://github.com/bazelbuild/rules_pkg/releases/download/0.9.1/rules_pkg-0.9.1.tar.gz",
     ],
-    sha256 = "038f1caa773a7e35b3663865ffb003169c6a71dc995e39bf4815792f385d837d",
 )
+
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+
 rules_pkg_dependencies()
 
-git_repository(
+http_archive(
     name = "googletest",
-    remote = "https://github.com/google/googletest",
-    commit = "703bd9caab50b139428cea1aaff9974ebee5742e",
-    shallow_since = "1570114335 -0400"
+    sha256 = "755f9a39bc7205f5a0c428e920ddad092c33c8a1b46997def3f1d4a82aded6e1",
+    strip_prefix = "googletest-5ab508a01f9eb089207ee87fd547d290da39d015",
+    urls = ["https://github.com/google/googletest/archive/5ab508a01f9eb089207ee87fd547d290da39d015.zip"],
 )
 
 # External dependency for torch_tensorrt if you already have precompiled binaries.
@@ -80,17 +84,13 @@ new_local_repository(
 #########################################################################
 # Testing Dependencies (optional - comment out on aarch64)
 #########################################################################
-pip_install(
-    name = "torch_tensorrt_py_deps",
-    requirements = "//py:requirements.txt",
-)
+load("@rules_python//python:pip.bzl", "pip_parse")
 
-pip_install(
-    name = "py_test_deps",
-    requirements = "//tests/py:requirements.txt",
+pip_parse(
+    name = "devtools_deps",
+    requirements_lock = "//:requirements-dev.txt",
 )
 
-pip_install(
-    name = "pylinter_deps",
-    requirements = "//tools/linter:requirements.txt",
-)
+load("@devtools_deps//:requirements.bzl", "install_deps")
+
+install_deps()
@@ -10,7 +10,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  <title>Class DataType &mdash; Torch-TensorRT v2.0.0.dev0+1fec519 documentation</title>
+  <title>Class DataType &mdash; Torch-TensorRT v2.2.0.dev0+50ab2c1 documentation</title>
 
 
 
@@ -225,7 +225,7 @@
 
 
                 <div class="version">
-                  v2.0.0.dev0+1fec519
+                  v2.2.0.dev0+50ab2c1
                 </div>
 
 
@@ -269,6 +269,8 @@
 <li class="toctree-l1"><a class="reference internal" href="../user_guide/getting_started_with_fx_path.html">Torch-TensorRT (FX Frontend) User Guide</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../user_guide/ptq.html">Post Training Quantization (PTQ)</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../user_guide/runtime.html">Deploying Torch-TensorRT Programs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../user_guide/saving_models.html">Saving models compiled with Torch-TensorRT</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../user_guide/dynamic_shapes.html">Dynamic shapes with Torch-TensorRT</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../user_guide/use_from_pytorch.html">Using Torch-TensorRT Directly From PyTorch</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../user_guide/using_dla.html">DLA</a></li>
 </ul>
@@ -304,6 +306,7 @@
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="../contributors/system_overview.html">System Overview</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../contributors/writing_converters.html">Writing Converters</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../contributors/writing_dynamo_aten_lowering_passes.html">Writing Dynamo ATen Lowering Passes</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../contributors/useful_links.html">Useful Links for Torch-TensorRT Development</a></li>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Indices</span></p>
@@ -414,6 +417,12 @@ <h2>Class Documentation<a class="headerlink" href="#class-documentation" title="
 <dd><p>INT64. </p>
 </dd></dl>
 
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N14torch_tensorrt8DataType5Value7kDoubleE">
+<span class="target" id="classtorch__tensorrt_1_1DataType_1a6335c0e206340d85a1382a5df17bf684aacf5b40b44995643185a977d2d1ce1bf"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kDouble</span></span></span><a class="headerlink" href="#_CPPv4N14torch_tensorrt8DataType5Value7kDoubleE" title="Permalink to this definition">¶</a><br /></dt>
+<dd><p>FP64. </p>
+</dd></dl>
+
 <dl class="cpp enumerator">
 <dt class="sig sig-object cpp" id="_CPPv4N14torch_tensorrt8DataType5Value6kFloatE">
 <span class="target" id="classtorch__tensorrt_1_1DataType_1a6335c0e206340d85a1382a5df17bf684a45ceda04c1ab50695a4a6aeaeae99817"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kFloat</span></span></span><a class="headerlink" href="#_CPPv4N14torch_tensorrt8DataType5Value6kFloatE" title="Permalink to this definition">¶</a><br /></dt>