feat: Add hardware compatibility option in Dynamo

gs-olive · gs-olive · commit 437151a76bb1 · 2023-12-07T12:47:26.000-08:00
- Add support for hardware compatibility for Ampere and later
architectures
- Add necessary functions to support the modification throughout the
stack, including C++ and Python components
- Update ABI version to address new metadata format for TRT Engines
- Update engine serialization schema accordingly
- Add test cases to validate feature
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -32,24 +32,35 @@ TRTEngine::TRTEngine(
     const std::string& serialized_engine,
     const RTDevice& cuda_device,
     const std::vector<std::string>& _in_binding_names,
-    const std::vector<std::string>& _out_binding_names)
-    : TRTEngine("deserialized_trt", serialized_engine, cuda_device, _in_binding_names, _out_binding_names) {}
+    const std::vector<std::string>& _out_binding_names,
+    bool hardware_compatible)
+    : TRTEngine(
+          "deserialized_trt",
+          serialized_engine,
+          cuda_device,
+          _in_binding_names,
+          _out_binding_names,
+          hardware_compatible) {}
 
 TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
     : TRTEngine(
           serialized_info[NAME_IDX],
           serialized_info[ENGINE_IDX],
           RTDevice(serialized_info[DEVICE_IDX]),
           split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM),
-          split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM)) {}
+          split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM),
+          static_cast<bool>(std::stoi(serialized_info[HARDWARE_COMPATIBLE]))) {}
 
 TRTEngine::TRTEngine(
     const std::string& mod_name,
     const std::string& serialized_engine,
     const RTDevice& cuda_device,
     const std::vector<std::string>& _in_binding_names,
-    const std::vector<std::string>& _out_binding_names) {
-  auto most_compatible_device = get_most_compatible_device(cuda_device);
+    const std::vector<std::string>& _out_binding_names,
+    bool hardware_compatible) {
+  this->hardware_compatible = hardware_compatible;
+
+  auto most_compatible_device = get_most_compatible_device(cuda_device, RTDevice(), hardware_compatible);
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
   device_info = most_compatible_device.value();
   set_rt_device(device_info);
@@ -231,6 +242,7 @@ std::string TRTEngine::to_str() const {
   }
   ss << "  }" << std::endl;
   ss << "  Device: " << device_info << std::endl;
+  ss << "  Hardware Compatibility: " << (hardware_compatible ? "Enabled" : "Disabled") << std::endl;
   // clang-format on
   return ss.str();
 }
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -34,19 +34,23 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<std::string> in_binding_names = {}; // ITO: PYT IDX
   std::vector<std::string> out_binding_names = {}; // ITO: PYT IDX
 
+  bool hardware_compatible = false; // Whether the engine was compiled in hardware compatible mode
+
   ~TRTEngine();
   TRTEngine(
       const std::string& serialized_engine,
       const RTDevice& cuda_device,
       const std::vector<std::string>& in_binding_names,
-      const std::vector<std::string>& out_binding_names);
+      const std::vector<std::string>& out_binding_names,
+      bool hardware_compatible = false);
   TRTEngine(std::vector<std::string> serialized_info);
   TRTEngine(
       const std::string& mod_name,
       const std::string& serialized_engine,
       const RTDevice& cuda_device,
       const std::vector<std::string>& in_binding_names,
-      const std::vector<std::string>& out_binding_names);
+      const std::vector<std::string>& out_binding_names,
+      bool hardware_compatible = false);
   TRTEngine& operator=(const TRTEngine& other);
   std::string to_str() const;
   static void verify_serialization_fmt(const std::vector<std::string>& serialized_info);
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -43,8 +43,8 @@ bool is_switch_required(const RTDevice& curr_device, const RTDevice& engine_devi
   return false;
 }
 
-RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_device) {
-  auto new_target_device_opt = get_most_compatible_device(engine_device, curr_device);
+RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_device, bool hardware_compatible) {
+  auto new_target_device_opt = get_most_compatible_device(engine_device, curr_device, hardware_compatible);
 
   // REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
   // TODO: I think this logic could be way simpler at execution time since if the tensors arent on the right
@@ -59,7 +59,9 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de
 }
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
-  LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
+  LOG_DEBUG(
+      "Attempting to run engine (ID: " << compiled_engine->name
+                                       << "); Hardware Compatible: " << compiled_engine->hardware_compatible);
 
   if (compiled_engine->profile_execution) {
     std::stringstream ss;
@@ -89,7 +91,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     if (is_switch_required(curr_device, compiled_engine->device_info)) {
       // Scan through available CUDA devices and set the CUDA device context correctly
-      RTDevice device = select_rt_device(compiled_engine->device_info, curr_device);
+      RTDevice device =
+          select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
       set_rt_device(device);
 
       // Target device is new device
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -101,6 +101,9 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
               serialize_info[ENGINE_IDX] = base64_encode(trt_engine);
               serialize_info[INPUT_BINDING_NAMES_IDX] = serialize_bindings(self->in_binding_names);
               serialize_info[OUTPUT_BINDING_NAMES_IDX] = serialize_bindings(self->out_binding_names);
+              serialize_info[HARDWARE_COMPATIBLE] = self->hardware_compatible ? "1" : "0";
+
+              LOG_DEBUG("Serialized Hardware Compatibility: " << (self->hardware_compatible ? "Enabled" : "Disabled"));
 
               return serialize_info;
             },
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
@@ -7,9 +7,12 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device, const RTDevice& curr_device) {
+c10::optional<RTDevice> get_most_compatible_device(
+    const RTDevice& target_device,
+    const RTDevice& curr_device,
+    bool hardware_compatible) {
   LOG_DEBUG("Target Device: " << target_device);
-  auto device_options = find_compatible_devices(target_device);
+  auto device_options = find_compatible_devices(target_device, hardware_compatible);
   RTDevice current_device;
   if (current_device.id == -1) {
     current_device = get_current_device();
@@ -28,7 +31,8 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
   dev_list << "[" << std::endl;
   for (auto device : device_options) {
     dev_list << "    " << device << ',' << std::endl;
-    if (device.device_name == target_device.device_name) {
+    // If the model is hardware compatible, any compatible device should be valid
+    if ((device.device_name == target_device.device_name) || hardware_compatible) {
       // First priority is selecting a candidate which agrees with the current device ID
       // If such a device is found, we can select it and break out of the loop
       if (device.id == current_device.id && best_match.id != current_device.id) {
@@ -58,7 +62,7 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
   }
 }
 
-std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device) {
+std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device, bool hardware_compatible) {
   auto dla_supported = get_dla_supported_SMs();
   auto device_list = get_available_device_list().get_devices();
 
@@ -74,7 +78,8 @@ std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device) {
     } else if (target_device.device_type == nvinfer1::DeviceType::kGPU) {
       auto target_dev_cc = target_device.getSMCapability();
       // If the SM Capabilities match, should be good enough to run
-      if (poss_dev_cc == target_dev_cc) {
+      // If hardware compatibility mode is enabled and the SM is at least 80, device is valid
+      if ((poss_dev_cc == target_dev_cc) || (hardware_compatible && std::stoi(poss_dev_cc) >= 8)) {
         compatible_devices.push_back(device.second);
       }
     } else {
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
@@ -15,21 +15,23 @@ namespace core {
 namespace runtime {
 
 using EngineID = int64_t;
-const std::string ABI_VERSION = "4";
+const std::string ABI_VERSION = "5";
 typedef enum {
   ABI_TARGET_IDX = 0,
   NAME_IDX,
   DEVICE_IDX,
   ENGINE_IDX,
   INPUT_BINDING_NAMES_IDX,
   OUTPUT_BINDING_NAMES_IDX,
+  HARDWARE_COMPATIBLE,
   SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
 c10::optional<RTDevice> get_most_compatible_device(
     const RTDevice& target_device,
-    const RTDevice& curr_device = RTDevice());
-std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);
+    const RTDevice& curr_device = RTDevice(),
+    bool hardware_compatible = false);
+std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device, bool hardware_compatible);
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
 
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -5,7 +5,6 @@
 from typing import Any, List, Optional, Sequence, Set, Tuple, Union
 
 import torch
-import torch_tensorrt
 from torch.export import ExportedProgram
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import (  # TODO: Should probabably be the TRT EngineCapability Enum
@@ -17,6 +16,7 @@
     DEBUG,
     DEVICE,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
+    HARDWARE_COMPATIBLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     OPTIMIZATION_LEVEL,
@@ -43,6 +43,8 @@
     to_torch_tensorrt_device,
 )
 
+import torch_tensorrt
+
 logger = logging.getLogger(__name__)
 
 
@@ -75,6 +77,7 @@ def compile(
     use_python_runtime: bool = USE_PYTHON_RUNTIME,
     use_fast_partitioner: bool = USE_FAST_PARTITIONER,
     enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
+    hardware_compatible: bool = HARDWARE_COMPATIBLE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile a TorchScript module for NVIDIA GPUs using TensorRT
@@ -131,6 +134,7 @@ def compile(
         use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization
         use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optiminal. Use the global paritioner (``False``) if looking for best performance
         enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the grap easier to covert to TensorRT, potentially increasing the amount of graphs run in TensorRT.
+        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -199,6 +203,7 @@ def compile(
         "use_fast_partitioner": use_fast_partitioner,
         "enable_experimental_decompositions": enable_experimental_decompositions,
         "require_full_compilation": require_full_compilation,
+        "hardware_compatible": hardware_compatible,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -15,6 +15,7 @@
 USE_FAST_PARTITIONER = True
 ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False
 REQUIRE_FULL_COMPILATION = False
+HARDWARE_COMPATIBLE = False
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -6,6 +6,7 @@
 from torch_tensorrt.dynamo._defaults import (
     DEBUG,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
+    HARDWARE_COMPATIBLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     OPTIMIZATION_LEVEL,
@@ -46,6 +47,7 @@ class CompilationSettings:
         device (Device): GPU to compile the model on
         require_full_compilation (bool): Whether to require the graph is fully compiled in TensorRT.
             Only applicable for `ir="dynamo"`; has no effect for `torch.compile` path
+        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
     """
 
     precision: torch.dtype = PRECISION
@@ -63,3 +65,4 @@ class CompilationSettings:
     enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS
     device: Device = field(default_factory=default_device)
     require_full_compilation: bool = REQUIRE_FULL_COMPILATION
+    hardware_compatible: bool = HARDWARE_COMPATIBLE
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -4,6 +4,9 @@
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set
 
 import numpy as np
+
+# @manual=//deeplearning/trt/python:py_tensorrt
+import tensorrt as trt
 import torch
 import torch.fx
 from torch.fx.node import _get_qualified_name
@@ -23,8 +26,6 @@
 from torch_tensorrt.fx.observer import Observer
 from torch_tensorrt.fx.utils import Frameworks, unified_dtype_converter
 
-# @manual=//deeplearning/trt/python:py_tensorrt
-import tensorrt as trt
 from packaging import version
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -118,7 +119,6 @@ def validate_conversion(self) -> Set[str]:
 
     def run(
         self,
-        workspace_size: int = 0,
         precision: torch.dtype = torch.float32,  # TODO: @peri044 Needs to be expanded to set
         sparse_weights: bool = False,
         disable_tf32: bool = False,
@@ -128,14 +128,10 @@ def run(
         timing_cache: Optional[trt.ITimingCache] = None,
         profiling_verbosity: Optional[trt.ProfilingVerbosity] = None,
         tactic_sources: Optional[int] = None,
-        max_aux_streams: Optional[int] = None,
-        version_compatible: bool = False,
-        optimization_level: Optional[int] = None,
     ) -> TRTInterpreterResult:
         """
         Build TensorRT engine with some configs.
         Args:
-            workspace_size: Amount of memory used by TensorRT to store intermediate buffers within an operation.
             precision: the precision model layers are running on (TensorRT will choose the best perforamnce precision).
             sparse_weights: allow the builder to examine weights and use optimized functions when weights have suitable sparsity
             force_fp32_output: force output to be fp32
@@ -172,9 +168,10 @@ def run(
 
         builder_config = self.builder.create_builder_config()
 
-        if workspace_size != 0:
+        if self.ctx.compilation_settings.workspace_size != 0:
             builder_config.set_memory_pool_limit(
-                trt.MemoryPoolType.WORKSPACE, workspace_size
+                trt.MemoryPoolType.WORKSPACE,
+                self.ctx.compilation_settings.workspace_size,
             )
 
         cache = None
@@ -193,15 +190,28 @@ def run(
             )
 
         if version.parse(trt.__version__) >= version.parse("8.6"):
-            if max_aux_streams is not None:
-                _LOGGER.info(f"Setting max aux streams to {max_aux_streams}")
-                builder_config.max_aux_streams = max_aux_streams
-            if version_compatible:
+            if self.ctx.compilation_settings.max_aux_streams is not None:
+                _LOGGER.info(
+                    f"Setting max aux streams to {self.ctx.compilation_settings.max_aux_streams}"
+                )
+                builder_config.max_aux_streams = (
+                    self.ctx.compilation_settings.max_aux_streams
+                )
+            if self.ctx.compilation_settings.version_compatible:
                 _LOGGER.info("Using version compatible")
                 builder_config.set_flag(trt.BuilderFlag.VERSION_COMPATIBLE)
-            if optimization_level is not None:
-                _LOGGER.info(f"Using optimization level {optimization_level}")
-                builder_config.builder_optimization_level = optimization_level
+            if self.ctx.compilation_settings.hardware_compatible:
+                _LOGGER.info("Using hardware compatible")
+                builder_config.hardware_compatibility_level = (
+                    trt.HardwareCompatibilityLevel.AMPERE_PLUS
+                )
+            if self.ctx.compilation_settings.optimization_level is not None:
+                _LOGGER.info(
+                    f"Using optimization level {self.ctx.compilation_settings.optimization_level}"
+                )
+                builder_config.builder_optimization_level = (
+                    self.ctx.compilation_settings.optimization_level
+                )
 
         if precision == torch.float16:
             builder_config.set_flag(trt.BuilderFlag.FP16)
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -3,15 +3,14 @@
 import io
 from typing import Sequence
 
+import tensorrt as trt
 import torch
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.conversion._TRTInterpreter import TRTInterpreter
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
 from torch_tensorrt.dynamo.utils import get_torch_inputs
 
-import tensorrt as trt
-
 
 def convert_module(
     module: torch.fx.GraphModule,
@@ -55,16 +54,12 @@ def convert_module(
         compilation_settings=settings,
     )
     interpreter_result = interpreter.run(
-        workspace_size=settings.workspace_size,
         precision=settings.precision,
         profiling_verbosity=(
             trt.ProfilingVerbosity.VERBOSE
             if settings.debug
             else trt.ProfilingVerbosity.LAYER_NAMES_ONLY
         ),
-        max_aux_streams=settings.max_aux_streams,
-        version_compatible=settings.version_compatible,
-        optimization_level=settings.optimization_level,
     )
 
     if settings.use_python_runtime:
@@ -86,4 +81,5 @@ def convert_module(
             input_binding_names=list(interpreter_result.input_names),
             output_binding_names=list(interpreter_result.output_names),
             target_device=settings.device,
+            hardware_compatible=settings.hardware_compatible,
         )
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
diff --git a/tests/py/dynamo/runtime/test_hw_compat.py b/tests/py/dynamo/runtime/test_hw_compat.py
diff --git a/tests/py/ts/models/hw_compat.ts b/tests/py/ts/models/hw_compat.ts