support tiling optimization

zewenli98 · zewenli98 · commit fdc7e3c67303 · 2025-03-17T14:19:58.000-07:00
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -96,6 +96,8 @@ def cross_compile_for_windows(
     strip_engine_weights: bool = _defaults.STRIP_ENGINE_WEIGHTS,
     immutable_weights: bool = _defaults.IMMUTABLE_WEIGHTS,
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
+    tiling_optimization_level: Optional[int] = _defaults.TILING_OPTIMIZATION_LEVEL,
+    l2_limit_for_tiling: Optional[int] = _defaults.L2_LIMIT_FOR_TILING,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -169,6 +171,8 @@ def cross_compile_for_windows(
         strip_engine_weights (bool): Strip engine weights from the serialized engine. This is useful when the engine is to be deployed in an environment where the weights are not required.
         immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored.
         enable_weight_streaming (bool): Enable weight streaming.
+        tiling_optimization_level (Optional[int]): The optimization level of tiling strategies. A Higher level allows TensorRT to spend more time searching for better optimization strategy. (We currently support [0, 1, 2, 3], default is 0)
+        l2_limit_for_tiling (Optional[int]): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -326,6 +330,8 @@ def cross_compile_for_windows(
         "immutable_weights": immutable_weights,
         "enable_cross_compile_for_windows": True,
         "enable_weight_streaming": enable_weight_streaming,
+        "tiling_optimization_level": tiling_optimization_level,
+        "l2_limit_for_tiling": l2_limit_for_tiling,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -413,6 +419,8 @@ def compile(
     strip_engine_weights: bool = _defaults.STRIP_ENGINE_WEIGHTS,
     immutable_weights: bool = _defaults.IMMUTABLE_WEIGHTS,
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
+    tiling_optimization_level: Optional[int] = _defaults.TILING_OPTIMIZATION_LEVEL,
+    l2_limit_for_tiling: Optional[int] = _defaults.L2_LIMIT_FOR_TILING,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -488,6 +496,8 @@ def compile(
         strip_engine_weights (bool): Strip engine weights from the serialized engine. This is useful when the engine is to be deployed in an environment where the weights are not required.
         immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored.
         enable_weight_streaming (bool): Enable weight streaming.
+        tiling_optimization_level (Optional[int]): The optimization level of tiling strategies. A Higher level allows TensorRT to spend more time searching for better optimization strategy. (We currently support [0, 1, 2, 3], default is 0)
+        l2_limit_for_tiling (Optional[int]): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -662,6 +672,8 @@ def compile(
         "immutable_weights": immutable_weights,
         "enable_cross_compile_for_windows": False,
         "enable_weight_streaming": enable_weight_streaming,
+        "tiling_optimization_level": tiling_optimization_level,
+        "l2_limit_for_tiling": l2_limit_for_tiling,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -950,6 +962,8 @@ def convert_exported_program_to_serialized_trt_engine(
     strip_engine_weights: bool = _defaults.STRIP_ENGINE_WEIGHTS,
     immutable_weights: bool = _defaults.IMMUTABLE_WEIGHTS,
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
+    tiling_optimization_level: Optional[int] = _defaults.TILING_OPTIMIZATION_LEVEL,
+    l2_limit_for_tiling: Optional[int] = _defaults.L2_LIMIT_FOR_TILING,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1013,6 +1027,8 @@ def convert_exported_program_to_serialized_trt_engine(
         strip_engine_weights (bool): Strip engine weights from the serialized engine. This is useful when the engine is to be deployed in an environment where the weights are not required.
         immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored.
         enable_weight_streaming (bool): Enable weight streaming.
+        tiling_optimization_level (Optional[int]): The optimization level of tiling strategies. A Higher level allows TensorRT to spend more time searching for better optimization strategy. (We currently support [0, 1, 2, 3], default is 0)
+        l2_limit_for_tiling (Optional[int]): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
@@ -1129,6 +1145,8 @@ def convert_exported_program_to_serialized_trt_engine(
         "strip_engine_weights": strip_engine_weights,
         "immutable_weights": immutable_weights,
         "enable_weight_streaming": enable_weight_streaming,
+        "tiling_optimization_level": tiling_optimization_level,
+        "l2_limit_for_tiling": l2_limit_for_tiling,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -47,6 +47,8 @@
 ENABLE_WEIGHT_STREAMING = False
 ENABLE_CROSS_COMPILE_FOR_WINDOWS = False
 USE_AOT_JOINT_EXPORT = True
+TILING_OPTIMIZATION_LEVEL = 0
+L2_LIMIT_FOR_TILING = -1
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -20,6 +20,7 @@
     ENGINE_CAPABILITY,
     HARDWARE_COMPATIBLE,
     IMMUTABLE_WEIGHTS,
+    L2_LIMIT_FOR_TILING,
     LAZY_ENGINE_INIT,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
@@ -31,6 +32,7 @@
     REUSE_CACHED_ENGINES,
     SPARSE_WEIGHTS,
     STRIP_ENGINE_WEIGHTS,
+    TILING_OPTIMIZATION_LEVEL,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
     USE_AOT_JOINT_EXPORT,
@@ -93,6 +95,8 @@ class CompilationSettings:
         enable_cross_compile_for_windows (bool): By default this is False means TensorRT engines can only be executed on the same platform where they were built.
             True will enable cross-platform compatibility which allows the engine to be built on Linux and run on Windows
         use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
+        tiling_optimization_level (Optional[int]): The optimization level of tiling strategies. A Higher level allows TensorRT to spend more time searching for better optimization strategy. (We currently support [0, 1, 2, 3], default is 0)
+        l2_limit_for_tiling (Optional[int]): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -134,6 +138,8 @@ class CompilationSettings:
     enable_weight_streaming: bool = ENABLE_WEIGHT_STREAMING
     enable_cross_compile_for_windows: bool = ENABLE_CROSS_COMPILE_FOR_WINDOWS
     use_aot_joint_export: bool = USE_AOT_JOINT_EXPORT
+    tiling_optimization_level: Optional[int] = TILING_OPTIMIZATION_LEVEL
+    l2_limit_for_tiling: Optional[int] = L2_LIMIT_FOR_TILING
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
@@ -149,6 +155,8 @@ class CompilationSettings:
     "strip_engine_weights",  # TODO: @Evan to remove this after implementing caching weight-stripped engines as default?
     "immutable_weights",
     "enable_weight_streaming",
+    "tiling_optimization_level",
+    "l2_limit_for_tiling",
 )
 
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -329,6 +329,41 @@ def _populate_trt_builder_config(
         if self.compilation_settings.enable_weight_streaming:
             builder_config.set_flag(trt.BuilderFlag.WEIGHT_STREAMING)
 
+        if version.parse(trt.__version__) >= version.parse("10.8"):
+            if (
+                self.compilation_settings.tiling_optimization_level is None
+                or self.compilation_settings.tiling_optimization_level == 0
+            ):
+                builder_config.tiling_optimization_level = (
+                    trt.TilingOptimizationLevel.NONE
+                )
+            elif self.compilation_settings.tiling_optimization_level == 1:
+                builder_config.tiling_optimization_level = (
+                    trt.TilingOptimizationLevel.FAST
+                )
+            elif self.compilation_settings.tiling_optimization_level == 2:
+                builder_config.tiling_optimization_level = (
+                    trt.TilingOptimizationLevel.MODERATE
+                )
+            elif self.compilation_settings.tiling_optimization_level == 3:
+                builder_config.tiling_optimization_level = (
+                    trt.TilingOptimizationLevel.FULL
+                )
+            else:
+                raise ValueError(
+                    f"Invalid tiling optimization level: {self.compilation_settings.tiling_optimization_level}. A valid value should be in [0, 1, 2, 3]."
+                )
+
+            if (
+                self.compilation_settings.l2_limit_for_tiling is None
+                or self.compilation_settings.l2_limit_for_tiling == -1
+            ):
+                builder_config.l2_limit_for_tiling = -1
+            else:
+                builder_config.l2_limit_for_tiling = (
+                    self.compilation_settings.l2_limit_for_tiling
+                )
+
         return builder_config
 
     def _create_timing_cache(
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -92,6 +92,8 @@ def __init__(
         dryrun: bool = _defaults.DRYRUN,
         hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
         timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
+        tiling_optimization_level: Optional[int] = _defaults.TILING_OPTIMIZATION_LEVEL,
+        l2_limit_for_tiling: Optional[int] = _defaults.L2_LIMIT_FOR_TILING,
         **kwargs: Any,
     ) -> None:
         """
@@ -133,6 +135,8 @@ def __init__(
             hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
             timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
             lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
+            tiling_optimization_level (Optional[int]): The optimization level of tiling strategies. A Higher level allows TensorRT to spend more time searching for better optimization strategy. (We currently support [0, 1, 2, 3], default is 0)
+            l2_limit_for_tiling (Optional[int]): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
             **kwargs: Any,
         Returns:
             MutableTorchTensorRTModule
@@ -193,6 +197,8 @@ def __init__(
             "dryrun": dryrun,
             "hardware_compatible": hardware_compatible,
             "timing_cache_path": timing_cache_path,
+            "tiling_optimization_level": tiling_optimization_level,
+            "l2_limit_for_tiling": l2_limit_for_tiling,
         }
         self.arg_dynamic_shapes: Optional[tuple[Any]] = None
         self.kwarg_dynamic_shapes: Optional[dict[Any, Any]] = None