perf: Add lowering passes to improve TRT conversion

gs-olive · gs-olive · commit d2e4f6dcfac6 · 2023-09-29T14:11:59.000-07:00
- Focus on variance and sum converters, reducing instances of extraneous
layers from unnecessary reshapes
- Add test cases to validate new additions
diff --git a/docsrc/contributors/writing_dynamo_aten_lowering_passes.rst b/docsrc/contributors/writing_dynamo_aten_lowering_passes.rst
@@ -12,7 +12,7 @@ Lowering Pass Requirements
 ------------
 
 An ATen lowering pass function in Torch-TRT must satisfy two requirements:
-- The function must take as input a single `torch.fx.GraphModule` and return the lowered `torch.fx.GraphModule`
+- The function must take as input a `torch.fx.GraphModule` and a sequence of torch Tensors, `Sequence[torch.Tensor]`, and return the lowered `torch.fx.GraphModule`
 - The function must leave the graph in a valid and invoke-able state, including performing any necessary linting and recompilation
 
 See this link for information on `Graph Manipulations <https://pytorch.org/docs/stable/fx.html#graph-manipulation>`_ in FX. See below for an example of a lowering pass which repairs graphs that have inputs which are also outputs, a disallowed configuration for TRT Engines.
@@ -22,7 +22,7 @@ Example Lowering Pass
 
 .. code-block:: python
 
-    def repair_input_as_output(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    def repair_input_as_output(gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]) -> torch.fx.GraphModule:
         """Repair scenarios where inputs are also outputs of the graph
 
         TRT does not allow such cases, so we insert a clone (identity) layer
@@ -82,15 +82,15 @@ For instance, to insert the pass at the default location (end of the list), the
 .. code-block:: python
 
     @_aten_lowering_pass
-    def my_custom_pass(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    def my_custom_pass(gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]) -> torch.fx.GraphModule:
         ...
 
 Alternatively, to insert the pass at a custom index (such as the front of the list) in the passlist, the following code can be used:
 
 .. code-block:: python
 
     @_aten_lowering_pass(index=0)
-    def my_custom_pass(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    def my_custom_pass(gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]) -> torch.fx.GraphModule:
         ...
 
 There are also provided utilities in `torch_tensorrt.dynamo.lowering.passes` for displaying the currently-available lowering pass list, applying those passes to an arbitrary `torch.fx.GraphModule`, and removing the lowering pass at a specific index.
@@ -101,7 +101,7 @@ There are also provided utilities in `torch_tensorrt.dynamo.lowering.passes` for
     print(dump_lowering_passes())
 
     # Apply lowering passes to a GraphModule
-    apply_lowering_passes(graph_module)
+    apply_lowering_passes(graph_module, sample_inputs)
 
     # Remove the lowering pass at index 1
     _remove_lowering_pass(index=1)
diff --git a/py/torch_tensorrt/dynamo/aten_tracer.py b/py/torch_tensorrt/dynamo/aten_tracer.py
@@ -28,6 +28,6 @@ def trace(
         "torch._export.DECOMP_TABLE", get_decompositions(experimental_decompositions)
     ):
         graph_module = export(model, tuple(inputs)).module()
-        graph_module = apply_lowering_passes(graph_module)
+        graph_module = apply_lowering_passes(graph_module, inputs)
     logger.debug("Post export graph: " + str(graph_module.graph))
     return graph_module
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -87,7 +87,7 @@ def _pretraced_backend(
 
             logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
-            gm = apply_lowering_passes(gm)
+            gm = apply_lowering_passes(gm, sample_inputs)
 
             trt_compiled = compile_module(
                 gm,
diff --git a/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py b/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py
@@ -149,6 +149,7 @@
     aten.special_log_ndtr,
     aten.special_xlog1py,
     aten.stack,
+    aten.std,
     aten.t,
     aten.tanh_backward,
     aten.threshold,
@@ -163,6 +164,8 @@
     aten.upsample_bilinear2d,
     aten.upsample_bilinear2d.vec,
     aten.upsample_nearest2d_backward,
+    aten.var,
+    aten.var_mean,
     aten.xlogy,
     aten.zero,
     aten.zero_,
diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 from torch._decomp import register_decomposition
@@ -135,6 +135,54 @@ def reciprocal_replacement(
     return torch.div(1, input_)
 
 
+@register_torch_trt_decomposition(
+    torch.ops.prims.var.default, registry=TORCH_TRT_DECOMPOSITIONS
+)
+def var_decomposition(
+    input_tensor: torch.Tensor,
+    dims: Optional[List[int]],
+    correction: int,
+    output_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    if dims is None:
+        dims = []
+
+    # If the dimensions are empty, variance is taken over all dimensions
+    if isinstance(dims, (tuple, list)) and len(dims) == 0:
+        N = input_tensor.numel()
+    # Otherwise, the number of samples is the product of the dimensions reduced over
+    else:
+        N = 1
+        for dim_i in dims:
+            N *= input_tensor.shape[dim_i]
+
+    # Compute the mean, difference, and correction term as per the formula:
+    # https://pytorch.org/docs/stable/generated/torch.var.html
+
+    # Additionally, prims does not support keepdim, and so we only keep dimensions
+    # on the first reduction, then remove it for the second
+    sample_mean = torch.mean(input_tensor, dims, keepdim=True)
+    diff = input_tensor - sample_mean
+    squared_diff = diff * diff
+    variance_unnormalized = torch.sum(squared_diff, dims, keepdim=False)
+
+    if correction is None:
+        correction_term = float(N - 1)
+    elif isinstance(correction, int):
+        correction_term = float(N - correction)
+    elif isinstance(correction, float):
+        correction_term = float(N) - correction
+    else:
+        raise RuntimeError("correction must be int or float")
+
+    if correction_term <= 0:
+        raise RuntimeError(f"correction term was non-positive, got: {correction_term}")
+
+    variance = variance_unnormalized / correction_term
+
+    return variance
+
+
 def get_decompositions(
     enable_experimental_decompositions: bool = False,
 ) -> Dict[OpOverload, Callable[[Any], Any]]:
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -1,9 +1,10 @@
 import logging
-from typing import Callable, Optional
+from typing import Callable, Optional, Sequence, Union
 
 import torch
 
 from .constant_folding import constant_fold
+from .fuse_prims_broadcast import fuse_prims_broadcast
 from .pass_manager import DynamoPassManager
 from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones
 from .repair_input_as_output import repair_input_as_output
@@ -13,19 +14,24 @@
         remove_input_alias_fixing_clones,
         constant_fold,
         repair_input_as_output,
+        fuse_prims_broadcast,
     ]
 )
 
 logger = logging.getLogger(__name__)
 
 
-LoweringPassSignature = Callable[[torch.fx.GraphModule], torch.fx.GraphModule]
+LoweringPassSignature = Callable[
+    [torch.fx.GraphModule, Sequence[torch.Tensor]], torch.fx.GraphModule
+]
 
 
 def _aten_lowering_pass(
     *args: LoweringPassSignature,
     index: Optional[int] = None,
-) -> LoweringPassSignature:
+) -> Union[
+    LoweringPassSignature, Callable[[LoweringPassSignature], LoweringPassSignature]
+]:
     """Adds a lowering pass to the registry, at a specified index if desired
 
     If no index is specified, the lowering pass is inserted at the end of the list
@@ -65,12 +71,14 @@ def _remove_lowering_pass(*, index: int) -> None:
     return
 
 
-def apply_lowering_passes(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+def apply_lowering_passes(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]
+) -> torch.fx.GraphModule:
     """Applies the lowering passes to a graph module, returns the modified GraphModule"""
     logging.debug(
         f"Invoking DynamoPassManager and applying lowering passes: {ATEN_LOWERING_PASSES}"
     )
-    return ATEN_LOWERING_PASSES(gm)
+    return ATEN_LOWERING_PASSES(gm, sample_inputs)
 
 
 def dump_lowering_passes() -> str:
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Sequence
 
 import torch
 from torch_tensorrt._utils import sanitized_torch_version
@@ -21,7 +22,9 @@
 
 
 @torch.utils._python_dispatch._disable_current_modes()  # type: ignore
-def constant_fold(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+def constant_fold(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]
+) -> torch.fx.GraphModule:
     """Adapted from:
     https://github.com/pytorch/pytorch/blob/3a79621c9dce17f77fbddc06aab21f6bc477f313/torch/_inductor/freezing.py#L178-L197
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py
@@ -0,0 +1,82 @@
+import logging
+from typing import Sequence
+
+import torch
+from torch.fx.passes.shape_prop import ShapeProp
+from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
+    clean_up_graph_after_modifications,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# TODO: Add relevant prims to this fusion
+def fuse_prims_broadcast(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]
+) -> torch.fx.GraphModule:
+    """Fuses prim nodes which are effectively the ATen equivalents with keep_dim=True"""
+    modified_graph = False
+
+    # Propagate shapes through the graph to determine if broadcast can be resolved
+    try:
+        ShapeProp(gm).propagate(*sample_inputs)
+    except (RuntimeError, AssertionError):
+        logger.warning(
+            "Shape Propagation Failed on Graph, skipping fuse_prims_broadcast lowering pass",
+            exc_info=True,
+        )
+        return gm
+
+    for node in gm.graph.nodes:
+        # If the node is a sum prims operator, with broadcast_in_dim being the only consumer
+        # it is a candidate for fusing
+        if (
+            node.target in (torch.ops.prims.sum.default,)
+            and len(node.users) == 1
+            and list(node.users)[0].target == torch.ops.prims.broadcast_in_dim.default
+        ):
+            # Get broadcasted shape, reduced dimensions, and original tensor shape
+            broadcast_node = list(node.users)[0]
+            broadcasted_shape = broadcast_node.args[1]
+            reduced_dims = node.args[1]
+            original_shape = node.args[0].meta["tensor_meta"].shape
+
+            # If the rank of the broadcasted shape is the same as the original
+            # and the broadcasts are all singletons for the reduced dimensions
+            # and all of the non-reduced dimensions are identical to the originals
+
+            # Then the broadcast is effectively performing a "keep_dim=True" operation
+            if (
+                len(broadcasted_shape) == len(original_shape)
+                and all(broadcasted_shape[i] == 1 for i in reduced_dims)
+                and all(
+                    broadcasted_shape[j] == original_shape[j]
+                    for j in range(len(original_shape))
+                    if j not in reduced_dims
+                )
+            ):
+                # Fuse the operator to its convertible alternative
+                with gm.graph.inserting_after(broadcast_node):
+                    modified_graph = True
+
+                    if node.target == torch.ops.prims.sum.default:
+                        fused_node = gm.graph.call_function(
+                            torch.ops.aten.sum.dim_IntList,
+                            args=(node.args[0], reduced_dims, True),
+                        )
+
+                # Replace all uses of the placeholder except the cloned node
+                # with the cloned placeholder
+                broadcast_node.replace_all_uses_with(
+                    fused_node,
+                )
+
+                # Erase uses of the broadcast node and original
+                gm.graph.erase_node(broadcast_node)
+                gm.graph.erase_node(node)
+
+    if modified_graph:
+        gm = clean_up_graph_after_modifications(gm)
+        logger.debug(f"Fused prims-broadcast paradigm:\n{gm.graph}")
+
+    return gm
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/pass_manager.py b/py/torch_tensorrt/dynamo/lowering/passes/pass_manager.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, Sequence
 
 import torch
 from torch.fx.passes.pass_manager import PassManager
@@ -8,22 +8,34 @@ class DynamoPassManager(PassManager):  # type: ignore[misc]
     def __init__(
         self,
         passes: Optional[
-            List[Callable[[torch.fx.GraphModule], torch.fx.GraphModule]]
+            List[
+                Callable[
+                    [torch.fx.GraphModule, Sequence[torch.Tensor]], torch.fx.GraphModule
+                ]
+            ]
         ] = None,
     ):
         super().__init__(passes)
 
     @classmethod
     def build_from_passlist(
         cls,
-        passes: Optional[List[Callable[[torch.fx.GraphModule], torch.fx.GraphModule]]],
+        passes: Optional[
+            List[
+                Callable[
+                    [torch.fx.GraphModule, Sequence[torch.Tensor]], torch.fx.GraphModule
+                ]
+            ]
+        ],
     ) -> Any:
         pm = DynamoPassManager(passes)
         return pm
 
     def add_pass_with_index(
         self,
-        lowering_pass: Callable[[torch.fx.GraphModule], torch.fx.GraphModule],
+        lowering_pass: Callable[
+            [torch.fx.GraphModule, Sequence[torch.Tensor]], torch.fx.GraphModule
+        ],
         index: Optional[int] = None,
     ) -> None:
         if index is None:
@@ -35,8 +47,12 @@ def add_pass_with_index(
     def remove_pass_with_index(self, index: int) -> None:
         del self.passes[index]
 
-    def __call__(self, source: Any) -> Any:
-        return super().__call__(source)
+    def __call__(self, gm: Any, sample_inputs: Any) -> Any:
+        self.validate()
+        out, example_inputs = gm, sample_inputs
+        for _pass in self.passes:
+            out = _pass(out, example_inputs)
+        return out
 
     def __str__(self) -> str:
         return str(self.passes)
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_input_alias_fixing_clones.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_input_alias_fixing_clones.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Sequence
 
 import torch
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
@@ -9,7 +10,9 @@
 
 
 # TODO: Delete this lowering pass once aot_export_joint_simple is patched
-def remove_input_alias_fixing_clones(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+def remove_input_alias_fixing_clones(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]
+) -> torch.fx.GraphModule:
     """Remove the auxiliary clone nodes inserted to fix input aliasing
 
     See: https://github.com/pytorch/pytorch/issues/108079
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/repair_input_as_output.py b/py/torch_tensorrt/dynamo/lowering/passes/repair_input_as_output.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Sequence
 
 import torch
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
@@ -9,7 +10,9 @@
 logger = logging.getLogger(__name__)
 
 
-def repair_input_as_output(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+def repair_input_as_output(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]
+) -> torch.fx.GraphModule:
     """Repair scenarios where inputs are also outputs of the graph
 
     TRT does not allow such cases, so we insert a clone (identity) layer
diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py
diff --git a/tests/py/dynamo/lowering/test_decompositions.py b/tests/py/dynamo/lowering/test_decompositions.py
diff --git a/tests/py/dynamo/testing_utilities.py b/tests/py/dynamo/testing_utilities.py