feat: Add maxpool lowering pass indices

gs-olive · gs-olive · commit a43552bf63cb · 2023-10-02T12:32:12.000-07:00
- Add lowering pass to switch `indices` variants to non-`indices`
variants
- Add testing for lowering passes
- Remove unused directory
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -40,7 +40,7 @@ repos:
     rev: 'v1.4.1'
     hooks:
     -   id: mypy
-        exclude: "^py/torch_tensorrt/fx|^examples|^tests|^tools|^docs|noxfile.py|setup.py|versions.py"
+        exclude: "^py/torch_tensorrt/fx|^examples|^tests|^py/torch_tensorrt/dynamo/_experimental|^tools|^docs|noxfile.py|setup.py|versions.py"
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.0.278
diff --git a/py/torch_tensorrt/dynamo/_experimental/_pre_aot_lowering.py b/py/torch_tensorrt/dynamo/_experimental/_pre_aot_lowering.py
diff --git a/py/torch_tensorrt/dynamo/_experimental/einsum.py b/py/torch_tensorrt/dynamo/_experimental/einsum.py
@@ -3,11 +3,12 @@
 import torch
 import torch._custom_ops as library
 from torch.fx.node import Argument, Target
-from torch_tensorrt.dynamo.lowering._pre_aot_lowering import register_substitution
 from torch_tensorrt.fx.converter_registry import tensorrt_converter
 from torch_tensorrt.fx.converters.converter_utils import set_layer_name
 from torch_tensorrt.fx.types import TRTNetwork, TRTTensor
 
+from ._pre_aot_lowering import register_substitution
+
 library.custom_op(
     "tensorrt::einsum",
     "(str equation, Tensor[] tensors) -> Tensor",
diff --git a/py/torch_tensorrt/dynamo/_experimental/maxpool1d.py b/py/torch_tensorrt/dynamo/_experimental/maxpool1d.py
@@ -3,11 +3,12 @@
 import torch
 import torch._custom_ops as library
 from torch.fx.node import Argument, Target
-from torch_tensorrt.dynamo.lowering._pre_aot_lowering import register_substitution
 from torch_tensorrt.fx.converter_registry import tensorrt_converter
 from torch_tensorrt.fx.converters import acc_ops_converters
 from torch_tensorrt.fx.types import TRTNetwork, TRTTensor
 
+from ._pre_aot_lowering import register_substitution
+
 # This file serves as an example and a tutorial for excluding custom modules from
 # torch.compile tracing. Each required step is labeled with a number indicating the
 # preferable implementation order.
diff --git a/py/torch_tensorrt/dynamo/_experimental/test_pre_aot_lowering.py b/py/torch_tensorrt/dynamo/_experimental/test_pre_aot_lowering.py
@@ -2,7 +2,7 @@
 import torch_tensorrt
 from torch.testing._internal.common_utils import TestCase, run_tests
 
-from ..testing_utilities import lower_graph_testing
+from .....tests.py.dynamo.testing_utilities import lower_graph_testing
 
 
 class TestMaxPool1D(TestCase):
@@ -52,7 +52,7 @@ def forward(self, x):
 
         max_diff = torch.max(torch.abs(optimized_model_results - torch_model_results))
         self.assertAlmostEqual(
-            max_diff, 0, f"Maxpool1d TRT outputs don't match with the original model."
+            max_diff, 0, "Maxpool1d TRT outputs don't match with the original model."
         )
 
 
@@ -102,7 +102,7 @@ def forward(self, x, y):
 
         max_diff = torch.max(torch.abs(optimized_model_results - torch_model_results))
         self.assertAlmostEqual(
-            max_diff, 0, f"Einsum TRT outputs don't match with the original model."
+            max_diff, 0, "Einsum TRT outputs don't match with the original model."
         )
 
 
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -15,7 +15,6 @@
     get_decompositions,
     repair_input_aliasing,
 )
-from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions
 from torch_tensorrt.dynamo.utils import parse_dynamo_kwargs, set_log_level
 
 logger = logging.getLogger(__name__)
@@ -64,9 +63,6 @@ def _pretraced_backend(
     try:
         logger.debug("Pre-AOT Autograd graph:\n" + str(gm.graph))
 
-        # Perform Pre-AOT Lowering for Module-Level Replacement
-        gm = pre_aot_substitutions(gm)
-
         fake_mode = detect_fake_mode(sample_inputs)
 
         # Place backend tracing within FakeTensor context allowing nonfake Tensors
diff --git a/py/torch_tensorrt/dynamo/lowering/__init__.py b/py/torch_tensorrt/dynamo/lowering/__init__.py
@@ -1,7 +1,4 @@
 from ._decompositions import get_decompositions  # noqa: F401
 from ._fusers import *  # noqa: F401
-from ._pre_aot_lowering import SUBSTITUTION_REGISTRY  # noqa: F401
-from ._pre_aot_lowering import register_substitution  # noqa: F401
 from ._repair_input_aliasing import repair_input_aliasing
 from .passes import apply_lowering_passes
-from .substitutions import *  # noqa: F401
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -9,6 +9,7 @@
 from .pass_manager import DynamoPassManager
 from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones
 from .repair_input_as_output import repair_input_as_output
+from .replace_max_pool_with_indices import replace_max_pool_with_indices
 
 ATEN_LOWERING_PASSES = DynamoPassManager.build_from_passlist(
     [
@@ -17,6 +18,7 @@
         repair_input_as_output,
         lower_efficient_attention,
         fuse_prims_broadcast,
+        replace_max_pool_with_indices,
     ]
 )
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py b/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py
@@ -0,0 +1,60 @@
+import logging
+import operator
+from typing import Sequence
+
+import torch
+from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
+    clean_up_graph_after_modifications,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def replace_max_pool_with_indices(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]
+) -> torch.fx.GraphModule:
+    """Replace MaxPool nodes which return unused indices"""
+    replacement_dict = {
+        torch.ops.aten.max_pool1d_with_indices.default: torch.ops.aten.max_pool1d.default,
+        torch.ops.aten.max_pool2d_with_indices.default: torch.ops.aten.max_pool2d.default,
+        torch.ops.aten.max_pool3d_with_indices.default: torch.ops.aten.max_pool3d.default,
+    }
+
+    modified_graph = False
+
+    for node in gm.graph.nodes:
+        # If the node is a placeholder and its only user is a clone node
+        # it was modified by the input alias-fixing pass, and the change
+        # needs to be undone
+        if (
+            node.target in replacement_dict
+            and len(node.users) == 1
+            and list(node.users)[0].target == operator.getitem
+            and list(node.users)[0].args[1] == 0
+        ):
+            modified_graph = True
+
+            # Replace all uses of the clone with the placholder, delete the clone
+            getitem_node = list(node.users)[0]
+
+            with gm.graph.inserting_after(getitem_node):
+                maxpool_fused = gm.graph.call_function(
+                    replacement_dict[node.target],
+                    args=node.args,
+                    kwargs=node.kwargs,
+                )
+
+            logger.debug(
+                f"Replacing all uses of nodes {node}, {getitem_node} with fused maxpool node {maxpool_fused} "
+                f"is the only user of placeholder {node} and was inserted by the compiler."
+            )
+
+            getitem_node.replace_all_uses_with(maxpool_fused)
+            gm.graph.erase_node(getitem_node)
+            gm.graph.erase_node(node)
+
+    if modified_graph:
+        gm = clean_up_graph_after_modifications(gm)
+        logger.debug(f"Graph after fusing maxpool operators with indices:\n{gm.graph}")
+
+    return gm
diff --git a/py/torch_tensorrt/dynamo/lowering/substitutions/__init__.py b/py/torch_tensorrt/dynamo/lowering/substitutions/__init__.py
diff --git a/setup.py b/setup.py
@@ -391,7 +391,6 @@ def run(self):
     "torch_tensorrt.dynamo.conversion.impl.slice",
     "torch_tensorrt.dynamo.conversion.impl.unary",
     "torch_tensorrt.dynamo.lowering",
-    "torch_tensorrt.dynamo.lowering.substitutions",
     "torch_tensorrt.dynamo.lowering.passes",
     "torch_tensorrt.dynamo.partitioning",
     "torch_tensorrt.dynamo.runtime",
@@ -419,7 +418,6 @@ def run(self):
     "torch_tensorrt.dynamo.conversion.impl.slice": "py/torch_tensorrt/dynamo/conversion/impl/slice",
     "torch_tensorrt.dynamo.conversion.impl.unary": "py/torch_tensorrt/dynamo/conversion/impl/unary",
     "torch_tensorrt.dynamo.lowering": "py/torch_tensorrt/dynamo/lowering",
-    "torch_tensorrt.dynamo.lowering.substitutions": "py/torch_tensorrt/dynamo/lowering/substitutions",
     "torch_tensorrt.dynamo.lowering.passes": "py/torch_tensorrt/dynamo/lowering/passes",
     "torch_tensorrt.dynamo.partitioning": "py/torch_tensorrt/dynamo/partitioning",
     "torch_tensorrt.dynamo.runtime": "py/torch_tensorrt/dynamo/runtime",
diff --git a/tests/py/dynamo/lowering/test_decompositions.py b/tests/py/dynamo/lowering/test_decompositions.py
@@ -313,6 +313,185 @@ def forward(self, x):
             f"Var TRT outputs don't match with the original model.",
         )
 
+    def test_lowering_maxpool1d_functional(self):
+        class MaxPool1d(torch.nn.Module):
+            def forward(self, x):
+                y = torch.nn.functional.max_pool1d(x, 3)
+                return y
+
+        # Operations expected to be removed in the traced graph after decompositions
+        expected_ops = {torch.ops.aten.max_pool2d.default}
+        unexpected_ops = {
+            torch.ops.aten.max_pool1d_with_indices.default,
+            torch.ops.aten.max_pool2d_with_indices.default,
+        }
+
+        inputs = [torch.randn(4, 8, 27).cuda()]
+
+        fx_graph = torch.fx.symbolic_trace(MaxPool1d())
+        unexpected_ops_seen, expected_ops_unseen = lower_graph_testing(
+            fx_graph,
+            inputs,
+            expected_ops=expected_ops,
+            unexpected_ops=unexpected_ops,
+            min_block_size=1,
+        )
+
+        self.assertEquals(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+
+        self.assertEquals(
+            len(expected_ops_unseen),
+            0,
+            f"The following expected ops were not encountered: {expected_ops_unseen}",
+        )
+
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"MaxPool1d TRT outputs don't match with the original model.",
+        )
+
+    def test_lowering_maxpool_2d_module(self):
+        class MaxPool2d(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+                self.maxpool = torch.nn.MaxPool2d((5, 3), stride=(2, 1))
+
+            def forward(self, x):
+                y = self.maxpool(x)
+                return y
+
+        # Operations expected to be removed in the traced graph after decompositions
+        expected_ops = {torch.ops.aten.max_pool2d.default}
+        unexpected_ops = {torch.ops.aten.max_pool2d_with_indices.default}
+
+        inputs = [torch.randn(1, 3, 25, 30).cuda()]
+
+        fx_graph = torch.fx.symbolic_trace(MaxPool2d())
+        unexpected_ops_seen, expected_ops_unseen = lower_graph_testing(
+            fx_graph,
+            inputs,
+            expected_ops=expected_ops,
+            unexpected_ops=unexpected_ops,
+            min_block_size=1,
+        )
+
+        self.assertEquals(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+
+        self.assertEquals(
+            len(expected_ops_unseen),
+            0,
+            f"The following expected ops were not encountered: {expected_ops_unseen}",
+        )
+
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"MaxPool2d TRT outputs don't match with the original model.",
+        )
+
+    def test_lowering_maxpool_3d_module(self):
+        class MaxPool3d(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+                self.maxpool = torch.nn.MaxPool3d(3)
+
+            def forward(self, x):
+                y = self.maxpool(x)
+                return y
+
+        # Operations expected to be removed in the traced graph after decompositions
+        expected_ops = {torch.ops.aten.max_pool3d.default}
+        unexpected_ops = {torch.ops.aten.max_pool3d_with_indices.default}
+
+        inputs = [torch.randn(4, 8, 27, 72, 96).cuda()]
+
+        fx_graph = torch.fx.symbolic_trace(MaxPool3d())
+        unexpected_ops_seen, expected_ops_unseen = lower_graph_testing(
+            fx_graph,
+            inputs,
+            expected_ops=expected_ops,
+            unexpected_ops=unexpected_ops,
+            min_block_size=1,
+        )
+
+        self.assertEquals(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+
+        self.assertEquals(
+            len(expected_ops_unseen),
+            0,
+            f"The following expected ops were not encountered: {expected_ops_unseen}",
+        )
+
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"MaxPool3d TRT outputs don't match with the original model.",
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/tests/py/dynamo/testing_utilities.py b/tests/py/dynamo/testing_utilities.py
@@ -12,7 +12,6 @@
     get_decompositions,
     repair_input_aliasing,
 )
-from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions
 
 DECIMALS_OF_AGREEMENT = 4
 
@@ -35,8 +34,6 @@ def fx_dynamo_testing_backend(
         use_fast_partitioner=use_fast_partitioner,
     )
 
-    gm = pre_aot_substitutions(gm)
-
     fake_mode = detect_fake_mode(sample_inputs)
 
     # Place backend tracing within FakeTensor context allowing nonfake Tensors

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`from .pass_manager import DynamoPassManager`
`10`	`10`	`from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones`
`11`	`11`	`from .repair_input_as_output import repair_input_as_output`
	`12`	`+from .replace_max_pool_with_indices import replace_max_pool_with_indices`
`12`	`13`
`13`	`14`	`ATEN_LOWERING_PASSES = DynamoPassManager.build_from_passlist(`
`14`	`15`	`[`
`@@ -17,6 +18,7 @@`
`17`	`18`	`repair_input_as_output,`
`18`	`19`	`lower_efficient_attention,`
`19`	`20`	`fuse_prims_broadcast,`
	`21`	`+ replace_max_pool_with_indices,`
`20`	`22`	`]`
`21`	`23`	`)`
`22`	`24`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,6 @@`
`12`	`12`	`get_decompositions,`
`13`	`13`	`repair_input_aliasing,`
`14`	`14`	`)`
`15`		`-from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions`
`16`	`15`
`17`	`16`	`DECIMALS_OF_AGREEMENT = 4`
`18`	`17`
`@@ -35,8 +34,6 @@ def fx_dynamo_testing_backend(`
`35`	`34`	`use_fast_partitioner=use_fast_partitioner,`
`36`	`35`	`)`
`37`	`36`
`38`		`- gm = pre_aot_substitutions(gm)`
`39`		`-`
`40`	`37`	`fake_mode = detect_fake_mode(sample_inputs)`
`41`	`38`
`42`	`39`	`# Place backend tracing within FakeTensor context allowing nonfake Tensors`