perf: Add efficient attention lowering pass

gs-olive · gs-olive · commit 6cbb24e583e7 · 2023-09-29T14:13:09.000-07:00
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -1500,3 +1500,18 @@ def aten_ops_max_pool(
         dilation=args_bounds_check(args, 4, replacement=1),
         ceil_mode=args_bounds_check(args, 5, replacement=False),
     )
+
+
+@dynamo_tensorrt_converter(
+    torch.nn.functional.scaled_dot_product_attention,
+)  # type: ignore[misc]
+def tensorrt_scaled_dot_product_attention(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    return impl.attention.scaled_dot_product_attention(
+        network, target, SourceIR.ATEN, name, args[0], args[1], args[2]
+    )
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/__init__.py b/py/torch_tensorrt/dynamo/conversion/impl/__init__.py
@@ -2,6 +2,7 @@
 
 from . import (
     activation,
+    attention,
     cast,
     condition,
     conv,
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/attention.py b/py/torch_tensorrt/dynamo/conversion/impl/attention.py
@@ -0,0 +1,49 @@
+import math
+from typing import Optional, Union
+
+import tensorrt as trt
+from torch.fx.node import Target
+from torch_tensorrt.dynamo.conversion import impl
+from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR
+from torch_tensorrt.fx.types import TRTNetwork, TRTTensor
+
+
+def scaled_dot_product_attention(
+    network: TRTNetwork,
+    target: Union[Target, str],
+    source_ir: Optional[SourceIR],
+    name: str,
+    query: TRTTensor,
+    key: TRTTensor,
+    value: TRTTensor,
+) -> TRTTensor:
+    mm = impl.matmul.matrix_multiply(
+        network,
+        target,
+        source_ir,
+        name + "_mm",
+        query,
+        key,
+        other_matrix_op=trt.MatrixOperation.TRANSPOSE,
+    )
+    div = impl.elementwise.div(
+        network,
+        target,
+        source_ir,
+        name + "_scale",
+        mm,
+        math.sqrt(query.shape[-1]),
+    )
+    softmax = impl.normalization.softmax(
+        network, target, source_ir, name + "_softmax", div, -1
+    )
+    out = impl.matmul.matrix_multiply(
+        network,
+        target,
+        source_ir,
+        name + "_out",
+        softmax,
+        value,
+    )
+
+    return out
diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -83,11 +83,6 @@ def inplace_op(*args, **kwargs):  # type: ignore
 replace_inplace_op(aten.scatter_reduce_, aten.scatter_reduce)
 
 
-@register_torch_trt_decomposition(aten.std, registry=TORCH_TRT_DECOMPOSITIONS)
-def std_replacement(*args, **kwargs) -> torch.Tensor:  # type: ignore
-    return torch.sqrt(torch.var(*args, **kwargs))
-
-
 @register_torch_trt_decomposition(aten.rsqrt, registry=TORCH_TRT_DECOMPOSITIONS)
 def rsqrt_replacement(*args, **kwargs) -> torch.Tensor:  # type: ignore
     return torch.reciprocal(torch.sqrt(*args, **kwargs))
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -5,6 +5,7 @@
 
 from .constant_folding import constant_fold
 from .fuse_prims_broadcast import fuse_prims_broadcast
+from .lower_efficient_attention import lower_efficient_attention
 from .pass_manager import DynamoPassManager
 from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones
 from .repair_input_as_output import repair_input_as_output
@@ -14,6 +15,7 @@
         remove_input_alias_fixing_clones,
         constant_fold,
         repair_input_as_output,
+        lower_efficient_attention,
         fuse_prims_broadcast,
     ]
 )
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py
@@ -77,6 +77,6 @@ def fuse_prims_broadcast(
 
     if modified_graph:
         gm = clean_up_graph_after_modifications(gm)
-        logger.debug(f"Fused prims-broadcast paradigm:\n{gm.graph}")
+        logger.debug(f"Graph after fusing prims-broadcast paradigm:\n{gm.graph}")
 
     return gm
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/lower_efficient_attention.py b/py/torch_tensorrt/dynamo/lowering/passes/lower_efficient_attention.py
@@ -0,0 +1,74 @@
+import logging
+import operator
+from typing import Callable, Sequence, Tuple
+
+import torch
+from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
+    clean_up_graph_after_modifications,
+    get_tensor_placeholders,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def lower_efficient_attention(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor]
+) -> torch.fx.GraphModule:
+    """Replace a specific version of scaled_dot_product_attention with an equivalent
+    implementation which can be easily converted to TRT
+    """
+    orig, replacement = efficient_attention_replacement()
+
+    if torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement):
+        gm = clean_up_graph_after_modifications(gm)
+        logger.debug(
+            f"Graph after lowering _scaled_dot_product_efficient_attention:\n{gm.graph}"
+        )
+
+    return gm
+
+
+def efficient_attention_replacement() -> (
+    Tuple[
+        torch.fx.GraphModule,
+        Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor],
+    ]
+):
+    """Constructs the original and replacement functions for efficient attention"""
+
+    # Empty boilerplate function taking in three Tensors and returning one
+    def boilerplate(
+        query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> torch.Tensor:
+        ...
+
+    # Trace boilerplate function and extract placeholder and output nodes
+    orig = torch.fx.symbolic_trace(boilerplate)
+    q, k, v = get_tensor_placeholders(orig)
+    output = [node for node in orig.graph.nodes if node.op == "output"][0]
+
+    # Graph types to replace are those which use the _scaled_dot_product_efficient_attention
+    # function and extract only the first element
+    with orig.graph.inserting_before(output):
+        att = orig.graph.call_function(
+            torch.ops.aten._scaled_dot_product_efficient_attention.default,
+            args=(q, k, v, None, False),
+        )
+        out = orig.graph.call_function(
+            operator.getitem,
+            args=(att, 0),
+        )
+
+    # Assign the output of the graph to be the single getitem output
+    output.args = (out,)
+
+    orig.graph.lint()
+    orig.recompile()
+
+    # Replacement graph consists of the functional version of scaled_dot_product_attention
+    def replacement(
+        query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> torch.Tensor:
+        return torch.nn.functional.scaled_dot_product_attention(query, key, value)
+
+    return orig, replacement
diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py
@@ -92,8 +92,8 @@ def identity_pass(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
 
 
 class TestPrimBroadcastFusion(TestCase):
-    def test_input_as_output(self):
-        class InputAsOutput(torch.nn.Module):
+    def test_broadcast_fusion(self):
+        class BroadcastFusion(torch.nn.Module):
             def forward(self, x):
                 return torch.var_mean(x, keepdim=True)[1]
 
@@ -104,7 +104,7 @@ def forward(self, x):
             ).cuda(),
         ]
 
-        fx_graph = torch.fx.symbolic_trace(InputAsOutput())
+        fx_graph = torch.fx.symbolic_trace(BroadcastFusion())
         expected_ops = {torch.ops.aten.sum.dim_IntList}
         unexpected_ops = {torch.ops.aten.var.default, torch.ops.prims.var.default}
 
@@ -151,7 +151,118 @@ def forward(self, x):
             max_diff,
             0,
             DECIMALS_OF_AGREEMENT,
-            msg=f"InputAsOutput TRT outputs don't match with the original model.",
+            msg=f"BroadcastFusion TRT outputs don't match with the original model.",
+        )
+        torch._dynamo.reset()
+
+
+class TestLowerEfficientAttention(TestCase):
+    def test_lower_efficient_attention(self):
+        class EfficientAttention(torch.nn.Module):
+            def forward(self, q, k, v):
+                attn = torch.ops.aten._scaled_dot_product_efficient_attention.default(
+                    q, k, v, None, False
+                )
+                return attn[0]
+
+        inputs = [
+            torch.rand(8, 4, 5, 4).cuda(),
+            torch.rand(8, 4, 2, 4).cuda(),
+            torch.rand(8, 4, 2, 4).cuda(),
+        ]
+
+        fx_graph = torch.fx.symbolic_trace(EfficientAttention())
+        expected_ops = {torch.nn.functional.scaled_dot_product_attention}
+        unexpected_ops = {
+            torch.ops.aten._scaled_dot_product_efficient_attention.default
+        }
+
+        unexpected_ops_seen, expected_ops_unseen = lower_graph_testing(
+            fx_graph,
+            inputs,
+            expected_ops=expected_ops,
+            unexpected_ops=unexpected_ops,
+            min_block_size=1,
+        )
+
+        self.assertEquals(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+
+        self.assertEquals(
+            len(expected_ops_unseen),
+            0,
+            f"The following expected ops were not encountered: {expected_ops_unseen}",
+        )
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = torch.cat(
+            [tensor.detach().cpu() for tensor in optimized_model(*inputs)]
+        )
+        torch_model_results = torch.cat(
+            [tensor.detach().cpu() for tensor in fx_graph(*inputs)]
+        )
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg=f"EfficientAttention TRT outputs don't match with the original model.",
+        )
+        torch._dynamo.reset()
+
+    def test_efficient_attention_converter(self):
+        class EfficientAttention(torch.nn.Module):
+            def forward(self, q, k, v):
+                attn = torch.ops.aten._scaled_dot_product_efficient_attention.default(
+                    q, k, v, None, False
+                )
+                return attn[0]
+
+        inputs = [
+            torch.rand(1, 3, 6, 4).cuda(),
+            torch.rand(1, 3, 2, 4).cuda(),
+            torch.rand(1, 3, 2, 4).cuda(),
+        ]
+
+        fx_graph = torch.fx.symbolic_trace(EfficientAttention())
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = torch.cat(
+            [tensor.detach().cpu() for tensor in optimized_model(*inputs)]
+        )
+        torch_model_results = torch.cat(
+            [tensor.detach().cpu() for tensor in fx_graph(*inputs)]
+        )
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg=f"EfficientAttention TRT outputs don't match with the original model.",
         )
         torch._dynamo.reset()
 

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`
`6`	`6`	`from .constant_folding import constant_fold`
`7`	`7`	`from .fuse_prims_broadcast import fuse_prims_broadcast`
	`8`	`+from .lower_efficient_attention import lower_efficient_attention`
`8`	`9`	`from .pass_manager import DynamoPassManager`
`9`	`10`	`from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones`
`10`	`11`	`from .repair_input_as_output import repair_input_as_output`
`@@ -14,6 +15,7 @@`
`14`	`15`	`remove_input_alias_fixing_clones,`
`15`	`16`	`constant_fold,`
`16`	`17`	`repair_input_as_output,`
	`18`	`+ lower_efficient_attention,`
`17`	`19`	`fuse_prims_broadcast,`
`18`	`20`	`]`
`19`	`21`	`)`