feat: Add preliminary support for freezing tensors in Dynamo

gs-olive · gs-olive · commit a208df570c65 · 2023-07-20T21:35:45.000-07:00
diff --git a/py/torch_tensorrt/dynamo/backend/aot_module.py b/py/torch_tensorrt/dynamo/backend/aot_module.py
@@ -0,0 +1,127 @@
+import torch
+import torch.utils._pytree as pytree
+from torch import nn
+from typing import Callable, Optional, Dict
+from torch._functorch.aot_autograd import (
+    AOT_COUNTER,
+    create_functional_call,
+    create_aot_dispatcher_function,
+    AOTConfig,
+)
+from torch._subclasses import FakeTensor
+from torch._functorch.partitioners import default_partition
+
+
+def aot_module(
+    mod: nn.Module,
+    args,
+    fw_compiler: Callable,
+    partition_fn: Callable = default_partition,
+    decompositions: Optional[Dict] = None,
+    keep_inference_input_mutations=False,
+) -> nn.Module:
+    """
+    Adapted from:
+    https://github.com/pytorch/pytorch/blob/cce2b7e3c95a7505b41bdfc53939d84d56e31260/torch/_functorch/aot_autograd.py#L3656-L3776
+
+    This is the simplified or low overhead version of aot_module. For frontends
+    like TorchDynamo, the input functions/modules to AOT are static and have
+    unpacked inputs/outputs. This gives us an opportunity to remove the
+        (1) pytree overhead to parse inputs/outputs,
+        (2) AOT Autograd cache,
+        (3) Reading of params/buffers in every forward call
+
+
+    :func:`aot_module_simplified` removes these overheads.
+    """
+
+    params = {
+        **dict(mod.named_parameters(remove_duplicate=False)),
+        **dict(mod.named_buffers(remove_duplicate=False)),
+    }
+    params_flat, params_spec = pytree.tree_flatten(params)
+    params_flat = list(params_flat)
+    params_len = len(params_flat)
+
+    functional_call = create_functional_call(mod, params_spec, params_len)
+
+    seen_sources = set()
+
+    full_args = []
+    # First, the params
+    full_args.extend(params_flat)
+
+    if torch._guards.TracingContext.get():
+        torch._guards.TracingContext.get().params_flat = params_flat
+
+    aot_autograd_arg_pos_to_source = None
+    # Then, the params 1:1 mapped sources, if relevant.
+    if hasattr(mod, "_param_name_to_source"):
+        aot_autograd_arg_pos_to_source = []
+        # We now know this came from dynamo, and (1) we care about guards,
+        # so setting up aot_autograd_arg_pos_to_source for downstream dedup guards
+        # can now be done safely. (2) Dynamo logic protects the 1:1 sizing below.
+        for name in params.keys():
+            assert name in mod._param_name_to_source, f"{name} not found."
+            source = mod._param_name_to_source[name]
+            assert source not in seen_sources, source
+            seen_sources.add(source)
+            aot_autograd_arg_pos_to_source.append(source)
+
+    # Next, the input args
+    full_args.extend(args)
+
+    if hasattr(mod, "graph"):
+        # Non dynamo entrypoints can get to here...
+        for i, node in enumerate(mod.graph.nodes):
+            if node.op == "placeholder":
+                if hasattr(node, "_dynamo_source"):
+                    # ... but not here!
+                    if aot_autograd_arg_pos_to_source is None:
+                        aot_autograd_arg_pos_to_source = []
+                    source = node._dynamo_source
+                    assert source not in seen_sources, source
+                    seen_sources.add(source)
+                    aot_autograd_arg_pos_to_source.append(source)
+
+    if aot_autograd_arg_pos_to_source is not None:
+        assert len(full_args) == len(aot_autograd_arg_pos_to_source)
+
+    dynamic_shapes = False
+    for x in full_args:
+        if isinstance(x, FakeTensor):
+            dynamic_shapes = x.fake_mode.shape_env is not None
+            break
+
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=fw_compiler,
+        inference_compiler=fw_compiler,
+        partition_fn=partition_fn,
+        decompositions=decompositions,
+        num_params_buffers=params_len,
+        aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations,
+        dynamic_shapes=dynamic_shapes,
+        aot_autograd_arg_pos_to_source=aot_autograd_arg_pos_to_source,
+        is_export=False,
+        no_tangents=False,
+    )
+
+    compiled_fn = create_aot_dispatcher_function(
+        functional_call,
+        full_args,
+        aot_config,
+    )
+
+    def forward(*runtime_args):
+        full_args = []
+        full_args.extend(runtime_args)
+        return compiled_fn(full_args)
+
+    # Just for convenience
+    forward.zero_grad = mod.zero_grad
+    forward.named_parameters = mod.named_parameters
+    forward.named_buffers = mod.named_buffers
+
+    return forward
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -3,6 +3,7 @@
 import torch
 from functools import partial
 import torch._dynamo as td
+from torch._guards import TracingContext
 
 from torch_tensorrt.dynamo import CompilationSettings
 from torch_tensorrt.dynamo.lowering._decompositions import (
@@ -15,10 +16,12 @@
     partition,
     get_submod_inputs,
 )
+from torch_tensorrt.dynamo.lowering._freeze_aot_graph import freeze_autograd_gm
 from torch_tensorrt.dynamo.utils import parse_dynamo_kwargs
 from torch_tensorrt.dynamo.conversion import convert_module
 
-from torch._functorch.aot_autograd import aot_module_simplified, make_boxed_compiler
+from torch._functorch.aot_autograd import make_boxed_compiler
+from .aot_module import aot_module
 
 
 logger = logging.getLogger(__name__)
@@ -30,6 +33,8 @@ def torch_tensorrt_backend(
 ):
     DEFAULT_BACKEND = aot_torch_tensorrt_aten_backend
 
+    TracingContext.get().fake_mode.allow_non_fake_inputs = True
+
     return DEFAULT_BACKEND(gm, sample_inputs, **kwargs)
 
 
@@ -48,7 +53,7 @@ def aot_torch_tensorrt_aten_backend(
     gm = pre_aot_substitutions(gm)
 
     # Invoke AOTAutograd to translate operators to aten
-    return aot_module_simplified(
+    return aot_module(
         gm,
         sample_inputs,
         fw_compiler=make_boxed_compiler(custom_backend),
@@ -73,9 +78,16 @@ def _pretraced_backend(
     try:
         logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
+        frozen_gm, unfrozen_indices = freeze_autograd_gm(gm, sample_inputs)
+        nonfrozen_inputs = [sample_inputs[idx] for idx in unfrozen_indices]
+
+        frozen_gm.graph.eliminate_dead_code()
+        frozen_gm.graph.lint()
+        frozen_gm.recompile()
+
         trt_compiled = _compile_module(
-            gm,
-            sample_inputs,
+            frozen_gm,
+            nonfrozen_inputs,
             settings=settings,
         )
         return trt_compiled
diff --git a/py/torch_tensorrt/dynamo/conversion/trt_interpreter.py b/py/torch_tensorrt/dynamo/conversion/trt_interpreter.py
@@ -22,6 +22,8 @@
     unified_dtype_converter,
     Frameworks,
 )
+from torch.utils._python_dispatch import _disable_current_modes
+
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
@@ -296,6 +298,21 @@ def call_function(self, target, args, kwargs):
         assert self._cur_node_name is not None
         return converter(self.network, target, args, kwargs, self._cur_node_name)
 
+    def get_attr(self, target, args, kwargs):
+        with _disable_current_modes():
+            from torch_tensorrt.fx.converters import to_numpy
+
+            frozen_attr = self.fetch_attr(target)
+
+            if isinstance(frozen_attr, torch.nn.Parameter):
+                constant_tensor = frozen_attr.data
+            else:
+                constant_tensor = frozen_attr
+
+            network_constant = to_numpy(constant_tensor)
+
+        return network_constant
+
     def call_method(self, target, args, kwargs):
         assert isinstance(target, str)
         converter = CONVERTERS.get(target)
@@ -317,6 +334,17 @@ def output(self, target, args, kwargs):
         else:
             outputs = (args[0],)
 
+        for output_idx in range(len(outputs)):
+            from torch_tensorrt.fx.converters import get_trt_tensor
+
+            output = outputs[output_idx]
+
+            if not isinstance(output, trt.tensorrt.ITensor):
+                new_output = get_trt_tensor(self.network, output, target)
+                outputs = (
+                    outputs[:output_idx] + (new_output,) + outputs[output_idx + 1 :]
+                )
+
         if not all(isinstance(output, trt.tensorrt.ITensor) for output in outputs):
             raise RuntimeError("TensorRT requires all outputs to be Tensor!")
 
@@ -356,3 +384,5 @@ def output(self, target, args, kwargs):
             elif self.output_fp16 and output.dtype == trt.float32:
                 output.dtype = trt.float16
             self._output_names.append(name)
+
+        return list(outputs)
diff --git a/py/torch_tensorrt/dynamo/lowering/__init__.py b/py/torch_tensorrt/dynamo/lowering/__init__.py
@@ -8,3 +8,4 @@
 from ._partition import partition, get_submod_inputs, DEFAULT_SINGLE_NODE_PARTITIONS
 from .substitutions import *
 from ._fusers import *
+from ._freeze_aot_graph import *
diff --git a/py/torch_tensorrt/dynamo/lowering/_freeze_aot_graph.py b/py/torch_tensorrt/dynamo/lowering/_freeze_aot_graph.py
@@ -0,0 +1,73 @@
+import torch
+from typing import List, Tuple
+from torch._inductor.freezing import replace_params_with_constants, constant_fold
+from torch._inductor.compile_fx import fake_tensor_prop
+from torch._functorch.compile_utils import fx_graph_cse
+import torch.fx.traceback as fx_traceback
+from torch._dynamo.utils import detect_fake_mode
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.passes.tools_common import legalize_graph
+import unittest
+
+
+def freeze_autograd_gm(
+    aot_autograd_gm: torch.fx.GraphModule,
+    example_inputs: List[torch._subclasses.FakeTensor],
+) -> Tuple[torch.fx.GraphModule, List[int]]:
+    """
+    Adapted from:
+    https://github.com/pytorch/pytorch/blob/750b9b359f06cb8b8c2d5b6118bba636e2112cbb/torch/_inductor/freezing.py#L186-L243
+
+    Inlines parameters that are not mutated into constants and optimizes the graph through constant propagation
+    and other techniques. If enabled, the function also discards the original parameters of the module for memory efficiency.
+
+    Assumes that this function is run in dynamo tracing post aot_autograd.
+
+    Args:
+        aot_autograd_gm (torch.fx.GraphModule): The aot_autograd constructed GraphModule to be frozen.
+        example_inputs (List[torch.Tensor]): A list of example input tensors to be used in the freezing process.
+
+    Returns:
+        Tuple[torch.fx.GraphModule, List[int]]: A tuple containing the frozen GraphModule and a list of indices
+        of the inputs that were preserved (not turned into constants).
+    """
+    # Extract necessary metadata and parameters
+    fw_metadata = torch._guards.TracingContext.get().fw_metadata
+    params_flat = torch._guards.TracingContext.get().params_flat
+    assert fw_metadata is not None and params_flat is not None
+
+    # Replace placeholders with get_attr nodes
+    preserved_arg_indices = replace_params_with_constants(
+        aot_autograd_gm, params_flat, fw_metadata
+    )
+
+    constant_fold(aot_autograd_gm)
+
+    fake_mode = detect_fake_mode(example_inputs)
+
+    # constant params will be real tensors, not fake
+    # TODO: fake_mode should should enable py dispatcher if its symbolic ?
+    with unittest.mock.patch.object(
+        fake_mode, "allow_non_fake_inputs", True
+    ), fake_mode:
+        args = [e for i, e in enumerate(example_inputs) if i in preserved_arg_indices]
+        with fx_traceback.preserve_node_meta():
+            aot_autograd_gm = make_fx(aot_autograd_gm, _allow_non_fake_inputs=True)(
+                *args
+            )
+
+    # TODO - further restrict cse ? right now needed to dedup aliasing ops
+    cse_graph = fx_graph_cse(aot_autograd_gm.graph)
+    aot_autograd_gm.graph = cse_graph
+    aot_autograd_gm.recompile()
+
+    # Make sure meta['val'] is properly setup(weight conversion
+    # or decompose_unfused_batchnorms lost meta['val']).
+    aot_example_inputs = [example_inputs[ind] for ind in preserved_arg_indices]
+    fake_tensor_prop(aot_autograd_gm, aot_example_inputs, True)
+
+    # TODO - apply legalization in pattern matcher
+    legalize_graph(aot_autograd_gm)
+    constant_fold(aot_autograd_gm)
+
+    return aot_autograd_gm, preserved_arg_indices
diff --git a/py/torch_tensorrt/dynamo/lowering/_partition.py b/py/torch_tensorrt/dynamo/lowering/_partition.py
@@ -125,8 +125,8 @@ def is_node_supported(
 
         if (
             node.target in CONVERTERS.keys()
-            and node_name not in self.torch_executed_ops
-        ):
+            or (node.op == "get_attr" and "frozen" in node_name)
+        ) and node_name not in self.torch_executed_ops:
             # If node is a proper, supported computational node, store the operator
             if not node.is_impure():
                 self.supported_operators.add(node_name)