[dynamo] Activation checkpointing as higher order op (pytorch#101028)

anijain2305 · pytorchmergebot · commit de15e740a1f1 · 2023-05-12T03:17:41.000Z
Pull Request resolved: pytorch#101028 Approved by: https://github.com/voznesenskym, https://github.com/zou3519
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
@@ -1,12 +1,20 @@
 # Owner(s): ["module: dynamo"]
+import functools
 import unittest
 
 import torch
 
 import torch._dynamo.test_case
+import torch._functorch.config
+import torch.utils.checkpoint
+from torch._dynamo.backends.common import aot_autograd
 from torch._dynamo.testing import CompileCounter, CompileCounterWithBackend
 from torch._dynamo.utils import counters
 from torch._higher_order_ops.wrap import wrap
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 
 # Equivalent to backend="eager", but also records graphs that
@@ -20,6 +28,11 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs):
         return gm
 
 
+def count_ops(gm, args, freq, op):
+    assert [node.target for node in gm.graph.nodes].count(op) == freq
+    return gm
+
+
 global_var = torch.randn(3)
 global_num = 3.14
 
@@ -406,6 +419,160 @@ def f(x):
         self._test_wrap_simple(f, (x,), 3, expected_opcount=2)
 
 
+class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
+    def _validate(self, fn, backend, *args, skip_check=False, fullgraph=True):
+        cloned_args = []
+        for arg in args:
+            cloned_args.append(arg.clone().detach().requires_grad_(arg.requires_grad))
+
+        expected = fn(*args)
+        expected.sum().backward()
+
+        result = torch.compile(fn, fullgraph=fullgraph, backend=backend)(*cloned_args)
+        result.sum().backward()
+
+        if not skip_check:
+            self.assertEqual(result, expected)
+            for arg, cloned_arg in zip(args, cloned_args):
+                self.assertEqual(arg.grad, cloned_arg.grad)
+
+    @requires_cuda()
+    @torch._functorch.config.patch(functionalize_rng_ops=True)
+    def test_function(self):
+        def gn(x, y):
+            return torch.sigmoid(torch.matmul(x, y))
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(gn, torch.sin(x), y)
+
+        x = torch.randn(4, 4, requires_grad=True)
+        y = torch.randn(4, 4, requires_grad=True)
+
+        fw_compiler = functools.partial(count_ops, freq=1, op=torch.ops.aten.mm.default)
+        bw_compiler = functools.partial(
+            count_ops, freq=3, op=torch.ops.aten.mm.default
+        )  # mm recomputed in the bwd
+        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
+        self._validate(fn, backend, x, y)
+
+    @requires_cuda()
+    @torch._functorch.config.patch(functionalize_rng_ops=True)
+    def test_function_with_kwargs(self):
+        def gn(x, y):
+            return torch.sigmoid(torch.matmul(x, y))
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(
+                gn, torch.sin(x), y, use_reentrant=True, preserve_rng_state=False
+            )
+
+        x = torch.randn(4, 4, requires_grad=True)
+        y = torch.randn(4, 4, requires_grad=True)
+
+        fw_compiler = functools.partial(count_ops, freq=1, op=torch.ops.aten.mm.default)
+        bw_compiler = functools.partial(
+            count_ops, freq=3, op=torch.ops.aten.mm.default
+        )  # mm recomputed in the bwd
+        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
+        self._validate(fn, backend, x, y)
+
+    @requires_cuda()
+    @torch._functorch.config.patch(functionalize_rng_ops=True)
+    def test_dropout(self):
+        def gn(x, y):
+            return torch.nn.functional.dropout(torch.matmul(x, y), p=0.2)
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(gn, torch.sin(x), y)
+
+        x = torch.randn(4, 4, device="cuda", requires_grad=True)
+        y = torch.randn(4, 4, device="cuda", requires_grad=True)
+
+        fw_compiler = functools.partial(
+            count_ops, freq=1, op=torch.ops.rngprims.philox_rand.default
+        )
+        bw_compiler = functools.partial(
+            count_ops, freq=1, op=torch.ops.rngprims.philox_rand.default
+        )
+        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
+        self._validate(
+            fn, backend, x, y, skip_check=True
+        )  # dropout decomp is known to diverge with eager
+
+    @requires_cuda()
+    @torch._functorch.config.patch(functionalize_rng_ops=True)
+    def test_fallback(self):
+        def gn(x, y):
+            torch._dynamo.graph_break()
+            return torch.sigmoid(torch.matmul(x, y))
+
+        def fn(x, y):
+            return torch.cos(torch.utils.checkpoint.checkpoint(gn, torch.sin(x), y))
+
+        x = torch.randn(4, 4, requires_grad=True)
+        y = torch.randn(4, 4, requires_grad=True)
+        args = (x, y)
+
+        backend = EagerAndRecordGraphs()
+        cnt = CompileCounterWithBackend(backend)
+
+        expected = fn(*args)
+        result = torch.compile(fn, backend=cnt)(*args)
+
+        self.assertEqual(result, expected)
+
+        # One graph for torch.sin on the input, and other for torch.cos.
+        self.assertEqual(cnt.frame_count, 2)
+        self.assertEqual(cnt.op_count, 2)
+        self.assertEqual(len(backend.graphs), 2)
+
+    def test_without_functionalization_turned_on(self):
+        def gn(x, y):
+            return torch.sigmoid(torch.matmul(x, y))
+
+        def fn(x, y):
+            return torch.cos(torch.utils.checkpoint.checkpoint(gn, torch.sin(x), y))
+
+        x = torch.randn(4, 4, requires_grad=True)
+        y = torch.randn(4, 4, requires_grad=True)
+        args = (x, y)
+
+        backend = EagerAndRecordGraphs()
+        cnt = CompileCounterWithBackend(backend)
+
+        expected = fn(*args)
+        result = torch.compile(fn, backend=cnt)(*args)
+
+        self.assertEqual(result, expected)
+
+    # Higher order op does not support nn.Modules yet
+    @unittest.expectedFailure
+    @requires_cuda()
+    @torch._functorch.config.patch(functionalize_rng_ops=True)
+    def test_module(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+
+            def forward(self, x):
+                return torch.sigmoid(self.linear(x))
+
+        mod = MockModule()
+
+        def fn(x):
+            return torch.utils.checkpoint.checkpoint(mod, torch.sin(x))
+
+        x = torch.randn(10, 10, requires_grad=True)
+
+        fw_compiler = functools.partial(count_ops, freq=1, op=torch.ops.aten.mm.default)
+        bw_compiler = functools.partial(
+            count_ops, freq=3, op=torch.ops.aten.mm.default
+        )  # mm recomputed in the bwd
+        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
+        self._validate(fn, backend, x)
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
@@ -23,6 +23,7 @@
 import torch
 import torch.fx
 import torch.utils._pytree as pytree
+import torch.utils.checkpoint
 from torch import _guards
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
@@ -1270,6 +1271,18 @@ def patch():
             # disable future hooking
             opt.step.hooked = True
 
+        # TorchDynamo does not step inside utils.checkpoint function.  The flow
+        # looks likes this
+        #  1) TorchDynamo tries to wrap utils.checkpoint in a HigherOrderOp by
+        #     speculatively checking if the forward function is safe to trace.
+        #  2) If yes, then Dynamo-generated Fx graph has the wrapped higher
+        #     order op. As a result, TorchDynamo does not look inside utils.checkpoint.
+        #  3) If not, then TorchDynamo falls back to eager by performing a graph
+        #     break. And here, the following disable wrapper ensures that
+        #     TorchDynamo does not trigger again on the frames created by
+        #     utils.checkpoint innards.
+        torch.utils.checkpoint.checkpoint = disable(torch.utils.checkpoint.checkpoint)
+
     @staticmethod
     def suppress_torch_distributed_warnings(fn):
         def inner_fn(*args, **kwargs):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
@@ -48,13 +48,17 @@
 import importlib
 
 import torch
+import torch._functorch.config
+import torch._higher_order_ops.wrap
 import torch.fx.experimental.symbolic_shapes
+import torch.utils.checkpoint
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.nn.modules.lazy import LazyModuleMixin
 from torch.utils._pytree import tree_map
 
+
 counters = collections.defaultdict(collections.Counter)
 troubleshooting_url = "https://pytorch.org/docs/master/compile/troubleshooting.html"
 nnmodule_doc_url = "https://pytorch.org/docs/master/compile/nn-module.html"
@@ -1620,3 +1624,46 @@ def defake(x):
     )
     y.zero_()
     return y
+
+
+# NB: The dictionary has to be created lazily after TorchPatcher is called so
+# that we pick up the disabled torch.utils.checkpoint wrapper. Therefore, it is
+# sitting in a separate function.
+@functools.lru_cache(None)
+def higher_order_op_converter():
+    return {
+        torch.utils.checkpoint.checkpoint: torch._higher_order_ops.wrap.wrap_activation_checkpoint,
+    }
+
+
+def requires_higher_order_op(obj):
+    return obj in higher_order_op_converter()
+
+
+def get_higher_order_op(obj):
+    if (
+        obj is torch.utils.checkpoint.checkpoint
+        and not torch._functorch.config.functionalize_rng_ops
+    ):
+        from .exc import unimplemented
+
+        # TODO - functionalize_rng_ops flags cannot be turned ON by default
+        # because 1) Performance concerns - seed and offset are read and passed
+        # to each AOT graph 2) Inductor has rand-specific optimizations and
+        # there is work remaining to compose them together with
+        # functionalization.
+        #
+        # Until we make it ON by default, we will have to ask users to turn on
+        # this flag manually.  TODO - Revisit if there is a simpler way to
+        # resolve this problem.
+        torch._logging.warning_once(
+            log,
+            "torch.compile on activation checkpointing is an experimental feature. "
+            "Please manually set torch._functorch.config.functionalize_rng_ops=True "
+            "to run torch.compile with activation checkpointing. Without this flag, "
+            "checkpointed function will not get compiled and fallback to eager.",
+        )
+        unimplemented(
+            "torch.compile requires functioanlization of rng ops to be turned on"
+        )
+    return higher_order_op_converter().get(obj)
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
@@ -46,7 +46,7 @@
     TensorVariable,
     UnspecializedPythonVariable,
 )
-from .torch import TorchVariable
+from .torch import TorchHigherOrderOperatorVariable, TorchVariable
 from .user_defined import UserDefinedClassVariable, UserDefinedObjectVariable
 
 __all__ = [
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
@@ -53,6 +53,7 @@
     np,
     odict_values,
     preserve_rng_state,
+    requires_higher_order_op,
     tensor_always_has_static_shape,
     torch_np,
     tuple_iterator,
@@ -103,7 +104,7 @@
 from .torch import (
     tensor_dunder_fns,
     torch_special_class_types,
-    TorchHigherOrderOperator,
+    TorchHigherOrderOperatorVariable,
     TorchVariable,
 )
 from .user_defined import UserDefinedClassVariable, UserDefinedObjectVariable
@@ -425,6 +426,7 @@ def index_source(key):
             istype(value, (type, types.FunctionType))
             and skipfiles.check(getfile(value), allow_torch=True)
             and not inspect.getattr_static(value, "_torchdynamo_inline", False)
+            and not requires_higher_order_op(value)
         ):
             return SkipFilesVariable(
                 value,
@@ -489,7 +491,7 @@ def index_source(key):
                 value, guards=make_guards(GuardBuilder.TYPE_MATCH)
             )
         elif isinstance(value, HigherOrderOperator):
-            return TorchHigherOrderOperator(
+            return TorchHigherOrderOperatorVariable(
                 value,
                 guards=self.make_guards(
                     GuardBuilder.TYPE_MATCH, GuardBuilder.NAME_MATCH
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
@@ -19,8 +19,10 @@
 from ..utils import (
     check_constant_args,
     check_unspec_python_args,
+    get_higher_order_op,
     istype,
     proxy_args_kwargs,
+    requires_higher_order_op,
     specialize_args_kwargs,
 )
 from .base import MutableLocal, typestr, VariableTracker
@@ -992,6 +994,7 @@ def call_getattr(
             ConstantVariable,
             GetAttrVariable,
             PythonModuleVariable,
+            TorchHigherOrderOperatorVariable,
             TorchVariable,
             UserFunctionVariable,
         )
@@ -1059,7 +1062,11 @@ def call_getattr(
                 return GetAttrVariable(obj, name, **options)
         elif isinstance(obj, TorchVariable):
             member = getattr(obj.value, name)
-            if is_allowed(member):
+            if requires_higher_order_op(member):
+                return TorchHigherOrderOperatorVariable(
+                    get_higher_order_op(member), **options
+                )
+            elif is_allowed(member):
                 return TorchVariable(member, **options)
             elif ConstantVariable.is_literal(member):
                 return ConstantVariable(member, **options)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`TensorVariable,`
`47`	`47`	`UnspecializedPythonVariable,`
`48`	`48`	`)`
`49`		`-from .torch import TorchVariable`
	`49`	`+from .torch import TorchHigherOrderOperatorVariable, TorchVariable`
`50`	`50`	`from .user_defined import UserDefinedClassVariable, UserDefinedObjectVariable`
`51`	`51`
`52`	`52`	`__all__ = [`