Merge branch 'main' into svekars-patch-22

svekars · web-flow · commit 1c3d9e8f20ce · 2024-04-19T15:33:20.000-07:00
diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py
@@ -86,29 +86,24 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
 print(f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds")
 
 # Lets explore the speed of each of the 3 implementations
-from torch.backends.cuda import sdp_kernel, SDPBackend
+from torch.nn.attention import SDPBackend, sdpa_kernel
 
-# Helpful arguments mapper
-backend_map = {
-    SDPBackend.MATH: {"enable_math": True, "enable_flash": False, "enable_mem_efficient": False},
-    SDPBackend.FLASH_ATTENTION: {"enable_math": False, "enable_flash": True, "enable_mem_efficient": False},
-    SDPBackend.EFFICIENT_ATTENTION: {
-        "enable_math": False, "enable_flash": False, "enable_mem_efficient": True}
-}
 
-with sdp_kernel(**backend_map[SDPBackend.MATH]):
-    print(f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds")
+with sdpa_kernel(SDPBackend.MATH):
+    math_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value)
+    print(f"The math implementation runs in {math_time:.3f} microseconds")
 
-
-with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
     try:
-        print(f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds")
+        flash_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value)
+        print(f"The flash attention implementation runs in {flash_time:.3f} microseconds")
     except RuntimeError:
         print("FlashAttention is not supported. See warnings for reasons.")
 
-with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
     try:
-        print(f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds")
+        efficient_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value)
+        print(f"The memory efficient implementation runs in {efficient_time:.3f} microseconds")
     except RuntimeError:
         print("EfficientAttention is not supported. See warnings for reasons.")
 
@@ -239,7 +234,7 @@ def generate_rand_batch(
 # Currently the fused implementations don't support ``NestedTensor`` for training
 model.eval()
 
-with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
     try:
         print(f"Random NT runs in {benchmark_torch_function_in_microseconds(model, random_nt):.3f} microseconds")
         print(f"Random Dense runs in {benchmark_torch_function_in_microseconds(model, random_dense):.3f} microseconds")
@@ -328,14 +323,82 @@ def generate_rand_batch(
 # the Shakespeare dataset.
 #
 
+######################################################################
+# Using SDPA with attn_bias subclasses`
+# ==========================================
+#
+# As of PyTorch 2.3, we have added a new submodule that contains tensor subclasses.
+# Designed to be used with ``torch.nn.functional.scaled_dot_product_attention``.
+# The module is named ``torch.nn.attention.bias`` and contains the following two
+# utilities for generating causal attention variants:
+#
+# - ``torch.nn.attention.bias.causal_upper_left``
+# - ``torch.nn.attention.bias.causal_lower_right``
+#
+# .. note::
+#    The current argument ``is_causal`` in ``torch.nn.functional.scaled_dot_product_attention``
+#    is the same as using ``torch.nn.attention.bias.causal_upper_left``.
+#
+
+from torch.nn.attention.bias import causal_lower_right, causal_upper_left
+
+batch_size = 32
+sequence_length_q = 2
+sequence_length_kv = 10
+num_heads = 16
+embed_dimension = 32
+
+dtype = torch.float16
+
+query = torch.rand(batch_size, num_heads, sequence_length_q, embed_dimension, device=device, dtype=dtype)
+key = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype)
+value = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype)
+
+upper_left_bias = causal_upper_left(sequence_length_q, sequence_length_kv)
+lower_right_bias = causal_lower_right(sequence_length_q, sequence_length_kv)
+
+print(type(upper_left_bias))
+print(type(lower_right_bias))
+
+assert type(upper_left_bias) == type(lower_right_bias)
+assert issubclass(type(upper_left_bias), torch.Tensor)
+
+# As you can see from the previous output, are the same type ``torch.nn.attention.bias.CausalBias``
+# and subclass ``torch.Tensor``
+
+# Lets see what these tensors look like
+print(upper_left_bias)
+print(lower_right_bias)
+
+# Upper Left Bias aligns the causal attention mask to the upper left corner of the attention scores matrix.
+# This only has an impact when the attention scores matrix is not square, which is common for decoding use cases.
+# Another way of thinking about this concept is that when you use upper left bias,
+# the 0th token in the query is aligned to the 0th token in the key, while for lower right bias,
+# Assuming the attention score matrix is two dimensional, ``attn_score[0][0]`` is the attention score
+# between the 0th token in the query and the 0th token in the key.
+# For lower right bias, the sequence of q is aligned so that the last token in q is aligned to the last token in k
+# (for example, ``attn_score[-1][-1])`` is all True since the last token in q is at the same position as the last token in k
+# even if the sequence length of q and k are different.
+
+# These objects are intended to be used with sdpa
+out_upper_left = F.scaled_dot_product_attention(query, key, value, upper_left_bias)
+out_lower_right = F.scaled_dot_product_attention(query, key, value, lower_right_bias)
+out_is_causal = F.scaled_dot_product_attention(query, key, value, is_causal=True)
+
+assert torch.allclose(out_upper_left, out_is_causal)
+assert not torch.allclose(out_upper_left, out_lower_right)
+
+# These attention biases should also be compatible with torch.compile
+compiled_sdpa = torch.compile(F.scaled_dot_product_attention, fullgraph=True)
+out_upper_left = compiled_sdpa(query, key, value, upper_left_bias)
 
 ######################################################################
 # Conclusion
 # ==========
 #
 # In this tutorial, we have demonstrated the basic usage of
 # ``torch.nn.functional.scaled_dot_product_attention``. We have shown how
-# the ``sdp_kernel`` context manager can be used to assert a certain
+# the ``sdpa_kernel`` context manager can be used to assert a certain
 # implementation is used on GPU. As well, we built a simple
 # ``CausalSelfAttention`` module that works with ``NestedTensor`` and is torch
 # compilable. In the process we have shown how to the profiling tools can
diff --git a/recipes_source/distributed_checkpoint_recipe.rst b/recipes_source/distributed_checkpoint_recipe.rst
@@ -1,7 +1,7 @@
 Getting Started with Distributed Checkpoint (DCP)
 =====================================================
 
-**Author**: `Iris Zhang <https://github.com/wz337>`__, `Rodrigo Kumpera <https://github.com/kumpera>`__, `Chien-Chin Huang <https://github.com/fegin>`__
+**Author**: `Iris Zhang <https://github.com/wz337>`__, `Rodrigo Kumpera <https://github.com/kumpera>`__, `Chien-Chin Huang <https://github.com/fegin>`__, `Lucas Pasqualin <https://github.com/lucasllc>`__
 
 .. note::
    |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/recipes_source/distributed_checkpoint_recipe.rst>`__.
@@ -22,8 +22,12 @@ In this tutorial, we show how to use DCP APIs with a simple FSDP wrapped model.
 How DCP works
 --------------
 
-:func:`torch.distributed.checkpoint` enables saving and loading models from multiple ranks in parallel.
-In addition, checkpointing automatically handles fully-qualified-name (FQN) mappings across models and optimizers, enabling load-time resharding across differing cluster topologies.
+:func:`torch.distributed.checkpoint` enables saving and loading models from multiple ranks in parallel. You can use this module to save on any number of ranks in parallel,
+and then re-shard across differing cluster topologies at load time.
+
+Addditionally, through the use of modules in :func:`torch.distributed.checkpoint.state_dict`,
+DCP offers support for gracefully handling ``state_dict`` generation and loading in distributed settings.
+This includes managing fully-qualified-name (FQN) mappings across models and optimizers, and setting default parameters for PyTorch provided parallelisms.
 
 DCP is different from :func:`torch.save` and :func:`torch.load` in a few significant ways:
 
@@ -42,19 +46,20 @@ Here we use a toy model wrapped with FSDP for demonstration purposes. Similarly,
 Saving
 ~~~~~~
 
-Now, let’s create a toy module, wrap it with FSDP, feed it with some dummy input data, and save it.
+Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input data, and save it.
 
 .. code-block:: python
 
     import os
 
     import torch
     import torch.distributed as dist
-    import torch.distributed.checkpoint as DCP
+    import torch.distributed.checkpoint as dcp
     import torch.multiprocessing as mp
     import torch.nn as nn
 
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from torch.distributed.checkpoint.state_dict import get_state_dict
     from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 
     CHECKPOINT_DIR = "checkpoint"
@@ -99,20 +104,14 @@ Now, let’s create a toy module, wrap it with FSDP, feed it with some dummy inp
         model(torch.rand(8, 16, device="cuda")).sum().backward()
         optimizer.step()
 
-        # set FSDP StateDictType to SHARDED_STATE_DICT so we can use DCP to checkpoint sharded model state dict
-        # note that we do not support FSDP StateDictType.LOCAL_STATE_DICT
-        FSDP.set_state_dict_type(
-            model,
-            StateDictType.SHARDED_STATE_DICT,
-        )
+        # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+        model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
         state_dict = {
-            "model": model.state_dict(),
+            "model": model_state_dict,
+            "optimizer": optimizer_state_dict
         }
+        dcp.save(state_dict,checkpoint_id=CHECKPOINT_DIR)
 
-        DCP.save_state_dict(
-            state_dict=state_dict,
-            storage_writer=DCP.FileSystemWriter(CHECKPOINT_DIR),
-        )
 
         cleanup()
 
@@ -152,12 +151,12 @@ The reason that we need the ``state_dict`` prior to loading is:
 
     import torch
     import torch.distributed as dist
-    import torch.distributed.checkpoint as DCP
+    import torch.distributed.checkpoint as dcp
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
     import torch.multiprocessing as mp
     import torch.nn as nn
 
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 
     CHECKPOINT_DIR = "checkpoint"
 
@@ -194,21 +193,23 @@ The reason that we need the ``state_dict`` prior to loading is:
         model = ToyModel().to(rank)
         model = FSDP(model)
 
-        FSDP.set_state_dict_type(
-            model,
-            StateDictType.SHARDED_STATE_DICT,
-        )
-        # different from ``torch.load()``, DCP requires model state_dict prior to loading to get
-        # the allocated storage and sharding information.
+        # generates the state dict we will load into
+        model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
         state_dict = {
-            "model": model.state_dict(),
+            "model": model_state_dict,
+            "optimizer": optimizer_state_dict
         }
-
-        DCP.load_state_dict(
+        dcp.load(
             state_dict=state_dict,
-            storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR),
+            checkpoint_id=CHECKPOINT_DIR,
+        )
+        # sets our state dicts on the model and optimizer, now that we've loaded
+        set_state_dict(
+            model,
+            optimizer,
+            model_state_dict=model_state_dict,
+            optim_state_dict=optimizer_state_dict
         )
-        model.load_state_dict(state_dict["model"])
 
         cleanup()
 
@@ -224,7 +225,8 @@ The reason that we need the ``state_dict`` prior to loading is:
         )
 
 If you would like to load the saved checkpoint into a non-FSDP wrapped model in a non-distributed setup, perhaps for inference, you can also do that with DCP.
-By default, DCP saves and loads a distributed ``state_dict`` in Single Program Multiple Data(SPMD) style. To load without a distributed setup, please set ``no_dist`` to ``True`` when loading with DCP.
+By default, DCP saves and loads a distributed ``state_dict`` in Single Program Multiple Data(SPMD) style. However if no process group is initialized, DCP infers
+the intent is to save or load in "non-distributed" style, meaning entirely in the current process.
 
 .. note::
   Distributed checkpoint support for Multi-Program Multi-Data is still under development.
@@ -259,11 +261,10 @@ By default, DCP saves and loads a distributed ``state_dict`` in Single Program M
             "model": model.state_dict(),
         }
 
-        # turn no_dist to be true to load in non-distributed setting
-        DCP.load_state_dict(
+        # since no progress group is initialized, DCP will disable any collectives.
+        dcp.load(
             state_dict=state_dict,
-            storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR),
-            no_dist=True,
+            checkpoint_id=CHECKPOINT_DIR,
         )
         model.load_state_dict(state_dict["model"])
 
@@ -274,7 +275,9 @@ By default, DCP saves and loads a distributed ``state_dict`` in Single Program M
 
 Conclusion
 ----------
-In conclusion, we have learned how to use DCP's :func:`save_state_dict` and :func:`load_state_dict` APIs, as well as how they are different form :func:`torch.save` and :func:`torch.load`.
+In conclusion, we have learned how to use DCP's :func:`save` and :func:`load` APIs, as well as how they are different form :func:`torch.save` and :func:`torch.load`.
+Additionally, we've learned how to use :func:`get_state_dict` and :func:`set_state_dict` to automatically manage parallelism-specific FQN's and defaults during state dict
+generation and loading.
 
 For more information, please see the following: