huggingface
diff --git a/Diff for: ‎scripts/convert_deprecated_attention_block.py
+116 b/Diff for: ‎scripts/convert_deprecated_attention_block.py
+116
diff --git a/Diff for: ‎src/diffusers/models/attention.py
+63-63 b/Diff for: ‎src/diffusers/models/attention.py
+63-63
diff --git a/Diff for: ‎src/diffusers/models/autoencoder_kl.py
+3 b/Diff for: ‎src/diffusers/models/autoencoder_kl.py
+3
@@ -0,0 +1,116 @@
+import argparse
+
+from torch import nn
+
+from diffusers import DiffusionPipeline
+from diffusers.models.attention import AttentionBlock, assert_no_deprecated_attention_blocks
+from diffusers.models.autoencoder_kl import AutoencoderKL
+from diffusers.models.unet_2d import UNet2DModel
+from diffusers.models.unet_2d_blocks import (
+    AttnDownBlock2D,
+    AttnDownEncoderBlock2D,
+    AttnSkipDownBlock2D,
+    AttnSkipUpBlock2D,
+    AttnUpBlock2D,
+    AttnUpDecoderBlock2D,
+    UNetMidBlock2D,
+)
+from diffusers.models.vq_model import VQModel
+
+
+MODULES = [AutoencoderKL, VQModel, UNet2DModel]
+
+UNET_BLOCKS = [
+    UNetMidBlock2D,
+    AttnDownBlock2D,
+    AttnDownEncoderBlock2D,
+    AttnSkipDownBlock2D,
+    AttnUpBlock2D,
+    AttnUpDecoderBlock2D,
+    AttnSkipUpBlock2D,
+]
+
+
+unet_blocks_to_convert = []
+
+
+def patch_unet_block(unet_block_class):
+    orig_constructor = unet_block_class.__init__
+
+    def new_constructor(self, *args, **kwargs):
+        orig_constructor(self, *args, **kwargs)
+        unet_blocks_to_convert.append(self)
+
+    def convert_attention_blocks(self):
+        new_attentions = []
+
+        for attention_block in self.attentions:
+            if isinstance(attention_block, AttentionBlock):
+                new_attention_block = attention_block.as_cross_attention()
+            else:
+                new_attention_block = attention_block
+
+            new_attentions.append(new_attention_block)
+
+        self.attentions = nn.ModuleList(new_attentions)
+
+    unet_block_class.__init__ = new_constructor
+    unet_block_class.convert_attention_blocks = convert_attention_blocks
+
+
+for unet_block_class in UNET_BLOCKS:
+    patch_unet_block(unet_block_class)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--pipeline",
+        default=None,
+        type=str,
+        required=True,
+        help="Pipeline to convert the deprecated `AttentionBlock` to `CrossAttention`",
+    )
+
+    parser.add_argument(
+        "--dump_path", default=None, type=str, required=True, help="Path to the save the converted pipeline."
+    )
+
+    args = parser.parse_args()
+
+    print(f"loading original pipeline {args.pipeline}")
+
+    pipe = DiffusionPipeline.from_pretrained(args.pipeline)
+
+    any_converted = False
+
+    for attr_name in dir(pipe):
+        attr = getattr(pipe, attr_name)
+
+        for module in MODULES:
+            if isinstance(attr, module):
+                print(
+                    f"converting `DiffusionPipeline.from_pretrained({args.pipeline}).{attr_name}.attention_block_type`"
+                )
+                attr.register_to_config(attention_block_type="CrossAttention")
+                any_converted = True
+
+    for unet_block in unet_blocks_to_convert:
+        print(f"converting {unet_block.__class__}.attentions")
+        unet_block.convert_attention_blocks()
+        any_converted = True
+
+    if not any_converted:
+        print(f"`DiffusionPipeline.from_pretrained({args.pipeline})` did not have any deprecated attention blocks")
+    else:
+        print(f"Saving converted pipeline to {args.dump_path}")
+
+        pipe.save_pretrained(args.dump_path)
+
+        print("Checking converted pipeline has no deprecated attention blocks")
+
+        with assert_no_deprecated_attention_blocks():
+            pipe = DiffusionPipeline.from_pretrained(args.dump_path)
+
+        print(f"Converted pipeline saved to {args.dump_path}")
@@ -11,15 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
+from contextlib import ContextDecorator
 from typing import Callable, Optional
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
+from ..utils import deprecate
 from ..utils.import_utils import is_xformers_available
-from .cross_attention import CrossAttention
+from .cross_attention import CrossAttention, SpatialAttnProcessor, XFormersSpatialAttnProcessor
 from .embeddings import CombinedTimestepLabelEmbeddings
 
 
@@ -57,6 +58,20 @@ def __init__(
         eps: float = 1e-5,
     ):
         super().__init__()
+
+        if _assert_no_deprecated_attention_blocks > 0:
+            raise AssertionError(
+                "Deprecated `AttentionBlock` created while `assert_no_deprecated_attention_blocks` context manager"
+                " active."
+            )
+
+        deprecation_message = (
+            "AttentionBlock has been deprecated and will be replaced with CrossAttention. TODO add upgrade"
+            " instructions"
+        )
+
+        deprecate("AttentionBlock", "1.0.0", deprecation_message, standard_warn=True)
+
         self.channels = channels
 
         self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
@@ -74,20 +89,6 @@ def __init__(
         self._use_memory_efficient_attention_xformers = False
         self._attention_op = None
 
-    def reshape_heads_to_batch_dim(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.num_heads
-        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
-        return tensor
-
-    def reshape_batch_dim_to_heads(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.num_heads
-        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
-        return tensor
-
     def set_use_memory_efficient_attention_xformers(
         self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
     ):
@@ -119,59 +120,43 @@ def set_use_memory_efficient_attention_xformers(
         self._attention_op = attention_op
 
     def forward(self, hidden_states):
-        residual = hidden_states
-        batch, channel, height, width = hidden_states.shape
-
-        # norm
-        hidden_states = self.group_norm(hidden_states)
-
-        hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
-
-        # proj to q, k, v
-        query_proj = self.query(hidden_states)
-        key_proj = self.key(hidden_states)
-        value_proj = self.value(hidden_states)
-
-        scale = 1 / math.sqrt(self.channels / self.num_heads)
+        attn = self.as_cross_attention()
+        hidden_states = attn(hidden_states)
 
-        query_proj = self.reshape_heads_to_batch_dim(query_proj)
-        key_proj = self.reshape_heads_to_batch_dim(key_proj)
-        value_proj = self.reshape_heads_to_batch_dim(value_proj)
+        return hidden_states
 
-        if self._use_memory_efficient_attention_xformers:
-            # Memory efficient attention
-            hidden_states = xformers.ops.memory_efficient_attention(
-                query_proj, key_proj, value_proj, attn_bias=None, op=self._attention_op
-            )
-            hidden_states = hidden_states.to(query_proj.dtype)
+    def as_cross_attention(self):
+        if self._attention_op is None:
+            processor = SpatialAttnProcessor()
         else:
-            attention_scores = torch.baddbmm(
-                torch.empty(
-                    query_proj.shape[0],
-                    query_proj.shape[1],
-                    key_proj.shape[1],
-                    dtype=query_proj.dtype,
-                    device=query_proj.device,
-                ),
-                query_proj,
-                key_proj.transpose(-1, -2),
-                beta=0,
-                alpha=scale,
-            )
-            attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
-            hidden_states = torch.bmm(attention_probs, value_proj)
+            processor = XFormersSpatialAttnProcessor(self._attention_op)
 
-        # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
-
-        # compute next hidden_states
-        hidden_states = self.proj_attn(hidden_states)
+        if self.num_head_size is None:
+            # When `self.num_head_size` is None, there is a single attention head
+            # of all the channels
+            dim_head = self.channels
+        else:
+            dim_head = self.num_head_size
+
+        attn = CrossAttention(
+            self.channels,
+            heads=self.num_heads,
+            dim_head=dim_head,
+            bias=True,
+            upcast_softmax=True,
+            norm_num_groups=self.group_norm.num_groups,
+            processor=processor,
+            eps=self.group_norm.eps,
+            rescale_output_factor=self.rescale_output_factor,
+        )
 
-        hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
+        attn.group_norm = self.group_norm
+        attn.to_q = self.query
+        attn.to_k = self.key
+        attn.to_v = self.value
+        attn.to_out[0] = self.proj_attn
 
-        # res connect and rescale
-        hidden_states = (hidden_states + residual) / self.rescale_output_factor
-        return hidden_states
+        return attn
 
 
 class BasicTransformerBlock(nn.Module):
@@ -480,3 +465,18 @@ def forward(self, x, timestep, class_labels, hidden_dtype=None):
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
+# tracks the number of `assert_no_deprecated_attention_blocks` decorators
+_assert_no_deprecated_attention_blocks = 0
+
+
+class assert_no_deprecated_attention_blocks(ContextDecorator):
+    def __enter__(self):
+        global _assert_no_deprecated_attention_blocks
+        _assert_no_deprecated_attention_blocks += 1
+        return self
+
+    def __exit__(self, *args):
+        global _assert_no_deprecated_attention_blocks
+        _assert_no_deprecated_attention_blocks -= 1
@@ -79,6 +79,7 @@ def __init__(
         norm_num_groups: int = 32,
         sample_size: int = 32,
         scaling_factor: float = 0.18215,
+        attention_block_type: str = "AttentionBlock",
     ):
         super().__init__()
 
@@ -92,6 +93,7 @@ def __init__(
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
             double_z=True,
+            attention_block_type=attention_block_type,
         )
 
         # pass init params to Decoder
@@ -103,6 +105,7 @@ def __init__(
             layers_per_block=layers_per_block,
             norm_num_groups=norm_num_groups,
             act_fn=act_fn,
+            attention_block_type=attention_block_type,
         )
 
         self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)