switch to my newer diffusers cross-attn API

Birch-san · Birch-san · commit c50b9bf32ad3 · 2023-03-09T23:20:55.000Z
diff --git a/scripts/play.py b/scripts/play.py
@@ -1,4 +1,5 @@
 import os, fnmatch
+from diffusers.models.attention_utils import mask_to_bias
 from diffusers.models.autoencoder_kl import AutoencoderKLOutput
 
 # monkey-patch _randn to use CPU random before k-diffusion uses it
@@ -20,6 +21,7 @@
 
 import torch
 from torch import Tensor, FloatTensor, BoolTensor, LongTensor, no_grad, zeros, tensor, arange, linspace, lerp
+from torch.nn.functional import pad
 from diffusers.models import UNet2DConditionModel, AutoencoderKL
 from diffusers.models.cross_attention import AttnProcessor2_0
 from diffusers.utils.import_utils import is_xformers_available
@@ -414,12 +416,34 @@
       # xformers attn_bias is only implemented for Triton + A100 GPU
       # https://github.com/facebookresearch/xformers/issues/576
       # chunked attention *can* be made to support masks, but I didn't implement it yet
-      case AttentionMode.Xformers | AttentionMode.Chunked | AttentionMode.ScaledDPAttn:
+      case AttentionMode.Xformers:
+        from packaging import version
+        from xformers import __version__ as xformers_version
+        # attn bias support was/will be added in 0.0.17:
+        # https://github.com/facebookresearch/xformers/blob/main/CHANGELOG.md
+        if version.parse(xformers_version) >= version.parse('0.0.17'):
+          # cutlassF is our best bet, but currently only supports token lengths which are multiples of 8
+          # https://gist.github.com/Birch-san/0c36d228e1d4b881a06d1c6e5289d569
+          # strictly speaking we should worry that making the key slightly longer, slightly
+          # affects the softmax averaging. oh well.
+          # https://github.com/lllyasviel/ControlNet/discussions/12
+          mask_length = mask_denorm.shape[-1]
+          extra_tokens_needed = 8 - (mask_length % 8)
+          # 0-pad mask to multiple of 8 tokens
+          mask_denorm = pad(mask_denorm, (0, extra_tokens_needed))
+          # replicate-pad embedding to multiple of 8 tokens (mask will hide the extra tokens)
+          embedding_denorm = pad(embedding_denorm, (0, 0, 0, extra_tokens_needed,), 'replicate')
+        else:
+          # if you're older than that, then we discard the masks
+          mask_denorm = None
+      case AttentionMode.Chunked:
         mask_denorm = None
+    
+    cross_attention_bias: Optional[FloatTensor] = None if mask_denorm is None else mask_to_bias(mask_denorm, unet.dtype)
 
     denoiser: Denoiser = denoiser_factory(
       cross_attention_conds=embedding_denorm,
-      cross_attention_mask=mask_denorm,
+      cross_attention_bias=cross_attention_bias,
       conds_per_prompt=conds_per_prompt_tensor,
       cond_weights=cond_weights,
       uncond_ixs=uncond_ixs,
diff --git a/src/diffusers b/src/diffusers
@@ -1 +1 @@
-Subproject commit 77d45ce6b4f2c0f4c4c9cbf411e1f96a050d38a9
+Subproject commit 790b155ec2bd3fa468e774ff3320b42cdeeddb3c
diff --git a/src/helpers/attention/multi_head_attention/multi_head_attention.py b/src/helpers/attention/multi_head_attention/multi_head_attention.py
@@ -1,4 +1,4 @@
-from torch import nn, Tensor
+from torch import nn, Tensor, FloatTensor
 from typing import Optional
 from ..attn_compatible import CrossAttnCompatible
 
@@ -29,20 +29,22 @@ def forward(
         hidden_states: Tensor,
         encoder_hidden_states: Optional[Tensor] = None,
         attention_mask: Optional[Tensor] = None,
-        cross_attn_mask: Optional[Tensor] = None,
         **cross_attention_kwargs,
     ) -> Tensor:
         kv = hidden_states if encoder_hidden_states is None else encoder_hidden_states
-        if cross_attn_mask is not None:
-            cross_attn_mask = cross_attn_mask.repeat_interleave(self.num_heads, dim=0)
-            cross_attn_mask = cross_attn_mask.unsqueeze(-2)
+        if encoder_hidden_states is not None and 'encoder_attention_bias' in cross_attention_kwargs:
+            encoder_attention_bias: FloatTensor = cross_attention_kwargs['encoder_attention_bias']
+            encoder_attention_bias = encoder_attention_bias.repeat_interleave(self.num_heads, dim=0)
+            encoder_attention_bias = encoder_attention_bias.unsqueeze(-2)
             _, vision_tokens, _ = hidden_states.shape
-            cross_attn_mask = cross_attn_mask.expand(-1, vision_tokens, -1)
+            encoder_attention_bias = encoder_attention_bias.expand(-1, vision_tokens, -1)
+        else:
+            encoder_attention_bias = None
         out, _ = super().forward(
             query=hidden_states,
             key=kv,
             value=kv,
             need_weights=False,
-            attn_mask=cross_attn_mask,
+            attn_mask=encoder_attention_bias,
         )
         return out
diff --git a/src/helpers/batch_denoiser.py b/src/helpers/batch_denoiser.py
@@ -18,7 +18,7 @@ def __call__(
 class AbstractBatchDenoiser(PostInitMixin, ABC, Denoiser):
   denoiser: DiffusersSDDenoiser
   cross_attention_conds: FloatTensor
-  cross_attention_mask: Optional[BoolTensor]
+  cross_attention_bias: Optional[FloatTensor]
   conds_per_prompt: LongTensor
   cond_weights: FloatTensor
   center_denoise_outputs: Optional[BoolTensor]
@@ -54,7 +54,7 @@ def __call__(
       input=noised_latents_in,
       sigma=sigma_in,
       encoder_hidden_states=self.cross_attention_conds,
-      attention_mask=self.cross_attention_mask,
+      cross_attention_bias=self.cross_attention_bias,
     )
     del noised_latents_in, sigma_in
     if self.center_denoise_outputs is not None:
@@ -163,7 +163,7 @@ def __call__(
       input=noised_latents_in,
       sigma=sigma_in,
       encoder_hidden_states=self.cross_attention_conds,
-      attention_mask=self.cross_attention_mask,
+      cross_attention_bias=self.cross_attention_bias,
     )
     if self.center_denoise_outputs is not None:
       denoised_latents = where(
@@ -194,7 +194,7 @@ class BatchDenoiserFactory():
   def __call__(
     self,
     cross_attention_conds: FloatTensor,
-    cross_attention_mask: Optional[BoolTensor],
+    cross_attention_bias: Optional[FloatTensor],
     conds_per_prompt: LongTensor,
     cond_weights: FloatTensor,
     uncond_ixs: Optional[LongTensor],
@@ -208,15 +208,15 @@ def __call__(
       return BatchNoCFGDenoiser(
         denoiser=self.denoiser,
         cross_attention_conds=cross_attention_conds,
-        cross_attention_mask=cross_attention_mask,
+        cross_attention_bias=cross_attention_bias,
         conds_per_prompt=conds_per_prompt,
         cond_weights=cond_weights,
         center_denoise_outputs=center_denoise_outputs,
       )
     return BatchCFGDenoiser(
       denoiser=self.denoiser,
       cross_attention_conds=cross_attention_conds,
-      cross_attention_mask=cross_attention_mask,
+      cross_attention_bias=cross_attention_bias,
       conds_per_prompt=conds_per_prompt,
       cond_weights=cond_weights,
       center_denoise_outputs=center_denoise_outputs,
diff --git a/src/helpers/diffusers_denoiser.py b/src/helpers/diffusers_denoiser.py
@@ -1,4 +1,4 @@
-from torch import Tensor, FloatTensor, BoolTensor
+from torch import Tensor, FloatTensor
 from diffusers.models import UNet2DConditionModel
 from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 from k_diffusion.external import DiscreteEpsDDPMDenoiser, DiscreteVDDPMDenoiser
@@ -17,21 +17,21 @@ def get_eps(
     sample: FloatTensor,
     timestep: Union[Tensor, float, int],
     encoder_hidden_states: Tensor,
+    cross_attention_bias: Optional[FloatTensor] = None,
     return_dict: bool = True,
-    attention_mask: Optional[BoolTensor] = None,
   ) -> Tensor:
-    # cross_attn_mask is a proposal from my xattn_mask_2 branch of diffusers:
+    # encoder_attention_bias is a proposal from my cross_attn_mask_3 branch of diffusers:
     # https://github.com/huggingface/diffusers/issues/1890
     # don't pass it in if we don't have to, to ensure compatibility with main branch of diffusers
-    attn_kwargs = {} if attention_mask is None else {
-      'cross_attn_mask': attention_mask,
+    cross_attention_kwargs = {} if cross_attention_bias is None else {
+      'encoder_attention_bias': cross_attention_bias,
     }
-    out: UNet2DConditionOutput = self.inner_model(
+    out: UNet2DConditionOutput = self.inner_model.forward(
       sample.to(self.inner_model.dtype),
       timestep.to(self.inner_model.dtype),
       encoder_hidden_states=encoder_hidden_states.to(self.inner_model.dtype),
       return_dict=return_dict,
-      **attn_kwargs,
+      cross_attention_kwargs=cross_attention_kwargs,
     )
     return out.sample.to(self.sampling_dtype)
 
@@ -50,21 +50,21 @@ def get_v(
     sample: FloatTensor,
     timestep: Union[Tensor, float, int],
     encoder_hidden_states: Tensor,
+    cross_attention_bias: Optional[FloatTensor] = None,
     return_dict: bool = True,
-    attention_mask: Optional[BoolTensor] = None,
     ) -> Tensor:
-    # cross_attn_mask is a proposal from my xattn_mask_2 branch of diffusers:
+    # encoder_attention_bias is a proposal from my cross_attn_mask_3 branch of diffusers:
     # https://github.com/huggingface/diffusers/issues/1890
     # don't pass it in if we don't have to, to ensure compatibility with main branch of diffusers
-    attn_kwargs = {} if attention_mask is None else {
-      'cross_attn_mask': attention_mask,
+    cross_attention_kwargs = {} if cross_attention_bias is None else {
+      'encoder_attention_bias': cross_attention_bias,
     }
     out: UNet2DConditionOutput = self.inner_model(
       sample.to(self.inner_model.dtype),
       timestep.to(self.inner_model.dtype),
       encoder_hidden_states=encoder_hidden_states.to(self.inner_model.dtype),
       return_dict=return_dict,
-      **attn_kwargs,
+      cross_attention_kwargs=cross_attention_kwargs,
     )
     return out.sample.to(self.sampling_dtype)