@@ -574,8 +574,8 @@ def guidance_scale(self):
574
574
return self ._guidance_scale
575
575
576
576
@property
577
- def attention_kwargs (self ):
578
- return self ._attention_kwargs
577
+ def cross_attention_kwargs (self ):
578
+ return self ._cross_attention_kwargs
579
579
580
580
@property
581
581
def do_classifier_free_guidance (self ):
@@ -613,7 +613,7 @@ def __call__(
613
613
return_dict : bool = True ,
614
614
clean_caption : bool = True ,
615
615
use_resolution_binning : bool = True ,
616
- attention_kwargs : Optional [Dict [str , Any ]] = None ,
616
+ cross_attention_kwargs : Optional [Dict [str , Any ]] = None ,
617
617
callback_on_step_end : Optional [Callable [[int , int , Dict ], None ]] = None ,
618
618
callback_on_step_end_tensor_inputs : List [str ] = ["latents" ],
619
619
max_sequence_length : int = 300 ,
@@ -686,7 +686,7 @@ def __call__(
686
686
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
687
687
return_dict (`bool`, *optional*, defaults to `True`):
688
688
Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
689
- attention_kwargs : TODO
689
+ cross_attention_kwargs : TODO
690
690
clean_caption (`bool`, *optional*, defaults to `True`):
691
691
Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
692
692
be installed. If the dependencies are not installed, the embeddings will be created from the raw
@@ -747,7 +747,7 @@ def __call__(
747
747
)
748
748
749
749
self ._guidance_scale = guidance_scale
750
- self ._attention_kwargs = attention_kwargs
750
+ self ._cross_attention_kwargs = cross_attention_kwargs
751
751
self ._interrupt = False
752
752
753
753
# 2. Default height and width to transformer
@@ -759,7 +759,9 @@ def __call__(
759
759
batch_size = prompt_embeds .shape [0 ]
760
760
761
761
device = self ._execution_device
762
- lora_scale = self .attention_kwargs .get ("scale" , None ) if self .attention_kwargs is not None else None
762
+ lora_scale = (
763
+ self .cross_attention_kwargs .get ("scale" , None ) if self .cross_attention_kwargs is not None else None
764
+ )
763
765
764
766
# 3. Encode input prompt
765
767
(
@@ -829,7 +831,7 @@ def __call__(
829
831
encoder_attention_mask = prompt_attention_mask ,
830
832
timestep = timestep ,
831
833
return_dict = False ,
832
- attention_kwargs = self .attention_kwargs ,
834
+ cross_attention_kwargs = self .cross_attention_kwargs ,
833
835
)[0 ]
834
836
noise_pred = noise_pred .float ()
835
837
0 commit comments