spatio temporal guidance

a-r-r-o-w · a-r-r-o-w · commit b30cf5d45283 · 2025-04-05T10:39:46.000+02:00
diff --git a/src/diffusers/guiders/skip_layer_guidance.py b/src/diffusers/guiders/skip_layer_guidance.py
@@ -24,7 +24,8 @@
 
 class SkipLayerGuidance(GuidanceMixin):
     """
-    Skip Layer Guidance (SLG): https://github.com/Stability-AI/sd3.5
+    Skip Layer Guidance (SLG): https://github.com/Stability-AI/sd3.5 Spatio-Temporal Guidance (STG):
+    https://huggingface.co/papers/2411.18664
 
     SLG was introduced by StabilityAI for improving structure and anotomy coherence in generated images. It works by
     skipping the forward pass of specified transformer blocks during the denoising process on an additional conditional
@@ -36,6 +37,9 @@ class SkipLayerGuidance(GuidanceMixin):
     worse versions of the conditional distribution estimates (because skipping layers is equivalent to using a worse
     version of the model for the conditional prediction).
 
+    STG is an improvement and follow-up work combining ideas from SLG, PAG and similar techniques for improving
+    generation quality in video diffusion models.
+
     Additional reading:
     - [Guiding a Diffusion Model with a Bad Version of Itself](https://huggingface.co/papers/2406.02507)
 
@@ -54,13 +58,13 @@ class SkipLayerGuidance(GuidanceMixin):
             The fraction of the total number of denoising steps after which skip layer guidance starts.
         skip_layer_guidance_stop (`float`, defaults to `0.2`):
             The fraction of the total number of denoising steps after which skip layer guidance stops.
-        skip_guidance_layers (`int` or `List[int]`, *optional*):
+        skip_layer_guidance_layers (`int` or `List[int]`, *optional*):
             The layer indices to apply skip layer guidance to. Can be a single integer or a list of integers. If not
             provided, `skip_layer_config` must be provided. The recommended values are `[7, 8, 9]` for Stable Diffusion
             3.5 Medium.
         skip_layer_config (`LayerSkipConfig` or `List[LayerSkipConfig]`, *optional*):
             The configuration for the skip layer guidance. Can be a single `LayerSkipConfig` or a list of
-            `LayerSkipConfig`. If not provided, `skip_guidance_layers` must be provided.
+            `LayerSkipConfig`. If not provided, `skip_layer_guidance_layers` must be provided.
         guidance_rescale (`float`, defaults to `0.0`):
             The rescale factor applied to the noise predictions. This is used to improve image quality and fix
             overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
@@ -79,7 +83,7 @@ def __init__(
         skip_layer_guidance_scale: float = 2.8,
         skip_layer_guidance_start: float = 0.01,
         skip_layer_guidance_stop: float = 0.2,
-        skip_guidance_layers: Optional[Union[int, List[int]]] = None,
+        skip_layer_guidance_layers: Optional[Union[int, List[int]]] = None,
         skip_layer_config: Union[LayerSkipConfig, List[LayerSkipConfig]] = None,
         guidance_rescale: float = 0.0,
         use_original_formulation: bool = False,
@@ -102,21 +106,21 @@ def __init__(
                 f"Expected `skip_layer_guidance_stop` to be between 0.0 and 1.0, but got {skip_layer_guidance_stop}."
             )
 
-        if skip_guidance_layers is None and skip_layer_config is None:
+        if skip_layer_guidance_layers is None and skip_layer_config is None:
             raise ValueError(
-                "Either `skip_guidance_layers` or `skip_layer_config` must be provided to enable Skip Layer Guidance."
+                "Either `skip_layer_guidance_layers` or `skip_layer_config` must be provided to enable Skip Layer Guidance."
             )
-        if skip_guidance_layers is not None and skip_layer_config is not None:
-            raise ValueError("Only one of `skip_guidance_layers` or `skip_layer_config` can be provided.")
+        if skip_layer_guidance_layers is not None and skip_layer_config is not None:
+            raise ValueError("Only one of `skip_layer_guidance_layers` or `skip_layer_config` can be provided.")
 
-        if skip_guidance_layers is not None:
-            if isinstance(skip_guidance_layers, int):
-                skip_guidance_layers = [skip_guidance_layers]
-            if not isinstance(skip_guidance_layers, list):
+        if skip_layer_guidance_layers is not None:
+            if isinstance(skip_layer_guidance_layers, int):
+                skip_layer_guidance_layers = [skip_layer_guidance_layers]
+            if not isinstance(skip_layer_guidance_layers, list):
                 raise ValueError(
-                    f"Expected `skip_guidance_layers` to be an int or a list of ints, but got {type(skip_guidance_layers)}."
+                    f"Expected `skip_layer_guidance_layers` to be an int or a list of ints, but got {type(skip_layer_guidance_layers)}."
                 )
-            skip_layer_config = [LayerSkipConfig(layer, fqn="auto") for layer in skip_guidance_layers]
+            skip_layer_config = [LayerSkipConfig(layer, fqn="auto") for layer in skip_layer_guidance_layers]
 
         if isinstance(skip_layer_config, LayerSkipConfig):
             skip_layer_config = [skip_layer_config]
diff --git a/src/diffusers/hooks/_common.py b/src/diffusers/hooks/_common.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ..models.attention import FeedForward, LuminaFeedForward
 from ..models.attention_processor import Attention, MochiAttention
 
 
 _ATTENTION_CLASSES = (Attention, MochiAttention)
+_FEEDFORWARD_CLASSES = (FeedForward, LuminaFeedForward)
 
 _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks", "layers")
 _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
diff --git a/src/diffusers/hooks/_helpers.py b/src/diffusers/hooks/_helpers.py
@@ -33,32 +33,37 @@
 from ..models.transformers.transformer_wan import WanTransformerBlock
 
 
+@dataclass
+class AttentionProcessorMetadata:
+    skip_processor_output_fn: Callable[[Any], Any]
+
+
+@dataclass
+class GuidanceMetadata:
+    perturbed_attention_guidance_processor_cls: Type = None
+
+
 @dataclass
 class TransformerBlockMetadata:
     skip_block_output_fn: Callable[[Any], Any]
     return_hidden_states_index: int = None
     return_encoder_hidden_states_index: int = None
 
 
-class TransformerBlockRegistry:
+class AttentionProcessorRegistry:
     _registry = {}
 
     @classmethod
-    def register(cls, model_class: Type, metadata: TransformerBlockMetadata):
+    def register(cls, model_class: Type, metadata: AttentionProcessorMetadata):
         cls._registry[model_class] = metadata
 
     @classmethod
-    def get(cls, model_class: Type) -> TransformerBlockMetadata:
+    def get(cls, model_class: Type) -> AttentionProcessorMetadata:
         if model_class not in cls._registry:
             raise ValueError(f"Model class {model_class} not registered.")
         return cls._registry[model_class]
 
 
-@dataclass
-class GuidanceMetadata:
-    perturbed_attention_guidance_processor_cls: Type = None
-
-
 class GuidanceMetadataRegistry:
     _registry = {}
 
@@ -73,6 +78,40 @@ def get(cls, model_class: Type) -> GuidanceMetadata:
         return cls._registry[model_class]
 
 
+class TransformerBlockRegistry:
+    _registry = {}
+
+    @classmethod
+    def register(cls, model_class: Type, metadata: TransformerBlockMetadata):
+        cls._registry[model_class] = metadata
+
+    @classmethod
+    def get(cls, model_class: Type) -> TransformerBlockMetadata:
+        if model_class not in cls._registry:
+            raise ValueError(f"Model class {model_class} not registered.")
+        return cls._registry[model_class]
+
+
+def _register_attention_processors_metadata():
+    # CogView4
+    AttentionProcessorRegistry.register(
+        model_class=CogView4AttnProcessor,
+        metadata=AttentionProcessorMetadata(
+            skip_processor_output_fn=_skip_proc_output_fn_Attention_CogView4AttnProcessor,
+        ),
+    )
+
+
+def _register_guidance_metadata():
+    # CogView4
+    GuidanceMetadataRegistry.register(
+        model_class=CogView4AttnProcessor,
+        metadata=GuidanceMetadata(
+            perturbed_attention_guidance_processor_cls=CogView4PAGAttnProcessor,
+        ),
+    )
+
+
 def _register_transformer_blocks_metadata():
     # CogVideoX
     TransformerBlockRegistry.register(
@@ -177,17 +216,20 @@ def _register_transformer_blocks_metadata():
     )
 
 
-def _register_guidance_metadata():
-    # CogView4
-    GuidanceMetadataRegistry.register(
-        model_class=CogView4AttnProcessor,
-        metadata=GuidanceMetadata(
-            perturbed_attention_guidance_processor_cls=CogView4PAGAttnProcessor,
-        ),
-    )
+# fmt: off
+def _skip_attention___ret___hidden_states___encoder_hidden_states(self, *args, **kwargs):
+    hidden_states = kwargs.get("hidden_states", None)
+    encoder_hidden_states = kwargs.get("encoder_hidden_states", None)
+    if hidden_states is None and len(args) > 0:
+        hidden_states = args[0]
+    if encoder_hidden_states is None and len(args) > 1:
+        encoder_hidden_states = args[1]
+    return hidden_states, encoder_hidden_states
+
+
+_skip_proc_output_fn_Attention_CogView4AttnProcessor = _skip_attention___ret___hidden_states___encoder_hidden_states
 
 
-# fmt: off
 def _skip_block_output_fn___hidden_states_0___ret___hidden_states(self, *args, **kwargs):
     hidden_states = kwargs.get("hidden_states", None)
     if hidden_states is None and len(args) > 0:
@@ -229,5 +271,6 @@ def _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___en
 # fmt: on
 
 
-_register_transformer_blocks_metadata()
+_register_attention_processors_metadata()
 _register_guidance_metadata()
+_register_transformer_blocks_metadata()
diff --git a/src/diffusers/hooks/layer_skip.py b/src/diffusers/hooks/layer_skip.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Callable, List, Optional
 
 import torch
 
 from ..utils import get_logger
 from ..utils.torch_utils import unwrap_module
-from ._common import _ALL_TRANSFORMER_BLOCK_IDENTIFIERS
-from ._helpers import TransformerBlockRegistry
+from ._common import _ALL_TRANSFORMER_BLOCK_IDENTIFIERS, _ATTENTION_CLASSES, _FEEDFORWARD_CLASSES
+from ._helpers import AttentionProcessorRegistry, TransformerBlockRegistry
 from .hooks import HookRegistry, ModelHook
 
 
@@ -44,9 +44,50 @@ class LayerSkipConfig:
 
     indices: List[int]
     fqn: str = "auto"
+    skip_attention: bool = True
+    skip_attention_scores: bool = False
+    skip_ff: bool = True
 
 
-class LayerSkipHook(ModelHook):
+class AttentionScoreSkipFunctionMode(torch.overrides.TorchFunctionMode):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        if func is torch.nn.functional.scaled_dot_product_attention:
+            value = kwargs.get("value", None)
+            if value is None:
+                value = args[2]
+            return value
+        return func(*args, **kwargs)
+
+
+class AttentionProcessorSkipHook(ModelHook):
+    def __init__(self, skip_processor_output_fn: Callable, skip_attention_scores: bool = False):
+        self.skip_processor_output_fn = skip_processor_output_fn
+        self.skip_attention_scores = skip_attention_scores
+
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        if self.skip_attention_scores:
+            with AttentionScoreSkipFunctionMode():
+                return self.fn_ref.original_forward(*args, **kwargs)
+        else:
+            return self.skip_processor_output_fn(module, *args, **kwargs)
+
+
+class FeedForwardSkipHook(ModelHook):
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        output = kwargs.get("hidden_states", None)
+        if output is None:
+            output = kwargs.get("x", None)
+        if output is None and len(args) > 0:
+            output = args[0]
+        return output
+
+
+class TransformerBlockSkipHook(ModelHook):
     def initialize_hook(self, module):
         self._metadata = TransformerBlockRegistry.get(unwrap_module(module).__class__)
         return module
@@ -81,6 +122,9 @@ def apply_layer_skip(module: torch.nn.Module, config: LayerSkipConfig) -> None:
 def _apply_layer_skip_hook(module: torch.nn.Module, config: LayerSkipConfig, name: Optional[str] = None) -> None:
     name = name or _LAYER_SKIP_HOOK
 
+    if config.skip_attention and config.skip_attention_scores:
+        raise ValueError("Cannot set both `skip_attention` and `skip_attention_scores` to True. Please choose one.")
+
     if config.fqn == "auto":
         for identifier in _ALL_TRANSFORMER_BLOCK_IDENTIFIERS:
             if hasattr(module, identifier):
@@ -101,10 +145,38 @@ def _apply_layer_skip_hook(module: torch.nn.Module, config: LayerSkipConfig, nam
     if len(config.indices) == 0:
         raise ValueError("Layer index list is empty. Please provide a non-empty list of layer indices to skip.")
 
+    blocks_found = False
     for i, block in enumerate(transformer_blocks):
         if i not in config.indices:
             continue
-        logger.debug(f"Apply LayerSkipHook to '{config.fqn}.{i}'")
-        registry = HookRegistry.check_if_exists_or_initialize(block)
-        hook = LayerSkipHook()
-        registry.register_hook(hook, name)
+        blocks_found = True
+        if config.skip_attention and config.skip_ff:
+            logger.debug(f"Applying TransformerBlockSkipHook to '{config.fqn}.{i}'")
+            registry = HookRegistry.check_if_exists_or_initialize(block)
+            hook = TransformerBlockSkipHook()
+            registry.register_hook(hook, name)
+        elif config.skip_attention or config.skip_attention_scores:
+            for submodule_name, submodule in block.named_modules():
+                if isinstance(submodule, _ATTENTION_CLASSES) and not submodule.is_cross_attention:
+                    logger.debug(f"Applying AttentionProcessorSkipHook to '{config.fqn}.{i}.{submodule_name}'")
+                    output_fn = AttentionProcessorRegistry.get(submodule.processor.__class__).skip_processor_output_fn
+                    registry = HookRegistry.check_if_exists_or_initialize(submodule)
+                    hook = AttentionProcessorSkipHook(output_fn, config.skip_attention_scores)
+                    registry.register_hook(hook, name)
+        elif config.skip_ff:
+            for submodule_name, submodule in block.named_modules():
+                if isinstance(submodule, _FEEDFORWARD_CLASSES):
+                    logger.debug(f"Applying FeedForwardSkipHook to '{config.fqn}.{i}.{submodule_name}'")
+                    registry = HookRegistry.check_if_exists_or_initialize(submodule)
+                    hook = FeedForwardSkipHook()
+                    registry.register_hook(hook, name)
+        else:
+            raise ValueError(
+                "At least one of `skip_attention`, `skip_attention_scores`, or `skip_ff` must be set to True."
+            )
+
+    if not blocks_found:
+        raise ValueError(
+            f"Could not find any transformer blocks matching the provided indices {config.indices} and "
+            f"fully qualified name '{config.fqn}'. Please check the indices and fqn for correctness."
+        )