huggingface
diff --git a/‎docs/source/en/_toctree.yml
+2 b/‎docs/source/en/_toctree.yml
+2
diff --git a/‎docs/source/en/api/cache.md
+49 b/‎docs/source/en/api/cache.md
+49
diff --git a/‎src/diffusers/__init__.py
+11 b/‎src/diffusers/__init__.py
+11
diff --git a/‎src/diffusers/hooks/__init__.py
+2 b/‎src/diffusers/hooks/__init__.py
+2
diff --git a/‎src/diffusers/hooks/hooks.py
+78-30 b/‎src/diffusers/hooks/hooks.py
+78-30
@@ -598,6 +598,8 @@
       title: Attention Processor
     - local: api/activations
       title: Custom activation functions
+    - local: api/cache
+      title: Caching methods
     - local: api/normalization
       title: Custom normalization layers
     - local: api/utilities
 
@@ -0,0 +1,49 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# Caching methods
+
+## Pyramid Attention Broadcast
+
+[Pyramid Attention Broadcast](https://huggingface.co/papers/2408.12588) from Xuanlei Zhao, Xiaolong Jin, Kai Wang, Yang You.
+
+Pyramid Attention Broadcast (PAB) is a method that speeds up inference in diffusion models by systematically skipping attention computations between successive inference steps and reusing cached attention states. The attention states are not very different between successive inference steps. The most prominent difference is in the spatial attention blocks, not as much in the temporal attention blocks, and finally the least in the cross attention blocks. Therefore, many cross attention computation blocks can be skipped, followed by the temporal and spatial attention blocks. By combining other techniques like sequence parallelism and classifier-free guidance parallelism, PAB achieves near real-time video generation.
+
+Enable PAB with [`~PyramidAttentionBroadcastConfig`] on any pipeline. For some benchmarks, refer to [this](https://github.com/huggingface/diffusers/pull/9562) pull request.
+
+```python
+import torch
+from diffusers import CogVideoXPipeline, PyramidAttentionBroadcastConfig
+
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+# Increasing the value of `spatial_attention_timestep_skip_range[0]` or decreasing the value of
+# `spatial_attention_timestep_skip_range[1]` will decrease the interval in which pyramid attention
+# broadcast is active, leader to slower inference speeds. However, large intervals can lead to
+# poorer quality of generated videos.
+config = PyramidAttentionBroadcastConfig(
+    spatial_attention_block_skip_range=2,
+    spatial_attention_timestep_skip_range=(100, 800),
+    current_timestep_callback=lambda: pipe.current_timestep,
+)
+pipe.transformer.enable_cache(config)
+```
+
+### CacheMixin
+
+[[autodoc]] CacheMixin
+
+### PyramidAttentionBroadcastConfig
+
+[[autodoc]] PyramidAttentionBroadcastConfig
+
+[[autodoc]] apply_pyramid_attention_broadcast
@@ -28,6 +28,7 @@
 
 _import_structure = {
     "configuration_utils": ["ConfigMixin"],
+    "hooks": [],
     "loaders": ["FromOriginalModelMixin"],
     "models": [],
     "pipelines": [],
@@ -75,6 +76,13 @@
     _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 
 else:
+    _import_structure["hooks"].extend(
+        [
+            "HookRegistry",
+            "PyramidAttentionBroadcastConfig",
+            "apply_pyramid_attention_broadcast",
+        ]
+    )
     _import_structure["models"].extend(
         [
             "AllegroTransformer3DModel",
@@ -90,6 +98,7 @@
             "AutoencoderKLTemporalDecoder",
             "AutoencoderOobleck",
             "AutoencoderTiny",
+            "CacheMixin",
             "CogVideoXTransformer3DModel",
             "CogView3PlusTransformer2DModel",
             "ConsisIDTransformer3DModel",
@@ -588,6 +597,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_pt_objects import *  # noqa F403
     else:
+        from .hooks import HookRegistry, PyramidAttentionBroadcastConfig, apply_pyramid_attention_broadcast
         from .models import (
             AllegroTransformer3DModel,
             AsymmetricAutoencoderKL,
@@ -602,6 +612,7 @@
             AutoencoderKLTemporalDecoder,
             AutoencoderOobleck,
             AutoencoderTiny,
+            CacheMixin,
             CogVideoXTransformer3DModel,
             CogView3PlusTransformer2DModel,
             ConsisIDTransformer3DModel,
 
@@ -2,4 +2,6 @@
 
 
 if is_torch_available():
+    from .hooks import HookRegistry, ModelHook
     from .layerwise_casting import apply_layerwise_casting, apply_layerwise_casting_hook
+    from .pyramid_attention_broadcast import PyramidAttentionBroadcastConfig, apply_pyramid_attention_broadcast
@@ -30,6 +30,9 @@ class ModelHook:
 
     _is_stateful = False
 
+    def __init__(self):
+        self.fn_ref: "HookFunctionReference" = None
+
     def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
         r"""
         Hook that is executed when a model is initialized.
@@ -48,8 +51,6 @@ def deinitalize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
             module (`torch.nn.Module`):
                 The module attached to this hook.
         """
-        module.forward = module._old_forward
-        del module._old_forward
         return module
 
     def pre_forward(self, module: torch.nn.Module, *args, **kwargs) -> Tuple[Tuple[Any], Dict[str, Any]]:
@@ -99,6 +100,29 @@ def reset_state(self, module: torch.nn.Module):
         return module
 
 
+class HookFunctionReference:
+    def __init__(self) -> None:
+        """A container class that maintains mutable references to forward pass functions in a hook chain.
+
+        Its mutable nature allows the hook system to modify the execution chain dynamically without rebuilding the
+        entire forward pass structure.
+
+        Attributes:
+            pre_forward: A callable that processes inputs before the main forward pass.
+            post_forward: A callable that processes outputs after the main forward pass.
+            forward: The current forward function in the hook chain.
+            original_forward: The original forward function, stored when a hook provides a custom new_forward.
+
+        The class enables hook removal by allowing updates to the forward chain through reference modification rather
+        than requiring reconstruction of the entire chain. When a hook is removed, only the relevant references need to
+        be updated, preserving the execution order of the remaining hooks.
+        """
+        self.pre_forward = None
+        self.post_forward = None
+        self.forward = None
+        self.original_forward = None
+
+
 class HookRegistry:
     def __init__(self, module_ref: torch.nn.Module) -> None:
         super().__init__()
@@ -107,51 +131,71 @@ def __init__(self, module_ref: torch.nn.Module) -> None:
 
         self._module_ref = module_ref
         self._hook_order = []
+        self._fn_refs = []
 
     def register_hook(self, hook: ModelHook, name: str) -> None:
         if name in self.hooks.keys():
-            logger.warning(f"Hook with name {name} already exists, replacing it.")
-
-        if hasattr(self._module_ref, "_old_forward"):
-            old_forward = self._module_ref._old_forward
-        else:
-            old_forward = self._module_ref.forward
-            self._module_ref._old_forward = self._module_ref.forward
+            raise ValueError(
+                f"Hook with name {name} already exists in the registry. Please use a different name or "
+                f"first remove the existing hook and then add a new one."
+            )
 
         self._module_ref = hook.initialize_hook(self._module_ref)
 
-        if hasattr(hook, "new_forward"):
-            rewritten_forward = hook.new_forward
-
+        def create_new_forward(function_reference: HookFunctionReference):
             def new_forward(module, *args, **kwargs):
-                args, kwargs = hook.pre_forward(module, *args, **kwargs)
-                output = rewritten_forward(module, *args, **kwargs)
-                return hook.post_forward(module, output)
-        else:
+                args, kwargs = function_reference.pre_forward(module, *args, **kwargs)
+                output = function_reference.forward(*args, **kwargs)
+                return function_reference.post_forward(module, output)
 
-            def new_forward(module, *args, **kwargs):
-                args, kwargs = hook.pre_forward(module, *args, **kwargs)
-                output = old_forward(*args, **kwargs)
-                return hook.post_forward(module, output)
+            return new_forward
+
+        forward = self._module_ref.forward
 
+        fn_ref = HookFunctionReference()
+        fn_ref.pre_forward = hook.pre_forward
+        fn_ref.post_forward = hook.post_forward
+        fn_ref.forward = forward
+
+        if hasattr(hook, "new_forward"):
+            fn_ref.original_forward = forward
+            fn_ref.forward = functools.update_wrapper(
+                functools.partial(hook.new_forward, self._module_ref), hook.new_forward
+            )
+
+        rewritten_forward = create_new_forward(fn_ref)
         self._module_ref.forward = functools.update_wrapper(
-            functools.partial(new_forward, self._module_ref), old_forward
+            functools.partial(rewritten_forward, self._module_ref), rewritten_forward
         )
 
+        hook.fn_ref = fn_ref
         self.hooks[name] = hook
         self._hook_order.append(name)
+        self._fn_refs.append(fn_ref)
 
     def get_hook(self, name: str) -> Optional[ModelHook]:
-        if name not in self.hooks.keys():
-            return None
-        return self.hooks[name]
+        return self.hooks.get(name, None)
 
     def remove_hook(self, name: str, recurse: bool = True) -> None:
         if name in self.hooks.keys():
+            num_hooks = len(self._hook_order)
             hook = self.hooks[name]
+            index = self._hook_order.index(name)
+            fn_ref = self._fn_refs[index]
+
+            old_forward = fn_ref.forward
+            if fn_ref.original_forward is not None:
+                old_forward = fn_ref.original_forward
+
+            if index == num_hooks - 1:
+                self._module_ref.forward = old_forward
+            else:
+                self._fn_refs[index + 1].forward = old_forward
+
             self._module_ref = hook.deinitalize_hook(self._module_ref)
             del self.hooks[name]
-            self._hook_order.remove(name)
+            self._hook_order.pop(index)
+            self._fn_refs.pop(index)
 
         if recurse:
             for module_name, module in self._module_ref.named_modules():
@@ -161,7 +205,7 @@ def remove_hook(self, name: str, recurse: bool = True) -> None:
                     module._diffusers_hook.remove_hook(name, recurse=False)
 
     def reset_stateful_hooks(self, recurse: bool = True) -> None:
-        for hook_name in self._hook_order:
+        for hook_name in reversed(self._hook_order):
             hook = self.hooks[hook_name]
             if hook._is_stateful:
                 hook.reset_state(self._module_ref)
@@ -180,9 +224,13 @@ def check_if_exists_or_initialize(cls, module: torch.nn.Module) -> "HookRegistry
         return module._diffusers_hook
 
     def __repr__(self) -> str:
-        hook_repr = ""
+        registry_repr = ""
         for i, hook_name in enumerate(self._hook_order):
-            hook_repr += f"  ({i}) {hook_name} - ({self.hooks[hook_name].__class__.__name__})"
+            if self.hooks[hook_name].__class__.__repr__ is not object.__repr__:
+                hook_repr = self.hooks[hook_name].__repr__()
+            else:
+                hook_repr = self.hooks[hook_name].__class__.__name__
+            registry_repr += f"  ({i}) {hook_name} - {hook_repr}"
             if i < len(self._hook_order) - 1:
-                hook_repr += "\n"
-        return f"HookRegistry(\n{hook_repr}\n)"
+                registry_repr += "\n"
+        return f"HookRegistry(\n{registry_repr}\n)"