From d15d090d554927f28db35761926c9b0811edadd4 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 19 May 2025 21:50:57 +0200
Subject: [PATCH 01/15] initial support

---
 src/diffusers/__init__.py                     |   4 +
 src/diffusers/loaders/peft.py                 |   1 +
 src/diffusers/models/__init__.py              |   2 +
 src/diffusers/models/transformers/__init__.py |   1 +
 .../models/transformers/transformer_wan.py    |   2 +-
 .../transformers/transformer_wan_vace.py      | 389 ++++++++
 src/diffusers/pipelines/__init__.py           |   4 +-
 src/diffusers/pipelines/wan/__init__.py       |   2 +
 .../pipelines/wan/pipeline_wan_vace.py        | 848 ++++++++++++++++++
 9 files changed, 1250 insertions(+), 3 deletions(-)
 create mode 100644 src/diffusers/models/transformers/transformer_wan_vace.py
 create mode 100644 src/diffusers/pipelines/wan/pipeline_wan_vace.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 9ab973351c86..5d7a9d9d232e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -215,6 +215,7 @@
             "UVit2DModel",
             "VQModel",
             "WanTransformer3DModel",
+            "WanVACETransformer3DModel",
         ]
     )
     _import_structure["optimization"] = [
@@ -526,6 +527,7 @@
             "VQDiffusionPipeline",
             "WanImageToVideoPipeline",
             "WanPipeline",
+            "WanVACEPipeline",
             "WanVideoToVideoPipeline",
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
@@ -819,6 +821,7 @@
             UVit2DModel,
             VQModel,
             WanTransformer3DModel,
+            WanVACETransformer3DModel,
         )
         from .optimization import (
             get_constant_schedule,
@@ -1109,6 +1112,7 @@
             VQDiffusionPipeline,
             WanImageToVideoPipeline,
             WanPipeline,
+            WanVACEPipeline,
             WanVideoToVideoPipeline,
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
index 7a970c5c5153..208034a0fce1 100644
--- a/src/diffusers/loaders/peft.py
+++ b/src/diffusers/loaders/peft.py
@@ -58,6 +58,7 @@
     "CogView4Transformer2DModel": lambda model_cls, weights: weights,
     "HiDreamImageTransformer2DModel": lambda model_cls, weights: weights,
     "HunyuanVideoFramepackTransformer3DModel": lambda model_cls, weights: weights,
+    "WanVACETransformer3DModel": lambda model_cls, weights: weights,
 }
 
 
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 58322800332a..8723fbca2187 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -89,6 +89,7 @@
     _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
     _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
     _import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
+    _import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
     _import_structure["unets.unet_1d"] = ["UNet1DModel"]
     _import_structure["unets.unet_2d"] = ["UNet2DModel"]
     _import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
@@ -178,6 +179,7 @@
             Transformer2DModel,
             TransformerTemporalModel,
             WanTransformer3DModel,
+            WanVACETransformer3DModel,
         )
         from .unets import (
             I2VGenXLUNet,
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index 86094104bd1c..e7b8ba55ca61 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -32,3 +32,4 @@
     from .transformer_sd3 import SD3Transformer2DModel
     from .transformer_temporal import TransformerTemporalModel
     from .transformer_wan import WanTransformer3DModel
+    from .transformer_wan_vace import WanVACETransformer3DModel
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
index c78d72dc4a2c..ba7aa06d0613 100644
--- a/src/diffusers/models/transformers/transformer_wan.py
+++ b/src/diffusers/models/transformers/transformer_wan.py
@@ -340,7 +340,7 @@ class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrigi
 
     _supports_gradient_checkpointing = True
     _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
-    _no_split_modules = ["WanTransformerBlock"]
+    _no_split_modules = ["WanTransformerBlock", "WanVACETransformerBlock"]
     _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
     _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
 
diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py
new file mode 100644
index 000000000000..e6b25672c1f8
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -0,0 +1,389 @@
+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ..attention import FeedForward
+from ..attention_processor import Attention
+from ..cache_utils import CacheMixin
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import FP32LayerNorm
+from .transformer_wan import WanAttnProcessor2_0, WanRotaryPosEmbed, WanTimeTextImageEmbedding, WanTransformerBlock
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class WanVACETransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        qk_norm: str = "rms_norm_across_heads",
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        added_kv_proj_dim: Optional[int] = None,
+        apply_input_projection: bool = False,
+        apply_output_projection: bool = False,
+    ):
+        super().__init__()
+
+        # 1. Input projection
+        self.proj_in = None
+        if apply_input_projection:
+            self.proj_in = nn.Linear(dim, dim)
+
+        # 2. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            processor=WanAttnProcessor2_0(),
+        )
+
+        # 3. Cross-attention
+        self.attn2 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            added_kv_proj_dim=added_kv_proj_dim,
+            added_proj_bias=True,
+            processor=WanAttnProcessor2_0(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+
+        # 4. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+
+        # 5. Output projection
+        self.proj_out = None
+        if apply_output_projection:
+            self.proj_out = nn.Linear(dim, dim)
+
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        control_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.proj_in is not None:
+            control_hidden_states = self.proj_in(control_hidden_states)
+            hidden_states = hidden_states + control_hidden_states
+        else:
+            hidden_states = control_hidden_states
+
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+            self.scale_shift_table + temb.float()
+        ).chunk(6, dim=1)
+
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn_output = self.attn1(hidden_states=norm_hidden_states, rotary_emb=rotary_emb)
+        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+        hidden_states = hidden_states + attn_output
+
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            hidden_states
+        )
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+
+        if self.proj_out is not None:
+            control_hidden_states = self.proj_out(hidden_states)
+
+        return hidden_states, control_hidden_states
+
+
+class WanVACETransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+    r"""
+    A Transformer model for video-like data used in the Wan model.
+
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Fixed length for text embeddings.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        text_dim (`int`, defaults to `512`):
+            Input dimension for text embeddings.
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            The number of layers of transformer blocks to use.
+        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
+            Window size for local attention (-1 indicates global attention).
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        qk_norm (`bool`, defaults to `True`):
+            Enable query/key normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        add_img_emb (`bool`, defaults to `False`):
+            Whether to use img_emb.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+    """
+
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["WanTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        cross_attn_norm: bool = True,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        eps: float = 1e-6,
+        image_dim: Optional[int] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        pos_embed_seq_len: Optional[int] = None,
+        vace_layers: List[int] = [0, 5, 10, 15, 20, 25, 30, 35],
+        vace_in_channels: int = 96,
+    ) -> None:
+        super().__init__()
+
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+
+        if max(vace_layers) >= num_layers:
+            raise ValueError(f"VACE layers {vace_layers} exceed the number of transformer layers {num_layers}.")
+        if 0 not in vace_layers:
+            raise ValueError("VACE layers must include layer 0.")
+
+        # 1. Patch & position embedding
+        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+        self.vace_patch_embedding = nn.Conv3d(vace_in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+
+        # 2. Condition embeddings
+        # image_embedding_dim=1280 for I2V model
+        self.condition_embedder = WanTimeTextImageEmbedding(
+            dim=inner_dim,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+            pos_embed_seq_len=pos_embed_seq_len,
+        )
+
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanTransformerBlock(
+                    inner_dim, ffn_dim, num_attention_heads, qk_norm, cross_attn_norm, eps, added_kv_proj_dim
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.vace_blocks = nn.ModuleList(
+            [
+                WanVACETransformerBlock(
+                    inner_dim,
+                    ffn_dim,
+                    num_attention_heads,
+                    qk_norm,
+                    cross_attn_norm,
+                    eps,
+                    added_kv_proj_dim,
+                    apply_input_projection=i == 0,  # Layer 0 always has input projection and is in vace_layers
+                    apply_output_projection=True,
+                )
+                for i in range(len(vace_layers))
+            ]
+        )
+
+        # 4. Output norm & projection
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        control_hidden_states: torch.Tensor = None,
+        control_hidden_states_scale: torch.Tensor = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if control_hidden_states is None:
+            raise ValueError("Control hidden states must be provided for VACE models.")
+
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+
+        if control_hidden_states_scale is None:
+            control_hidden_states_scale = control_hidden_states.new_ones(len(self.config.vace_layers))
+
+        # 1. Rotary position embedding
+        rotary_emb = self.rope(hidden_states)
+
+        # 2. Patch embedding
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        control_hidden_states = self.vace_patch_embedding(control_hidden_states)
+        control_hidden_states = control_hidden_states.flatten(2).transpose(1, 2)
+        control_hidden_states_padding = control_hidden_states.new_zeros(
+            batch_size, hidden_states.size(1) - control_hidden_states.size(1), control_hidden_states.size(2)
+        )
+        control_hidden_states = torch.cat([control_hidden_states, control_hidden_states_padding], dim=1)
+
+        # 3. Time embedding
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image
+        )
+        timestep_proj = timestep_proj.unflatten(1, (6, -1))
+
+        # 4. Image embedding
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+
+        # 5. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            # Prepare VACE hints
+            control_hidden_states_list = []
+            vace_hidden_states = hidden_states
+            for block in self.vace_blocks:
+                vace_hidden_states, control_hidden_states = self._gradient_checkpointing_func(
+                    block, vace_hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
+                )
+                control_hidden_states_list.append(control_hidden_states)
+            control_hidden_states_list = control_hidden_states_list[::-1]
+
+            for i, block in enumerate(self.blocks):
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
+                )
+                if i in self.config.vace_layers:
+                    control_hint = control_hidden_states_list.pop()
+                    hidden_states = hidden_states + control_hint * control_hidden_states_scale[i]
+        else:
+            # Prepare VACE hints
+            control_hidden_states_list = []
+            vace_hidden_states = hidden_states
+            for block in self.vace_blocks:
+                vace_hidden_states, control_hidden_states = block(
+                    vace_hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
+                )
+                control_hidden_states_list.append(control_hidden_states)
+            control_hidden_states_list = control_hidden_states_list[::-1]
+
+            for i, block in enumerate(self.blocks):
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+                if i in self.config.vace_layers:
+                    control_hint = control_hidden_states_list.pop()
+                    hidden_states = hidden_states + control_hint * control_hidden_states_scale[i]
+
+        # 6. Output norm, projection & unpatchify
+        shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+
+        hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 4debb868d9dc..17ba9e2d0df7 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -366,7 +366,7 @@
         "WuerstchenDecoderPipeline",
         "WuerstchenPriorPipeline",
     ]
-    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"]
+    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline", "WanVACEPipeline"]
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -734,7 +734,7 @@
             UniDiffuserTextDecoder,
         )
         from .visualcloze import VisualClozeGenerationPipeline, VisualClozePipeline
-        from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
+        from .wan import WanImageToVideoPipeline, WanPipeline, WanVACEPipeline, WanVideoToVideoPipeline
         from .wuerstchen import (
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
diff --git a/src/diffusers/pipelines/wan/__init__.py b/src/diffusers/pipelines/wan/__init__.py
index 80916a8a1e10..bb96372b1db2 100644
--- a/src/diffusers/pipelines/wan/__init__.py
+++ b/src/diffusers/pipelines/wan/__init__.py
@@ -24,6 +24,7 @@
 else:
     _import_structure["pipeline_wan"] = ["WanPipeline"]
     _import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
+    _import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
     _import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -35,6 +36,7 @@
     else:
         from .pipeline_wan import WanPipeline
         from .pipeline_wan_i2v import WanImageToVideoPipeline
+        from .pipeline_wan_vace import WanVACEPipeline
         from .pipeline_wan_video2video import WanVideoToVideoPipeline
 
 else:
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
new file mode 100644
index 000000000000..235e247cbab8
--- /dev/null
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -0,0 +1,848 @@
+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import regex as re
+import torch
+from transformers import AutoTokenizer, UMT5EncoderModel
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput
+from ...loaders import WanLoraLoaderMixin
+from ...models import AutoencoderKLWan, WanTransformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import WanPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers.utils import export_to_video
+        >>> from diffusers import AutoencoderKLWan, WanPipeline
+        >>> from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+
+        >>> # Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+        >>> model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
+        >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+        >>> pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
+        >>> flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
+        >>> pipe.to("cuda")
+
+        >>> prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
+        >>> negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+
+        >>> output = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     height=720,
+        ...     width=1280,
+        ...     num_frames=81,
+        ...     guidance_scale=5.0,
+        ... ).frames[0]
+        >>> export_to_video(output, "output.mp4", fps=16)
+        ```
+"""
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class WanVACEPipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Pipeline for controllable generation using Wan.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        tokenizer ([`T5Tokenizer`]):
+            Tokenizer from [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5Tokenizer),
+            specifically the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        text_encoder ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        transformer ([`WanTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`UniPCMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        transformer: WanTransformer3DModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        video=None,
+        mask=None,
+        reference_images=None,
+    ):
+        base = self.vae_scale_factor_spatial * self.transformer.config.patch_size
+        if height % base != 0 or width % base != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by {base} but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        if video is not None:
+            if mask is not None:
+                if len(video) != len(mask):
+                    raise ValueError(
+                        f"Length of `video` {len(video)} and `mask` {len(mask)} do not match. Please make sure that"
+                        " they have the same length."
+                    )
+            if reference_images is not None:
+                is_pil_image = isinstance(reference_images, PIL.Image.Image)
+                is_list_of_pil_images = isinstance(reference_images, list) and all(
+                    isinstance(ref_img, PIL.Image.Image) for ref_img in reference_images
+                )
+                is_list_of_list_of_pil_images = isinstance(reference_images, list) and all(
+                    isinstance(ref_img, list) and all(isinstance(ref_img_, PIL.Image.Image) for ref_img_ in ref_img)
+                    for ref_img in reference_images
+                )
+                if not (is_pil_image or is_list_of_pil_images or is_list_of_list_of_pil_images):
+                    raise ValueError(
+                        "`reference_images` has to be of type `PIL.Image.Image` or `list` of `PIL.Image.Image`, or "
+                        "`list` of `list` of `PIL.Image.Image`, but is {type(reference_images)}"
+                    )
+                if is_list_of_list_of_pil_images and len(reference_images) != 1:
+                    raise ValueError(
+                        "The pipeline only supports generating one video at a time at the moment. When passing a list "
+                        "of list of reference images, where the outer list corresponds to the batch size and the inner "
+                        "list corresponds to list of conditioning images per video, please make sure to only pass "
+                        "one inner list of reference images (i.e., `[[<image1>, <image2>, ...]]`"
+                    )
+        elif mask is not None:
+            raise ValueError("`mask` can only be passed if `video` is passed as well.")
+
+    def preprocess_conditions(
+        self,
+        video: Optional[List[PipelineImageInput]] = None,
+        mask: Optional[List[PipelineImageInput]] = None,
+        reference_images: Optional[List[PipelineImageInput]] = None,
+        batch_size: int = 1,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        if video is not None:
+            video = self.video_processor.preprocess_video(video, None, None)  # Use the height/width of video
+            image_size = tuple(video.shape[-2:])
+        else:
+            video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=dtype, device=device)
+            image_size = (height, width)  # Use the height/width provider by user
+
+        if mask is not None:
+            mask = self.video_processor.preprocess_video(mask, height, width)
+        else:
+            mask = torch.ones_like(video, dtype=dtype, device=device)
+
+        video = video.to(dtype=dtype, device=device)
+        mask = mask.to(dtype=dtype, device=device)
+
+        reference_images_preprocessed = []
+        if reference_images is not None:
+            if not isinstance(reference_images, list):
+                reference_images = [reference_images]
+            for i, image in enumerate(reference_images):
+                image = self.video_processor.preprocess(image, None, None)  # Use the height/width of image
+
+                img_height, img_width = image.shape[-2:]
+                scale = min(image_size[0] / img_height, image_size[1] / img_width)
+                new_height, new_width = int(img_height * scale), int(img_width * scale)
+                resized_image = torch.nn.functional.interpolate(
+                    image.unsqueeze(1), size=(new_height, new_width), mode="bilinear", align_corners=False
+                ).squeeze(1)
+
+                top = (image_size[0] - new_height) // 2
+                left = (image_size[1] - new_width) // 2
+                canvas = torch.ones(batch_size, 1, 3, *image_size, device=device, dtype=dtype)
+                canvas[:, :, :, top : top + new_height, left : left + new_width] = resized_image
+                reference_images_preprocessed.append(canvas)
+
+        return video, mask, reference_images_preprocessed
+
+    def prepare_video_latents(
+        self,
+        video: torch.Tensor,
+        mask: torch.Tensor,
+        reference_images: Optional[List[torch.Tensor]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ) -> torch.Tensor:
+        if isinstance(generator, list):
+            # TODO: support this
+            raise ValueError("Passing a list of generators is not yet supported. This may be supported in the future.")
+
+        if reference_images is None:
+            # For each batch of video, we set no reference image (as one or more can be passed by user)
+            reference_images = [[None] for _ in range(video.shape[0])]
+        else:
+            if video.shape[0] != len(reference_images):
+                raise ValueError(
+                    f"Batch size of `video` {video.shape[0]} and length of `reference_images` {len(reference_images)} does not match."
+                )
+
+        if video.shape[0] != 1:
+            # TODO: support this
+            raise ValueError(
+                "Generating with more than one video is not yet supported. This may be supported in the future."
+            )
+
+        vae_dtype = self.vae.dtype
+        video = video.to(dtype=vae_dtype)
+
+        if mask is None:
+            latents = retrieve_latents(self.vae.encode(video), generator, sample_mode="argmax").unbind(0)
+        else:
+            mask = mask.to(dtype=vae_dtype)
+            mask = [torch.where(m > 0.5, 1.0, 0.0) for m in mask]
+            inactive = [v * (1 - m) for v, m in zip(video, mask)]
+            reactive = [v * m for v, m in zip(video, mask)]
+            inactive = retrieve_latents(self.vae.encode(inactive), generator, sample_mode="argmax")
+            reactive = retrieve_latents(self.vae.encode(reactive), generator, sample_mode="argmax")
+            latents = [torch.cat([i, r], dim=0) for i, r in zip(inactive, reactive)]
+
+        latent_list = []
+        for latent, ref_images in zip(latents, reference_images):
+            if ref_images is not None:
+                ref_images = ref_images.to(dtype=vae_dtype)
+                ref_latents = retrieve_latents(self.vae.encode(ref_images), generator, sample_mode="argmax")
+                ref_latents = [torch.cat([r, torch.zeros_like(r)], dim=0) for r in ref_latents]
+            latent = torch.cat([*ref_latents, latent], dim=1)
+            latent_list.append(latent)
+        return latent_list
+
+    def prepare_masks(
+        self,
+        mask: torch.Tensor,
+        reference_images: Optional[List[torch.Tensor]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ) -> torch.Tensor:
+        if isinstance(generator, list):
+            # TODO: support this
+            raise ValueError("Passing a list of generators is not yet supported. This may be supported in the future.")
+
+        if reference_images is None:
+            # For each batch of video, we set no reference image (as one or more can be passed by user)
+            reference_images = [[None] for _ in range(mask.shape[0])]
+        else:
+            if mask.shape[0] != len(reference_images):
+                raise ValueError(
+                    f"Batch size of `mask` {mask.shape[0]} and length of `reference_images` {len(reference_images)} does not match."
+                )
+
+        if mask.shape[0] != 1:
+            # TODO: support this
+            raise ValueError(
+                "Generating with more than one video is not yet supported. This may be supported in the future."
+            )
+
+        mask_list = []
+        transformer_patch_size = self.transformer.config.patch_size
+        for mask_, ref_images in zip(mask, reference_images):
+            num_frames, num_channels, height, width = mask_.shape
+            new_num_frames = (num_frames + self.vae_scale_factor_temporal - 1) // self.vae_scale_factor_temporal
+            new_height = height // (self.vae_scale_factor_spatial * transformer_patch_size) * transformer_patch_size
+            new_width = width // (self.vae_scale_factor_spatial * transformer_patch_size) * transformer_patch_size
+            mask_ = mask_[:, 0, :, :]
+            mask_ = mask_.view(num_frames, height, self.vae_scale_factor_spatial, width, self.vae_scale_factor_spatial)
+            mask_ = mask_.permute(2, 4, 0, 1, 3).flatten(2, 4).flatten(0, 1)
+            mask_ = torch.nn.functional.interpolate(
+                mask_.unsqueeze(0), size=(new_num_frames, new_height, new_width), mode="nearest-exact"
+            ).squeeze(0)
+            if ref_images is not None:
+                num_ref_images = ref_images.size(0)
+                mask_padding = torch.zeros_like(mask[:num_ref_images, :, :, :])
+                mask_ = torch.cat([mask_, mask_padding], dim=1)
+            mask_list.append(mask_)
+        return mask_list
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        video: Optional[List[PipelineImageInput]] = None,
+        mask: Optional[List[PipelineImageInput]] = None,
+        reference_images: Optional[List[PipelineImageInput]] = None,
+        conditioning_scale: Union[float, List[float], torch.Tensor] = 1.0,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, defaults to `480`):
+                The height in pixels of the generated image.
+            width (`int`, defaults to `832`):
+                The width in pixels of the generated image.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
+                The dtype to use for the torch.amp.autocast.
+
+        Examples:
+
+        Returns:
+            [`~WanPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`WanPipelineOutput`] is returned, otherwise a `tuple` is returned where
+                the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # Simplification of implementation for now
+        if not isinstance(prompt, str):
+            raise ValueError("Passing a list of prompts is not yet supported. This may be supported in the future.")
+        if num_videos_per_prompt != 1:
+            raise ValueError(
+                "Generating multiple videos per prompt is not yet supported. This may be supported in the future."
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            video,
+            mask,
+            reference_images,
+        )
+
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        device = self._execution_device
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        vae_dtype = self.vae.dtype
+        transformer_dtype = self.transformer.dtype
+
+        if isinstance(conditioning_scale, (int, float)):
+            conditioning_scale = [conditioning_scale] * len(self.transformer.config.vace_layers)
+        if isinstance(conditioning_scale, list):
+            if len(conditioning_scale) != len(self.transformer.config.vace_layers):
+                raise ValueError(
+                    f"Length of `conditioning_scale` {len(conditioning_scale)} does not match number of layers {len(self.transformer.config.vace_layers)}."
+                )
+            conditioning_scale = torch.tensor(conditioning_scale)
+        if isinstance(conditioning_scale, torch.Tensor):
+            if conditioning_scale.size(0) != len(self.transformer.config.vace_layers):
+                raise ValueError(
+                    f"Length of `conditioning_scale` {conditioning_scale.size(0)} does not match number of layers {len(self.transformer.config.vace_layers)}."
+                )
+            conditioning_scale = conditioning_scale.to(device=device, dtype=transformer_dtype)
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        video, mask, reference_images = self.preprocess_conditions(
+            video,
+            mask,
+            reference_images,
+            batch_size,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+        )
+
+        conditioning_latents = self.prepare_video_latents(video, mask, reference_images, generator)
+        conditioning_latents = [c.to(transformer_dtype) for c in conditioning_latents]
+
+        mask = self.prepare_masks(mask, reference_images, generator)
+        mask = [m.to(transformer_dtype) for m in mask]
+
+        conditioning_latents = [torch.cat([c, m], dim=1) for c, m in zip(conditioning_latents, mask)]
+
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                latent_model_input = latents.to(transformer_dtype)
+                timestep = t.expand(latents.shape[0])
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    control_hidden_states=conditioning_latents,
+                    control_hidden_states_scale=conditioning_scale,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if self.do_classifier_free_guidance:
+                    noise_uncond = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        control_hidden_states=conditioning_latents,
+                        control_hidden_states_scale=conditioning_scale,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if not output_type == "latent":
+            latents = latents.to(vae_dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return WanPipelineOutput(frames=video)

From f9865ff1ffbba5ceea49635877ce47a5aec9cc4f Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 19 May 2025 21:51:16 +0200
Subject: [PATCH 02/15] make fix-copies

---
 src/diffusers/utils/dummy_pt_objects.py           | 15 +++++++++++++++
 .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 97bc3f317b32..24b3c3d7be59 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -1150,6 +1150,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class WanVACETransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 def get_constant_schedule(*args, **kwargs):
     requires_backends(get_constant_schedule, ["torch"])
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 4ab6091c6dfc..72c21a187dae 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2882,6 +2882,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class WanVACEPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class WanVideoToVideoPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 32ab1c969c7ef6131a19e1fe8af0acac3e9735cc Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 19 May 2025 21:53:18 +0200
Subject: [PATCH 03/15] fix no split modules

---
 src/diffusers/models/transformers/transformer_wan.py      | 2 +-
 src/diffusers/models/transformers/transformer_wan_vace.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
index ba7aa06d0613..c78d72dc4a2c 100644
--- a/src/diffusers/models/transformers/transformer_wan.py
+++ b/src/diffusers/models/transformers/transformer_wan.py
@@ -340,7 +340,7 @@ class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrigi
 
     _supports_gradient_checkpointing = True
     _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
-    _no_split_modules = ["WanTransformerBlock", "WanVACETransformerBlock"]
+    _no_split_modules = ["WanTransformerBlock"]
     _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
     _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
 
diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py
index e6b25672c1f8..34c05424d557 100644
--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -176,7 +176,7 @@ class WanVACETransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromO
 
     _supports_gradient_checkpointing = True
     _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
-    _no_split_modules = ["WanTransformerBlock"]
+    _no_split_modules = ["WanTransformerBlock", "WanVACETransformerBlock"]
     _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
     _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
 

From 834bfc6448d879db48a94e410a375673097a054b Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 19 May 2025 21:56:11 +0200
Subject: [PATCH 04/15] add conversion script

---
 scripts/convert_wan_to_diffusers.py | 143 +++++++++++++++++++++++++---
 1 file changed, 130 insertions(+), 13 deletions(-)

diff --git a/scripts/convert_wan_to_diffusers.py b/scripts/convert_wan_to_diffusers.py
index ef91e9e6c180..6d25cde071b1 100644
--- a/scripts/convert_wan_to_diffusers.py
+++ b/scripts/convert_wan_to_diffusers.py
@@ -1,6 +1,6 @@
 import argparse
 import pathlib
-from typing import Any, Dict
+from typing import Any, Dict, Tuple
 
 import torch
 from accelerate import init_empty_weights
@@ -14,6 +14,8 @@
     WanImageToVideoPipeline,
     WanPipeline,
     WanTransformer3DModel,
+    WanVACEPipeline,
+    WanVACETransformer3DModel,
 )
 
 
@@ -59,7 +61,52 @@
     "attn2.norm_k_img": "attn2.norm_added_k",
 }
 
+VACE_TRANSFORMER_KEYS_RENAME_DICT = {
+    "time_embedding.0": "condition_embedder.time_embedder.linear_1",
+    "time_embedding.2": "condition_embedder.time_embedder.linear_2",
+    "text_embedding.0": "condition_embedder.text_embedder.linear_1",
+    "text_embedding.2": "condition_embedder.text_embedder.linear_2",
+    "time_projection.1": "condition_embedder.time_proj",
+    "head.modulation": "scale_shift_table",
+    "head.head": "proj_out",
+    "modulation": "scale_shift_table",
+    "ffn.0": "ffn.net.0.proj",
+    "ffn.2": "ffn.net.2",
+    # Hack to swap the layer names
+    # The original model calls the norms in following order: norm1, norm3, norm2
+    # We convert it to: norm1, norm2, norm3
+    "norm2": "norm__placeholder",
+    "norm3": "norm2",
+    "norm__placeholder": "norm3",
+    # # For the I2V model
+    # "img_emb.proj.0": "condition_embedder.image_embedder.norm1",
+    # "img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
+    # "img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
+    # "img_emb.proj.4": "condition_embedder.image_embedder.norm2",
+    # # for the FLF2V model
+    # "img_emb.emb_pos": "condition_embedder.image_embedder.pos_embed",
+    # Add attention component mappings
+    "self_attn.q": "attn1.to_q",
+    "self_attn.k": "attn1.to_k",
+    "self_attn.v": "attn1.to_v",
+    "self_attn.o": "attn1.to_out.0",
+    "self_attn.norm_q": "attn1.norm_q",
+    "self_attn.norm_k": "attn1.norm_k",
+    "cross_attn.q": "attn2.to_q",
+    "cross_attn.k": "attn2.to_k",
+    "cross_attn.v": "attn2.to_v",
+    "cross_attn.o": "attn2.to_out.0",
+    "cross_attn.norm_q": "attn2.norm_q",
+    "cross_attn.norm_k": "attn2.norm_k",
+    "attn2.to_k_img": "attn2.add_k_proj",
+    "attn2.to_v_img": "attn2.add_v_proj",
+    "attn2.norm_k_img": "attn2.norm_added_k",
+    "before_proj": "proj_in",
+    "after_proj": "proj_out",
+}
+
 TRANSFORMER_SPECIAL_KEYS_REMAP = {}
+VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
 
 
 def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
@@ -74,7 +121,7 @@ def load_sharded_safetensors(dir: pathlib.Path):
     return state_dict
 
 
-def get_transformer_config(model_type: str) -> Dict[str, Any]:
+def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
     if model_type == "Wan-T2V-1.3B":
         config = {
             "model_id": "StevenZhang/Wan2.1-T2V-1.3B-Diff",
@@ -94,6 +141,8 @@ def get_transformer_config(model_type: str) -> Dict[str, Any]:
                 "text_dim": 4096,
             },
         }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
     elif model_type == "Wan-T2V-14B":
         config = {
             "model_id": "StevenZhang/Wan2.1-T2V-14B-Diff",
@@ -113,6 +162,8 @@ def get_transformer_config(model_type: str) -> Dict[str, Any]:
                 "text_dim": 4096,
             },
         }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
     elif model_type == "Wan-I2V-14B-480p":
         config = {
             "model_id": "StevenZhang/Wan2.1-I2V-14B-480P-Diff",
@@ -133,6 +184,8 @@ def get_transformer_config(model_type: str) -> Dict[str, Any]:
                 "text_dim": 4096,
             },
         }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
     elif model_type == "Wan-I2V-14B-720p":
         config = {
             "model_id": "StevenZhang/Wan2.1-I2V-14B-720P-Diff",
@@ -153,6 +206,8 @@ def get_transformer_config(model_type: str) -> Dict[str, Any]:
                 "text_dim": 4096,
             },
         }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
     elif model_type == "Wan-FLF2V-14B-720P":
         config = {
             "model_id": "ypyp/Wan2.1-FLF2V-14B-720P",  # This is just a placeholder
@@ -175,11 +230,60 @@ def get_transformer_config(model_type: str) -> Dict[str, Any]:
                 "pos_embed_seq_len": 257 * 2,
             },
         }
-    return config
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-VACE-1.3B":
+        config = {
+            "model_id": "Wan-AI/Wan2.1-VACE-1.3B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 12,
+                "num_layers": 30,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "vace_layers": [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
+                "vace_in_channels": 96,
+            },
+        }
+        RENAME_DICT = VACE_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = VACE_TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-VACE-14B":
+        config = {
+            "model_id": "Wan-AI/Wan2.1-VACE-14B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "vace_layers": [0, 5, 10, 15, 20, 25, 30, 35],
+                "vace_in_channels": 96,
+            },
+        }
+        RENAME_DICT = VACE_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = VACE_TRANSFORMER_SPECIAL_KEYS_REMAP
+    return config, RENAME_DICT, SPECIAL_KEYS_REMAP
 
 
 def convert_transformer(model_type: str):
-    config = get_transformer_config(model_type)
+    config, RENAME_DICT, SPECIAL_KEYS_REMAP = get_transformer_config(model_type)
+
     diffusers_config = config["diffusers_config"]
     model_id = config["model_id"]
     model_dir = pathlib.Path(snapshot_download(model_id, repo_type="model"))
@@ -187,16 +291,19 @@ def convert_transformer(model_type: str):
     original_state_dict = load_sharded_safetensors(model_dir)
 
     with init_empty_weights():
-        transformer = WanTransformer3DModel.from_config(diffusers_config)
+        if "VACE" not in model_type:
+            transformer = WanTransformer3DModel.from_config(diffusers_config)
+        else:
+            transformer = WanVACETransformer3DModel.from_config(diffusers_config)
 
     for key in list(original_state_dict.keys()):
         new_key = key[:]
-        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
+        for replace_key, rename_key in RENAME_DICT.items():
             new_key = new_key.replace(replace_key, rename_key)
         update_state_dict_(original_state_dict, key, new_key)
 
     for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+        for special_key, handler_fn_inplace in SPECIAL_KEYS_REMAP.items():
             if special_key not in key:
                 continue
             handler_fn_inplace(key, original_state_dict)
@@ -412,7 +519,7 @@ def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_type", type=str, default=None)
     parser.add_argument("--output_path", type=str, required=True)
-    parser.add_argument("--dtype", default="fp32")
+    parser.add_argument("--dtype", default="fp32", choices=["fp32", "fp16", "bf16", "none"])
     return parser.parse_args()
 
 
@@ -426,18 +533,20 @@ def get_args():
 if __name__ == "__main__":
     args = get_args()
 
-    transformer = None
-    dtype = DTYPE_MAPPING[args.dtype]
-
-    transformer = convert_transformer(args.model_type).to(dtype=dtype)
+    transformer = convert_transformer(args.model_type)
     vae = convert_vae()
-    text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl")
+    text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
     flow_shift = 16.0 if "FLF2V" in args.model_type else 3.0
     scheduler = UniPCMultistepScheduler(
         prediction_type="flow_prediction", use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=flow_shift
     )
 
+    # If user has specified "none", we keep the original dtypes of the state dict without any conversion
+    if args.dtype != "none":
+        dtype = DTYPE_MAPPING[args.dtype]
+        transformer.to(dtype)
+
     if "I2V" in args.model_type or "FLF2V" in args.model_type:
         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
@@ -452,6 +561,14 @@ def get_args():
             image_encoder=image_encoder,
             image_processor=image_processor,
         )
+    elif "VACE" in args.model_type:
+        pipe = WanVACEPipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+        )
     else:
         pipe = WanPipeline(
             transformer=transformer,

From ea301df80b36ea3d35ca70857ac593e5ea7585ff Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Tue, 20 May 2025 21:35:23 +0200
Subject: [PATCH 05/15] refactor

---
 .../transformers/transformer_wan_vace.py      |  29 ++--
 .../pipelines/wan/pipeline_wan_vace.py        | 134 ++++++++++++------
 2 files changed, 105 insertions(+), 58 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py
index 34c05424d557..a5ad84477624 100644
--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -175,7 +175,7 @@ class WanVACETransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromO
     """
 
     _supports_gradient_checkpointing = True
-    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _skip_layerwise_casting_patterns = ["patch_embedding", "vace_patch_embedding", "condition_embedder", "norm"]
     _no_split_modules = ["WanTransformerBlock", "WanVACETransformerBlock"]
     _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
     _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
@@ -273,9 +273,6 @@ def forward(
         return_dict: bool = True,
         attention_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
-        if control_hidden_states is None:
-            raise ValueError("Control hidden states must be provided for VACE models.")
-
         if attention_kwargs is not None:
             attention_kwargs = attention_kwargs.copy()
             lora_scale = attention_kwargs.pop("scale", 1.0)
@@ -299,6 +296,12 @@ def forward(
 
         if control_hidden_states_scale is None:
             control_hidden_states_scale = control_hidden_states.new_ones(len(self.config.vace_layers))
+        control_hidden_states_scale = torch.unbind(control_hidden_states_scale)
+        if len(control_hidden_states_scale) != len(self.config.vace_layers):
+            raise ValueError(
+                f"Length of `control_hidden_states_scale` {len(control_hidden_states_scale)} should be "
+                f"equal to {len(self.config.vace_layers)}."
+            )
 
         # 1. Rotary position embedding
         rotary_emb = self.rope(hidden_states)
@@ -306,9 +309,11 @@ def forward(
         # 2. Patch embedding
         hidden_states = self.patch_embedding(hidden_states)
         hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        print("hidden_states", hidden_states.shape)
 
         control_hidden_states = self.vace_patch_embedding(control_hidden_states)
         control_hidden_states = control_hidden_states.flatten(2).transpose(1, 2)
+        print("control_hidden_states", control_hidden_states.shape)
         control_hidden_states_padding = control_hidden_states.new_zeros(
             batch_size, hidden_states.size(1) - control_hidden_states.size(1), control_hidden_states.size(2)
         )
@@ -329,11 +334,11 @@ def forward(
             # Prepare VACE hints
             control_hidden_states_list = []
             vace_hidden_states = hidden_states
-            for block in self.vace_blocks:
+            for i, block in enumerate(self.vace_blocks):
                 vace_hidden_states, control_hidden_states = self._gradient_checkpointing_func(
                     block, vace_hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
                 )
-                control_hidden_states_list.append(control_hidden_states)
+                control_hidden_states_list.append((control_hidden_states, control_hidden_states_scale[i]))
             control_hidden_states_list = control_hidden_states_list[::-1]
 
             for i, block in enumerate(self.blocks):
@@ -341,24 +346,24 @@ def forward(
                     block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
                 )
                 if i in self.config.vace_layers:
-                    control_hint = control_hidden_states_list.pop()
-                    hidden_states = hidden_states + control_hint * control_hidden_states_scale[i]
+                    control_hint, scale = control_hidden_states_list.pop()
+                    hidden_states = hidden_states + control_hint * scale
         else:
             # Prepare VACE hints
             control_hidden_states_list = []
             vace_hidden_states = hidden_states
-            for block in self.vace_blocks:
+            for i, block in enumerate(self.vace_blocks):
                 vace_hidden_states, control_hidden_states = block(
                     vace_hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
                 )
-                control_hidden_states_list.append(control_hidden_states)
+                control_hidden_states_list.append((control_hidden_states, control_hidden_states_scale[i]))
             control_hidden_states_list = control_hidden_states_list[::-1]
 
             for i, block in enumerate(self.blocks):
                 hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
                 if i in self.config.vace_layers:
-                    control_hint = control_hidden_states_list.pop()
-                    hidden_states = hidden_states + control_hint * control_hidden_states_scale[i]
+                    control_hint, scale = control_hidden_states_list.pop()
+                    hidden_states = hidden_states + control_hint * scale
 
         # 6. Output norm, projection & unpatchify
         shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index 235e247cbab8..82e9c05bfe87 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -292,7 +292,7 @@ def check_inputs(
         mask=None,
         reference_images=None,
     ):
-        base = self.vae_scale_factor_spatial * self.transformer.config.patch_size
+        base = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
         if height % base != 0 or width % base != 0:
             raise ValueError(f"`height` and `width` have to be divisible by {base} but are {height} and {width}.")
 
@@ -368,39 +368,78 @@ def preprocess_conditions(
         device: Optional[torch.device] = None,
     ):
         if video is not None:
-            video = self.video_processor.preprocess_video(video, None, None)  # Use the height/width of video
-            image_size = tuple(video.shape[-2:])
+            base = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
+            video_height, video_width = self.video_processor.get_default_height_width(video[0])
+
+            if video_height * video_width > height * width:
+                scale = min(width / video_width, height / video_height)
+                video_height, video_width = int(video_height * scale), int(video_width * scale)
+
+            if video_height % base != 0 or video_width % base != 0:
+                logger.warning(
+                    f"Video height and width should be divisible by {base}, but got {video_height} and {video_width}. "
+                )
+                video_height = (video_height // base) * base
+                video_width = (video_width // base) * base
+
+            assert video_height * video_width <= height * width
+
+            video = self.video_processor.preprocess_video(video, video_height, video_width)
+            image_size = (video_height, video_width)  # Use the height/width of video (with possible rescaling)
         else:
-            video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=dtype, device=device)
+            video = torch.zeros(batch_size, 3, num_frames, height, width, dtype=dtype, device=device)
             image_size = (height, width)  # Use the height/width provider by user
 
         if mask is not None:
-            mask = self.video_processor.preprocess_video(mask, height, width)
+            mask = self.video_processor.preprocess_video(mask, image_size[0], image_size[1])
         else:
-            mask = torch.ones_like(video, dtype=dtype, device=device)
+            mask = torch.ones_like(video)
 
         video = video.to(dtype=dtype, device=device)
         mask = mask.to(dtype=dtype, device=device)
 
-        reference_images_preprocessed = []
-        if reference_images is not None:
-            if not isinstance(reference_images, list):
-                reference_images = [reference_images]
-            for i, image in enumerate(reference_images):
-                image = self.video_processor.preprocess(image, None, None)  # Use the height/width of image
+        # Make a list of list of images where the outer list corresponds to video batch size and the inner list
+        # corresponds to list of conditioning images per video
+        if reference_images is None or isinstance(reference_images, PIL.Image.Image):
+            reference_images = [[reference_images] for _ in range(video.shape[0])]
+        elif isinstance(reference_images, (list, tuple)) and isinstance(next(iter(reference_images)), PIL.Image.Image):
+            reference_images = [reference_images]
+        elif (
+            isinstance(reference_images, (list, tuple))
+            and isinstance(next(iter(reference_images)), list)
+            and isinstance(next(iter(reference_images[0])), PIL.Image.Image)
+        ):
+            reference_images = reference_images
+        else:
+            raise ValueError(
+                "`reference_images` has to be of type `PIL.Image.Image` or `list` of `PIL.Image.Image`, or "
+                "`list` of `list` of `PIL.Image.Image`, but is {type(reference_images)}"
+            )
+
+        if video.shape[0] != len(reference_images):
+            raise ValueError(
+                f"Batch size of `video` {video.shape[0]} and length of `reference_images` {len(reference_images)} does not match."
+            )
 
+        reference_images_preprocessed = []
+        for i, reference_images_batch in enumerate(reference_images):
+            preprocessed_images = []
+            for j, image in enumerate(reference_images_batch):
+                if image is None:
+                    continue
+                image = self.video_processor.preprocess(image, None, None)
                 img_height, img_width = image.shape[-2:]
                 scale = min(image_size[0] / img_height, image_size[1] / img_width)
                 new_height, new_width = int(img_height * scale), int(img_width * scale)
                 resized_image = torch.nn.functional.interpolate(
-                    image.unsqueeze(1), size=(new_height, new_width), mode="bilinear", align_corners=False
-                ).squeeze(1)
-
+                    image, size=(new_height, new_width), mode="bilinear", align_corners=False
+                ).squeeze(0)  # [C, H, W]
                 top = (image_size[0] - new_height) // 2
                 left = (image_size[1] - new_width) // 2
-                canvas = torch.ones(batch_size, 1, 3, *image_size, device=device, dtype=dtype)
-                canvas[:, :, :, top : top + new_height, left : left + new_width] = resized_image
-                reference_images_preprocessed.append(canvas)
+                canvas = torch.ones(3, *image_size, device=device, dtype=dtype)
+                canvas[:, top : top + new_height, left : left + new_width] = resized_image
+                preprocessed_images.append(canvas)
+            reference_images_preprocessed.append(preprocessed_images)
 
         return video, mask, reference_images_preprocessed
 
@@ -408,7 +447,7 @@ def prepare_video_latents(
         self,
         video: torch.Tensor,
         mask: torch.Tensor,
-        reference_images: Optional[List[torch.Tensor]] = None,
+        reference_images: Optional[List[List[torch.Tensor]]] = None,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
     ) -> torch.Tensor:
         if isinstance(generator, list):
@@ -416,7 +455,8 @@ def prepare_video_latents(
             raise ValueError("Passing a list of generators is not yet supported. This may be supported in the future.")
 
         if reference_images is None:
-            # For each batch of video, we set no reference image (as one or more can be passed by user)
+            # For each batch of video, we set no re
+            # ference image (as one or more can be passed by user)
             reference_images = [[None] for _ in range(video.shape[0])]
         else:
             if video.shape[0] != len(reference_images):
@@ -437,22 +477,24 @@ def prepare_video_latents(
             latents = retrieve_latents(self.vae.encode(video), generator, sample_mode="argmax").unbind(0)
         else:
             mask = mask.to(dtype=vae_dtype)
-            mask = [torch.where(m > 0.5, 1.0, 0.0) for m in mask]
-            inactive = [v * (1 - m) for v, m in zip(video, mask)]
-            reactive = [v * m for v, m in zip(video, mask)]
+            mask = torch.where(mask > 0.5, 1.0, 0.0)
+            inactive = video * (1 - mask)
+            reactive = video * mask
             inactive = retrieve_latents(self.vae.encode(inactive), generator, sample_mode="argmax")
             reactive = retrieve_latents(self.vae.encode(reactive), generator, sample_mode="argmax")
-            latents = [torch.cat([i, r], dim=0) for i, r in zip(inactive, reactive)]
+            latents = torch.cat([inactive, reactive], dim=1)
 
         latent_list = []
-        for latent, ref_images in zip(latents, reference_images):
-            if ref_images is not None:
-                ref_images = ref_images.to(dtype=vae_dtype)
-                ref_latents = retrieve_latents(self.vae.encode(ref_images), generator, sample_mode="argmax")
-                ref_latents = [torch.cat([r, torch.zeros_like(r)], dim=0) for r in ref_latents]
-            latent = torch.cat([*ref_latents, latent], dim=1)
+        for latent, reference_images_batch in zip(latents, reference_images):
+            for reference_image in reference_images_batch:
+                assert reference_image.ndim == 3
+                reference_image = reference_image.to(dtype=vae_dtype)
+                reference_image = reference_image[None, :, None, :, :]  # [1, C, 1, H, W]
+                reference_latent = retrieve_latents(self.vae.encode(reference_image), generator, sample_mode="argmax")
+                reference_latent = torch.cat([reference_latent, torch.zeros_like(reference_latent)], dim=1)
+                latent = torch.cat([reference_latent.squeeze(0), latent], dim=1)  # Concat across frame dimension
             latent_list.append(latent)
-        return latent_list
+        return torch.stack(latent_list)
 
     def prepare_masks(
         self,
@@ -479,25 +521,28 @@ def prepare_masks(
                 "Generating with more than one video is not yet supported. This may be supported in the future."
             )
 
+        transformer_patch_size = self.transformer.config.patch_size[1]
+
         mask_list = []
-        transformer_patch_size = self.transformer.config.patch_size
-        for mask_, ref_images in zip(mask, reference_images):
-            num_frames, num_channels, height, width = mask_.shape
+        for mask_, reference_images_batch in zip(mask, reference_images):
+            num_channels, num_frames, height, width = mask_.shape
             new_num_frames = (num_frames + self.vae_scale_factor_temporal - 1) // self.vae_scale_factor_temporal
             new_height = height // (self.vae_scale_factor_spatial * transformer_patch_size) * transformer_patch_size
             new_width = width // (self.vae_scale_factor_spatial * transformer_patch_size) * transformer_patch_size
-            mask_ = mask_[:, 0, :, :]
-            mask_ = mask_.view(num_frames, height, self.vae_scale_factor_spatial, width, self.vae_scale_factor_spatial)
-            mask_ = mask_.permute(2, 4, 0, 1, 3).flatten(2, 4).flatten(0, 1)
+            mask_ = mask_[0, :, :, :]
+            mask_ = mask_.view(
+                num_frames, new_height, self.vae_scale_factor_spatial, new_width, self.vae_scale_factor_spatial
+            )
+            mask_ = mask_.permute(2, 4, 0, 1, 3).flatten(0, 1)  # [8x8, num_frames, new_height, new_width]
             mask_ = torch.nn.functional.interpolate(
                 mask_.unsqueeze(0), size=(new_num_frames, new_height, new_width), mode="nearest-exact"
             ).squeeze(0)
-            if ref_images is not None:
-                num_ref_images = ref_images.size(0)
-                mask_padding = torch.zeros_like(mask[:num_ref_images, :, :, :])
+            num_ref_images = len(reference_images_batch)
+            if num_ref_images > 0:
+                mask_padding = torch.zeros_like(mask_[:, :num_ref_images, :, :])
                 mask_ = torch.cat([mask_, mask_padding], dim=1)
             mask_list.append(mask_)
-        return mask_list
+        return torch.stack(mask_list)
 
     def prepare_latents(
         self,
@@ -746,12 +791,9 @@ def __call__(
         )
 
         conditioning_latents = self.prepare_video_latents(video, mask, reference_images, generator)
-        conditioning_latents = [c.to(transformer_dtype) for c in conditioning_latents]
-
         mask = self.prepare_masks(mask, reference_images, generator)
-        mask = [m.to(transformer_dtype) for m in mask]
-
-        conditioning_latents = [torch.cat([c, m], dim=1) for c, m in zip(conditioning_latents, mask)]
+        conditioning_latents = torch.cat([conditioning_latents, mask], dim=1)
+        conditioning_latents = conditioning_latents.to(transformer_dtype)
 
         num_channels_latents = self.transformer.config.in_channels
         latents = self.prepare_latents(

From 4b14ddd80b360818e31ba4c154d81e74e195f208 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Tue, 20 May 2025 21:35:30 +0200
Subject: [PATCH 06/15] add pipeline test

---
 tests/pipelines/wan/test_wan_vace.py | 189 +++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 tests/pipelines/wan/test_wan_vace.py

diff --git a/tests/pipelines/wan/test_wan_vace.py b/tests/pipelines/wan/test_wan_vace.py
new file mode 100644
index 000000000000..44e036d93d36
--- /dev/null
+++ b/tests/pipelines/wan/test_wan_vace.py
@@ -0,0 +1,189 @@
+# Copyright 2024 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import AutoencoderKLWan, FlowMatchEulerDiscreteScheduler, WanVACEPipeline, WanVACETransformer3DModel
+from diffusers.utils.testing_utils import enable_full_determinism
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class WanVACEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = WanVACEPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+    supports_dduf = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        transformer = WanVACETransformer3DModel(
+            patch_size=(1, 2, 2),
+            num_attention_heads=2,
+            attention_head_dim=12,
+            in_channels=16,
+            out_channels=16,
+            text_dim=32,
+            freq_dim=256,
+            ffn_dim=32,
+            num_layers=3,
+            cross_attn_norm=True,
+            qk_norm="rms_norm_across_heads",
+            rope_max_seq_len=32,
+            vace_layers=[0, 2],
+            vace_in_channels=96,
+        )
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        num_frames = 17
+        height = 16
+        width = 16
+
+        video = [Image.new("RGB", (height, width))] * num_frames
+        mask = [Image.new("L", (height, width), 0)] * num_frames
+
+        inputs = {
+            "video": video,
+            "mask": mask,
+            "prompt": "dance monkey",
+            "negative_prompt": "negative",  # TODO
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "height": 16,
+            "width": 16,
+            "num_frames": num_frames,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (17, 3, 16, 16))
+        expected_video = torch.randn(17, 3, 16, 16)
+        max_diff = np.abs(generated_video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    def test_inference_with_single_reference_image(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["reference_images"] = Image.new("RGB", (16, 16))
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (17, 3, 16, 16))
+        expected_video = torch.randn(17, 3, 16, 16)
+        max_diff = np.abs(generated_video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    def test_inference_with_multiple_reference_image(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["reference_images"] = [[Image.new("RGB", (16, 16))] * 2]
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (17, 3, 16, 16))
+        expected_video = torch.randn(17, 3, 16, 16)
+        max_diff = np.abs(generated_video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    @unittest.skip("Test not supported")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    @unittest.skip("Errors out because passing multiple prompts at once is not yet supported by this pipeline.")
+    def test_encode_prompt_works_in_isolation(self):
+        pass
+
+    @unittest.skip("Batching is not yet supported with this pipeline")
+    def test_inference_batch_consistent(self):
+        pass
+
+    @unittest.skip("Batching is not yet supported with this pipeline")
+    def test_inference_batch_single_identical(self):
+        return super().test_inference_batch_single_identical()

From 694dcf2cfdadfd2b5f81599b35e36e624e88a556 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 21 May 2025 00:52:15 +0200
Subject: [PATCH 07/15] refactor

---
 .../transformers/transformer_wan_vace.py      | 43 +++++++++----------
 .../pipelines/wan/pipeline_wan_vace.py        | 32 ++++++++++++--
 2 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py
index a5ad84477624..1a6f2af59a87 100644
--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -106,35 +106,38 @@ def forward(
     ) -> torch.Tensor:
         if self.proj_in is not None:
             control_hidden_states = self.proj_in(control_hidden_states)
-            hidden_states = hidden_states + control_hidden_states
-        else:
-            hidden_states = control_hidden_states
+            control_hidden_states = control_hidden_states + hidden_states
 
         shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
             self.scale_shift_table + temb.float()
         ).chunk(6, dim=1)
 
         # 1. Self-attention
-        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        norm_hidden_states = (self.norm1(control_hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(
+            control_hidden_states
+        )
         attn_output = self.attn1(hidden_states=norm_hidden_states, rotary_emb=rotary_emb)
-        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+        control_hidden_states = (control_hidden_states.float() + attn_output * gate_msa).type_as(control_hidden_states)
 
         # 2. Cross-attention
-        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        norm_hidden_states = self.norm2(control_hidden_states.float()).type_as(control_hidden_states)
         attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
-        hidden_states = hidden_states + attn_output
+        control_hidden_states = control_hidden_states + attn_output
 
         # 3. Feed-forward
-        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
-            hidden_states
+        norm_hidden_states = (self.norm3(control_hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            control_hidden_states
         )
         ff_output = self.ffn(norm_hidden_states)
-        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+        control_hidden_states = (control_hidden_states.float() + ff_output.float() * c_gate_msa).type_as(
+            control_hidden_states
+        )
 
+        conditioning_states = None
         if self.proj_out is not None:
-            control_hidden_states = self.proj_out(hidden_states)
+            conditioning_states = self.proj_out(control_hidden_states)
 
-        return hidden_states, control_hidden_states
+        return conditioning_states, control_hidden_states
 
 
 class WanVACETransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
@@ -309,11 +312,9 @@ def forward(
         # 2. Patch embedding
         hidden_states = self.patch_embedding(hidden_states)
         hidden_states = hidden_states.flatten(2).transpose(1, 2)
-        print("hidden_states", hidden_states.shape)
 
         control_hidden_states = self.vace_patch_embedding(control_hidden_states)
         control_hidden_states = control_hidden_states.flatten(2).transpose(1, 2)
-        print("control_hidden_states", control_hidden_states.shape)
         control_hidden_states_padding = control_hidden_states.new_zeros(
             batch_size, hidden_states.size(1) - control_hidden_states.size(1), control_hidden_states.size(2)
         )
@@ -333,12 +334,11 @@ def forward(
         if torch.is_grad_enabled() and self.gradient_checkpointing:
             # Prepare VACE hints
             control_hidden_states_list = []
-            vace_hidden_states = hidden_states
             for i, block in enumerate(self.vace_blocks):
-                vace_hidden_states, control_hidden_states = self._gradient_checkpointing_func(
-                    block, vace_hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
+                conditioning_states, control_hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
                 )
-                control_hidden_states_list.append((control_hidden_states, control_hidden_states_scale[i]))
+                control_hidden_states_list.append((conditioning_states, control_hidden_states_scale[i]))
             control_hidden_states_list = control_hidden_states_list[::-1]
 
             for i, block in enumerate(self.blocks):
@@ -351,12 +351,11 @@ def forward(
         else:
             # Prepare VACE hints
             control_hidden_states_list = []
-            vace_hidden_states = hidden_states
             for i, block in enumerate(self.vace_blocks):
-                vace_hidden_states, control_hidden_states = block(
-                    vace_hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
+                conditioning_states, control_hidden_states = block(
+                    hidden_states, encoder_hidden_states, control_hidden_states, timestep_proj, rotary_emb
                 )
-                control_hidden_states_list.append((control_hidden_states, control_hidden_states_scale[i]))
+                control_hidden_states_list.append((conditioning_states, control_hidden_states_scale[i]))
             control_hidden_states_list = control_hidden_states_list[::-1]
 
             for i, block in enumerate(self.blocks):
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index 82e9c05bfe87..ae93c57a188a 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -23,7 +23,7 @@
 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput
 from ...loaders import WanLoraLoaderMixin
-from ...models import AutoencoderKLWan, WanTransformer3DModel
+from ...models import AutoencoderKLWan, WanVACETransformer3DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
@@ -137,7 +137,7 @@ def __init__(
         self,
         tokenizer: AutoTokenizer,
         text_encoder: UMT5EncoderModel,
-        transformer: WanTransformer3DModel,
+        transformer: WanVACETransformer3DModel,
         vae: AutoencoderKLWan,
         scheduler: FlowMatchEulerDiscreteScheduler,
     ):
@@ -421,6 +421,13 @@ def preprocess_conditions(
                 f"Batch size of `video` {video.shape[0]} and length of `reference_images` {len(reference_images)} does not match."
             )
 
+        ref_images_lengths = [len(reference_images_batch) for reference_images_batch in reference_images]
+        if any(l != ref_images_lengths[0] for l in ref_images_lengths):
+            raise ValueError(
+                f"All batches of `reference_images` should have the same length, but got {ref_images_lengths}. Support for this "
+                "may be added in the future."
+            )
+
         reference_images_preprocessed = []
         for i, reference_images_batch in enumerate(reference_images):
             preprocessed_images = []
@@ -449,7 +456,10 @@ def prepare_video_latents(
         mask: torch.Tensor,
         reference_images: Optional[List[List[torch.Tensor]]] = None,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        device: Optional[torch.device] = None,
     ) -> torch.Tensor:
+        device = device or self._execution_device
+
         if isinstance(generator, list):
             # TODO: support this
             raise ValueError("Passing a list of generators is not yet supported. This may be supported in the future.")
@@ -473,8 +483,16 @@ def prepare_video_latents(
         vae_dtype = self.vae.dtype
         video = video.to(dtype=vae_dtype)
 
+        latents_mean = torch.tensor(self.vae.config.latents_mean, device=device, dtype=torch.float32).view(
+            1, self.vae.config.z_dim, 1, 1, 1
+        )
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std, device=device, dtype=torch.float32).view(
+            1, self.vae.config.z_dim, 1, 1, 1
+        )
+
         if mask is None:
             latents = retrieve_latents(self.vae.encode(video), generator, sample_mode="argmax").unbind(0)
+            latents = ((latents.float() - latents_mean) * latents_std).to(vae_dtype)
         else:
             mask = mask.to(dtype=vae_dtype)
             mask = torch.where(mask > 0.5, 1.0, 0.0)
@@ -482,6 +500,8 @@ def prepare_video_latents(
             reactive = video * mask
             inactive = retrieve_latents(self.vae.encode(inactive), generator, sample_mode="argmax")
             reactive = retrieve_latents(self.vae.encode(reactive), generator, sample_mode="argmax")
+            inactive = ((inactive.float() - latents_mean) * latents_std).to(vae_dtype)
+            reactive = ((reactive.float() - latents_mean) * latents_std).to(vae_dtype)
             latents = torch.cat([inactive, reactive], dim=1)
 
         latent_list = []
@@ -491,6 +511,7 @@ def prepare_video_latents(
                 reference_image = reference_image.to(dtype=vae_dtype)
                 reference_image = reference_image[None, :, None, :, :]  # [1, C, 1, H, W]
                 reference_latent = retrieve_latents(self.vae.encode(reference_image), generator, sample_mode="argmax")
+                reference_latent = ((reference_latent.float() - latents_mean) * latents_std).to(vae_dtype)
                 reference_latent = torch.cat([reference_latent, torch.zeros_like(reference_latent)], dim=1)
                 latent = torch.cat([reference_latent.squeeze(0), latent], dim=1)  # Concat across frame dimension
             latent_list.append(latent)
@@ -790,7 +811,7 @@ def __call__(
             device,
         )
 
-        conditioning_latents = self.prepare_video_latents(video, mask, reference_images, generator)
+        conditioning_latents = self.prepare_video_latents(video, mask, reference_images, generator, device)
         mask = self.prepare_masks(mask, reference_images, generator)
         conditioning_latents = torch.cat([conditioning_latents, mask], dim=1)
         conditioning_latents = conditioning_latents.to(transformer_dtype)
@@ -808,6 +829,11 @@ def __call__(
             latents,
         )
 
+        if conditioning_latents.shape[2] != latents.shape[2]:
+            logger.warning(
+                "The number of frames in the conditioning latents does not match the number of frames to be generated. Generation quality may be affected."
+            )
+
         # 6. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)

From 23f6bc1201402afc887925ef3c0e670dbbc043a9 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 21 May 2025 02:19:30 +0200
Subject: [PATCH 08/15] fix bug with mask

---
 src/diffusers/pipelines/wan/pipeline_wan_vace.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index ae93c57a188a..31805e31e4aa 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -392,6 +392,7 @@ def preprocess_conditions(
 
         if mask is not None:
             mask = self.video_processor.preprocess_video(mask, image_size[0], image_size[1])
+            mask = torch.clamp((mask + 1) / 2, min=0, max=1)
         else:
             mask = torch.ones_like(video)
 

From 5218baef6847856b3e0469831893eae45d7facd7 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Thu, 29 May 2025 13:52:31 +0200
Subject: [PATCH 09/15] fix for reference images

---
 src/diffusers/pipelines/wan/pipeline_wan_vace.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index 31805e31e4aa..242b261a1b7d 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -359,7 +359,7 @@ def preprocess_conditions(
         self,
         video: Optional[List[PipelineImageInput]] = None,
         mask: Optional[List[PipelineImageInput]] = None,
-        reference_images: Optional[List[PipelineImageInput]] = None,
+        reference_images: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], List[List[PIL.Image.Image]]]] = None,
         batch_size: int = 1,
         height: int = 480,
         width: int = 832,
@@ -513,8 +513,9 @@ def prepare_video_latents(
                 reference_image = reference_image[None, :, None, :, :]  # [1, C, 1, H, W]
                 reference_latent = retrieve_latents(self.vae.encode(reference_image), generator, sample_mode="argmax")
                 reference_latent = ((reference_latent.float() - latents_mean) * latents_std).to(vae_dtype)
-                reference_latent = torch.cat([reference_latent, torch.zeros_like(reference_latent)], dim=1)
-                latent = torch.cat([reference_latent.squeeze(0), latent], dim=1)  # Concat across frame dimension
+                reference_latent = reference_latent.squeeze(0)  # [C, 1, H, W]
+                reference_latent = torch.cat([reference_latent, torch.zeros_like(reference_latent)], dim=0)
+                latent = torch.cat([reference_latent.squeeze(0), latent], dim=1)
             latent_list.append(latent)
         return torch.stack(latent_list)
 
@@ -811,6 +812,7 @@ def __call__(
             torch.float32,
             device,
         )
+        num_reference_images = len(reference_images[0])
 
         conditioning_latents = self.prepare_video_latents(video, mask, reference_images, generator, device)
         mask = self.prepare_masks(mask, reference_images, generator)
@@ -823,7 +825,7 @@ def __call__(
             num_channels_latents,
             height,
             width,
-            num_frames,
+            num_frames + num_reference_images * self.vae_scale_factor_temporal,
             torch.float32,
             device,
             generator,
@@ -893,6 +895,8 @@ def __call__(
         self._current_timestep = None
 
         if not output_type == "latent":
+            print(latents.shape, num_reference_images)
+            latents = latents[:, :, num_reference_images:]
             latents = latents.to(vae_dtype)
             latents_mean = (
                 torch.tensor(self.vae.config.latents_mean)

From 1da4a55f4c62604b90b727eb1c98e5a21b0cc077 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Thu, 29 May 2025 14:00:18 +0200
Subject: [PATCH 10/15] remove print

---
 src/diffusers/pipelines/wan/pipeline_wan_vace.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index 242b261a1b7d..f44b67c580b6 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -895,7 +895,6 @@ def __call__(
         self._current_timestep = None
 
         if not output_type == "latent":
-            print(latents.shape, num_reference_images)
             latents = latents[:, :, num_reference_images:]
             latents = latents.to(vae_dtype)
             latents_mean = (

From f4310a25e8e8c07f48fc418c20da443c9df0fc72 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Fri, 30 May 2025 13:04:34 +0200
Subject: [PATCH 11/15] update docs

---
 docs/source/en/api/pipelines/wan.md | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md
index 09503125f5c5..2ada80caf715 100644
--- a/docs/source/en/api/pipelines/wan.md
+++ b/docs/source/en/api/pipelines/wan.md
@@ -20,7 +20,16 @@
 
 [Wan 2.1](https://github.com/Wan-Video/Wan2.1) by the Alibaba Wan Team.
 
-<!-- TODO(aryan): update abstract once paper is out -->
+*This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluation metrics. These contributions collectively enhance the model's performance and versatility. Specifically, Wan is characterized by four key features: Leading Performance: The 14B model of Wan, trained on a vast dataset comprising billions of images and videos, demonstrates the scaling laws of video generation with respect to both data and model size. It consistently outperforms the existing open-source models as well as state-of-the-art commercial solutions across multiple internal and external benchmarks, demonstrating a clear and significant performance superiority. Comprehensiveness: Wan offers two capable models, i.e., 1.3B and 14B parameters, for efficiency and effectiveness respectively. It also covers multiple downstream applications, including image-to-video, instruction-guided video editing, and personal video generation, encompassing up to eight tasks. Consumer-Grade Efficiency: The 1.3B model demonstrates exceptional resource efficiency, requiring only 8.19 GB VRAM, making it compatible with a wide range of consumer-grade GPUs. Openness: We open-source the entire series of Wan, including source code and all models, with the goal of fostering the growth of the video generation community. This openness seeks to significantly expand the creative possibilities of video production in the industry and provide academia with high-quality video foundation models. All the code and models are available at [this https URL](https://github.com/Wan-Video/Wan2.1).*
+
+The following Wan models are supported in Diffusers:
+- [Wan 2.1 T2V 1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers)
+- [Wan 2.1 T2V 14B](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B-Diffusers)
+- [Wan 2.1 I2V 14B - 480P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers)
+- [Wan 2.1 I2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers)
+- [Wan 2.1 FLF2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers)
+- [Wan 2.1 VACE 1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B) (Unofficial diffusers checkpoint for now available [here](https://huggingface.co/a-r-r-o-w/Wan-VACE-1.3B-diffusers))
+- [Wan 2.1 VACE 14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) (Unofficial diffusers checkpoint for now available [here](https://huggingface.co/linoyts/Wan-VACE-14B-diffusers))
 
 ## Generating Videos with Wan 2.1
 
@@ -227,6 +236,19 @@ output = pipe(
 export_to_video(output, "wan-v2v.mp4", fps=16)
 ```
 
+### Any-to-Video Controllable Generation
+
+Wan VACE supports various generation techniques which achieve controllable video generation. Some of the capabilities include:
+- Control to Video (Depth, Pose, Sketch, Flow, Grayscale, Scribble, Layout, Boundary Box, etc.). Recommended library for preprocessing videos to obtain control videos: [huggingface/controlnet_aux]()
+- Image/Video to Video (first frame, last frame, starting clip, ending clip, random clips)
+- Inpainting and Outpainting
+- Subject to Video (faces, object, characters, etc.)
+- Composition to Video (reference anything, animate anything, swap anything, expand anything, move anything, etc.)
+
+The code snippets available in [this](https://github.com/huggingface/diffusers/pull/11582) pull request demonstrate some examples of how videos can be generated with controllability signals.
+
+The general rule of thumb to keep in mind when preparing inputs for the VACE pipeline is that the input images, or frames of a video that you want to use for conditioning, should have a corresponding mask that is black in color. The black mask signifies that the model will not generate new content for that area, and only use those parts for conditioning the generation process. For parts/frames that should be generated by the model, the mask should be white in color.
+
 ## Memory Optimizations for Wan 2.1
 
 Base inference with the large 14B Wan 2.1 models can take up to 35GB of VRAM when generating videos at 720p resolution. We'll outline a few memory optimizations we can apply to reduce the VRAM required to run the model.

From c164d8420ab32645058a30b4c91ddf5773ed5d7e Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Fri, 6 Jun 2025 10:07:12 +0200
Subject: [PATCH 12/15] update slices

---
 tests/pipelines/wan/test_wan_vace.py | 52 +++++++++++++++++-----------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/tests/pipelines/wan/test_wan_vace.py b/tests/pipelines/wan/test_wan_vace.py
index 44e036d93d36..31b787a0f927 100644
--- a/tests/pipelines/wan/test_wan_vace.py
+++ b/tests/pipelines/wan/test_wan_vace.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Team.
+# Copyright 2025 The HuggingFace Team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -107,7 +107,7 @@ def get_dummy_inputs(self, device, seed=0):
             "video": video,
             "mask": mask,
             "prompt": "dance monkey",
-            "negative_prompt": "negative",  # TODO
+            "negative_prompt": "negative",
             "generator": generator,
             "num_inference_steps": 2,
             "guidance_scale": 6.0,
@@ -128,13 +128,17 @@ def test_inference(self):
         pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        video = pipe(**inputs).frames
-        generated_video = video[0]
+        video = pipe(**inputs).frames[0]
+        self.assertEqual(video.shape, (17, 3, 16, 16))
 
-        self.assertEqual(generated_video.shape, (17, 3, 16, 16))
-        expected_video = torch.randn(17, 3, 16, 16)
-        max_diff = np.abs(generated_video - expected_video).max()
-        self.assertLessEqual(max_diff, 1e10)
+        # fmt: off
+        expected_slice = [0.4523, 0.45198, 0.44872, 0.45326, 0.45211, 0.45258, 0.45344, 0.453, 0.52431, 0.52572, 0.50701, 0.5118, 0.53717, 0.53093, 0.50557, 0.51402]
+        # fmt: on
+
+        video_slice = video.flatten()
+        video_slice = torch.cat([video_slice[:8], video_slice[-8:]])
+        video_slice = [round(x, 5) for x in video_slice.tolist()]
+        self.assertTrue(np.allclose(video_slice, expected_slice, atol=1e-3))
 
     def test_inference_with_single_reference_image(self):
         device = "cpu"
@@ -146,13 +150,17 @@ def test_inference_with_single_reference_image(self):
 
         inputs = self.get_dummy_inputs(device)
         inputs["reference_images"] = Image.new("RGB", (16, 16))
-        video = pipe(**inputs).frames
-        generated_video = video[0]
+        video = pipe(**inputs).frames[0]
+        self.assertEqual(video.shape, (17, 3, 16, 16))
+
+        # fmt: off
+        expected_slice = [0.45247, 0.45214, 0.44874, 0.45314, 0.45171, 0.45299, 0.45428, 0.45317, 0.51378, 0.52658, 0.53361, 0.52303, 0.46204, 0.50435, 0.52555, 0.51342]
+        # fmt: on
 
-        self.assertEqual(generated_video.shape, (17, 3, 16, 16))
-        expected_video = torch.randn(17, 3, 16, 16)
-        max_diff = np.abs(generated_video - expected_video).max()
-        self.assertLessEqual(max_diff, 1e10)
+        video_slice = video.flatten()
+        video_slice = torch.cat([video_slice[:8], video_slice[-8:]])
+        video_slice = [round(x, 5) for x in video_slice.tolist()]
+        self.assertTrue(np.allclose(video_slice, expected_slice, atol=1e-3))
 
     def test_inference_with_multiple_reference_image(self):
         device = "cpu"
@@ -164,13 +172,17 @@ def test_inference_with_multiple_reference_image(self):
 
         inputs = self.get_dummy_inputs(device)
         inputs["reference_images"] = [[Image.new("RGB", (16, 16))] * 2]
-        video = pipe(**inputs).frames
-        generated_video = video[0]
+        video = pipe(**inputs).frames[0]
+        self.assertEqual(video.shape, (17, 3, 16, 16))
+
+        # fmt: off
+        expected_slice = [0.45321, 0.45221, 0.44818, 0.45375, 0.45268, 0.4519, 0.45271, 0.45253, 0.51244, 0.52223, 0.51253, 0.51321, 0.50743, 0.51177, 0.51626, 0.50983]
+        # fmt: on
 
-        self.assertEqual(generated_video.shape, (17, 3, 16, 16))
-        expected_video = torch.randn(17, 3, 16, 16)
-        max_diff = np.abs(generated_video - expected_video).max()
-        self.assertLessEqual(max_diff, 1e10)
+        video_slice = video.flatten()
+        video_slice = torch.cat([video_slice[:8], video_slice[-8:]])
+        video_slice = [round(x, 5) for x in video_slice.tolist()]
+        self.assertTrue(np.allclose(video_slice, expected_slice, atol=1e-3))
 
     @unittest.skip("Test not supported")
     def test_attention_slicing_forward_pass(self):

From 1b3c85ad023b2c3617222cfed33d4b409d332f05 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Fri, 6 Jun 2025 10:28:07 +0200
Subject: [PATCH 13/15] update

---
 src/diffusers/pipelines/wan/pipeline_wan_vace.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index f44b67c580b6..09b12b265240 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -53,8 +53,8 @@
         >>> from diffusers import AutoencoderKLWan, WanPipeline
         >>> from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
 
-        >>> # Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
-        >>> model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
+        >>> # Available models: Wan-AI/Wan2.1-T2V-14B-diffusers, Wan-AI/Wan2.1-T2V-1.3B-diffusers
+        >>> model_id = "Wan-AI/Wan2.1-T2V-14B-diffusers"
         >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
         >>> pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
         >>> flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P

From 9ad3e31b7053ad4f8382dae273afb23346ac22aa Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Fri, 6 Jun 2025 10:32:05 +0200
Subject: [PATCH 14/15] update

---
 docs/source/en/api/pipelines/wan.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md
index 9848b0087967..efc4dff2de98 100644
--- a/docs/source/en/api/pipelines/wan.md
+++ b/docs/source/en/api/pipelines/wan.md
@@ -22,6 +22,8 @@
 
 # Wan2.1
 
+[Wan-2.1](https://huggingface.co/papers/2503.20314) by the Wan Team.
+
 *This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluation metrics. These contributions collectively enhance the model's performance and versatility. Specifically, Wan is characterized by four key features: Leading Performance: The 14B model of Wan, trained on a vast dataset comprising billions of images and videos, demonstrates the scaling laws of video generation with respect to both data and model size. It consistently outperforms the existing open-source models as well as state-of-the-art commercial solutions across multiple internal and external benchmarks, demonstrating a clear and significant performance superiority. Comprehensiveness: Wan offers two capable models, i.e., 1.3B and 14B parameters, for efficiency and effectiveness respectively. It also covers multiple downstream applications, including image-to-video, instruction-guided video editing, and personal video generation, encompassing up to eight tasks. Consumer-Grade Efficiency: The 1.3B model demonstrates exceptional resource efficiency, requiring only 8.19 GB VRAM, making it compatible with a wide range of consumer-grade GPUs. Openness: We open-source the entire series of Wan, including source code and all models, with the goal of fostering the growth of the video generation community. This openness seeks to significantly expand the creative possibilities of video production in the industry and provide academia with high-quality video foundation models. All the code and models are available at [this https URL](https://github.com/Wan-Video/Wan2.1).*
 
 You can find all the original Wan2.1 checkpoints under the [Wan-AI](https://huggingface.co/Wan-AI) organization.

From 62101a495233d361ca00b9fe37d9cc2b1270ebba Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Fri, 6 Jun 2025 11:43:49 +0200
Subject: [PATCH 15/15] update example

---
 .../pipelines/wan/pipeline_wan_vace.py        | 52 +++++++++++++++----
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index 09b12b265240..e029006aa175 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -49,28 +49,58 @@
     Examples:
         ```python
         >>> import torch
-        >>> from diffusers.utils import export_to_video
-        >>> from diffusers import AutoencoderKLWan, WanPipeline
+        >>> import PIL.Image
+        >>> from diffusers import AutoencoderKLWan, WanVACEPipeline
         >>> from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
-
-        >>> # Available models: Wan-AI/Wan2.1-T2V-14B-diffusers, Wan-AI/Wan2.1-T2V-1.3B-diffusers
-        >>> model_id = "Wan-AI/Wan2.1-T2V-14B-diffusers"
+        >>> from diffusers.utils import export_to_video, load_image
+        def prepare_video_and_mask(first_img: PIL.Image.Image, last_img: PIL.Image.Image, height: int, width: int, num_frames: int):
+            first_img = first_img.resize((width, height))
+            last_img = last_img.resize((width, height))
+            frames = []
+            frames.append(first_img)
+            # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
+            # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
+            # match the original code.
+            frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
+            frames.append(last_img)
+            mask_black = PIL.Image.new("L", (width, height), 0)
+            mask_white = PIL.Image.new("L", (width, height), 255)
+            mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
+            return frames, mask
+
+        >>> # Available checkpoints: Wan-AI/Wan2.1-VACE-1.3B-diffusers, Wan-AI/Wan2.1-VACE-14B-diffusers
+        >>> model_id = "Wan-AI/Wan2.1-VACE-1.3B-diffusers"
         >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
-        >>> pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
-        >>> flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
+        >>> pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
+        >>> flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
         >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
         >>> pipe.to("cuda")
 
-        >>> prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
+        >>> prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
         >>> negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+        >>> first_frame = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
+        ... )
+        >>> last_frame = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png>>> "
+        ... )
+
+        >>> height = 512
+        >>> width = 512
+        >>> num_frames = 81
+        >>> video, mask = prepare_video_and_mask(first_frame, last_frame, height, width, num_frames)
 
         >>> output = pipe(
+        ...     video=video,
+        ...     mask=mask,
         ...     prompt=prompt,
         ...     negative_prompt=negative_prompt,
-        ...     height=720,
-        ...     width=1280,
-        ...     num_frames=81,
+        ...     height=height,
+        ...     width=width,
+        ...     num_frames=num_frames,
+        ...     num_inference_steps=30,
         ...     guidance_scale=5.0,
+        ...     generator=torch.Generator().manual_seed(42),
         ... ).frames[0]
         >>> export_to_video(output, "output.mp4", fps=16)
         ```