Skip to content

Wan VACE #11582

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
Draft

Wan VACE #11582

wants to merge 11 commits into from

Conversation

a-r-r-o-w
Copy link
Member

@a-r-r-o-w a-r-r-o-w commented May 19, 2025

Checkpoints (temporary; only for the time being until official weights are hosted):

T2V

import torch
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video

model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

output = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=480,
    width=832,
    num_frames=81,
    num_inference_steps=30,
    guidance_scale=5.0,
    conditioning_scale=0.0,
    # conditioning_scale=1.0,
    generator=torch.Generator().manual_seed(0),
).frames[0]
export_to_video(output, "output.mp4", fps=16)
`conditioning_scale=0` `conditioning_scale=1`
output2.mp4
output.mp4

I2V

import torch
import PIL.Image
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_image


def prepare_video_and_mask(img: PIL.Image.Image, height: int, width: int, num_frames: int):
    img = img.resize((width, height))
    frames = [img]
    # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
    # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
    # match the original code.
    frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 1))
    mask_black = PIL.Image.new("L", (width, height), 0)
    mask_white = PIL.Image.new("L", (width, height), 255)
    mask = [mask_black, *[mask_white] * (num_frames - 1)]
    return frames, mask


model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "An astronaut emerging from a cracked, otherworldly egg on the barren surface of the Moon—his form silhouetted against the stark lunar dust, as if being born into silence. The vast darkness of space looms behind, punctuated by distant stars, capturing the immense depth and isolation of the cosmos. The scene is rendered in ultra-realistic, cinematic detail, with dramatic lighting and a breath-taking, movie-like camera angle that evokes awe and mystery—blending themes of rebirth, exploration, and the uncanny."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg")

height = 480
width = 832
num_frames = 81
video, mask = prepare_video_and_mask(image, height, width, num_frames)

output = pipe(
    video=video,
    mask=mask,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)
output.mp4

V2LF

import torch
import PIL.Image
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_image


def prepare_video_and_mask(img: PIL.Image.Image, height: int, width: int, num_frames: int):
    img = img.resize((width, height))
    frames = []
    # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
    # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
    # match the original code.
    frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 1))
    frames.append(img)
    mask_black = PIL.Image.new("L", (width, height), 0)
    mask_white = PIL.Image.new("L", (width, height), 255)
    mask = [*[mask_white] * (num_frames - 1), mask_black]
    return frames, mask


model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "An astronaut emerging from a cracked, otherworldly egg on the barren surface of the Moon—his form silhouetted against the stark lunar dust, as if being born into silence. The vast darkness of space looms behind, punctuated by distant stars, capturing the immense depth and isolation of the cosmos. The scene is rendered in ultra-realistic, cinematic detail, with dramatic lighting and a breath-taking, movie-like camera angle that evokes awe and mystery—blending themes of rebirth, exploration, and the uncanny."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg")

height = 480
width = 832
num_frames = 81
video, mask = prepare_video_and_mask(image, height, width, num_frames)

output = pipe(
    video=video,
    mask=mask,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)
output.mp4

FLF2V

import torch
import PIL.Image
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_image


def prepare_video_and_mask(first_img: PIL.Image.Image, last_img: PIL.Image.Image, height: int, width: int, num_frames: int):
    first_img = first_img.resize((width, height))
    last_img = last_img.resize((width, height))
    frames = []
    frames.append(first_img)
    # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
    # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
    # match the original code.
    frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
    frames.append(last_img)
    mask_black = PIL.Image.new("L", (width, height), 0)
    mask_white = PIL.Image.new("L", (width, height), 255)
    mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
    return frames, mask


model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")

height = 512
width = 512
num_frames = 81
video, mask = prepare_video_and_mask(first_frame, last_frame, height, width, num_frames)

output = pipe(
    video=video,
    mask=mask,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)
output.mp4

Random-to-V

Ideally, you should use similar looking images for consistent video generation. The example here is just for testing purposes with completely random images

from typing import List

import torch
import PIL.Image
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_image


def prepare_video_and_mask(images: List[PIL.Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
    images = [img.resize((width, height)) for img in images]
    # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
    # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
    # match the original code.
    frames = [PIL.Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
    
    mask_black = PIL.Image.new("L", (width, height), 0)
    mask_white = PIL.Image.new("L", (width, height), 255)
    mask = [mask_white] * num_frames
    
    for img, idx in zip(images, frame_indices):
        assert idx < num_frames
        frames[idx] = img
        mask[idx] = mask_black
    
    return frames, mask


model_id = "linoyts/Wan-VACE-14B-diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "Various different characters appear and disappear in a fast transition video showcasting their unique features and personalities. The video is about showcasing different dance styles, with each character performing a distinct dance move. The background is a vibrant, colorful stage with dynamic lighting that changes with each dance style. The camera captures close-ups of the dancers' expressions and movements. Highly dynamic, fast-paced music video, with quick cuts and transitions between characters, cinematic, vibrant colors"
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

image1 = load_image("inputs/framepack/1.png")
image2 = load_image("inputs/framepack/5.png")
image3 = load_image("inputs/framepack/12.png")
frame_indices = [0, 45, 70]  # Some random indices for the frames

height = 832
width = 480
num_frames = 81
video, mask = prepare_video_and_mask([image1, image2, image3], frame_indices, height, width, num_frames)

output = pipe(
    video=video,
    mask=mask,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)
output.mp4

Inpaint

Ideally, you should use a mask prepared with segmentation models for best editing.

from typing import List

import torch
import PIL.Image
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_video


def prepare_video_and_mask(video: List[PIL.Image.Image], height: int, width: int, num_frames: int):
    assert len(video) == num_frames
    frames = [frame.resize((width, height)) for frame in video]
    mask_black = PIL.Image.new("L", (width, height), 0)
    # Make the mask white between top=0, bottom=height, left=width/2 - d, right=width/2 + d
    d = 80
    mask_white = PIL.Image.new("L", (2 * d, height), 255)
    mask_black.paste(mask_white, (width // 2 - d, 0))
    mask = [mask_black] * num_frames
    for i in range(num_frames):
        new_frame = PIL.Image.new("RGB", (width, height), (128, 128, 128))
        mask_inverse = mask[i].point(lambda p: 255 - p)
        new_frame.paste(frames[i], mask=mask_inverse)
        frames[i] = new_frame
    return frames, mask


model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "Shrek, the ogre, walks out of a building in a happy mood. He is wearing black pants and a black coat, formal attire with the coat open. The background is a busy street with people walking by. He looks joyful and is smiling and dancing with some crazy moves. The scene is bright. The lighting is warm and inviting, creating a cheerful atmosphere. The camera angle is slightly low, capturing the character from below."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

height = 480
width = 832
num_frames = 81
video = load_video("inputs/peter-dance.mp4")[::2][:81]  # Load the video and take every second frame, limiting to 81 frames
video, mask = prepare_video_and_mask(video, height, width, num_frames)

output = pipe(
    video=video,
    mask=mask,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output2.mp4", fps=16)
peter-dance.mp4
output2.mp4

Outpaint

from typing import List

import torch
import PIL.Image
import PIL.ImageDraw
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_image


def prepare_video_and_mask(img: PIL.Image.Image, directions: List[str], expand_ratio: float, height: int, width: int, num_frames: int, mask_blur: float = 0):
    image_width, image_height = img.size
    left = int(expand_ratio * image_width) if "left" in directions else 0
    right = int(expand_ratio * image_width) if "right" in directions else 0
    top = int(expand_ratio * image_height) if "up" in directions else 0
    bottom = int(expand_ratio * image_height) if "down" in directions else 0

    crop_left = left
    crop_right = image_width - right
    crop_top = top
    crop_bottom = image_height - bottom
    crop_box = (crop_left, crop_top, crop_right, crop_bottom)
    cropped_image = img.crop(crop_box)
    new_image = PIL.Image.new("RGB", (image_width, image_height), (128, 128, 128))
    new_image.paste(cropped_image, (left, top))
    new_image.save("output.png")

    mask = PIL.Image.new("L", (image_width, image_height), 255)
    draw = PIL.ImageDraw.Draw(mask)
    x0 = left + (mask_blur * 2 if left > 0 else 0)
    y0 = top + (mask_blur * 2 if top > 0 else 0)
    x1 = left + cropped_image.width - (mask_blur * 2 if right > 0 else 0)
    y1 = top + cropped_image.height - (mask_blur * 2 if bottom > 0 else 0)
    draw.rectangle((x0, y0, x1, y1), fill="black")
    mask.save("mask.png")

    frames = [new_image]
    frames.extend([PIL.Image.new("RGB", (image_width, image_height), (128, 128, 128))] * (num_frames - 1))

    mask_white = PIL.Image.new("L", (image_width, image_height), 255)
    mask = [mask] + [mask_white] * (num_frames - 1)
    
    return frames, mask


model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "A cute plushie dog sitting on the bed surrounded by more cute plushie puppies, with a soft and fluffy appearance. The dog has a light brown fur coat, with darker patches around its ears and eyes. The plushies look soft and cuddly. The plushie dogs are excitedly playing together, with their tails wagging and their eyes sparkling with joy."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
image = load_image("inputs/plushie-dog-on-bed.png")

height = 512
width = 512
num_frames = 81
directions = ["left", "right"]
expand_ratio = 0.25
video, mask = prepare_video_and_mask(image, directions, expand_ratio, height, width, num_frames)

output = pipe(
    video=video,
    mask=mask,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)

output
mask

output.mp4

OpenPose

from controlnet_aux import OpenposeDetector
from diffusers.utils import load_video, export_to_video

open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
open_pose.to("cuda")

video = load_video("inputs/man-contemporary-dance.mp4")[::3][:81]
video = [frame.convert("RGB").resize((832, 480)) for frame in video]
openpose_video = [open_pose(frame) for frame in video]

export_to_video(openpose_video, "openpose-man-contemporary-dance.mp4", fps=30)
import torch
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_video


model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "An alien-like creature with a resemblance of leaves, branches and twigs is dancing gracefully in a post-apocalyptic world. The creature has a humanoid shape, with long, flowing limbs that resemble branches. Its skin is textured like bark, and its eyes glow softly. The background is a desolate landscape with remnants of a once-thriving city, now overgrown with vegetation. The lighting is soft and ethereal, casting a magical glow on the scene. The camera captures the creature from a low angle, emphasizing its height and gracefulness as it moves fluidly through the air."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

height = 480
width = 832
num_frames = 81
video = load_video("openpose-man-contemporary-dance.mp4")[:num_frames]
video = [frame.convert("RGB").resize((width, height)) for frame in video]

output = pipe(
    video=video,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)
openpose-man-contemporary-dance.mp4
output.mp4

Inpaint with reference image

from typing import List

import torch
import PIL.Image
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from diffusers.utils import export_to_video, load_image, load_video


def prepare_video_and_mask(video: List[PIL.Image.Image], height: int, width: int, num_frames: int):
    assert len(video) == num_frames
    frames = [frame.resize((width, height)) for frame in video]
    mask_black = PIL.Image.new("L", (width, height), 0)
    # Make the mask white between top=0, bottom=height, left=width/2 - d, right=width/2 + d
    d = 80
    mask_white = PIL.Image.new("L", (2 * d, height), 255)
    mask_black.paste(mask_white, (width // 2 - d, 0))
    mask = [mask_black] * num_frames
    for i in range(num_frames):
        new_frame = PIL.Image.new("RGB", (width, height), (128, 128, 128))
        mask_inverse = mask[i].point(lambda p: 255 - p)
        new_frame.paste(frames[i], mask=mask_inverse)
        frames[i] = new_frame
    return frames, mask


model_id = "/raid/aryan/diffusers-wan-vace-1.3b/"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "A character walks out of a building in a happy mood. The background is a busy street with people walking by. He looks joyful and is smiling and dancing with some crazy moves. The scene is bright. The lighting is warm and inviting, creating a cheerful atmosphere. The camera angle is slightly low, capturing the character from below."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

height = 480
num_frames = 81
width = 832
reference_image = load_image("inputs/framepack/1.png")
video = load_video("inputs/peter-dance.mp4")[::2][:81]  # Load the video and take every second frame, limiting to 81 frames
video, mask = prepare_video_and_mask(video, height, width, num_frames)

output = pipe(
    video=video,
    mask=mask,
    reference_images=reference_image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output2.mp4", fps=16)
output.mp4

250413_171501_349_5557

@HuggingFaceDocBuilderDev

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

@a-r-r-o-w a-r-r-o-w force-pushed the integrations/wan-vace branch from 4a4b058 to 50b1216 Compare May 29, 2025 11:52
@nitinmukesh
Copy link

Thank you x 100

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants