Skip to content

[LoRA] allow big CUDA tests to run properly for LoRA (and others) #9845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 45 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
8696fbb
allow big lora tests to run on the CI.
sayakpaul Nov 2, 2024
06b3919
print
sayakpaul Nov 2, 2024
b062bd9
print.
sayakpaul Nov 2, 2024
360935c
print
sayakpaul Nov 2, 2024
1d1248a
print
sayakpaul Nov 2, 2024
f5550e3
print
sayakpaul Nov 2, 2024
4cd5a3c
print
sayakpaul Nov 2, 2024
a901420
more
sayakpaul Nov 2, 2024
d659f1c
print
sayakpaul Nov 2, 2024
96d27ff
remove print.
sayakpaul Nov 2, 2024
8510f98
remove print
sayakpaul Nov 2, 2024
9fe7b91
directly place on cuda.
sayakpaul Nov 2, 2024
286af0e
remove pipeline.
sayakpaul Nov 2, 2024
741a44f
remove
sayakpaul Nov 2, 2024
e818907
fix
sayakpaul Nov 2, 2024
3ed98a1
fix
sayakpaul Nov 2, 2024
9124f28
spaces
sayakpaul Nov 2, 2024
a938831
quality
sayakpaul Nov 2, 2024
bd94852
updates
sayakpaul Nov 3, 2024
1760419
directly place flux controlnet pipeline on cuda.
sayakpaul Nov 3, 2024
021f0de
torch_device instead of cuda.
sayakpaul Nov 3, 2024
ee662cf
style
sayakpaul Nov 3, 2024
c46331f
device placement.
sayakpaul Nov 3, 2024
ad4d7a7
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Nov 5, 2024
207579b
fixes
sayakpaul Nov 5, 2024
7f27d2d
fixes
sayakpaul Nov 5, 2024
876dfdb
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Nov 5, 2024
1740d83
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Nov 18, 2024
48c3bd3
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Nov 20, 2024
295315c
add big gpu marker for mochi; rename test correctly
a-r-r-o-w Nov 20, 2024
704392a
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Nov 22, 2024
77cfc79
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Nov 23, 2024
75e3850
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Nov 27, 2024
e10caf4
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Dec 2, 2024
58b79f2
address feedback
sayakpaul Dec 2, 2024
c7a64b8
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Dec 4, 2024
8b952cb
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Dec 8, 2024
79ca87e
resolve conflicts.
sayakpaul Dec 20, 2024
168aff7
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Dec 25, 2024
008f34f
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Dec 25, 2024
44db423
fix
sayakpaul Dec 25, 2024
50e59dc
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Dec 27, 2024
f1ab9f3
resolve conflicts.
sayakpaul Jan 9, 2025
03e11f1
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Jan 9, 2025
2d2c972
Merge branch 'main' into allow-flux-lora-tests
sayakpaul Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions tests/lora/test_lora_layers_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import unittest

import numpy as np
import pytest
import safetensors.torch
import torch
from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
Expand All @@ -29,6 +30,7 @@
is_peft_available,
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_peft_backend,
require_torch_gpu,
slow,
Expand Down Expand Up @@ -169,8 +171,8 @@ def test_modify_padding_mode(self):
@nightly
@require_torch_gpu
@require_peft_backend
@unittest.skip("We cannot run inference on this model with the current CI hardware")
# TODO (DN6, sayakpaul): move these tests to a beefier GPU
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class FluxLoRAIntegrationTests(unittest.TestCase):
"""internal note: The integration slices were obtained on audace.

Expand All @@ -192,14 +194,18 @@ def setUp(self):
def tearDown(self):
super().tearDown()

del self.pipeline
gc.collect()
torch.cuda.empty_cache()

def test_flux_the_last_ben(self):
self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload()
# Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
# run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
# `enable_model_cpu_offload()`. We repeat this for the other tests, too.
self.pipeline = self.pipeline.to(torch_device)

prompt = "jon snow eating pizza with ketchup"

Expand All @@ -221,7 +227,7 @@ def test_flux_kohya(self):
self.pipeline.load_lora_weights("Norod78/brain-slug-flux")
self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload()
self.pipeline = self.pipeline.to(torch_device)

prompt = "The cat with a brain slug earring"
out = self.pipeline(
Expand All @@ -243,7 +249,7 @@ def test_flux_kohya_with_text_encoder(self):
self.pipeline.load_lora_weights("cocktailpeanut/optimus", weight_name="optimus.safetensors")
self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload()
self.pipeline = self.pipeline.to(torch_device)

prompt = "optimus is cleaning the house with broomstick"
out = self.pipeline(
Expand All @@ -265,7 +271,7 @@ def test_flux_xlabs(self):
self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors")
self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload()
self.pipeline = self.pipeline.to(torch_device)

prompt = "A blue jay standing on a large basket of rainbow macarons, disney style"

Expand Down
17 changes: 14 additions & 3 deletions tests/lora/test_lora_layers_sd3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import unittest

import numpy as np
import pytest
import torch
from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel

Expand All @@ -30,9 +31,12 @@
from diffusers.utils.import_utils import is_accelerate_available
from diffusers.utils.testing_utils import (
is_peft_available,
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_peft_backend,
require_torch_gpu,
slow,
torch_device,
)

Expand Down Expand Up @@ -130,9 +134,13 @@ def test_modify_padding_mode(self):
pass


@slow
@nightly
@require_torch_gpu
@require_peft_backend
class LoraSD3IntegrationTests(unittest.TestCase):
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class SD3LoraIntegrationTests(unittest.TestCase):
pipeline_class = StableDiffusion3Img2ImgPipeline
repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"

Expand Down Expand Up @@ -166,14 +174,17 @@ def get_inputs(self, device, seed=0):

def test_sd3_img2img_lora(self):
pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2", weight_name="pytorch_lora_weights.safetensors")
pipe.enable_sequential_cpu_offload()
pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2")
pipe.fuse_lora()
pipe.unload_lora_weights()
pipe = pipe.to(torch_device)

inputs = self.get_inputs(torch_device)

image = pipe(**inputs).images[0]
image_slice = image[0, -3:, -3:]
expected_slice = np.array([0.5396, 0.5776, 0.7432, 0.5151, 0.5586, 0.7383, 0.5537, 0.5933, 0.7153])

max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())

assert max_diff < 1e-4, f"Outputs are not close enough, got {max_diff}"
Expand Down
9 changes: 5 additions & 4 deletions tests/pipelines/controlnet_flux/test_controlnet_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
enable_full_determinism,
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
slow,
Expand Down Expand Up @@ -183,6 +184,7 @@ def test_xformers_attention_forwardGenerator_pass(self):


@slow
@nightly
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class FluxControlNetPipelineSlowTests(unittest.TestCase):
Expand All @@ -208,8 +210,7 @@ def test_canny(self):
text_encoder_2=None,
controlnet=controlnet,
torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
).to(torch_device)
pipe.set_progress_bar_config(disable=None)

generator = torch.Generator(device="cpu").manual_seed(0)
Expand All @@ -219,12 +220,12 @@ def test_canny(self):

prompt_embeds = torch.load(
hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
)
).to(torch_device)
pooled_prompt_embeds = torch.load(
hf_hub_download(
repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
)
)
).to(torch_device)

output = pipe(
prompt_embeds=prompt_embeds,
Expand Down
14 changes: 6 additions & 8 deletions tests/pipelines/flux/test_pipeline_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
from diffusers.utils.testing_utils import (
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
slow,
Expand Down Expand Up @@ -193,6 +194,7 @@ def test_fused_qkv_projections(self):


@slow
@nightly
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class FluxPipelineSlowTests(unittest.TestCase):
Expand All @@ -210,19 +212,16 @@ def tearDown(self):
torch.cuda.empty_cache()

def get_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device="cpu").manual_seed(seed)
generator = torch.Generator(device="cpu").manual_seed(seed)

prompt_embeds = torch.load(
hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
)
).to(torch_device)
pooled_prompt_embeds = torch.load(
hf_hub_download(
repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
)
)
).to(torch_device)
return {
"prompt_embeds": prompt_embeds,
"pooled_prompt_embeds": pooled_prompt_embeds,
Expand All @@ -236,8 +235,7 @@ def get_inputs(self, device, seed=0):
def test_flux_inference(self):
pipe = self.pipeline_class.from_pretrained(
self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
)
pipe.enable_model_cpu_offload()
).to(torch_device)

inputs = self.get_inputs(torch_device)

Expand Down
10 changes: 8 additions & 2 deletions tests/pipelines/mochi/test_mochi.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,16 @@
import unittest

import numpy as np
import pytest
import torch
from transformers import AutoTokenizer, T5EncoderModel

from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
from diffusers.utils.testing_utils import (
enable_full_determinism,
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_torch_gpu,
slow,
torch_device,
Expand Down Expand Up @@ -261,7 +264,10 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):


@slow
@nightly
@require_torch_gpu
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class MochiPipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger."

Expand All @@ -275,7 +281,7 @@ def tearDown(self):
gc.collect()
torch.cuda.empty_cache()

def test_cogvideox(self):
def test_mochi(self):
generator = torch.Generator("cpu").manual_seed(0)

pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", torch_dtype=torch.float16)
Expand All @@ -293,7 +299,7 @@ def test_cogvideox(self):
).frames

video = videos[0]
expected_video = torch.randn(1, 16, 480, 848, 3).numpy()
expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching!


max_diff = numpy_cosine_similarity_distance(video, expected_video)
assert max_diff < 1e-3, f"Max diff is too high. got {video}"
Loading