nits

williamberman · williamberman · commit a257ed52734a · 2023-03-13T12:40:26.000-07:00
MultiControlNet -&gt; MultiControlNetModel - Matches existing naming a bit
closer

MultiControlNetModel inherit from model utils class - Don't have to
re-write fp16 test

Skip tests that save multi controlnet pipeline - Clearer than changing
test body

Don't auto-batch the number of input images to the number of controlnets.
We generally like to require the user to pass the expected number of
inputs. This simplifies the processing code a bit more

Use existing image pre-processing code a bit more. We can rely on the
existing image pre-processing code and keep the inference loop a bit
simpler.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -20,12 +20,12 @@
 import numpy as np
 import PIL.Image
 import torch
-from torch import device, nn
+from torch import nn
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.controlnet import ControlNetOutput
-from ...models.modeling_utils import get_parameter_device, get_parameter_dtype
+from ...models.modeling_utils import ModelMixin
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     PIL_INTERPOLATION,
@@ -89,7 +89,7 @@
 """
 
 
-class MultiControlNet(nn.Module):
+class MultiControlNetModel(ModelMixin):
     r"""
     Multiple `ControlNetModel` wrapper class for Multi-ControlNet
 
@@ -102,25 +102,10 @@ class MultiControlNet(nn.Module):
             `ControlNetModel` as a list.
     """
 
-    def __init__(self, controlnets: List[ControlNetModel]):
+    def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
         super().__init__()
         self.nets = nn.ModuleList(controlnets)
 
-    @property
-    def device(self) -> device:
-        """
-        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        return get_parameter_device(self)
-
-    @property
-    def dtype(self) -> torch.dtype:
-        """
-        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        return get_parameter_dtype(self)
-
     def forward(
         self,
         sample: torch.FloatTensor,
@@ -180,8 +165,9 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
-            Provides additional conditioning to the unet during the denoising process. You can set multiple
-            `ControlNetModel` as a list.
+            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+            as a list, the outputs from each ControlNet are added together to create one combined additional
+            conditioning.
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
@@ -199,7 +185,7 @@ def __init__(
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        controlnet: Union[ControlNetModel, List[ControlNetModel]],
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
@@ -224,9 +210,7 @@ def __init__(
             )
 
         if isinstance(controlnet, (list, tuple)):
-            controlnet = MultiControlNet(controlnet)
-        else:
-            controlnet = controlnet
+            controlnet = MultiControlNetModel(controlnet)
 
         self.register_modules(
             vae=vae,
@@ -507,12 +491,14 @@ def prepare_extra_step_kwargs(self, generator, eta):
     def check_inputs(
         self,
         prompt,
+        image,
         height,
         width,
         callback_steps,
         negative_prompt=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
     ):
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -551,6 +537,40 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+        # Check `image`
+
+        if isinstance(self.controlnet, ControlNetModel):
+            self.check_image(image, prompt, prompt_embeds)
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            if len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    "For multiple controlnets: `image` must have the same length as the number of controlnets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+
+        if isinstance(self.controlnet, ControlNetModel):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
     def check_image(self, image, prompt, prompt_embeds):
         image_is_pil = isinstance(image, PIL.Image.Image)
         image_is_tensor = isinstance(image, torch.Tensor)
@@ -637,7 +657,10 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         return latents
 
     def _default_height_width(self, height, width, image):
-        if isinstance(image, list):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
             image = image[0]
 
         if height is None:
@@ -658,41 +681,6 @@ def _default_height_width(self, height, width, image):
 
         return height, width
 
-    def _prepare_images(self, image):
-        if isinstance(self.controlnet, ControlNetModel):
-            return [image]  # convert to array for internal use
-        else:  # Multi-Controlnet
-            if not isinstance(image, list):
-                raise ValueError("The `image` argument needs to be specified in a `list`.")
-
-            num_controlnets = len(self.controlnet.nets)
-            if len(image) % num_controlnets != 0:
-                raise ValueError(
-                    "The length of the `image` argument list needs to be a multiple of the number of Multi-ControlNet."
-                )
-
-            image_per_control = len(image) // num_controlnets
-
-            # let's split images over controlnets
-            return [image[i : i + image_per_control] for i in range(0, len(image), image_per_control)]
-
-    def _prepare_controlnet_conditioning_scale(self, controlnet_conditioning_scale):
-        if isinstance(self.controlnet, ControlNetModel):
-            if not isinstance(controlnet_conditioning_scale, float):
-                raise ValueError("The `controlnet_conditioning_scale` argument needs to be specified as a `float`.")
-            return controlnet_conditioning_scale
-        else:  # Multi-Controlnet
-            num_controlnets = len(self.controlnet.nets)
-            if isinstance(controlnet_conditioning_scale, list):
-                if len(controlnet_conditioning_scale) != num_controlnets:
-                    raise ValueError(
-                        "The length of the `controlnet_conditioning_scale` list does not match the number of Multi-ControlNet. "
-                        "If specified in `list`, it needs to have the same length as the number of Multi-ControlNet."
-                    )
-            else:
-                controlnet_conditioning_scale = [controlnet_conditioning_scale] * num_controlnets
-            return controlnet_conditioning_scale
-
     # override DiffusionPipeline
     def save_pretrained(
         self,
@@ -736,12 +724,14 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`):
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+                    `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
                 The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
                 the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The control image is automatically resized to fit the output image. If
-                multiple ControlNets are specified in init, you need to set the corresponding images in the form of a
-                list of `List[torch.FloatTensor]` or `List[PIL.Image.Image]`.
+                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+                specified in init, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single controlnet.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -807,21 +797,21 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-
-        # prepare `images` and `controlnet_conditioning_scale` for both a ControlNet and Multi-Controlnet
-        # `images` here is a list where each element is a conditioning image for each ControlNet.
-        images = self._prepare_images(image)
-        controlnet_conditioning_scale = self._prepare_controlnet_conditioning_scale(controlnet_conditioning_scale)
-
         # 0. Default height and width to unet
-        height, width = self._default_height_width(height, width, images[0])
+        height, width = self._default_height_width(height, width, image)
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+            prompt,
+            image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
         )
-        for image in images:
-            self.check_image(image, prompt, prompt_embeds)
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -837,6 +827,9 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
+        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
+
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
             prompt,
@@ -849,8 +842,8 @@ def __call__(
         )
 
         # 4. Prepare image
-        images = [
-            self.prepare_image(
+        if isinstance(self.controlnet, ControlNetModel):
+            image = self.prepare_image(
                 image=image,
                 width=width,
                 height=height,
@@ -860,8 +853,26 @@ def __call__(
                 dtype=self.controlnet.dtype,
                 do_classifier_free_guidance=do_classifier_free_guidance,
             )
-            for image in images
-        ]
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                )
+
+                images.append(image_)
+
+            image = images
+        else:
+            assert False
 
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -896,7 +907,7 @@ def __call__(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    controlnet_cond=images[0] if len(images) == 1 else images,
+                    controlnet_cond=image,
                     conditioning_scale=controlnet_conditioning_scale,
                     return_dict=False,
                 )
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -28,6 +28,7 @@
     StableDiffusionControlNetPipeline,
     UNet2DConditionModel,
 )
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
 from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import require_torch_gpu
@@ -211,9 +212,11 @@ def get_dummy_components(self):
         text_encoder = CLIPTextModel(text_encoder_config)
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
+        controlnet = MultiControlNetModel([controlnet1, controlnet2])
+
         components = {
             "unet": unet,
-            "controlnet": [controlnet1, controlnet2],
+            "controlnet": controlnet,
             "scheduler": scheduler,
             "vae": vae,
             "text_encoder": text_encoder,
@@ -268,30 +271,7 @@ def test_xformers_attention_forwardGenerator_pass(self):
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=2e-3)
 
-    # override PipelineTesterMixin
-    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
-    def test_float16_inference(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        for name, module in components.items():
-            if isinstance(module, list):
-                components[name] = [m.half() for m in module if hasattr(m, "half")]
-            elif hasattr(module, "half"):
-                components[name] = module.half()
-        pipe_fp16 = self.pipeline_class(**components)
-        pipe_fp16.to(torch_device)
-        pipe_fp16.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(torch_device))[0]
-        output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]
-
-        max_diff = np.abs(output - output_fp16).max()
-        self.assertLess(max_diff, 1e-2, "The outputs of the fp16 and fp32 pipelines are too different.")
-
-    def check_save_pretrained_raise_not_implemented_exception(self):
+    def test_save_pretrained_raise_not_implemented_exception(self):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         pipe.to(torch_device)
@@ -304,17 +284,19 @@ def check_save_pretrained_raise_not_implemented_exception(self):
                 pass
 
     # override PipelineTesterMixin
-    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    @unittest.skip("save pretrained not implemented")
     def test_save_load_float16(self):
-        self.check_save_pretrained_raise_not_implemented_exception()
+        ...
 
     # override PipelineTesterMixin
+    @unittest.skip("save pretrained not implemented")
     def test_save_load_local(self):
-        self.check_save_pretrained_raise_not_implemented_exception()
+        ...
 
     # override PipelineTesterMixin
+    @unittest.skip("save pretrained not implemented")
     def test_save_load_optional_components(self):
-        self.check_save_pretrained_raise_not_implemented_exception()
+        ...
 
 
 @slow
@@ -605,6 +587,8 @@ def test_pose_and_canny(self):
 
         assert image.shape == (768, 512, 3)
 
-        expected_image = load_numpy("https://huggingface.co/takuma104/controlnet_dev/resolve/main/pose_canny_out.npy")
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose_canny_out.npy"
+        )
 
         assert np.abs(expected_image - image).max() < 5e-2