invoke-ai · psychedelicious · Nov 1, 2024 · Oct 26, 2024 · Oct 26, 2024 · Oct 26, 2024
@@ -622,7 +622,7 @@ def run_t2i_adapters(
         for t2i_adapter_field in t2i_adapter:
             t2i_adapter_model_config = context.models.get_config(t2i_adapter_field.t2i_adapter_model.key)
             t2i_adapter_loaded_model = context.models.load(t2i_adapter_field.t2i_adapter_model)
-            image = context.images.get_pil(t2i_adapter_field.image.image_name)
+            image = context.images.get_pil(t2i_adapter_field.image.image_name, mode="RGB")
 
             # The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally.
             if t2i_adapter_model_config.base == BaseModelType.StableDiffusion1:
@@ -640,29 +640,39 @@ def run_t2i_adapters(
             with t2i_adapter_loaded_model as t2i_adapter_model:
                 total_downscale_factor = t2i_adapter_model.total_downscale_factor
 
-                # Resize the T2I-Adapter input image.
-                # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
-                # result will match the latent image's dimensions after max_unet_downscale is applied.
-                t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
-                t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
-
                 # Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
                 # a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the
                 # T2I-Adapter model.
                 #
                 # Note: We re-use the `prepare_control_image(...)` from ControlNet for T2I-Adapter, because it has many
                 # of the same requirements (e.g. preserving binary masks during resize).
+
+                # Assuming fixed dimensional scaling of LATENT_SCALE_FACTOR.
+                _, _, latent_height, latent_width = latents_shape
+                control_height_resize = latent_height * LATENT_SCALE_FACTOR
+                control_width_resize = latent_width * LATENT_SCALE_FACTOR
                 t2i_image = prepare_control_image(
                     image=image,
                     do_classifier_free_guidance=False,
-                    width=t2i_input_width,
-                    height=t2i_input_height,
+                    width=control_width_resize,
+                    height=control_height_resize,
                     num_channels=t2i_adapter_model.config["in_channels"],  # mypy treats this as a FrozenDict
                     device=t2i_adapter_model.device,
                     dtype=t2i_adapter_model.dtype,
                     resize_mode=t2i_adapter_field.resize_mode,
                 )
 
+                # Resize the T2I-Adapter input image.
+                # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
+                # result will match the latent image's dimensions after max_unet_downscale is applied.
+                # We crop the image to this size so that the positions match the input image on non-standard resolutions
+                t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
+                t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
+                if t2i_image.shape[2] > t2i_input_height or t2i_image.shape[3] > t2i_input_width:
+                    t2i_image = t2i_image[
+                        :, :, : min(t2i_image.shape[2], t2i_input_height), : min(t2i_image.shape[3], t2i_input_width)
+                    ]
+
                 adapter_state = t2i_adapter_model(t2i_image)
 
             if do_classifier_free_guidance:

@@ -499,6 +499,22 @@ def step(
                     for idx, value in enumerate(single_t2i_adapter_data.adapter_state):
                         accum_adapter_state[idx] += value * t2i_adapter_weight
 
+            # Hack: force compatibility with irregular resolutions by padding the feature map with zeros
+            for idx, tensor in enumerate(accum_adapter_state):
+                # The tensor size is supposed to be some integer downscale factor of the latents size.
+                # Internally, the unet will pad the latents before downscaling between levels when it is no longer divisible by its downscale factor.
+                # If the latent size does not scale down evenly, we need to pad the tensor so that it matches the the downscaled padded latents later on.
+                scale_factor = latents.size()[-1] // tensor.size()[-1]
+                required_padding_width = math.ceil(latents.size()[-1] / scale_factor) - tensor.size()[-1]
+                required_padding_height = math.ceil(latents.size()[-2] / scale_factor) - tensor.size()[-2]
+                tensor = torch.nn.functional.pad(
+                    tensor,
+                    (0, required_padding_width, 0, required_padding_height, 0, 0, 0, 0),
+                    mode="constant",
+                    value=0,
+                )
+                accum_adapter_state[idx] = tensor
+
             down_intrablock_additional_residuals = accum_adapter_state
 
         # Handle inpainting models.

@@ -202,46 +202,6 @@ const createSelector = (
             if (controlLayer.controlAdapter.model?.base !== model?.base) {
               problems.push(i18n.t('parameters.invoke.layer.controlAdapterIncompatibleBaseModel'));
             }
-            // T2I Adapters require images have dimensions that are multiples of 64 (SD1.5) or 32 (SDXL)
-            if (controlLayer.controlAdapter.type === 't2i_adapter') {
-              const multiple = model?.base === 'sdxl' ? 32 : 64;
-              if (bbox.scaleMethod === 'none') {
-                if (bbox.rect.width % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxWidth', {
-                      multiple,
-                      width: bbox.rect.width,
-                    }),
-                  });
-                }
-                if (bbox.rect.height % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxHeight', {
-                      multiple,
-                      height: bbox.rect.height,
-                    }),
-                  });
-                }
-              } else {
-                if (bbox.scaledSize.width % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxWidth', {
-                      multiple,
-                      width: bbox.scaledSize.width,
-                    }),
-                  });
-                }
-                if (bbox.scaledSize.height % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxHeight', {
-                      multiple,
-                      height: bbox.scaledSize.height,
-                    }),
-                  });
-                }
-              }
-            }
-
             if (problems.length) {
               const content = upperFirst(problems.join(', '));
               reasons.push({ prefix, content });