Skip to content

Make T2I Adapters work with any resolution supported by the models #7215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions invokeai/app/invocations/denoise_latents.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ def run_t2i_adapters(
for t2i_adapter_field in t2i_adapter:
t2i_adapter_model_config = context.models.get_config(t2i_adapter_field.t2i_adapter_model.key)
t2i_adapter_loaded_model = context.models.load(t2i_adapter_field.t2i_adapter_model)
image = context.images.get_pil(t2i_adapter_field.image.image_name)
image = context.images.get_pil(t2i_adapter_field.image.image_name, mode="RGB")

# The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally.
if t2i_adapter_model_config.base == BaseModelType.StableDiffusion1:
Expand All @@ -640,29 +640,39 @@ def run_t2i_adapters(
with t2i_adapter_loaded_model as t2i_adapter_model:
total_downscale_factor = t2i_adapter_model.total_downscale_factor

# Resize the T2I-Adapter input image.
# We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
# result will match the latent image's dimensions after max_unet_downscale is applied.
t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor

# Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
# a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the
# T2I-Adapter model.
#
# Note: We re-use the `prepare_control_image(...)` from ControlNet for T2I-Adapter, because it has many
# of the same requirements (e.g. preserving binary masks during resize).

# Assuming fixed dimensional scaling of LATENT_SCALE_FACTOR.
_, _, latent_height, latent_width = latents_shape
control_height_resize = latent_height * LATENT_SCALE_FACTOR
control_width_resize = latent_width * LATENT_SCALE_FACTOR
t2i_image = prepare_control_image(
image=image,
do_classifier_free_guidance=False,
width=t2i_input_width,
height=t2i_input_height,
width=control_width_resize,
height=control_height_resize,
num_channels=t2i_adapter_model.config["in_channels"], # mypy treats this as a FrozenDict
device=t2i_adapter_model.device,
dtype=t2i_adapter_model.dtype,
resize_mode=t2i_adapter_field.resize_mode,
)

# Resize the T2I-Adapter input image.
# We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
# result will match the latent image's dimensions after max_unet_downscale is applied.
# We crop the image to this size so that the positions match the input image on non-standard resolutions
t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
if t2i_image.shape[2] > t2i_input_height or t2i_image.shape[3] > t2i_input_width:
t2i_image = t2i_image[
:, :, : min(t2i_image.shape[2], t2i_input_height), : min(t2i_image.shape[3], t2i_input_width)
]

adapter_state = t2i_adapter_model(t2i_image)

if do_classifier_free_guidance:
Expand Down
16 changes: 16 additions & 0 deletions invokeai/backend/stable_diffusion/diffusers_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,22 @@ def step(
for idx, value in enumerate(single_t2i_adapter_data.adapter_state):
accum_adapter_state[idx] += value * t2i_adapter_weight

# Hack: force compatibility with irregular resolutions by padding the feature map with zeros
for idx, tensor in enumerate(accum_adapter_state):
# The tensor size is supposed to be some integer downscale factor of the latents size.
# Internally, the unet will pad the latents before downscaling between levels when it is no longer divisible by its downscale factor.
# If the latent size does not scale down evenly, we need to pad the tensor so that it matches the the downscaled padded latents later on.
scale_factor = latents.size()[-1] // tensor.size()[-1]
required_padding_width = math.ceil(latents.size()[-1] / scale_factor) - tensor.size()[-1]
required_padding_height = math.ceil(latents.size()[-2] / scale_factor) - tensor.size()[-2]
tensor = torch.nn.functional.pad(
tensor,
(0, required_padding_width, 0, required_padding_height, 0, 0, 0, 0),
mode="constant",
value=0,
)
accum_adapter_state[idx] = tensor

down_intrablock_additional_residuals = accum_adapter_state

# Handle inpainting models.
Expand Down
40 changes: 0 additions & 40 deletions invokeai/frontend/web/src/common/hooks/useIsReadyToEnqueue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,46 +202,6 @@ const createSelector = (
if (controlLayer.controlAdapter.model?.base !== model?.base) {
problems.push(i18n.t('parameters.invoke.layer.controlAdapterIncompatibleBaseModel'));
}
// T2I Adapters require images have dimensions that are multiples of 64 (SD1.5) or 32 (SDXL)
if (controlLayer.controlAdapter.type === 't2i_adapter') {
const multiple = model?.base === 'sdxl' ? 32 : 64;
if (bbox.scaleMethod === 'none') {
if (bbox.rect.width % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxWidth', {
multiple,
width: bbox.rect.width,
}),
});
}
if (bbox.rect.height % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxHeight', {
multiple,
height: bbox.rect.height,
}),
});
}
} else {
if (bbox.scaledSize.width % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxWidth', {
multiple,
width: bbox.scaledSize.width,
}),
});
}
if (bbox.scaledSize.height % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxHeight', {
multiple,
height: bbox.scaledSize.height,
}),
});
}
}
}

if (problems.length) {
const content = upperFirst(problems.join(', '));
reasons.push({ prefix, content });
Expand Down
Loading