Add Kandinsky 2.1 #3308

yiyixuxu · 2023-05-01T17:04:34Z

this PR add Kandinsky2.1 to diffusers

original codebase: https://github.com/ai-forever/Kandinsky-2

to-do:

from diffusers import KandinskyPipeline, KandinskyPriorPipeline

import torch
import numpy as np
device = "cuda"

# # inputs
prompt= "red cat, 4k photo"
batch_size=1 


# # create prior 
pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
pipe_prior.to("cuda")

# use prior to generate image_emb based on our prompt
generator = torch.Generator(device=device).manual_seed(0)
out = pipe_prior(prompt, generator=generator,)
image_emb = out.images
zero_image_emb = out.zero_embeds

# create diffuser pipeline
pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky")
pipe.to(device)


generator = torch.Generator(device="cuda").manual_seed(0)
out = pipe(
    prompt,
    image_embeds=image_emb,
    negative_image_embeds =zero_image_emb,
    height=768,
    width=768,
    num_inference_steps=100,
    generator=generator )

image = out.images[0]
image.save("cat.png")

use inpainting pipeline to add a hat

from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
from diffusers.utils import load_image

import torch
import numpy as np

device = "cuda"

REPO_PRIOR = "YiYiXu/Kandinsky-prior"
REPO_INPAINT = "YiYiXu/Kandinsky-inpaint"

# # inputs
prompt= "a hat"
batch_size=1 

# # create prior 
pipe_prior = KandinskyPriorPipeline.from_pretrained(REPO_PRIOR)
pipe_prior.to("cuda")

# use prior to generate image_emb based on our prompt
generator = torch.Generator(device=device).manual_seed(0)
out = pipe_prior(prompt, generator=generator,)
image_emb = out.images
zero_image_emb = out.zero_embeds

# create diffuser pipeline
pipe = KandinskyInpaintPipeline.from_pretrained(REPO_INPAINT )
pipe.to(device)

init_image = load_image(
    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
    "/kandinsky/cat.png")

mask = np.ones((768, 768), dtype=np.float32)
mask[:250,250:-250] =  0

generator = torch.Generator(device="cuda").manual_seed(0)
out = pipe(
    prompt,
    image=init_image,
    mask_image=mask,
    image_embeds=image_emb,
    negative_image_embeds =zero_image_emb,
    height=768,
    width=768,
    num_inference_steps=150,
    generator=generator )

image = out.images[0]
image.save("cat_with_hat.png")

image-to-image generation

from diffusers.utils import load_image
from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline, DDIMScheduler
import torch

REPO_PRIOR = "YiYiXu/Kandinsky-prior"
REPO = "YiYiXu/Kandinsky"

model_dtype = torch.float16

prompt = "A red cartoon frog, 4k"

init_image = load_image(
    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
    "/kandinsky/frog.png")

# create prior 
pipe_prior = KandinskyPriorPipeline.from_pretrained(REPO_PRIOR, torch_dtype=model_dtype)
pipe_prior.to("cuda")

# use prior to generate image_emb based on our prompt
generator = torch.Generator(device='cuda').manual_seed(0)
out = pipe_prior(prompt, num_inference_steps=25, generator=generator,).
image_emb = out.images
zero_image_emb = out.zero_embeds

# create img2img pipeline
pipe = KandinskyImg2ImgPipeline.from_pretrained(REPO, torch_dtype=model_dtype)

# create ddim scheduler 
ddim_config = {
    "num_train_timesteps": 1000,
    "beta_schedule":  "linear",
    "beta_start": 0.00085,
    "beta_end":0.012,
    "clip_sample" : False,
    "set_alpha_to_one" : False, 
    "steps_offset" : 0,
    "prediction_type" : "epsilon",
    "thresholding" : False,
}

ddim_scheduler = DDIMScheduler(**ddim_config)
pipe.scheduler = ddim_scheduler

pipe.to("cuda")


generator = torch.Generator(device='cuda').manual_seed(0)
out = pipe(
    prompt=prompt, 
    image=init_image, 
    height=768, 
    width=768, 
    num_inference_steps=100, 
    generator=generator, 
    image_embeds=image_emb, 
    negative_image_embeds=zero_image_emb, 
    strength=0.2,) 

out.images[0].save("img2img_frog_out.png")

image mixing

from diffusers import KandinskyPriorPipeline, KandinskyPipeline
from diffusers.utils import load_image
import PIL

import torch

from torchvision import transforms

REPO_PRIOR = "YiYiXu/Kandinsky-prior"
REPO = "YiYiXu/Kandinsky"

model_dtype = torch.float16


# create prior 
pipe_prior = KandinskyPriorPipeline.from_pretrained(REPO_PRIOR, torch_dtype=model_dtype)
pipe_prior.to("cuda")

# we will use prior to create image_emb and zero_image_emb

img1 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
    "/kandinsky/cat.png")

img2 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
    "/kandinsky/starry_night.jpeg")

images_texts = ["a cat", img1, img2 ]
weights = [0.3,0.3,0.4]

image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights,)

pipe = KandinskyPipeline.from_pretrained(REPO, torch_dtype =model_dtype )
pipe.to("cuda")

generator = torch.Generator(device="cuda").manual_seed(0)
out = pipe(
    "",
    image_embeds=image_emb,
    negative_image_embeds =zero_image_emb,
    height=768,
    width=768,
    num_inference_steps=150,
    generator=generator )

image = out.images[0]
image.save("starry_cat.png")

isamu-isozaki · 2023-05-01T22:58:45Z

@yiyixuxu Very cool! Just thought to mention but we did port MoVQ in huggingface/open-muse here which might help

yiyixuxu · 2023-05-01T23:22:46Z

@isamu-isozaki thanks!

ayushtues · 2023-05-02T16:37:00Z

Very cool @yiyixuxu, can you tell me how you tested if the prior model pipeline is working and if the weights were loading? Would it be cool to have a temporary jupyter-notebook handy for testing the pipeline, and its individual components, if the weights are loading, hacky debugging etc

ayushtues · 2023-05-02T18:09:27Z

For the MOVQ, it is practically the same as the VQVAE model already in diffusers https://github.com/huggingface/diffusers/blob/kandinsky/src/diffusers/models/vq_model.py, with the Encoder, and the VectorQuantizer being exactly the same, but in the decoder, it just uses a different custom normalization layer in the decoder (SpatialNorm) which takes an extra embedding as input, than GroupNorm in VQVAE, the rest of the implementation of the decoder is also exactly the same and thus needs changes to the attention/resnet building blocks, which are also the same as the ones present in diffusers, except the normalization layer (they use Groupnorm, and need to use SpatialNorm now).

We can either parametrize the attention/resnet building blocks and VQVAE in diffusers to support using a different normalization layer and an additional embedding input, or copy them with the minimal changes in the Kandinsky pipeline if we feel the normalization layer is not general enough to change the existing implementations.

Would love to hear opinions on this!

yiyixuxu · 2023-05-02T18:18:39Z

@ayushtues let's add SpatialNorm to the blocks

yiyixuxu · 2023-05-02T18:40:34Z

@ayushtues this is an example script I use to do a quick compare along the process

Note that this might not work for you because I had to go into the original repo to hardcode a few things to make sure we can reproduce ( including changing the the noise construction to match diffusers' and passing a generator down, I don't think you will need to do this for decoder) - this is just an example so that you can use a similar process

import numpy as np
from kandinsky2 import get_kandinsky2
import torch
model = get_kandinsky2('cuda', task_type='text2img', model_version='2.1', use_flash_attention=False)

prompt= "red cat, 4k photo"
batch_size=1 
guidance_scale=4
prior_cf_scale=4,
prior_steps="5"
negative_prior_prompt=""

# generate clip embeddings
image_emb = model.generate_clip_emb(
    prompt,
    batch_size=batch_size,
    prior_cf_scale=prior_cf_scale,
    prior_steps=prior_steps,
    negative_prior_prompt=negative_prior_prompt,
        )
print(f"image_emb:{image_emb.shape},{image_emb.sum()}")




# diffusers
from diffusers import KandinskyPipeline, PriorTransformer
import diffusers

pipe_prior = KandinskyPipeline.from_pretrained("YiYiXu/test-kandinsky")
pipe_prior.to("cuda")

generator = torch.Generator(device="cuda").manual_seed(0)
image_emb_d = pipe_prior(
    prompt,
    generator=generator,
)

print(f"image_embeddings:{image_emb_d.shape},{image_emb_d.sum()}")
print("compare results:")
print(np.max(np.abs(image_emb_d.detach().cpu().numpy() - image_emb.detach().cpu().numpy())))

HuggingFaceDocBuilderDev · 2023-05-04T02:39:09Z

The documentation is not available anymore as the PR was closed or merged.

ayushtues · 2023-05-04T10:38:52Z

Started a PR #3330 for adding the decoder, was able to load the pretrained weights of the MOVQ model into diffusers based VQModel, with minimal changes. Need to ensure if forward passes are also the same next

ayushtues · 2023-05-04T13:04:10Z

Okay the outputs of the forward pass are in 1e-4 of each other for the movq decoder and 1e-5 for the movq encoder and seem similar, so should be okay.

Can integrate it into the pipeline next, added a PR for the weights in the diffuser model repo @yiyixuxu meanwhile

yiyixuxu · 2023-05-05T16:27:49Z

@ayushtues

Thanks for adding the decoder so fast! super awesome job! 😇🤗👏👍

I think we can wrap up Kandinsky soon! A few tasks left (I ranked them from easy to difficult based on my subjective judgment 😂) - let me know if you are interested in taking any of these. I will help you as much as you need of course :)

image encoder: I think it's only used to generate zero image embedding for unconditional tokens here https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/kandinsky2_1_model.py#L323, it's just the clip image encoder

from transformers import CLIPVisionModelWithProjection
clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")

second text_encoder and tokenizer for the text-to-image diffusion process with Unet
- note that it's different from the prior_text_encoder and prior_tokenizer (CLIP text_encoder and tokenizer) - they use MultilingualCLIP for text_to_image
- MultilingualCLIP is not in transformers library yet but it's just a few lines of code, we can just take this https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/model/text_encoders.py#L108
second scheduler for text-to-image diffusion process: I think we can just use UnCLIPScheduler https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_unclip.py out-of-box with:
- variance_type : "learned_range"
- clip_sample_range:2
- we may need to add this bit to UnCLIPScheduler https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/model/gaussian_diffusion.py#L287
put everything together into the pipeline!

patrickvonplaten · 2023-05-06T11:04:52Z

src/diffusers/pipelines/kandinsky/text_proj.py

@@ -0,0 +1,77 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.


src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py

ayushtues · 2023-05-06T17:28:56Z

@yiyixuxu I can take up 1, 2, and then later help with 4 when the parts are ready to combine in the pipeline; not so familiar with how schedulers integrate into diffusers, so will leave 3 to you, but will definitely want to review it and learn how they integrate into the pipeline.

This reverts commit fee1bba.

yiyixuxu · 2023-05-07T06:08:54Z

@ayushtues great!

ayushtues · 2023-05-09T10:34:23Z

@yiyixuxu where do you think we should put the multilingualCLIP model, since it's not directly available in HF, should we add it in a separate file in pipelines/Kandinsky?

ayushtues · 2023-05-09T11:13:09Z

Meanwhile started another PR for task 1, 2 - #3373

Co-authored-by: Sayak Paul <[email protected]>

…into kandinsky

patrickvonplaten · 2023-05-25T21:11:52Z

Good to merge!

alexblattner · 2023-06-08T07:37:06Z

can this use controlnet?

patrickvonplaten · 2023-06-11T15:34:35Z

We should try training ControlNet on it!

add kandinsky2.1 --------- Co-authored-by: yiyixuxu <yixu310@gmail,com> Co-authored-by: Ayush Mangal <[email protected]> Co-authored-by: ayushmangal <[email protected]> Co-authored-by: Patrick von Platen <[email protected]> Co-authored-by: Sayak Paul <[email protected]>

yiyixuxu added 2 commits May 1, 2023 16:42

add prior

e09866e

add conversion script

f28ad03

yiyixuxu mentioned this pull request May 1, 2023

Kandinsky 2.1 #2985

Closed

2 tasks

yiyixuxu added 2 commits May 3, 2023 00:31

fix file name

09da58a

add unet + text_proj

6c95524

ayushtues mentioned this pull request May 4, 2023

[WIP] Add Kandinsky decoder #3330

Merged

3 tasks

seruva19 mentioned this pull request May 4, 2023

Optimize performance seruva19/kubin#30

Closed

2 tasks

yiyixuxu added 3 commits May 6, 2023 04:33

Merge remote-tracking branch 'origin/main' into kandinsky

14b3f9d

soft dependency on m-clip

fee1bba

add dynamic thresholding to unclip

06499f7

patrickvonplaten reviewed May 6, 2023

View reviewed changes

src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py Show resolved Hide resolved

patrickvonplaten reviewed May 6, 2023

View reviewed changes

src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py Outdated Show resolved Hide resolved

yiyixuxu added 2 commits May 7, 2023 00:27

Revert "soft dependency on m-clip"

2e93e81

This reverts commit fee1bba.

fix a bug in text_proj

a6bacf2

ayushtues mentioned this pull request May 9, 2023

[WIP] add Kandinsky image encoder and Multi-clip model #3373

Merged

1 task

yiyixuxu and others added 18 commits May 24, 2023 18:59

more

dab1a1f

fix inpaint test - update scheduler

410704e

make style

eb06439

change checkpoint name for prior

b945f8f

update ckpt name for text2img

4145c05

update ckpt for inpaint

f83dddb

style

93f4848

explain interpolate

8d972bf

style

307cef2

change default num_inference_steps for prior to 25

1634ebd

lose test for inpaint

1f5c724

remove a comment

4aa4e35

udpate

1d18955

style

8d370ab

Update docs/source/en/api/pipelines/kandinsky.mdx

6d360b1

Co-authored-by: Sayak Paul <[email protected]>

add author names

6dbbc95

Merge branch 'kandinsky' of https://github.com/huggingface/diffusers …

f09306d

…into kandinsky

remove yiyi files

357e15a

patrickvonplaten changed the title ~~[WIP] add Kandinsky 2.1~~ Add Kandinsky 2.1 May 25, 2023

patrickvonplaten approved these changes May 25, 2023

View reviewed changes

yiyixuxu merged commit 03b7a84 into main May 25, 2023

This was referenced May 30, 2023

Make sure we also change the config when setting encoder_hid_dim_type=="text_proj" and allow xformers #3615

Merged

Textual Inversion / Dreambooth / ControlNet / LoRA for Kandinsky #3677

Closed

kashif deleted the kandinsky branch June 7, 2023 09:41

yiyixuxu mentioned this pull request Jun 13, 2023

update conversion script for Kandinsky unet #3766

Merged

		@@ -0,0 +1,77 @@
		# Copyright 2023 The HuggingFace Team. All rights reserved.

Add Kandinsky 2.1 #3308

Add Kandinsky 2.1 #3308

Uh oh!

Conversation

yiyixuxu commented May 1, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

isamu-isozaki commented May 1, 2023

Uh oh!

yiyixuxu commented May 1, 2023

Uh oh!

ayushtues commented May 2, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ayushtues commented May 2, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

yiyixuxu commented May 2, 2023

Uh oh!

yiyixuxu commented May 2, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

HuggingFaceDocBuilderDev commented May 4, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ayushtues commented May 4, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ayushtues commented May 4, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

yiyixuxu commented May 5, 2023

Uh oh!

patrickvonplaten May 6, 2023

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

ayushtues commented May 6, 2023

Uh oh!

yiyixuxu commented May 7, 2023

Uh oh!

ayushtues commented May 9, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ayushtues commented May 9, 2023

Uh oh!

patrickvonplaten commented May 25, 2023

Uh oh!

alexblattner commented Jun 8, 2023

Uh oh!

patrickvonplaten commented Jun 11, 2023

Uh oh!

Uh oh!

yiyixuxu commented May 1, 2023 •

edited

Loading

ayushtues commented May 2, 2023 •

edited

Loading

ayushtues commented May 2, 2023 •

edited

Loading

yiyixuxu commented May 2, 2023 •

edited

Loading

HuggingFaceDocBuilderDev commented May 4, 2023 •

edited

Loading

ayushtues commented May 4, 2023 •

edited

Loading

ayushtues commented May 4, 2023 •

edited

Loading

ayushtues commented May 9, 2023 •

edited

Loading