huggingface · DN6 · Mar 6, 2024 · Oct 22, 2023 · Nov 11, 2023 · Nov 11, 2023
diff --git a/scripts/convert_wuerstchen3.py b/scripts/convert_wuerstchen3.py
@@ -0,0 +1,126 @@
+# Run inside root directory of official source code: https://github.com/dome272/wuerstchen/
+import os
+
+import torch
+from transformers import AutoTokenizer, CLIPTextModel, CLIPVisionModelWithProjection
+# from vqgan import VQModel
+
+from diffusers import (
+    DDPMWuerstchenScheduler,
+    WuerstchenV3CombinedPipeline,
+    WuerstchenV3DecoderPipeline,
+    WuerstchenV3PriorPipeline,
+)
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.pipelines.wuerstchen3 import WuerstchenV3DiffNeXt, WuerstchenV3Prior
+
+
+model_path = "../Wuerstchen/"
+device = "cpu"
+
+# paella_vqmodel = VQModel()
+# state_dict = torch.load(os.path.join(model_path, "vqgan_f4_v1_500k.pt"), map_location=device)["state_dict"]
+# paella_vqmodel.load_state_dict(state_dict)
+
+# state_dict["vquantizer.embedding.weight"] = state_dict["vquantizer.codebook.weight"]
+# state_dict.pop("vquantizer.codebook.weight")
+# vqmodel = PaellaVQModel(num_vq_embeddings=paella_vqmodel.codebook_size, latent_channels=paella_vqmodel.c_latent)
+# vqmodel.load_state_dict(state_dict)
+
+# # Clip Text encoder and tokenizer
+# text_encoder = CLIPTextModel.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+# tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+# # Generator
+# clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14").to("cpu")
+
+orig_state_dict = torch.load(os.path.join(model_path, "base_120k.pt"), map_location=device)
+state_dict = {}
+for key in orig_state_dict.keys():
+    if key.endswith("in_proj_weight"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+    elif key.endswith("in_proj_bias"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+    elif key.endswith("out_proj.weight"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+    elif key.endswith("out_proj.bias"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+    # rename clip_mapper to clip_txt_pooled_mapper
+    elif key.endswith("clip_mapper.weight"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("clip_mapper.weight", "clip_txt_pooled_mapper.weight")] = weights
+    elif key.endswith("clip_mapper.bias"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("clip_mapper.bias", "clip_txt_pooled_mapper.bias")] = weights
+    else:
+        state_dict[key] = orig_state_dict[key]
+decoder = WuerstchenV3DiffNeXt().to(device)
+decoder.load_state_dict(state_dict)
+
+
+# Prior
+orig_state_dict = torch.load(os.path.join(model_path, "v1.pt"), map_location=device)
+state_dict = {}
+for key in orig_state_dict.keys():
+    if key.endswith("in_proj_weight"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+    elif key.endswith("in_proj_bias"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+    elif key.endswith("out_proj.weight"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+    elif key.endswith("out_proj.bias"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+    else:
+        state_dict[key] = orig_state_dict[key]
+prior_model = WuerstchenV3Prior().to(device)
+prior_model.load_state_dict(state_dict)
+
+import pdb
+pdb.set_trace()
+
+# # scheduler
+# scheduler = DDPMWuerstchenScheduler()
+#
+# # Prior pipeline
+# prior_pipeline = WuerstchenPriorPipeline(
+#     prior=prior_model, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler
+# )
+#
+# prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior")
+#
+# decoder_pipeline = WuerstchenDecoderPipeline(
+#     text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=decoder, scheduler=scheduler
+# )
+# decoder_pipeline.save_pretrained("warp-ai/wuerstchen")
+#
+# # Wuerstchen pipeline
+# wuerstchen_pipeline = WuerstchenCombinedPipeline(
+#     # Decoder
+#     text_encoder=gen_text_encoder,
+#     tokenizer=gen_tokenizer,
+#     decoder=decoder,
+#     scheduler=scheduler,
+#     vqgan=vqmodel,
+#     # Prior
+#     prior_tokenizer=tokenizer,
+#     prior_text_encoder=text_encoder,
+#     prior=prior_model,
+#     prior_scheduler=scheduler,
+# )
+# wuerstchen_pipeline.save_pretrained("warp-ai/WuerstchenCombinedPipeline")
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -302,6 +302,9 @@
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
+            "WuerstchenV3CombinedPipeline",
+            "WuerstchenV3DecoderPipeline",
+            "WuerstchenV3PriorPipeline",
         ]
     )
 
@@ -660,6 +663,9 @@
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
+            WuerstchenV3CombinedPipeline,
+            WuerstchenV3DecoderPipeline,
+            WuerstchenV3PriorPipeline,
         )
 
     try:

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -228,6 +228,11 @@
         "WuerstchenDecoderPipeline",
         "WuerstchenPriorPipeline",
     ]
+    _import_structure["wuerstchen3"] = [
+        "WuerstchenV3CombinedPipeline",
+        "WuerstchenV3DecoderPipeline",
+        "WuerstchenV3PriorPipeline",
+    ]
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -461,6 +466,11 @@
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
         )
+        from .wuerstchen3 import (
+            WuerstchenV3CombinedPipeline,
+            WuerstchenV3DecoderPipeline,
+            WuerstchenV3PriorPipeline,
+        )
 
         try:
             if not is_onnx_available():

diff --git a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
@@ -32,13 +32,19 @@ def forward(self, x):
 
 
 class TimestepBlock(nn.Module):
-    def __init__(self, c, c_timestep):
+    def __init__(self, c, c_timestep, conds=["sca"]):
         super().__init__()
         linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
         self.mapper = linear_cls(c_timestep, c * 2)
+        for cname in conds:
+            setattr(self, f"mapper_{cname}", linear_cls(c_timestep, c * 2))
 
     def forward(self, x, t):
-        a, b = self.mapper(t)[:, :, None, None].chunk(2, dim=1)
+        t = t.chunk(len(self.conds) + 1, dim=1)
+        a, b = self.mapper(t[0])[:, :, None, None].chunk(2, dim=1)
+        for i, c in enumerate(self.conds):
+            ac, bc = getattr(self, f"mapper_{c}")(t[i + 1])[:, :, None, None].chunk(2, dim=1)
+            a, b = a + ac, b + bc
         return x * (1 + a) + b
 
 
@@ -49,10 +55,10 @@ def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
         conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
         linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
 
-        self.depthwise = conv_cls(c + c_skip, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.depthwise = conv_cls(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
         self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
         self.channelwise = nn.Sequential(
-            linear_cls(c, c * 4), nn.GELU(), GlobalResponseNorm(c * 4), nn.Dropout(dropout), linear_cls(c * 4, c)
+            linear_cls(c + c_skip, c * 4), nn.GELU(), GlobalResponseNorm(c * 4), nn.Dropout(dropout), linear_cls(c * 4, c)
         )
 
     def forward(self, x, x_skip=None):

diff --git a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
@@ -24,6 +24,28 @@
 from .modeling_wuerstchen_common import AttnBlock, GlobalResponseNorm, TimestepBlock, WuerstchenLayerNorm
 
 
+class ResBlockStageB(nn.Module):
+    def __init__(self, c, c_skip=None, kernel_size=3, dropout=0.0):
+        super().__init__()
+        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(c + c_skip, c * 4),
+            nn.GELU(),
+            GlobalResponseNorm(c * 4),
+            nn.Dropout(dropout),
+            nn.Linear(c * 4, c),
+        )
+
+    def forward(self, x, x_skip=None):
+        x_res = x
+        x = self.norm(self.depthwise(x))
+        if x_skip is not None:
+            x = torch.cat([x, x_skip], dim=1)
+        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x + x_res
+
+
 class WuerstchenDiffNeXt(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
@@ -71,7 +93,7 @@ def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0):
             elif block_type == "A":
                 return AttnBlock(c_hidden, c_cond, nhead, self_attn=True, dropout=dropout)
             elif block_type == "T":
-                return TimestepBlock(c_hidden, c_r)
+                return TimestepBlock(c_hidden, c_r, conds=[])
             else:
                 raise ValueError(f"Block type {block_type} not supported")
 
@@ -230,25 +252,3 @@ def forward(self, x, r, effnet, clip=None, x_cat=None, eps=1e-3, return_noise=Tr
             return (x_in - a) / b
         else:
             return a, b
-
-
-class ResBlockStageB(nn.Module):
-    def __init__(self, c, c_skip=None, kernel_size=3, dropout=0.0):
-        super().__init__()
-        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
-        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
-        self.channelwise = nn.Sequential(
-            nn.Linear(c + c_skip, c * 4),
-            nn.GELU(),
-            GlobalResponseNorm(c * 4),
-            nn.Dropout(dropout),
-            nn.Linear(c * 4, c),
-        )
-
-    def forward(self, x, x_skip=None):
-        x_res = x
-        x = self.norm(self.depthwise(x))
-        if x_skip is not None:
-            x = torch.cat([x, x_skip], dim=1)
-        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        return x + x_res
diff --git a/src/diffusers/pipelines/wuerstchen3/__init__.py b/src/diffusers/pipelines/wuerstchen3/__init__.py
@@ -0,0 +1,54 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_wuerstchen3_diffnext"] = ["WuerstchenV3DiffNeXt"]
+    _import_structure["modeling_wuerstchen3_prior"] = ["WuerstchenV3Prior"]
+    _import_structure["pipeline_wuerstchen3"] = ["WuerstchenV3DecoderPipeline"]
+    _import_structure["pipeline_wuerstchen3_combined"] = ["WuerstchenV3CombinedPipeline"]
+    _import_structure["pipeline_wuerstchen3_prior"] = ["WuerstchenV3PriorPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modeling_wuerstchen3_diffnext import WuerstchenV3DiffNeXt
+        from .modeling_wuerstchen3_prior import WuerstchenV3Prior
+        from .pipeline_wuerstchen3 import WuerstchenV3DecoderPipeline
+        from .pipeline_wuerstchen3_combined import WuerstchenV3CombinedPipeline
+        from .pipeline_wuerstchen3_prior import WuerstchenV3PriorPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)