Patch MLPs optionally / use the new convention

isidentical · isidentical · commit 424a344eb248 · 2023-07-25T03:55:48.000+03:00
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
@@ -59,7 +59,7 @@
     import safetensors
 
 if is_transformers_available():
-    from transformers import CLIPTextModel, PreTrainedModel, PreTrainedTokenizer
+    from transformers import CLIPTextModel, CLIPTextModelWithProjection, PreTrainedModel, PreTrainedTokenizer
 
 if is_accelerate_available():
     from accelerate import init_empty_weights
@@ -128,6 +128,8 @@ def text_encoder_mlp_modules(text_encoder):
             mlp_mod = layer.mlp
             name = f"text_model.encoder.layers.{i}.mlp"
             mlp_modules.append((name, mlp_mod))
+    elif isinstance(text_encoder, CLIPTextModelWithProjection):
+        pass  # SDXL is not supported yet.
     else:
         raise ValueError(f"do not know how to get mlp modules for: {text_encoder.__class__.__name__}")
 
@@ -1128,21 +1130,12 @@ def load_lora_into_text_encoder(cls, state_dict, network_alpha, text_encoder, lo
                             f"{name}.out_proj.lora_linear_layer.down.weight"
                         ] = text_encoder_lora_state_dict.pop(f"{name}.to_out_lora.down.weight")
 
-                    for name, _ in text_encoder_mlp_modules(text_encoder):
-                        for direction in ["up", "down"]:
-                            for layer in ["fc1", "fc2"]:
-                                original_key = f"{name}.{layer}.lora.{direction}.weight"
-                                replacement_key = f"{name}.{layer}.lora_linear_layer.{direction}.weight"
-                                if original_key in text_encoder_lora_state_dict:
-                                    text_encoder_lora_state_dict[replacement_key] = text_encoder_lora_state_dict.pop(
-                                        original_key
-                                    )
-
                 rank = text_encoder_lora_state_dict[
                     "text_model.encoder.layers.0.self_attn.out_proj.lora_linear_layer.up.weight"
                 ].shape[1]
+                patch_mlp = any(".mlp." in key for key in text_encoder_lora_state_dict.keys())
 
-                cls._modify_text_encoder(text_encoder, lora_scale, network_alpha, rank=rank)
+                cls._modify_text_encoder(text_encoder, lora_scale, network_alpha, rank=rank, patch_mlp=patch_mlp)
 
                 # set correct dtype & device
                 text_encoder_lora_state_dict = {
@@ -1187,6 +1180,7 @@ def _modify_text_encoder(
         network_alpha=None,
         rank=4,
         dtype=None,
+        patch_mlp=False,
     ):
         r"""
         Monkey-patches the forward passes of attention modules of the text encoder.
@@ -1218,12 +1212,17 @@ def _modify_text_encoder(
             )
             lora_parameters.extend(attn_module.out_proj.lora_linear_layer.parameters())
 
-        for _, mlp_module in text_encoder_mlp_modules(text_encoder):
-            mlp_module.fc1 = PatchedLoraProjection(mlp_module.fc1, lora_scale, network_alpha, rank=rank, dtype=dtype)
-            lora_parameters.extend(mlp_module.fc1.lora_linear_layer.parameters())
+        if patch_mlp:
+            for _, mlp_module in text_encoder_mlp_modules(text_encoder):
+                mlp_module.fc1 = PatchedLoraProjection(
+                    mlp_module.fc1, lora_scale, network_alpha, rank=rank, dtype=dtype
+                )
+                lora_parameters.extend(mlp_module.fc1.lora_linear_layer.parameters())
 
-            mlp_module.fc2 = PatchedLoraProjection(mlp_module.fc2, lora_scale, network_alpha, rank=rank, dtype=dtype)
-            lora_parameters.extend(mlp_module.fc2.lora_linear_layer.parameters())
+                mlp_module.fc2 = PatchedLoraProjection(
+                    mlp_module.fc2, lora_scale, network_alpha, rank=rank, dtype=dtype
+                )
+                lora_parameters.extend(mlp_module.fc2.lora_linear_layer.parameters())
 
         return lora_parameters
 
@@ -1363,6 +1362,9 @@ def _convert_kohya_lora_to_diffusers(cls, state_dict):
                         te_state_dict[diffusers_name] = value
                         te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
                     elif "mlp" in diffusers_name:
+                        # Be aware that this is the new diffusers convention and the rest of the code might
+                        # not utilize it yet.
+                        diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
                         te_state_dict[diffusers_name] = value
                         te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]