[LoRA] Fix SDXL text encoder LoRAs (huggingface#4371)

sayakpaul · patrickvonplaten · Jimmy · commit f712f56cf209 · 2024-04-26T17:05:11.000-04:00
* temporarily disable text encoder loras.

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debbuging.

* modify doc.

* rename tests.

* print slices.

* fix: assertions

* Apply suggestions from code review

Co-authored-by: Patrick von Platen &lt;patrick.v.platen@gmail.com&gt;

---------

Co-authored-by: Patrick von Platen &lt;patrick.v.platen@gmail.com&gt;
diff --git a/docs/source/en/training/lora.md b/docs/source/en/training/lora.md
@@ -401,5 +401,4 @@ Thanks to [@isidentical](https://github.com/isidentical) for helping us on integ
 
 ### Known limitations specific to the Kohya-styled LoRAs
 
-* SDXL LoRAs that have both the text encoders are currently leading to weird results. We're actively investigating the issue. 
 * When images don't looks similar to other UIs such ComfyUI, it can be beacause of multiple reasons as explained [here](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736). 
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import os
 import re
 import warnings
@@ -258,6 +259,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
         # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
         network_alphas = kwargs.pop("network_alphas", None)
+        is_network_alphas_none = network_alphas is None
 
         if use_safetensors and not is_safetensors_available():
             raise ValueError(
@@ -349,13 +351,20 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
 
                 # Create another `mapped_network_alphas` dictionary so that we can properly map them.
                 if network_alphas is not None:
-                    for k in network_alphas:
+                    network_alphas_ = copy.deepcopy(network_alphas)
+                    for k in network_alphas_:
                         if k.replace(".alpha", "") in key:
-                            mapped_network_alphas.update({attn_processor_key: network_alphas[k]})
+                            mapped_network_alphas.update({attn_processor_key: network_alphas.pop(k)})
+
+            if not is_network_alphas_none:
+                if len(network_alphas) > 0:
+                    raise ValueError(
+                        f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+                    )
 
             if len(state_dict) > 0:
                 raise ValueError(
-                    f"The state_dict has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
+                    f"The `state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
                 )
 
             for key, value_dict in lora_grouped_dict.items():
@@ -434,14 +443,6 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                             v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight"),
                             out_rank=rank_mapping.get("to_out_lora.down.weight"),
                             out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight"),
-                            # rank=rank_mapping.get("to_k_lora.down.weight", None),
-                            # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
-                            # q_rank=rank_mapping.get("to_q_lora.down.weight", None),
-                            # q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight", None),
-                            # v_rank=rank_mapping.get("to_v_lora.down.weight", None),
-                            # v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight", None),
-                            # out_rank=rank_mapping.get("to_out_lora.down.weight", None),
-                            # out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight", None),
                         )
                     else:
                         attn_processors[key] = attn_processor_class(
@@ -496,9 +497,6 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         # set ff layers
         for target_module, lora_layer in non_attn_lora_layers:
             target_module.set_lora_layer(lora_layer)
-            # It should raise an error if we don't have a set lora here
-            # if hasattr(target_module, "set_lora_layer"):
-            #     target_module.set_lora_layer(lora_layer)
 
     def save_attn_procs(
         self,
@@ -1251,9 +1249,10 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
         keys = list(state_dict.keys())
         prefix = cls.text_encoder_name if prefix is None else prefix
 
+        # Safe prefix to check with.
         if any(cls.text_encoder_name in key for key in keys):
             # Load the layers corresponding to text encoder and make necessary adjustments.
-            text_encoder_keys = [k for k in keys if k.startswith(prefix)]
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
             text_encoder_lora_state_dict = {
                 k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
             }
@@ -1303,6 +1302,14 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
                 ].shape[1]
                 patch_mlp = any(".mlp." in key for key in text_encoder_lora_state_dict.keys())
 
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
                 cls._modify_text_encoder(
                     text_encoder,
                     lora_scale,
@@ -1364,12 +1371,13 @@ def _modify_text_encoder(
 
         lora_parameters = []
         network_alphas = {} if network_alphas is None else network_alphas
+        is_network_alphas_populated = len(network_alphas) > 0
 
         for name, attn_module in text_encoder_attn_modules(text_encoder):
-            query_alpha = network_alphas.get(name + ".k.proj.alpha")
-            key_alpha = network_alphas.get(name + ".q.proj.alpha")
-            value_alpha = network_alphas.get(name + ".v.proj.alpha")
-            proj_alpha = network_alphas.get(name + ".out.proj.alpha")
+            query_alpha = network_alphas.pop(name + ".to_q_lora.down.weight.alpha", None)
+            key_alpha = network_alphas.pop(name + ".to_k_lora.down.weight.alpha", None)
+            value_alpha = network_alphas.pop(name + ".to_v_lora.down.weight.alpha", None)
+            out_alpha = network_alphas.pop(name + ".to_out_lora.down.weight.alpha", None)
 
             attn_module.q_proj = PatchedLoraProjection(
                 attn_module.q_proj, lora_scale, network_alpha=query_alpha, rank=rank, dtype=dtype
@@ -1387,14 +1395,14 @@ def _modify_text_encoder(
             lora_parameters.extend(attn_module.v_proj.lora_linear_layer.parameters())
 
             attn_module.out_proj = PatchedLoraProjection(
-                attn_module.out_proj, lora_scale, network_alpha=proj_alpha, rank=rank, dtype=dtype
+                attn_module.out_proj, lora_scale, network_alpha=out_alpha, rank=rank, dtype=dtype
             )
             lora_parameters.extend(attn_module.out_proj.lora_linear_layer.parameters())
 
         if patch_mlp:
             for name, mlp_module in text_encoder_mlp_modules(text_encoder):
-                fc1_alpha = network_alphas.get(name + ".fc1.alpha")
-                fc2_alpha = network_alphas.get(name + ".fc2.alpha")
+                fc1_alpha = network_alphas.pop(name + ".fc1.lora_linear_layer.down.weight.alpha")
+                fc2_alpha = network_alphas.pop(name + ".fc2.lora_linear_layer.down.weight.alpha")
 
                 mlp_module.fc1 = PatchedLoraProjection(
                     mlp_module.fc1, lora_scale, network_alpha=fc1_alpha, rank=rank, dtype=dtype
@@ -1406,6 +1414,11 @@ def _modify_text_encoder(
                 )
                 lora_parameters.extend(mlp_module.fc2.lora_linear_layer.parameters())
 
+        if is_network_alphas_populated and len(network_alphas) > 0:
+            raise ValueError(
+                f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+            )
+
         return lora_parameters
 
     @classmethod
@@ -1519,10 +1532,6 @@ def _convert_kohya_lora_to_diffusers(cls, state_dict):
             lora_name_up = lora_name + ".lora_up.weight"
             lora_name_alpha = lora_name + ".alpha"
 
-            # if lora_name_alpha in state_dict:
-            #     alpha = state_dict.pop(lora_name_alpha).item()
-            #     network_alphas.update({lora_name_alpha: alpha})
-
             if lora_name.startswith("lora_unet_"):
                 diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
 
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
@@ -737,7 +737,7 @@ def test_a1111(self):
         ).images
 
         images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.3725, 0.3767, 0.3761, 0.3796, 0.3827, 0.3763, 0.3831, 0.3809, 0.3392])
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
 
         self.assertTrue(np.allclose(images, expected, atol=1e-4))
 
@@ -760,7 +760,7 @@ def test_vanilla_funetuning(self):
 
         self.assertTrue(np.allclose(images, expected, atol=1e-4))
 
-    def test_unload_lora(self):
+    def test_unload_kohya_lora(self):
         generator = torch.manual_seed(0)
         prompt = "masterpiece, best quality, mountain"
         num_inference_steps = 2

Original file line number	Diff line number	Diff line change
`@@ -401,5 +401,4 @@ Thanks to [@isidentical](https://github.com/isidentical) for helping us on integ`
`401`	`401`
`402`	`402`	`### Known limitations specific to the Kohya-styled LoRAs`
`403`	`403`
`404`		`-* SDXL LoRAs that have both the text encoders are currently leading to weird results. We're actively investigating the issue.`
`405`	`404`	`* When images don't looks similar to other UIs such ComfyUI, it can be beacause of multiple reasons as explained [here](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736).`