[Misc] Improve BNB loader to handle mixture of sharded and merged weights with same suffix (#11566)

Isotr0py · web-flow · commit dde1fa18c9f9 · 2024-12-27T19:45:13.000Z
Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
@@ -1001,8 +1001,11 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
                     for sub_name in sub_modules:
                         self.target_modules.append(
                             name.replace(last_name, sub_name))
-                else:
-                    self.target_modules.append(name)
+                # Add original module name even if the module has stacked map,
+                # in case model has a mixture of disk-merged and disk-splitted
+                # weights with same last name.
+                self.target_modules.append(name)
+
         assert (self.target_modules
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"