auto scale

ngxson · ngxson · commit 42415a4874e0 · 2024-07-15T11:41:18.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -684,7 +684,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--lora") {
         CHECK_ARG
-        params.lora_adapter.emplace_back(argv[i], 1.0f);
+        params.lora_adapter.emplace_back(argv[i], 0.0f);
         return true;
     }
     if (arg == "--lora-scaled") {
@@ -2089,6 +2089,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
             llama_free_model(model);
             return std::make_tuple(nullptr, nullptr);
         }
+        if (lora_scale == 0.0f) {
+            lora_scale = llama_lora_adapter_get_default_scale(adapter);
+        }
         llama_lora_adapter_set(lctx, adapter, lora_scale);
     }
 
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
@@ -366,9 +366,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             lparams: dict[str, Any] = json.load(f)
 
         alpha = lparams["lora_alpha"]
+        rank = lparams["r"]
 
         model_instance.gguf_writer.add_string("training.type", "finetune_lora")
         model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha))
+        model_instance.gguf_writer.add_float32("training.lora.scale", float(alpha) / float(rank))
 
         model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
         logger.info("Exporting model...")
diff --git a/include/llama.h b/include/llama.h
@@ -513,12 +513,33 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
 
+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_control_vector_apply(
+            struct llama_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);
+
+    //
+    // LoRA
+    //
+
     // Load a LoRA adapter from file
     // The loaded adapter will be associated to the given model, and will be free when the model is deleted
     LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
             struct llama_model * model,
             const char * path_lora);
 
+    // Get default scale of an adapter
+    LLAMA_API float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter);
+
     // Add a loaded LoRA adapter to given context
     // This will not modify model's weight
     LLAMA_API int32_t llama_lora_adapter_set(
@@ -536,20 +557,6 @@ extern "C" {
     // Note: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
 
-    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
-    // the currently loaded vector.
-    // n_embd should be the size of a single layer's control, and data should point
-    // to an n_embd x n_layers buffer starting from layer 1.
-    // il_start and il_end are the layer range the vector should apply to (both inclusive)
-    // See llama_control_vector_load in common to load a control vector.
-    LLAMA_API int32_t llama_control_vector_apply(
-            struct llama_context * lctx,
-                     const float * data,
-                          size_t   len,
-                         int32_t   n_embd,
-                         int32_t   il_start,
-                         int32_t   il_end);
-
     //
     // KV cache
     //
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -380,6 +380,7 @@ enum llm_kv {
 
     LLM_KV_TRAINING_TYPE,
     LLM_KV_TRAINING_LORA_ALPHA,
+    LLM_KV_TRAINING_LORA_SCALE,
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -476,6 +477,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_TRAINING_TYPE,                  "training.type"       },
     { LLM_KV_TRAINING_LORA_ALPHA,            "training.lora.alpha" },
+    { LLM_KV_TRAINING_LORA_SCALE,            "training.lora.scale" },
 };
 
 struct LLM_KV {
@@ -2851,6 +2853,7 @@ struct llama_lora_adapter {
     std::vector<ggml_backend_buffer_t> bufs;
 
     float alpha;
+    float scale; // default scale
 
     llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
         base_model->lora_adapters.insert(this);
@@ -18578,7 +18581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
+    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -18615,6 +18618,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         }
 
         adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA));
+        adapter.scale = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_SCALE));
     }
 
     int n_tensors = gguf_get_n_tensors(ctx_gguf);
@@ -18749,6 +18753,10 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     ggml_free(ctx);
 }
 
+float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter) {
+    return adapter->scale;
+}
+
 int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter,

Original file line number	Diff line number	Diff line change
`@@ -684,7 +684,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa`
`684`	`684`	`}`
`685`	`685`	`if (arg == "--lora") {`
`686`	`686`	`CHECK_ARG`
`687`		`- params.lora_adapter.emplace_back(argv[i], 1.0f);`
	`687`	`+ params.lora_adapter.emplace_back(argv[i], 0.0f);`
`688`	`688`	`return true;`
`689`	`689`	`}`
`690`	`690`	`if (arg == "--lora-scaled") {`
`@@ -2089,6 +2089,9 @@ std::tuple<struct llama_model , struct llama_context > llama_init_from_gpt_par`
`2089`	`2089`	`llama_free_model(model);`
`2090`	`2090`	`return std::make_tuple(nullptr, nullptr);`
`2091`	`2091`	`}`
	`2092`	`+ if (lora_scale == 0.0f) {`
	`2093`	`+ lora_scale = llama_lora_adapter_get_default_scale(adapter);`
	`2094`	`+ }`
`2092`	`2095`	`llama_lora_adapter_set(lctx, adapter, lora_scale);`
`2093`	`2096`	`}`
`2094`	`2097`