ggml-org · zhipenghan · Jul 8, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 10, 2024
diff --git a/common/common.cpp b/common/common.cpp
@@ -654,6 +654,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.hf_file = argv[i];
         return true;
     }
+    if (arg == "-mpa" || arg == "--model-path-alias") {
+        CHECK_ARG
+        std::string model_derived_alias = argv[i];
+        size_t equals_pos = model_derived_alias.find('=');
+        if (equals_pos != std::string::npos) {
+            std::string alias = model_derived_alias.substr(0, equals_pos);
+            std::string model_path = model_derived_alias.substr(equals_pos + 1);
+            params.derived_model_paths.emplace_back(alias, model_path);
+        }
+
+        return true;
+    }
     if (arg == "--lora") {
         CHECK_ARG
         params.lora_adapter.emplace_back(argv[i], 1.0f);
@@ -2045,6 +2057,21 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         }
     }
 
+    for (unsigned int i = 0; i < params.derived_model_paths.size(); ++i) {
+        const auto & derived_model_path = params.derived_model_paths[i];
+        const std::string & derived_model_name = std::get<0>(derived_model_path);
+        const std::string & derived_model_file = std::get<1>(derived_model_path);
+
+        llama_model * derived_model = llama_load_model_from_file(derived_model_file.c_str(), mparams);
+
+        if (derived_model == NULL) {
+            fprintf(stderr, "%s: error: failed to load derived model '%s'\n", __func__, derived_model_file.c_str());
+        }
+
+        llama_model_set_name(derived_model, derived_model_name.c_str());
+        llama_ctx_set_derived_model(lctx, derived_model);
+    }
+
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);

diff --git a/common/common.h b/common/common.h
@@ -124,6 +124,9 @@ struct gpt_params {
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
 
+    // multiple derived models paths map
+    std::vector<std::tuple<std::string, std::string>> derived_model_paths; // derived model paths
+
     // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -33,6 +33,7 @@ else()
     add_subdirectory(lookahead)
     add_subdirectory(lookup)
     add_subdirectory(main)
+    add_subdirectory(multi-adaptation)
     add_subdirectory(parallel)
     add_subdirectory(passkey)
     add_subdirectory(perplexity)

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
@@ -32,8 +32,10 @@ struct split_params {
     int n_split_tensors = 128;
     std::string input;
     std::string output;
+    std::string tensor_set_file;
     bool no_tensor_first_split = false;
     bool dry_run = false;
+    bool customized_split = false;
 };
 
 static void split_print_usage(const char * executable) {
@@ -47,6 +49,7 @@ static void split_print_usage(const char * executable) {
     printf("  -h, --help              show this help message and exit\n");
     printf("  --version               show version and build info\n");
     printf("  --split                 split GGUF to multiple GGUF (enabled by default)\n");
+    printf("  --tensor-set            customize tensor set used to split. File contains modules, e.g. 'ffn_up.weight'");
     printf("  --merge                 merge multiple GGUF to a single GGUF\n");
     printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
     printf("  --split-max-size N(M|G) max size per split\n");
@@ -121,6 +124,16 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
             params.operation = SPLIT_OP_SPLIT;
         }
 
+        if (arg == "--tensor-set") {
+            arg_found = true;
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.tensor_set_file = argv[arg_idx];
+            params.customized_split = true;
+        }
+
         if (is_mode_set) {
             throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
         }
@@ -180,12 +193,27 @@ static void zeros(std::ofstream & file, size_t n) {
     }
 }
 
+static std::vector<std::string> read_customized_tensors(const std::string & tensor_set_file) {
+    std::vector<std::string> tensor_set;
+    std::ifstream f_tensor_set(tensor_set_file);
+    if (!f_tensor_set.is_open()) {
+        fprintf(stderr, "error: failed to open tensor set file %s\n", tensor_set_file.c_str());
+        exit(EXIT_FAILURE);
+    }
+    std::string line;
+    while (std::getline(f_tensor_set, line)) {
+        tensor_set.push_back(line);
+    }
+    return tensor_set;
+}
+
 struct split_strategy {
     const split_params params;
     std::ifstream & f_input;
     struct gguf_context * ctx_gguf;
     struct ggml_context * ctx_meta = NULL;
     const int n_tensors;
+    std::string tensor_set_file;
 
     // one ctx_out per one output file
     std::vector<struct gguf_context *> ctx_outs;
@@ -233,20 +261,45 @@ struct split_strategy {
             new_ctx_out(true);
         }
 
-        // process tensors one by one
-        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
-        for (int i = 0; i < n_tensors; ++i) {
-            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-            // calculate the "imaginary" size = the current size + next tensor size
-            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
-            size_t next_tensors_size = curr_tensors_size + n_bytes;
-            if (should_split(i, next_tensors_size)) {
-                new_ctx_out(false);
-                curr_tensors_size = n_bytes;
-            } else {
-                curr_tensors_size = next_tensors_size;
+        if (!params.customized_split) {
+            // process tensors one by one
+            size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
+            for (int i = 0; i < n_tensors; ++i) {
+                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+                // calculate the "imaginary" size = the current size + next tensor size
+                size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
+                size_t next_tensors_size = curr_tensors_size + n_bytes;
+                if (should_split(i, next_tensors_size)) {
+                    new_ctx_out(false);
+                    curr_tensors_size = n_bytes;
+                } else {
+                    curr_tensors_size = next_tensors_size;
+                }
+                gguf_add_tensor(ctx_out, t);
+            }
+        } else {
+            // custom split based on tensor set
+            std::vector<std::string> tensor_set = read_customized_tensors(params.tensor_set_file);
+            if(tensor_set.empty()) {
+                fprintf(stderr, "error: tensor set is empty\n");
+                exit(EXIT_FAILURE);
+            }
+            for (int i = 0; i < n_tensors; ++i) {
+                const char * t_name = gguf_get_tensor_name(ctx_gguf, i);
+                if (is_tensor_in_customized_set(t_name, tensor_set)) {
+                    struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+                    gguf_add_tensor(ctx_out, t);
+                }
+            }
+            new_ctx_out(false);
+            // add left tensors to the next split
+            for (int i = 0; i < n_tensors; ++i) {
+                const char * t_name = gguf_get_tensor_name(ctx_gguf, i);
+                if (!is_tensor_in_customized_set(t_name, tensor_set)) {
+                    struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+                    gguf_add_tensor(ctx_out, t);
+                }
             }
-            gguf_add_tensor(ctx_out, t);
         }
 
         // push the last ctx_out
@@ -274,6 +327,16 @@ struct split_strategy {
         }
     }
 
+    bool is_tensor_in_customized_set(const char * t_name, std::vector<std::string> tensor_set) {
+        for (auto & s : tensor_set) {
+            if (strstr(t_name, s.c_str()) != NULL) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
     void print_info() {
         printf("n_split: %ld\n", ctx_outs.size());
         int i_split = 0;

diff --git a/examples/multi-adaptation/CMakeLists.txt b/examples/multi-adaptation/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama_multi-adaptation)
+add_executable(${TARGET} multi-adaptation.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/multi-adaptation/README.md b/examples/multi-adaptation/README.md
@@ -0,0 +1,33 @@
+# Server Multi Adaptations for Different Scenarios
+
+## Goal
+Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder.
+
+## Usage
+Use the `-mpa` parameter to pass the alias and model path.
+
+### Flag to Switch Derived Model
+```c
+llama_ctx_switch_derived_model(ctx, "summarize");
+```
+
+### Pass Model Path and Alias for Derived Models
+```sh
+llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \
+ -mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \
+ -mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf
+```
+
+## Foundation Model
+The **foundation** GGUF contains the weights shared across models.
+The **adaptor** GGUF contains the task-specific weights.
+
+Here are the combinations for hosting three models:
+- `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)`
+- `model-adaptor-taskA.gguf + model-foundation.gguf`
+- `model-adaptor-taskB.gguf + model-foundation.gguf`
+
+The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called.
+
+## Example
+Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf)