split: support in llama_model_loader

phymbert · phymbert · commit 14442d8315f5 · 2024-03-19T13:44:53.000+01:00
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
@@ -22,12 +22,8 @@ enum split_operation : uint8_t {
     SPLIT_OP_MERGE,
 };
 
-static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
-static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
-
-static const int SPLIT_FILENAME_MAX = 256;
-
-static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
+static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "split.no"; // @ggerganov: should we make this accessible from outside ?
+static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "split.count";
 
 struct split_params {
     split_operation operation = SPLIT_OP_SPLIT;
@@ -136,12 +132,6 @@ static void zeros(std::ofstream & file, size_t n) {
     }
 }
 
-static std::string split_file_name(const std::string & path, int i_split, int n_split) {
-    char f_split[SPLIT_FILENAME_MAX] = {0};
-    snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
-    return std::string(f_split);
-}
-
 struct split_strategy {
     const split_params params;
     std::ifstream & f_input;
@@ -182,19 +172,20 @@ struct split_strategy {
         if (i_split == 0) {
             gguf_set_kv(ctx_out, ctx_gguf);
         }
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
 
         // populate the original tensors, so we get an initial metadata
         for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
             struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
             gguf_add_tensor(ctx_out, meta);
         }
 
-        auto split_name = split_file_name(params.output, i_split, n_split);
+        char split_path[4096] = {0};
+        llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
 
-        fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
-        fout = std::ofstream(split_name, std::ios::binary);
+        fprintf(stderr, "%s: %s ...", __func__, split_path);
+        fout = std::ofstream(split_path, std::ios::binary);
         fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
         auto meta_size = gguf_get_meta_size(ctx_out);
@@ -262,9 +253,13 @@ static void gguf_split(const split_params & split_params) {
     }
 
     split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+
+    char first_split_path[4096] = {0};
+    llama_split_path(first_split_path, sizeof(first_split_path),
+                     split_params.output.c_str(), strategy.i_split, strategy.n_split);
     fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
             __func__, split_params.input.c_str(),
-            split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
+            first_split_path,
             split_params.n_split_tensors);
 
     strategy.split_start();
@@ -300,7 +295,9 @@ static void gguf_merge(const split_params & split_params) {
     std::vector<ggml_context *> ctx_metas;
     std::vector<gguf_context *> ctx_ggufs;
 
-    std::string split_prefix;
+    char split_path[4096] = {0};
+    strncpy(split_path, split_params.input.c_str(), sizeof(split_path));
+    char split_prefix[4096] = {0};
 
     // First pass to find KV and tensors metadata
     for (int i_split = 0; i_split < n_split; i_split++) {
@@ -311,13 +308,12 @@ static void gguf_merge(const split_params & split_params) {
             /*.ctx      = */ &ctx_meta,
         };
 
-        auto split_name = split_params.input;
         if (i_split > 0) {
-            split_name = split_file_name(split_prefix, i_split, n_split);
+            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
         }
-        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
 
-        auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
+        auto * ctx_gguf = gguf_init_from_file(split_path, params);
         if (!ctx_gguf) {
             fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
             exit(1);
@@ -333,65 +329,43 @@ static void gguf_merge(const split_params & split_params) {
                         __func__,
                         LLM_KV_GENERAL_SPLIT_N_SPLIT);
                 gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
                 exit(1);
             }
 
-            n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
             if (n_split < 1) {
                 fprintf(stderr,
                         "\n%s: input file does not contain a valid split count %d\n",
                         __func__,
                         n_split);
                 gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
                 exit(1);
             }
 
-            // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
-
-            // Set metadata from the first split
-            gguf_set_kv(ctx_out, ctx_gguf);
-        }
-
-        // Verify the file naming
-        {
-            int i_split_file = 0;
-            int n_split_file = 0;
-            const char * i_split_format = "-00000-of-00000.gguf";
-
-            if (split_name.size() < strlen(i_split_format)) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
-                gguf_free(ctx_out);
-                fout.close();
-                exit(1);
-            }
-
-            split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
-
-            const char * split_name_c_str = split_name.c_str();
-            int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
-
-            if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
+            // Verify the file naming and extract split_prefix
+            if (!llama_split_prefix(split_prefix, split_path, i_split, n_split)) {
                 fprintf(stderr, "\n%s: unexpected input file name: %s"
-                                " i_split=%d i_split_file=%d"
-                                " n_split=%d n_split_file=%d\n", __func__,
-                        split_params.input.c_str(),
-                        i_split, i_split_file,
-                        n_split, n_split_file);
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
+                                " i_split=%d"
+                                " n_split=%d\n", __func__,
+                        split_path, i_split, n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
                 exit(1);
             }
+
+            // Do not trigger merge if we try to merge again the output
+            gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
+ +
+            // Set metadata from the first split
+            gguf_set_kv(ctx_out, ctx_gguf);
         }
 
         auto n_tensors = gguf_get_n_tensors(ctx_gguf);
@@ -413,18 +387,19 @@ static void gguf_merge(const split_params & split_params) {
 
     // Write tensors data
     for (int i_split = 0; i_split < n_split; i_split++) {
-        auto split_name = split_file_name(split_prefix, i_split, n_split);
-        std::ifstream f_input(split_name.c_str(), std::ios::binary);
+        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        std::ifstream f_input(split_path, std::ios::binary);
         if (!f_input.is_open()) {
-            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_name.c_str());
-            for (auto * _ctx_gguf : ctx_ggufs) {
-                gguf_free(_ctx_gguf);
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
+            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
+                gguf_free(ctx_ggufs[i]);
+                ggml_free(ctx_metas[i]);
             }
             gguf_free(ctx_out);
             fout.close();
             exit(1);
         }
-        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
 
         auto * ctx_gguf = ctx_ggufs[i_split];
         auto * ctx_meta = ctx_metas[i_split];
diff --git a/llama.cpp b/llama.cpp
@@ -290,6 +290,9 @@ enum llm_kv {
     LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
     LLM_KV_ROPE_SCALING_FINETUNED,
 
+    LLM_KV_SPLIT_NO,
+    LLM_KV_SPLIT_COUNT,
+
     LLM_KV_SSM_INNER_SIZE,
     LLM_KV_SSM_CONV_KERNEL,
     LLM_KV_SSM_STATE_SIZE,
@@ -355,6 +358,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
     { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
 
+    { LLM_KV_SPLIT_NO,                      "split.no"    },
+    { LLM_KV_SPLIT_COUNT,                   "split.count" },
+
     { LLM_KV_SSM_CONV_KERNEL,               "%s.ssm.conv_kernel"    },
     { LLM_KV_SSM_INNER_SIZE,                "%s.ssm.inner_size"     },
     { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
@@ -2797,6 +2803,9 @@ struct llama_model_loader {
     int n_tensors = 0;
     int n_created = 0;
 
+    uint16_t n_split = 0;
+    std::vector<uint16_t> split_tensor_offsets = {0};
+
     int64_t n_elements = 0;
     size_t  n_bytes    = 0;
 
@@ -2840,6 +2849,55 @@ struct llama_model_loader {
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
+        get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+        if (n_split > 0) {
+            uint16_t i_split = 0;
+            get_key(llm_kv(LLM_KV_SPLIT_NO), i_split);
+            if (i_split != 0) {
+                throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", i_split));
+            }
+            char split_prefix[4096] = {0};
+            int n_split_prefix = llama_split_prefix(split_prefix, fname.c_str(), i_split, n_split);
+            if (!n_split_prefix) {
+                throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
+            }
+
+            if (trace > 0) {
+                LLAMA_LOG_INFO("%s: loading additional %d GGUFs split\n",
+                               __func__, n_split);
+            }
+
+            auto split_n_tensors = gguf_get_n_tensors(ctx_gguf);
+            for (i_split = 1; i_split < n_split; i_split++) {
+                char split_path[4096] = {0};
+                llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+
+                struct ggml_context * split_ctx_meta = NULL;
+                struct gguf_init_params split_params = {
+                    /*.no_alloc = */ true,
+                    /*.ctx      = */ &split_ctx_meta,
+                };
+                auto * split_ctx_gguf = gguf_init_from_file(split_path, split_params);
+                if (!split_ctx_gguf) {
+                    throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str()));
+                }
+
+                split_tensor_offsets.push_back(split_n_tensors);
+                split_n_tensors = gguf_get_n_tensors(split_ctx_gguf);
+                for (int i_tensor = 0; i_tensor < split_n_tensors; i_tensor++) {
+                    const char * t_name = gguf_get_tensor_name(split_ctx_gguf, i_tensor);
+                    struct ggml_tensor * t = ggml_get_tensor(split_ctx_meta, t_name);
+                    gguf_add_tensor(ctx_gguf, t);
+                }
+
+                gguf_free(split_ctx_gguf);
+                ggml_free(split_ctx_meta);
+            }
+
+            LLAMA_LOG_INFO("%s: additional %d GGUFs split metadata loaded.\n",
+                           __func__, n_split);
+        }
+
         n_kv      = gguf_get_n_kv(ctx_gguf);
         n_tensors = gguf_get_n_tensors(ctx_gguf);
 
@@ -14648,6 +14706,32 @@ LLAMA_API int32_t llama_chat_apply_template(
     return res;
 }
 
+LLAMA_API int llama_split_path(char * split_path, int maxlen, const char * path_prefix, int split_no, int split_count) {
+    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
+    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
+        return strlen(split_path);
+    }
+    return 0;
+}
+
+LLAMA_API int llama_split_prefix(char * dest, const char * split_path, int split_no, int split_count) {
+    char split_prefix[PATH_MAX] = {0};
+    int split_no_file = 0;
+    int split_count_file = 0;
+    const char * split_format = "-00000-of-00000.gguf";
+
+    if (strlen(split_path) > strlen(split_format) + 1) {
+        strncpy(split_prefix, split_path, strlen(split_path) - strlen(split_format));
+
+        int n = sscanf(&split_path[0] + strlen(split_prefix), "-%d-of-%d", &split_no_file, &split_count_file);
+        if (n == 2 && split_no_file - 1 == split_no && split_count_file == split_count) {
+            strcpy(dest, split_prefix);
+            return strlen(split_prefix);
+        }
+    }
+    return 0;
+}
+
 struct llama_timings llama_get_timings(struct llama_context * ctx) {
     struct llama_timings result = {
         /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
diff --git a/llama.h b/llama.h
@@ -960,6 +960,16 @@ extern "C" {
                                 int32_t   n_past,
                                 int32_t   n_predict);
 
+    /// @details Build a split GGUF final path for this chunk.
+    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+    //  Returns the split_path length.
+    LLAMA_API int llama_split_path(char * split_path, int maxlen, const char * path_prefix, int split_no, int split_count);
+
+    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+    ///          llama_split_prefix(split_prefix, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+    //  Returns the split_prefix length.
+    LLAMA_API int llama_split_prefix(char * split_prefix, const char * split_path, int split_no, int split_count);
+
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);