Skip to content

Commit ae32a25

Browse files
committed
Rewrite loading code to try to satisfy everyone
Features: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on ggml-org#740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Todo: - **VirtualLock does not work at all** on the one Windows machine I tested it on (it complains about quota). Figure out why. - Verify that using the `fopen` family of functions actually does what I think it does, performance-wise. - More testing. Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. Co-authored-by: Pavol Rusnak <[email protected]> (for the bit I copied from ggml-org#740)
1 parent eeaa7b0 commit ae32a25

File tree

10 files changed

+1167
-814
lines changed

10 files changed

+1167
-814
lines changed

examples/common.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "common.h"
22

3-
#include "ggml.h"
4-
53
#include <cassert>
64
#include <cstring>
75
#include <fstream>
@@ -154,6 +152,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
154152
params.use_color = true;
155153
} else if (arg == "--mlock") {
156154
params.use_mlock = true;
155+
} else if (arg == "--no-mmap") {
156+
params.use_mmap = false;
157157
} else if (arg == "--mtest") {
158158
params.mem_test = true;
159159
} else if (arg == "--verbose-prompt") {
@@ -233,9 +233,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
233233
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
234234
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
235235
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
236-
if (ggml_mlock_supported()) {
236+
if (llama_mlock_supported()) {
237237
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
238238
}
239+
if (llama_mmap_supported()) {
240+
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
241+
}
239242
fprintf(stderr, " --mtest compute maximum memory usage\n");
240243
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
241244
fprintf(stderr, " -m FNAME, --model FNAME\n");

examples/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct gpt_params {
4747
bool instruct = false; // instruction mode (used for Alpaca models)
4848
bool ignore_eos = false; // do not stop generating after eos
4949
bool perplexity = false; // compute perplexity over the prompt
50+
bool use_mmap = true; // use mmap for faster loads
5051
bool use_mlock = false; // use mlock to keep model in memory
5152
bool mem_test = false; // compute maximum memory usage
5253
bool verbose_prompt = false; // print prompt tokens before generation

examples/embedding/embedding.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
3838
lparams.seed = params.seed;
3939
lparams.f16_kv = params.memory_f16;
4040
lparams.logits_all = params.perplexity;
41+
lparams.use_mmap = params.use_mmap;
4142
lparams.use_mlock = params.use_mlock;
4243
lparams.embedding = params.embedding;
4344

examples/main/main.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
9797
lparams.n_parts = params.n_parts;
9898
lparams.seed = params.seed;
9999
lparams.f16_kv = params.memory_f16;
100+
lparams.use_mmap = params.use_mmap;
100101
lparams.use_mlock = params.use_mlock;
101102

102103
ctx = llama_init_from_file(params.model.c_str(), lparams);

examples/perplexity/perplexity.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
115115
lparams.seed = params.seed;
116116
lparams.f16_kv = params.memory_f16;
117117
lparams.logits_all = params.perplexity;
118+
lparams.use_mmap = params.use_mmap;
118119
lparams.use_mlock = params.use_mlock;
119120
lparams.embedding = params.embedding;
120121

ggml.c

-78
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,6 @@ typedef void* thread_ret_t;
9797
#define static_assert(cond, msg) _Static_assert(cond, msg)
9898
#endif
9999

100-
#define GGML_MLOCK_SUPPORT 0
101-
102-
#ifdef __has_include
103-
#if __has_include(<sys/mman.h>)
104-
#undef GGML_MLOCK_SUPPORT
105-
#define GGML_MLOCK_SUPPORT 1
106-
#include <sys/mman.h>
107-
#endif
108-
#endif
109-
110-
111100
/*#define GGML_PERF*/
112101
#define GGML_DEBUG 0
113102
#define GGML_GELU_FP16
@@ -2690,21 +2679,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
26902679

26912680
static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");
26922681

2693-
//
2694-
// ggml object
2695-
//
2696-
2697-
struct ggml_object {
2698-
size_t offs;
2699-
size_t size;
2700-
2701-
struct ggml_object * next;
2702-
2703-
char padding[8];
2704-
};
2705-
2706-
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
2707-
27082682
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
27092683
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
27102684

@@ -2716,7 +2690,6 @@ struct ggml_context {
27162690
size_t mem_size;
27172691
void * mem_buffer;
27182692
bool mem_buffer_owned;
2719-
bool mem_buffer_mlocked;
27202693
bool no_alloc;
27212694

27222695
int n_objects;
@@ -3003,7 +2976,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
30032976
/*.mem_size =*/ params.mem_size,
30042977
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
30052978
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3006-
/*.mem_buffer_mlocked =*/ false,
30072979
/*.no_alloc =*/ params.no_alloc,
30082980
/*.n_objects =*/ 0,
30092981
/*.objects_begin =*/ NULL,
@@ -3036,14 +3008,6 @@ void ggml_free(struct ggml_context * ctx) {
30363008
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
30373009
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
30383010

3039-
#if GGML_MLOCK_SUPPORT
3040-
if (ctx->mem_buffer_mlocked) {
3041-
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
3042-
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
3043-
}
3044-
}
3045-
#endif
3046-
30473011
if (ctx->mem_buffer_owned) {
30483012
free(ctx->mem_buffer);
30493013
}
@@ -3072,48 +3036,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
30723036
return result;
30733037
}
30743038

3075-
#ifdef __APPLE__
3076-
#define MLOCK_SUGGESTION \
3077-
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
3078-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
3079-
#else
3080-
#define MLOCK_SUGGESTION \
3081-
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
3082-
#endif
3083-
3084-
bool ggml_mlock_supported(void) {
3085-
return GGML_MLOCK_SUPPORT;
3086-
}
3087-
3088-
bool ggml_mlock(
3089-
struct ggml_context * ctx,
3090-
const void *opt_extra_addr,
3091-
size_t opt_extra_len,
3092-
char **err_p) {
3093-
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
3094-
#if GGML_MLOCK_SUPPORT
3095-
if (ctx->mem_buffer_mlocked) {
3096-
return true;
3097-
}
3098-
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
3099-
(opt_extra_len &&
3100-
mlock(opt_extra_addr, opt_extra_len))) {
3101-
if ((*err_p = malloc(1024))) {
3102-
snprintf(*err_p, 1024,
3103-
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
3104-
ctx->mem_size + opt_extra_len,
3105-
strerror(errno));
3106-
}
3107-
return false;
3108-
}
3109-
ctx->mem_buffer_mlocked = true;
3110-
return true;
3111-
#else // GGML_MLOCK_SUPPORT
3112-
*err_p = strdup("can't mlock because it's not supported on this system");
3113-
return false;
3114-
#endif // GGML_MLOCK_SUPPORT
3115-
}
3116-
31173039
////////////////////////////////////////////////////////////////////////////////
31183040

31193041
struct ggml_tensor * ggml_new_tensor_impl(

ggml.h

+13-7
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,19 @@ enum ggml_op {
253253
GGML_OP_COUNT,
254254
};
255255

256+
257+
// ggml object
258+
struct ggml_object {
259+
size_t offs;
260+
size_t size;
261+
262+
struct ggml_object * next;
263+
264+
char padding[8];
265+
};
266+
267+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
268+
256269
// n-dimensional tensor
257270
struct ggml_tensor {
258271
enum ggml_type type;
@@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
344357

345358
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
346359

347-
bool ggml_mlock_supported(void);
348-
bool ggml_mlock(
349-
struct ggml_context * ctx,
350-
const void *opt_extra_addr,
351-
size_t opt_extra_len,
352-
char **err_p);
353-
354360
struct ggml_tensor * ggml_new_tensor(
355361
struct ggml_context * ctx,
356362
enum ggml_type type,

0 commit comments

Comments
 (0)