12
12
#include < cassert>
13
13
#include < cstring>
14
14
15
+ // headers for POSIX mmap
16
+ #if defined (__unix__) || defined (__APPLE__)
17
+ # include < sys/mman.h>
18
+ # include < fcntl.h>
19
+ # include < unistd.h>
20
+ #endif
21
+
15
22
#define LLAMA_USE_SCRATCH
16
23
#define LLAMA_MAX_SCRATCH_BUFFERS 16
17
24
@@ -246,6 +253,7 @@ static bool kv_cache_init(
246
253
struct ggml_init_params params;
247
254
params.mem_size = cache.buf .size ();
248
255
params.mem_buffer = cache.buf .data ();
256
+ params.no_alloc = false ;
249
257
250
258
cache.ctx = ggml_init (params);
251
259
@@ -288,6 +296,26 @@ struct llama_context_params llama_context_default_params() {
288
296
// model loading
289
297
//
290
298
299
+ void * mmap_file (const char * fname) {
300
+ #if defined(MAP_FAILED)
301
+ // POSIX mmap
302
+ int fd = open (fname, O_RDONLY);
303
+ size_t len = lseek (fd, 0 , SEEK_END);
304
+ void * mm_addr = mmap (NULL , len, PROT_READ, MAP_SHARED, fd, 0 );
305
+ if (mm_addr == MAP_FAILED) {
306
+ perror (" mmap failed" );
307
+ mm_addr = NULL ;
308
+ }
309
+ close (fd);
310
+ return mm_addr;
311
+ #else
312
+ // TODO: windows support
313
+ (void )(fname); // suppress warnings
314
+ return NULL ;
315
+ #endif
316
+ }
317
+
318
+
291
319
static bool llama_model_load (
292
320
const std::string & fname,
293
321
llama_context & lctx,
@@ -303,6 +331,7 @@ static bool llama_model_load(
303
331
304
332
lctx.t_start_us = t_start_us;
305
333
334
+ // TODO: this could probably be smaller when using mmap
306
335
std::vector<char > f_buf (1024 *1024 );
307
336
308
337
auto & model = lctx.model ;
@@ -449,39 +478,49 @@ static bool llama_model_load(
449
478
}
450
479
}
451
480
481
+ bool use_mmap = (n_parts == 1 );
482
+
483
+ // try to memory map the model file
484
+ void * mm_addr = NULL ;
485
+ if (use_mmap) {
486
+ mm_addr = mmap_file (fname.c_str ());
487
+ if (mm_addr == NULL ) {
488
+ use_mmap = false ;
489
+ }
490
+ }
491
+
492
+
493
+
452
494
auto & ctx = model.ctx ;
453
495
454
496
size_t ctx_size = 0 ;
455
-
456
497
{
457
498
const auto & hparams = model.hparams ;
458
499
459
500
const int n_embd = hparams.n_embd ;
460
501
const int n_layer = hparams.n_layer ;
461
- const int n_ctx = hparams.n_ctx ;
462
502
const int n_vocab = hparams.n_vocab ;
463
503
464
- ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // tok_embeddings
504
+ if (!use_mmap) {
505
+ ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // tok_embeddings
465
506
466
- ctx_size += n_embd*ggml_type_sizef (GGML_TYPE_F32); // norm
507
+ ctx_size += n_embd*ggml_type_sizef (GGML_TYPE_F32); // norm
467
508
468
- ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // output
509
+ ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // output
469
510
470
- ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // attention_norm
511
+ ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // attention_norm
471
512
472
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wq
473
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wk
474
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wv
475
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wo
513
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wq
514
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wk
515
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wv
516
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wo
476
517
477
- ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // ffn_norm
518
+ ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // ffn_norm
478
519
479
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w1
480
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w2
481
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w3
482
-
483
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef (memory_type); // memory_k
484
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef (memory_type); // memory_v
520
+ ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w1
521
+ ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w2
522
+ ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w3
523
+ }
485
524
486
525
ctx_size += (5 + 10 *n_layer)*256 ; // object overhead
487
526
@@ -514,6 +553,7 @@ static bool llama_model_load(
514
553
struct ggml_init_params params = {
515
554
/* .mem_size =*/ lctx.model .buf .size (),
516
555
/* .mem_buffer =*/ lctx.model .buf .data (),
556
+ /* .no_alloc =*/ use_mmap,
517
557
};
518
558
519
559
model.ctx = ggml_init (params);
@@ -595,7 +635,7 @@ static bool llama_model_load(
595
635
fname_part += " ." + std::to_string (i);
596
636
}
597
637
598
- fprintf (stderr, " %s: loading model part %d/%d from '%s'\n " , __func__, i+1 , n_parts, fname_part.c_str ());
638
+ fprintf (stderr, " %s: loading model part %d/%d from '%s'%s \n " , __func__, i+1 , n_parts, fname_part.c_str (), use_mmap ? " (memory mapped) " : " " );
599
639
600
640
fin = std::ifstream (fname_part, std::ios::binary);
601
641
fin.rdbuf ()->pubsetbuf (f_buf.data (), f_buf.size ());
@@ -736,7 +776,14 @@ static bool llama_model_load(
736
776
}
737
777
738
778
if (part_id == 0 ) {
739
- fin.read (reinterpret_cast <char *>(tensor->data ), ggml_nbytes (tensor));
779
+ if (mm_addr) {
780
+ off_t offset = fin.tellg ();
781
+ tensor->data = (char *) mm_addr + offset;
782
+ fin.seekg (ggml_nbytes (tensor), std::ios::cur);
783
+ }
784
+ else {
785
+ fin.read (reinterpret_cast <char *>(tensor->data ), ggml_nbytes (tensor));
786
+ }
740
787
} else {
741
788
fin.seekg (ggml_nbytes (tensor), std::ios::cur);
742
789
}
@@ -849,6 +896,7 @@ static bool llama_eval_internal(
849
896
struct ggml_init_params params = {
850
897
/* .mem_size =*/ buf_compute.size (),
851
898
/* .mem_buffer =*/ buf_compute.data (),
899
+ /* .no_alloc =*/ false ,
852
900
};
853
901
854
902
struct ggml_context * ctx0 = ggml_init (params);
0 commit comments