@@ -668,13 +668,21 @@ struct llama_model_loader {
668
668
669
669
struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend) {
670
670
struct ggml_tensor * tensor;
671
+
672
+ if (backend != GGML_BACKEND_CPU) {
673
+ ggml_set_no_alloc (ggml_ctx, true );
674
+ }
671
675
if (lt.ne .size () == 2 ) {
672
676
tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
673
677
} else {
674
678
LLAMA_ASSERT (lt.ne .size () == 1 );
675
679
tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
676
680
}
677
681
ggml_set_name (tensor, lt.name .c_str ());
682
+
683
+ if (backend != GGML_BACKEND_CPU) {
684
+ ggml_set_no_alloc (ggml_ctx, use_mmap);
685
+ }
678
686
LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
679
687
tensor->backend = backend;
680
688
lt.ggml_tensor = tensor;
@@ -713,6 +721,11 @@ struct llama_model_loader {
713
721
}
714
722
LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
715
723
lt.data = (uint8_t *) lt.ggml_tensor ->data ;
724
+ // allocate temp buffer if not using mmap
725
+ if (!use_mmap && lt.data == NULL ) {
726
+ lt.data = (uint8_t *)malloc (ggml_nbytes (lt.ggml_tensor ));
727
+ }
728
+
716
729
load_data_for (lt);
717
730
switch (lt.ggml_tensor ->backend ) {
718
731
case GGML_BACKEND_CPU:
@@ -726,11 +739,17 @@ struct llama_model_loader {
726
739
#ifdef GGML_USE_CUBLAS
727
740
case GGML_BACKEND_CUDA:
728
741
ggml_cuda_load_data (lt.data , lt.ggml_tensor );
742
+ if (!use_mmap) {
743
+ free (lt.data );
744
+ }
729
745
break ;
730
746
#endif
731
747
#ifdef GGML_USE_CLBLAST
732
748
case GGML_BACKEND_CL:
733
749
ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
750
+ if (!use_mmap) {
751
+ free (lt.data );
752
+ }
734
753
break ;
735
754
#endif
736
755
default :
0 commit comments