Skip to content

Commit a6f4178

Browse files
slarenggerganovJohannesGaessler
authored andcommitted
llama : ggml-backend integration (ggml-org#4766)
* llama : ggml-backend integration * ggml-backend : add names to buffers * fix unmap after loading * batched-bench : add tensor_split param * llama : check for null tensor_split * ggml-backend : increase GGML_MAX_BACKENDS * improve graph splitting, partial fix for --no-kv-offload * cuda : add ggml-backend split buffer support * cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available) * ggml : fix null backend dereference (ggml-org#4807) * ggml : fix null backend dereference * ggml : also check ggml_backend_is_cpu * test-backend-ops : check buffer allocation failures * llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row) * ggml : fix mul_mat_id work size * llama : rewrite session kv load/set without graphs * minor * llama : only initialize used backends, free backends on context free * llama : abort ctx if cuda backend init fails * llama : rewrite lora with ggml-backend and compute on CPU ggml-ci * llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer * opencl : add ggml-backend buffer type * cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf) * llama : on Metal, by default offload the full model ggml-ci * metal : page align the data ptr (ggml-org#4854) * Apply suggestions from code review Co-authored-by: Johannes Gäßler <[email protected]> * cuda : fix split buffer free * address review comments * llama-bench : add split-mode parameter * fix whitespace * opencl : fix double initialization * server : add --split-mode parameter * use async copy and compute to improve multi-gpu performance ggml-ci * use async memcpys to copy the graph outputs to the CPU * fix opencl * use a host buffer for the cpu compute buffer for faster copies to the gpu --------- Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Johannes Gäßler <[email protected]>
1 parent 425b864 commit a6f4178

21 files changed

+2523
-2285
lines changed

common/common.cpp

+39-26
Original file line numberDiff line numberDiff line change
@@ -543,9 +543,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
543543
invalid_param = true;
544544
break;
545545
}
546-
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
547546
params.n_gpu_layers = std::stoi(argv[i]);
548-
#else
547+
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
549548
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
550549
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
551550
#endif
@@ -554,9 +553,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
554553
invalid_param = true;
555554
break;
556555
}
557-
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
558556
params.n_gpu_layers_draft = std::stoi(argv[i]);
559-
#else
557+
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
560558
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
561559
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
562560
#endif
@@ -565,40 +563,53 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
565563
invalid_param = true;
566564
break;
567565
}
568-
#ifdef GGML_USE_CUBLAS
569566
params.main_gpu = std::stoi(argv[i]);
570-
#else
571-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
572-
#endif
567+
#ifndef GGML_USE_CUBLAS
568+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
569+
#endif // GGML_USE_CUBLAS
570+
} else if (arg == "--split-mode" || arg == "-sm") {
571+
if (++i >= argc) {
572+
invalid_param = true;
573+
break;
574+
}
575+
std::string arg_next = argv[i];
576+
if (arg_next == "none") {
577+
params.split_mode = LLAMA_SPLIT_NONE;
578+
} else if (arg_next == "layer") {
579+
params.split_mode = LLAMA_SPLIT_LAYER;
580+
} else if (arg_next == "row") {
581+
params.split_mode = LLAMA_SPLIT_ROW;
582+
} else {
583+
invalid_param = true;
584+
break;
585+
}
586+
#ifndef GGML_USE_CUBLAS
587+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
588+
#endif // GGML_USE_CUBLAS
573589
} else if (arg == "--tensor-split" || arg == "-ts") {
574590
if (++i >= argc) {
575591
invalid_param = true;
576592
break;
577593
}
578-
#ifdef GGML_USE_CUBLAS
579594
std::string arg_next = argv[i];
580595

581596
// split string by , and /
582597
const std::regex regex{R"([,/]+)"};
583598
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
584599
std::vector<std::string> split_arg{it, {}};
585-
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
586-
600+
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
601+
invalid_param = true;
602+
break;
603+
}
587604
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
588605
if (i < split_arg.size()) {
589606
params.tensor_split[i] = std::stof(split_arg[i]);
590607
} else {
591608
params.tensor_split[i] = 0.0f;
592609
}
593610
}
594-
#else
595-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
596-
#endif // GGML_USE_CUBLAS
597-
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
598-
#ifdef GGML_USE_CUBLAS
599-
params.mul_mat_q = false;
600-
#else
601-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
611+
#ifndef GGML_USE_CUBLAS
612+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
602613
#endif // GGML_USE_CUBLAS
603614
} else if (arg == "--no-mmap") {
604615
params.use_mmap = false;
@@ -915,14 +926,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
915926
printf(" number of layers to store in VRAM\n");
916927
printf(" -ngld N, --n-gpu-layers-draft N\n");
917928
printf(" number of layers to store in VRAM for the draft model\n");
929+
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
930+
printf(" how to split the model across multiple GPUs, one of:\n");
931+
printf(" - none: use one GPU only\n");
932+
printf(" - layer (default): split layers and KV across GPUs\n");
933+
printf(" - row: split rows across GPUs\n");
918934
printf(" -ts SPLIT, --tensor-split SPLIT\n");
919-
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
920-
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
921-
#ifdef GGML_USE_CUBLAS
922-
printf(" -nommq, --no-mul-mat-q\n");
923-
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
924-
printf(" Not recommended since this is both slower and uses more VRAM.\n");
925-
#endif // GGML_USE_CUBLAS
935+
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
936+
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
937+
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
926938
#endif
927939
printf(" -gan N, --grp-attn-n N\n");
928940
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
@@ -1041,6 +1053,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
10411053
mparams.n_gpu_layers = params.n_gpu_layers;
10421054
}
10431055
mparams.main_gpu = params.main_gpu;
1056+
mparams.split_mode = params.split_mode;
10441057
mparams.tensor_split = params.tensor_split;
10451058
mparams.use_mmap = params.use_mmap;
10461059
mparams.use_mlock = params.use_mlock;

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ struct gpt_params {
5959
float p_split = 0.1f; // speculative decoding split probability
6060
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
6161
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
62+
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
6263
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
6364
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
6465
int32_t n_beams = 0; // if non-zero then use beam search of given width.

examples/batched-bench/batched-bench.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,10 @@ int main(int argc, char ** argv) {
8888

8989
llama_model_params model_params = llama_model_default_params();
9090

91+
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
92+
9193
model_params.n_gpu_layers = n_gpu_layers;
94+
model_params.tensor_split = t_split.data();
9295

9396
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
9497

0 commit comments

Comments
 (0)