Skip to content

Commit c1664a0

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 4336231 + 0728c5a commit c1664a0

14 files changed

+2626
-522
lines changed

CMakeLists.txt

+15-3
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ endif()
6767
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
6868
option(LLAMA_BLAS "llama: use BLAS" OFF)
6969
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
70-
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
70+
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
71+
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
72+
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
7173
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
7274
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7375
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
@@ -252,6 +254,10 @@ if (LLAMA_CUBLAS)
252254
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
253255

254256
add_compile_definitions(GGML_USE_CUBLAS)
257+
# if (LLAMA_CUDA_CUBLAS)
258+
# add_compile_definitions(GGML_CUDA_CUBLAS)
259+
# endif()
260+
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
255261
if (LLAMA_CUDA_FORCE_DMMV)
256262
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
257263
endif()
@@ -272,10 +278,14 @@ if (LLAMA_CUBLAS)
272278
endif()
273279

274280
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
281+
# 52 == lowest CUDA 12 standard
282+
# 60 == f16 CUDA intrinsics
283+
# 61 == integer CUDA intrinsics
284+
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
275285
if (LLAMA_CUDA_DMMV_F16)
276-
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
286+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
277287
else()
278-
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
288+
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
279289
endif()
280290
endif()
281291
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -532,6 +542,8 @@ endif()
532542
add_library(ggml OBJECT
533543
ggml.c
534544
ggml.h
545+
ggml-alloc.c
546+
ggml-alloc.h
535547
${GGML_SOURCES_CUDA}
536548
${GGML_SOURCES_OPENCL}
537549
${GGML_SOURCES_METAL}

Makefile

+19-3
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ ifdef LLAMA_CUBLAS
194194
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
195195
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
196196
OBJS += ggml-cuda.o
197-
NVCCFLAGS = --forward-unknown-to-host-compiler
197+
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
198198
ifdef LLAMA_CUDA_NVCC
199199
NVCC = $(LLAMA_CUDA_NVCC)
200200
else
@@ -220,14 +220,25 @@ else ifdef LLAMA_CUDA_DMMV_Y
220220
else
221221
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
222222
endif # LLAMA_CUDA_MMV_Y
223+
ifdef LLAMA_CUDA_F16
224+
NVCCFLAGS += -DGGML_CUDA_F16
225+
endif # LLAMA_CUDA_F16
223226
ifdef LLAMA_CUDA_DMMV_F16
224-
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
227+
NVCCFLAGS += -DGGML_CUDA_F16
225228
endif # LLAMA_CUDA_DMMV_F16
226229
ifdef LLAMA_CUDA_KQUANTS_ITER
227230
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
228231
else
229232
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
230233
endif
234+
ifdef LLAMA_CUDA_MMQ_Y
235+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
236+
else
237+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
238+
endif # LLAMA_CUDA_MMQ_Y
239+
#ifdef LLAMA_CUDA_CUBLAS
240+
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
241+
#endif # LLAMA_CUDA_CUBLAS
231242
ifdef LLAMA_CUDA_CCBIN
232243
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
233244
endif
@@ -339,7 +350,12 @@ $(info )
339350
ggml.o: ggml.c ggml.h ggml-cuda.h
340351
$(CC) $(CFLAGS) -c $< -o $@
341352

342-
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
353+
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
354+
$(CC) $(CFLAGS) -c $< -o $@
355+
356+
OBJS += ggml-alloc.o
357+
358+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
343359
$(CXX) $(CXXFLAGS) -c $< -o $@
344360

345361
common.o: examples/common.cpp examples/common.h

README.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -400,12 +400,16 @@ Building the program with BLAS support may lead to some performance improvements
400400

401401
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
402402

403+
<!---
404+
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
405+
--->
403406
| Option | Legal values | Default | Description |
404407
|-------------------------|------------------------|---------|-------------|
408+
| LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
405409
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
406410
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
407-
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
408-
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
411+
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
412+
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
409413
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
410414

411415
- #### hipBLAS

examples/common.cpp

+13-3
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
352352
#ifdef GGML_USE_CUBLAS
353353
params.main_gpu = std::stoi(argv[i]);
354354
#else
355-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
355+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
356356
#endif
357357
} else if (arg == "--tensor-split" || arg == "-ts") {
358358
if (++i >= argc) {
@@ -376,13 +376,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
376376
}
377377
}
378378
#else
379-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
379+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
380+
#endif // GGML_USE_CUBLAS
381+
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
382+
#ifdef GGML_USE_CUBLAS
383+
params.mul_mat_q = true;
384+
#else
385+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
380386
#endif // GGML_USE_CUBLAS
381387
} else if (arg == "--low-vram" || arg == "-lv") {
382388
#ifdef GGML_USE_CUBLAS
383389
params.low_vram = true;
384390
#else
385-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
391+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
386392
#endif // GGML_USE_CUBLAS
387393
} else if (arg == "--no-mmap") {
388394
params.use_mmap = false;
@@ -585,6 +591,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
585591
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
586592
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
587593
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
594+
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
595+
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
596+
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
588597
#endif
589598
fprintf(stdout, " --mtest compute maximum memory usage\n");
590599
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
@@ -637,6 +646,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
637646
lparams.main_gpu = params.main_gpu;
638647
lparams.tensor_split = params.tensor_split;
639648
lparams.low_vram = params.low_vram;
649+
lparams.mul_mat_q = params.mul_mat_q;
640650
lparams.seed = params.seed;
641651
lparams.f16_kv = params.memory_f16;
642652
lparams.use_mmap = params.use_mmap;

examples/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ struct gpt_params {
7474
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
7575

7676
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
77+
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
7778
bool memory_f16 = true; // use f16 instead of f32 for memory kv
7879
bool random_prompt = false; // do not randomize prompt if none provided
7980
bool use_color = false; // use color to distinguish generations and inputs

examples/server/server.cpp

+13-2
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
631631
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
632632
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
633633
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
634+
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
635+
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
636+
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
634637
#endif
635638
fprintf(stdout, " -m FNAME, --model FNAME\n");
636639
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
@@ -827,15 +830,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
827830
}
828831
}
829832
#else
830-
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
833+
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
831834
#endif // GGML_USE_CUBLAS
832835
}
833836
else if (arg == "--low-vram" || arg == "-lv")
834837
{
835838
#ifdef GGML_USE_CUBLAS
836839
params.low_vram = true;
837840
#else
838-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
841+
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
842+
#endif // GGML_USE_CUBLAS
843+
}
844+
else if (arg == "--mul-mat-q" || arg == "-mmq")
845+
{
846+
#ifdef GGML_USE_CUBLAS
847+
params.mul_mat_q = true;
848+
#else
849+
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
839850
#endif // GGML_USE_CUBLAS
840851
}
841852
else if (arg == "--main-gpu" || arg == "-mg")

0 commit comments

Comments
 (0)