Skip to content

Commit c1cb70d

Browse files
committed
new build arg LLAMA_CUDA_MMQ_Y
1 parent c1664a0 commit c1cb70d

File tree

3 files changed

+7
-3
lines changed

3 files changed

+7
-3
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ if (LLAMA_HIPBLAS)
375375
message(STATUS "HIP and hipBLAS found")
376376
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
377377
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
378+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
378379
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
379380
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
380381
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ ifdef LLAMA_HIPBLAS
270270
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
271271
LLAMA_CUDA_DMMV_X ?= 32
272272
LLAMA_CUDA_MMV_Y ?= 1
273+
LLAMA_CUDA_MMQ_Y ?= 64
273274
LLAMA_CUDA_KQUANTS_ITER ?= 2
274275
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
275276
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
@@ -278,6 +279,7 @@ ifdef LLAMA_HIPBLAS
278279
ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
279280
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
280281
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
282+
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
281283
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
282284
ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
283285
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h

README.md

+4-3
Original file line numberDiff line numberDiff line change
@@ -437,9 +437,10 @@ Building the program with BLAS support may lead to some performance improvements
437437
438438
| Option | Legal values | Default | Description |
439439
|-------------------------|------------------------|---------|-------------|
440-
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
441-
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
442-
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
440+
| LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom HIP kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
441+
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
442+
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
443+
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
443444
444445
- #### CLBlast
445446

0 commit comments

Comments
 (0)