Skip to content

Commit 2803459

Browse files
authored
cuda : rename build flag to LLAMA_CUDA (ggml-org#6299)
1 parent b06c16e commit 2803459

28 files changed

+129
-115
lines changed

.devops/full-cuda.Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ COPY . .
2626

2727
# Set nvcc architecture
2828
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
29-
# Enable cuBLAS
30-
ENV LLAMA_CUBLAS=1
29+
# Enable CUDA
30+
ENV LLAMA_CUDA=1
3131

3232
RUN make
3333

.devops/llama-cpp-cublas.srpm.spec renamed to .devops/llama-cpp-cuda.srpm.spec

+11-11
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
1313
# It is up to the user to install the correct vendor-specific support.
1414

15-
Name: llama.cpp-cublas
15+
Name: llama.cpp-cuda
1616
Version: %( date "+%%Y%%m%%d" )
1717
Release: 1%{?dist}
1818
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@@ -32,24 +32,24 @@ CPU inference for Meta's Lllama2 models using default options.
3232
%setup -n llama.cpp-master
3333

3434
%build
35-
make -j LLAMA_CUBLAS=1
35+
make -j LLAMA_CUDA=1
3636

3737
%install
3838
mkdir -p %{buildroot}%{_bindir}/
39-
cp -p main %{buildroot}%{_bindir}/llamacppcublas
40-
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
41-
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
39+
cp -p main %{buildroot}%{_bindir}/llamacppcuda
40+
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
41+
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
4242

4343
mkdir -p %{buildroot}/usr/lib/systemd/system
44-
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
44+
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
4545
[Unit]
4646
Description=Llama.cpp server, CPU only (no GPU support in this build).
4747
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
4848

4949
[Service]
5050
Type=simple
5151
EnvironmentFile=/etc/sysconfig/llama
52-
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
52+
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
5353
ExecReload=/bin/kill -s HUP $MAINPID
5454
Restart=never
5555

@@ -67,10 +67,10 @@ rm -rf %{buildroot}
6767
rm -rf %{_builddir}/*
6868

6969
%files
70-
%{_bindir}/llamacppcublas
71-
%{_bindir}/llamacppcublasserver
72-
%{_bindir}/llamacppcublassimple
73-
/usr/lib/systemd/system/llamacublas.service
70+
%{_bindir}/llamacppcuda
71+
%{_bindir}/llamacppcudaserver
72+
%{_bindir}/llamacppcudasimple
73+
/usr/lib/systemd/system/llamacuda.service
7474
%config /etc/sysconfig/llama
7575

7676
%pre

.devops/main-cuda.Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ COPY . .
2020

2121
# Set nvcc architecture
2222
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
23-
# Enable cuBLAS
24-
ENV LLAMA_CUBLAS=1
23+
# Enable CUDA
24+
ENV LLAMA_CUDA=1
2525

2626
RUN make
2727

.devops/nix/package.nix

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ effectiveStdenv.mkDerivation (
192192
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
193193
(cmakeBool "LLAMA_BLAS" useBlas)
194194
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
195-
(cmakeBool "LLAMA_CUBLAS" useCuda)
195+
(cmakeBool "LLAMA_CUDA" useCuda)
196196
(cmakeBool "LLAMA_HIPBLAS" useRocm)
197197
(cmakeBool "LLAMA_METAL" useMetalKit)
198198
(cmakeBool "LLAMA_MPI" useMpi)

.devops/server-cuda.Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ COPY . .
2020

2121
# Set nvcc architecture
2222
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
23-
# Enable cuBLAS
24-
ENV LLAMA_CUBLAS=1
23+
# Enable CUDA
24+
ENV LLAMA_CUDA=1
2525

2626
RUN make
2727

.github/workflows/build.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -728,13 +728,13 @@ jobs:
728728
path: |
729729
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
730730
731-
windows-latest-cmake-cublas:
731+
windows-latest-cmake-cuda:
732732
runs-on: windows-latest
733733

734734
strategy:
735735
matrix:
736736
cuda: ['12.2.0', '11.7.1']
737-
build: ['cublas']
737+
build: ['cuda']
738738

739739
steps:
740740
- name: Clone
@@ -755,7 +755,7 @@ jobs:
755755
run: |
756756
mkdir build
757757
cd build
758-
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
758+
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
759759
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
760760
761761
- name: Determine tag name
@@ -911,7 +911,7 @@ jobs:
911911
- macOS-latest-make
912912
- macOS-latest-cmake
913913
- windows-latest-cmake
914-
- windows-latest-cmake-cublas
914+
- windows-latest-cmake-cuda
915915
- macOS-latest-cmake-arm64
916916
- macOS-latest-cmake-x64
917917

CMakeLists.txt

+13-8
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ endif()
8989
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
9090
option(LLAMA_BLAS "llama: use BLAS" OFF)
9191
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
92-
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
93-
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
92+
option(LLAMA_CUDA "llama: use CUDA" OFF)
93+
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
9494
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
9595
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
9696
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
360360
endif()
361361

362362
if (LLAMA_CUBLAS)
363+
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
364+
set(LLAMA_CUDA ON)
365+
endif()
366+
367+
if (LLAMA_CUDA)
363368
cmake_minimum_required(VERSION 3.17)
364369

365370
find_package(CUDAToolkit)
366371
if (CUDAToolkit_FOUND)
367-
message(STATUS "cuBLAS found")
372+
message(STATUS "CUDA found")
368373

369374
enable_language(CUDA)
370375

@@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
373378
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
374379
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
375380

376-
add_compile_definitions(GGML_USE_CUBLAS)
381+
add_compile_definitions(GGML_USE_CUDA)
377382
if (LLAMA_CUDA_FORCE_DMMV)
378383
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
379384
endif()
@@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
422427
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
423428

424429
else()
425-
message(WARNING "cuBLAS not found")
430+
message(WARNING "CUDA not found")
426431
endif()
427432
endif()
428433

@@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
525530
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
526531
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
527532

528-
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
533+
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
529534

530535
if (LLAMA_HIP_UMA)
531536
add_compile_definitions(GGML_HIP_UMA)
@@ -830,7 +835,7 @@ endif()
830835

831836
set(CUDA_CXX_FLAGS "")
832837

833-
if (LLAMA_CUBLAS)
838+
if (LLAMA_CUDA)
834839
set(CUDA_FLAGS -use_fast_math)
835840

836841
if (LLAMA_FATAL_WARNINGS)
@@ -1055,7 +1060,7 @@ endif()
10551060
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
10561061
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
10571062

1058-
if (LLAMA_CUBLAS)
1063+
if (LLAMA_CUDA)
10591064
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
10601065
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
10611066
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")

Makefile

+19-7
Original file line numberDiff line numberDiff line change
@@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
390390
endif # LLAMA_BLIS
391391

392392
ifdef LLAMA_CUBLAS
393+
# LLAMA_CUBLAS is deprecated and will be removed in the future
394+
LLAMA_CUDA := 1
395+
endif
396+
397+
ifdef LLAMA_CUDA
393398
ifneq ('', '$(wildcard /opt/cuda)')
394399
CUDA_PATH ?= /opt/cuda
395400
else
396401
CUDA_PATH ?= /usr/local/cuda
397402
endif
398-
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
403+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
399404
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
400405
OBJS += ggml-cuda.o
401406
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -462,7 +467,7 @@ endif
462467

463468
ifdef JETSON_EOL_MODULE_DETECT
464469
define NVCC_COMPILE
465-
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
470+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
466471
endef # NVCC_COMPILE
467472
else
468473
define NVCC_COMPILE
@@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
476481
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
477482
$(NVCC_COMPILE)
478483

479-
endif # LLAMA_CUBLAS
484+
endif # LLAMA_CUDA
480485

481486
ifdef LLAMA_CLBLAST
482487

@@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
533538
LLAMA_CUDA_DMMV_X ?= 32
534539
LLAMA_CUDA_MMV_Y ?= 1
535540
LLAMA_CUDA_KQUANTS_ITER ?= 2
536-
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
541+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
537542
ifdef LLAMA_HIP_UMA
538543
MK_CPPFLAGS += -DGGML_HIP_UMA
539544
endif # LLAMA_HIP_UMA
@@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
609614
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
610615

611616
# identify CUDA host compiler
612-
ifdef LLAMA_CUBLAS
617+
ifdef LLAMA_CUDA
613618
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
614619
include scripts/get-flags.mk
615620
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
634639
$(info I LDFLAGS: $(LDFLAGS))
635640
$(info I CC: $(shell $(CC) --version | head -n 1))
636641
$(info I CXX: $(shell $(CXX) --version | head -n 1))
637-
ifdef LLAMA_CUBLAS
642+
ifdef LLAMA_CUDA
638643
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
639644
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
640645
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
644649
endif # CUDA_POWER_ARCH
645650
endif # CUDA_DOCKER_ARCH
646651
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
647-
endif # LLAMA_CUBLAS
652+
endif # LLAMA_CUDA
648653
$(info )
649654

655+
ifdef LLAMA_CUBLAS
656+
$(info !!!!)
657+
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
658+
$(info !!!!)
659+
$(info )
660+
endif
661+
650662
#
651663
# Build library
652664
#

README.md

+4-7
Original file line numberDiff line numberDiff line change
@@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements
448448

449449
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
450450

451-
- #### cuBLAS
451+
- #### CUDA
452452

453-
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
453+
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
454454
455455
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
456456
457457
- Using `make`:
458458
```bash
459-
make LLAMA_CUBLAS=1
459+
make LLAMA_CUDA=1
460460
```
461461
- Using `CMake`:
462462
463463
```bash
464464
mkdir build
465465
cd build
466-
cmake .. -DLLAMA_CUBLAS=ON
466+
cmake .. -DLLAMA_CUDA=ON
467467
cmake --build . --config Release
468468
```
469469
470470
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
471471
472-
<!---
473-
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
474-
--->
475472
| Option | Legal values | Default | Description |
476473
|--------------------------------|------------------------|---------|-------------|
477474
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |

ci/run.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
4040
fi
4141

4242
if [ ! -z ${GG_BUILD_CUDA} ]; then
43-
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
43+
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
4444
fi
4545

4646
if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
412412

413413
set -e
414414

415-
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
416-
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
415+
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
416+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
417417

418418
python3 ../convert.py ${path_models}
419419

0 commit comments

Comments
 (0)