Skip to content

Commit 80e4e54

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 7735c5a + 1d16309 commit 80e4e54

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+4096
-3502
lines changed

.devops/full-cuda.Dockerfile

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
ARG UBUNTU_VERSION=22.04
2+
3+
# This needs to generally match the container host's environment.
4+
ARG CUDA_VERSION=11.7.1
5+
6+
# Target the CUDA build image
7+
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
8+
9+
FROM ${BASE_CUDA_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
ARG CUDA_DOCKER_ARCH=all
13+
14+
RUN apt-get update && \
15+
apt-get install -y build-essential python3 python3-pip
16+
17+
COPY requirements.txt requirements.txt
18+
19+
RUN pip install --upgrade pip setuptools wheel \
20+
&& pip install -r requirements.txt
21+
22+
WORKDIR /app
23+
24+
COPY . .
25+
26+
# Set nvcc architecture
27+
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
28+
# Enable cuBLAS
29+
ENV LLAMA_CUBLAS=1
30+
31+
RUN make
32+
33+
ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/main-cuda.Dockerfile

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
ARG UBUNTU_VERSION=22.04
2+
# This needs to generally match the container host's environment.
3+
ARG CUDA_VERSION=11.7.1
4+
# Target the CUDA build image
5+
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6+
# Target the CUDA runtime image
7+
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8+
9+
FROM ${BASE_CUDA_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
ARG CUDA_DOCKER_ARCH=all
13+
14+
RUN apt-get update && \
15+
apt-get install -y build-essential
16+
17+
WORKDIR /app
18+
19+
COPY . .
20+
21+
# Set nvcc architecture
22+
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
23+
# Enable cuBLAS
24+
ENV LLAMA_CUBLAS=1
25+
26+
RUN make
27+
28+
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
29+
30+
COPY --from=build /app/main /main
31+
32+
ENTRYPOINT [ "/main" ]

.github/workflows/build.yml

+11-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ on:
1616
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
1717

1818
env:
19-
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
19+
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
20+
GGML_NLOOP: 3
21+
GGML_NITER: 1
22+
GGML_N_THREADS: 1
2023

2124
jobs:
2225
ubuntu-focal-make:
@@ -64,7 +67,7 @@ jobs:
6467
id: cmake_test
6568
run: |
6669
cd build
67-
ctest --verbose
70+
ctest --verbose --timeout 900
6871
6972
ubuntu-latest-cmake-sanitizer:
7073
runs-on: ubuntu-latest
@@ -99,7 +102,7 @@ jobs:
99102
id: cmake_test
100103
run: |
101104
cd build
102-
ctest --verbose
105+
ctest --verbose --timeout 900
103106
104107
macOS-latest-make:
105108
runs-on: macos-latest
@@ -137,19 +140,21 @@ jobs:
137140
- name: Build
138141
id: cmake_build
139142
run: |
143+
sysctl -a
140144
mkdir build
141145
cd build
142-
cmake -DLLAMA_AVX2=OFF ..
146+
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
143147
cmake --build . --config Release
144148
145149
- name: Test
146150
id: cmake_test
147151
run: |
148152
cd build
149-
ctest --verbose
153+
ctest --verbose --timeout 900
150154
151155
windows-latest-cmake:
152156
runs-on: windows-latest
157+
153158
env:
154159
OPENBLAS_VERSION: 0.3.23
155160
OPENCL_VERSION: 2023.04.17
@@ -248,7 +253,7 @@ jobs:
248253
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
249254
run: |
250255
cd build
251-
ctest -C Release --verbose
256+
ctest -C Release --verbose --timeout 900
252257
253258
- name: Get commit hash
254259
id: commit

CMakeLists.txt

+13-3
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,9 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework
6868
option(LLAMA_BLAS "llama: use BLAS" OFF)
6969
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
7070
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
71+
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
7172
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
72-
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
73+
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
7374
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
7475
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
7576
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@@ -217,6 +218,9 @@ if (LLAMA_BLAS)
217218
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
218219
add_compile_options(${BLAS_LINKER_FLAGS})
219220
add_compile_definitions(GGML_USE_OPENBLAS)
221+
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
222+
add_compile_definitions(GGML_BLAS_USE_MKL)
223+
endif()
220224
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
221225
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
222226

@@ -247,8 +251,14 @@ if (LLAMA_CUBLAS)
247251
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
248252

249253
add_compile_definitions(GGML_USE_CUBLAS)
254+
if (LLAMA_CUDA_FORCE_DMMV)
255+
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
256+
endif()
250257
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
251-
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
258+
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
259+
if (DEFINED LLAMA_CUDA_DMMV_Y)
260+
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
261+
endif()
252262
if (LLAMA_CUDA_DMMV_F16)
253263
add_compile_definitions(GGML_CUDA_DMMV_F16)
254264
endif()
@@ -264,7 +274,7 @@ if (LLAMA_CUBLAS)
264274
if (LLAMA_CUDA_DMMV_F16)
265275
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
266276
else()
267-
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
277+
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
268278
endif()
269279
endif()
270280
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

Makefile

+16-5
Original file line numberDiff line numberDiff line change
@@ -163,17 +163,27 @@ ifdef LLAMA_CUBLAS
163163
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
164164
OBJS += ggml-cuda.o
165165
NVCC = nvcc
166-
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
166+
NVCCFLAGS = --forward-unknown-to-host-compiler
167+
ifdef CUDA_DOCKER_ARCH
168+
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
169+
else
170+
NVCCFLAGS += -arch=native
171+
endif # CUDA_DOCKER_ARCH
172+
ifdef LLAMA_CUDA_FORCE_DMMV
173+
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
174+
endif # LLAMA_CUDA_FORCE_DMMV
167175
ifdef LLAMA_CUDA_DMMV_X
168176
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
169177
else
170178
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
171179
endif # LLAMA_CUDA_DMMV_X
172-
ifdef LLAMA_CUDA_DMMV_Y
173-
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
180+
ifdef LLAMA_CUDA_MMV_Y
181+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
182+
else ifdef LLAMA_CUDA_DMMV_Y
183+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
174184
else
175-
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
176-
endif # LLAMA_CUDA_DMMV_Y
185+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
186+
endif # LLAMA_CUDA_MMV_Y
177187
ifdef LLAMA_CUDA_DMMV_F16
178188
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
179189
endif # LLAMA_CUDA_DMMV_F16
@@ -182,6 +192,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
182192
else
183193
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
184194
endif
195+
185196
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
186197
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
187198
endif # LLAMA_CUBLAS

README.md

+43-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

1212
**Hot topics:**
1313

14+
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
1415
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
1516
- New roadmap: https://github.com/users/ggerganov/projects/7
1617
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
@@ -85,7 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
8586
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
8687
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
8788
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
88-
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
89+
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
8990

9091
**Bindings:**
9192

@@ -344,8 +345,9 @@ Building the program with BLAS support may lead to some performance improvements
344345
345346
| Option | Legal values | Default | Description |
346347
|-------------------------|------------------------|---------|-------------|
348+
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. |
347349
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
348-
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
350+
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
349351
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
350352
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
351353
@@ -693,7 +695,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
693695

694696
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
695697

696-
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
698+
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
697699

698700
### Docker
699701

@@ -729,6 +731,38 @@ or with a light image:
729731
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
730732
```
731733

734+
### Docker With CUDA
735+
736+
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
737+
738+
#### Building Locally
739+
740+
```bash
741+
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
742+
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
743+
```
744+
745+
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
746+
747+
The defaults are:
748+
749+
- `CUDA_VERSION` set to `11.7.1`
750+
- `CUDA_DOCKER_ARCH` set to `all`
751+
752+
The resulting images, are essentially the same as the non-CUDA images:
753+
754+
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
755+
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
756+
757+
#### Usage
758+
759+
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
760+
761+
```bash
762+
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
763+
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
764+
```
765+
732766
### Contributing
733767
734768
- Contributors can open PRs
@@ -749,5 +783,10 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
749783
750784
### Docs
751785
752-
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
786+
- [main](./examples/main/README.md)
787+
- [server](./examples/server/README.md)
788+
- [embd-input](./examples/embd-input/README.md)
789+
- [jeopardy](./examples/jeopardy/README.md)
790+
- [BLIS](./docs/BLIS.md)
753791
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
792+
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

convert.py

+7
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
154154
# try transformer naming first
155155
if "model.layers.0.self_attn.q_proj.weight" in model:
156156
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
158+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
157159
else:
158160
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159161

162+
if n_layer < 1:
163+
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
164+
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
165+
160166
n_head=n_embd // 128 # guessed
161167

162168
return Params(
@@ -822,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
822828

823829

824830
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
831+
'BF16': DT_BF16,
825832
'F16': DT_F16,
826833
'F32': DT_F32,
827834
'I32': DT_I32,

examples/alpaca.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
cd `dirname $0`
88
cd ..
99

10-
./main -m ./models/ggml-alpaca-7b-q4.bin \
10+
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
1111
--color \
1212
-f ./prompts/alpaca.txt \
1313
--ctx_size 2048 \

0 commit comments

Comments
 (0)