Skip to content

Commit c66115b

Browse files
committed
Merge 'origin/master' into hipblas
2 parents a0b2d5f + b8ee340 commit c66115b

25 files changed

+1277
-518
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ jobs:
165165
- build: 'clblast'
166166
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
167167
- build: 'openblas'
168-
defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
168+
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
169169

170170
steps:
171171
- name: Clone

BLIS.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
BLIS Installation Manual
2+
------------------------
3+
4+
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
5+
6+
Project URL: https://github.com/flame/blis
7+
8+
### Prepare:
9+
10+
Compile BLIS:
11+
12+
```bash
13+
git clone https://github.com/flame/blis
14+
cd blis
15+
./configure --enable-cblas -t openmp,pthreads auto
16+
# will install to /usr/local/ by default.
17+
make -j
18+
```
19+
20+
Install BLIS:
21+
22+
```bash
23+
sudo make install
24+
```
25+
26+
We recommend using openmp since it's easier to modify the cores been used.
27+
28+
### llama.cpp compilation
29+
30+
Makefile:
31+
32+
```bash
33+
make LLAMA_BLIS=1 -j
34+
# make LLAMA_BLIS=1 benchmark-matmult
35+
```
36+
37+
CMake:
38+
39+
```bash
40+
mkdir build
41+
cd build
42+
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
43+
make -j
44+
```
45+
46+
### llama.cpp execution
47+
48+
According to the BLIS documentation, we could set the following
49+
environment variables to modify the behavior of openmp:
50+
51+
```
52+
export GOMP_GPU_AFFINITY="0-19"
53+
export BLIS_NUM_THREADS=14
54+
```
55+
56+
And then run the binaries as normal.
57+
58+
59+
### Intel specific issue
60+
61+
Some might get the error message saying that `libimf.so` cannot be found.
62+
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
63+
64+
### Reference:
65+
66+
1. https://github.com/flame/blis#getting-started
67+
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md

CMakeLists.txt

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ endif()
6565

6666
# 3rd party libs
6767
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
68-
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
68+
option(LLAMA_BLAS "llama: use BLAS" OFF)
69+
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
6970
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
7071
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7172
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@@ -146,36 +147,28 @@ if (APPLE AND LLAMA_ACCELERATE)
146147
endif()
147148
endif()
148149

149-
if (LLAMA_OPENBLAS)
150+
if (LLAMA_BLAS)
150151
if (LLAMA_STATIC)
151152
set(BLA_STATIC ON)
152153
endif()
153-
154-
set(BLA_VENDOR OpenBLAS)
154+
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
155+
set(BLA_SIZEOF_INTEGER 8)
156+
endif()
157+
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
155158
find_package(BLAS)
156159
if (BLAS_FOUND)
157-
message(STATUS "OpenBLAS found")
160+
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
158161

162+
add_compile_options(${BLAS_LINKER_FLAGS})
159163
add_compile_definitions(GGML_USE_OPENBLAS)
160-
add_link_options(${BLAS_LIBRARIES})
161-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
162-
163-
# find header file
164-
set(OPENBLAS_INCLUDE_SEARCH_PATHS
165-
/usr/include
166-
/usr/include/openblas
167-
/usr/include/openblas-base
168-
/usr/local/include
169-
/usr/local/include/openblas
170-
/usr/local/include/openblas-base
171-
/opt/OpenBLAS/include
172-
$ENV{OpenBLAS_HOME}
173-
$ENV{OpenBLAS_HOME}/include
174-
)
175-
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
176-
add_compile_options(-I${OPENBLAS_INC})
164+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
165+
166+
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
167+
include_directories(${BLAS_INCLUDE_DIRS})
177168
else()
178-
message(WARNING "OpenBLAS not found")
169+
message(WARNING "BLAS not found, please refer to "
170+
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
171+
" to set correct LLAMA_BLAS_VENDOR")
179172
endif()
180173
endif()
181174

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ ifdef LLAMA_OPENBLAS
122122
LDFLAGS += -lopenblas
123123
endif
124124
endif
125+
ifdef LLAMA_BLIS
126+
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
127+
LDFLAGS += -lblis -L/usr/local/lib
128+
endif
125129
ifdef LLAMA_CUBLAS
126130
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
127131
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include

README.md

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
99

1010
**Hot topics:**
1111

12+
- Quantization formats `Q4` and `Q8` have changed again (19 May) - [(info)](https://github.com/ggerganov/llama.cpp/pull/1508)
1213
- Quantization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
1314
- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
1415

@@ -55,7 +56,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
5556
- Mixed F16 / F32 precision
5657
- 4-bit, 5-bit and 8-bit integer quantization support
5758
- Runs on the CPU
58-
- OpenBLAS support
59+
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
5960
- cuBLAS and CLBlast support
6061

6162
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
@@ -80,6 +81,7 @@ as the main playground for developing new features for the [ggml](https://github
8081
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
8182
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
8283
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
84+
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
8385

8486
**Bindings:**
8587

@@ -272,10 +274,25 @@ Building the program with BLAS support may lead to some performance improvements
272274
```bash
273275
mkdir build
274276
cd build
275-
cmake .. -DLLAMA_OPENBLAS=ON
277+
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
276278
cmake --build . --config Release
277279
```
278280

281+
- BLIS
282+
283+
Check [BLIS.md](BLIS.md) for more information.
284+
285+
- Intel MKL
286+
287+
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
288+
289+
```bash
290+
mkdir build
291+
cd build
292+
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
293+
cmake --build . -config Release
294+
```
295+
279296
- cuBLAS
280297

281298
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
@@ -333,16 +350,16 @@ Several quantization methods are supported. They differ in the resulting model d
333350
334351
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
335352
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
336-
| 7B | perplexity | 5.9066 | 6.1565 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
337-
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G |
338-
| 7B | ms/tok @ 4th | 128 | 50 | 54 | 75 | 83 | 75 |
339-
| 7B | ms/tok @ 8th | 123 | 44 | 52 | 53 | 58 | 72 |
340-
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
341-
| 13B | perplexity | 5.2543 | 5.3860 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
342-
| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G |
343-
| 13B | ms/tok @ 4th | 239 | 93 | 101 | 150 | 164 | 141 |
344-
| 13B | ms/tok @ 8th | 240 | 81 | 96 | 96 | 104 | 136 |
345-
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
353+
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
354+
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
355+
| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
356+
| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
357+
| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
358+
| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
359+
| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
360+
| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
361+
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
362+
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
346363
347364
### Perplexity (measuring model quality)
348365

convert.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ def make_tensors_list() -> List[str]:
121121
f'layers.{i}.feed_forward.w1.weight',
122122
f'layers.{i}.feed_forward.w2.weight',
123123
f'layers.{i}.feed_forward.w3.weight',
124-
f'layers.{i}.atttention_norm.weight',
125124
f'layers.{i}.ffn_norm.weight',
126125
]
127126
return ret
@@ -1055,7 +1054,7 @@ def load_some_model(path: Path) -> ModelPlus:
10551054
files = list(path.glob("model-00001-of-*.safetensors"))
10561055
if not files:
10571056
# Try the PyTorch patterns too, with lower priority
1058-
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
1057+
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
10591058
files = [file for glob in globs for file in path.glob(glob)]
10601059
if not files:
10611060
# Try GGML too, but with lower priority, since if both a non-GGML

examples/benchmark/benchmark-matmult.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
#include <locale.h>
21
#include "ggml.h"
32
#include "build-info.h"
3+
4+
#include <locale.h>
45
#include <assert.h>
56
#include <math.h>
67
#include <cstring>
@@ -211,6 +212,7 @@ int main(int argc, char ** argv) {
211212
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
212213
printf("=====================================================================================\n");
213214

215+
double gflops_sum = 0;
214216
for (int i=0;i<benchmark_params.n_iterations ;i++) {
215217

216218
long long int start = ggml_time_us();
@@ -219,6 +221,7 @@ int main(int argc, char ** argv) {
219221
long long int stop = ggml_time_us();
220222
long long int usec = stop-start;
221223
double gflops = (double)(flops_per_matrix)/usec/1000.0;
224+
gflops_sum += gflops;
222225
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
223226
i,
224227
gf31.n_threads,
@@ -248,4 +251,7 @@ int main(int argc, char ** argv) {
248251
// Running a different graph computation to make sure we override the CPU cache lines
249252
ggml_graph_compute(ctx, &gf32);
250253
}
254+
printf("\n");
255+
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
256+
printf("=====================================================================================\n");
251257
}

0 commit comments

Comments
 (0)