Skip to content

Commit 6f7c156

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 61df8e9 + fc45a81 commit 6f7c156

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3028
-1752
lines changed

.flake8

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[flake8]
2+
max-line-length = 125

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ build-metal/
2222
build-no-accel/
2323
build-sanitize-addr/
2424
build-sanitize-thread/
25+
out/
2526

2627
models/*
2728
*.bin
@@ -32,14 +33,17 @@ models/*
3233
/result
3334
/perplexity
3435
/embedding
36+
/train-text-from-scratch
3537
/benchmark-matmult
3638
/vdot
39+
/server
3740
/Pipfile
3841
/libllama.so
3942

4043
build-info.h
4144
arm_neon.h
4245
compile_commands.json
46+
CMakeSettings.json
4347

4448
__pycache__
4549

.pre-commit-config.yaml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# See https://pre-commit.com for more information
2+
# See https://pre-commit.com/hooks.html for more hooks
3+
exclude: prompts/.*.txt
4+
repos:
5+
- repo: https://github.com/pre-commit/pre-commit-hooks
6+
rev: v3.2.0
7+
hooks:
8+
- id: trailing-whitespace
9+
- id: end-of-file-fixer
10+
- id: check-yaml
11+
- id: check-added-large-files
12+
- repo: https://github.com/PyCQA/flake8
13+
rev: 6.0.0
14+
hooks:
15+
- id: flake8

CMakeLists.txt

+54-3
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
7070
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
7171
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7272
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
73+
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
7374
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7475
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7576
option(LLAMA_METAL "llama: use Metal" OFF)
@@ -159,17 +160,64 @@ if (LLAMA_BLAS)
159160
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
160161
set(BLA_SIZEOF_INTEGER 8)
161162
endif()
163+
162164
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
163165
find_package(BLAS)
166+
164167
if (BLAS_FOUND)
165168
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
166169

170+
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
171+
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
172+
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
173+
find_package(PkgConfig REQUIRED)
174+
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
175+
pkg_check_modules(DepBLAS REQUIRED blas)
176+
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
177+
pkg_check_modules(DepBLAS REQUIRED openblas)
178+
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
179+
pkg_check_modules(DepBLAS REQUIRED blis)
180+
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
181+
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
182+
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
183+
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
184+
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
185+
# all Intel* libraries share the same include path
186+
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
187+
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
188+
# this doesn't provide pkg-config
189+
# suggest to assign BLAS_INCLUDE_DIRS on your own
190+
if ("${NVHPC_VERSION}" STREQUAL "")
191+
message(WARNING "Better to set NVHPC_VERSION")
192+
else()
193+
set(DepBLAS_FOUND ON)
194+
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
195+
endif()
196+
endif()
197+
if (DepBLAS_FOUND)
198+
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
199+
else()
200+
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
201+
" detected by pkgconfig, trying to find cblas.h from possible paths...")
202+
find_path(BLAS_INCLUDE_DIRS
203+
NAMES cblas.h
204+
HINTS
205+
/usr/include
206+
/usr/local/include
207+
/usr/include/openblas
208+
/opt/homebrew/opt/openblas/include
209+
/usr/local/opt/openblas/include
210+
/usr/include/x86_64-linux-gnu/openblas/include
211+
)
212+
endif()
213+
endif()
214+
215+
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
167216
add_compile_options(${BLAS_LINKER_FLAGS})
168217
add_compile_definitions(GGML_USE_OPENBLAS)
169218
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
219+
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
170220

171-
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
172-
include_directories(${BLAS_INCLUDE_DIRS})
173221
else()
174222
message(WARNING "BLAS not found, please refer to "
175223
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@@ -191,6 +239,7 @@ if (LLAMA_CUBLAS)
191239
add_compile_definitions(GGML_USE_CUBLAS)
192240
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
193241
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
242+
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
194243

195244
if (LLAMA_STATIC)
196245
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -440,12 +489,14 @@ add_library(ggml OBJECT
440489
${GGML_SOURCES_EXTRA}
441490
)
442491

443-
target_include_directories(ggml PUBLIC .)
492+
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
444493
target_compile_features(ggml PUBLIC c_std_11) # don't bump
445494
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
446495

496+
add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
447497
if (BUILD_SHARED_LIBS)
448498
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
499+
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
449500
endif()
450501

451502
add_library(llama

Makefile

+18-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
33

44
ifdef LLAMA_BUILD_SERVER
55
BUILD_TARGETS += server
6+
LLAMA_SERVER_VERBOSE ?= 1
7+
server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
68
endif
79

810
default: $(BUILD_TARGETS)
@@ -171,6 +173,11 @@ ifdef LLAMA_CUDA_DMMV_Y
171173
else
172174
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
173175
endif # LLAMA_CUDA_DMMV_Y
176+
ifdef LLAMA_CUDA_KQUANTS_ITER
177+
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
178+
else
179+
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
180+
endif
174181
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
175182
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
176183
endif # LLAMA_CUBLAS
@@ -277,7 +284,7 @@ libllama.so: llama.o ggml.o $(OBJS)
277284
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
278285

279286
clean:
280-
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
287+
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
281288

282289
#
283290
# Examples
@@ -289,6 +296,12 @@ main: examples/main/main.cpp build-info.h ggml.
289296
@echo '==== Run ./main -h for help. ===='
290297
@echo
291298

299+
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
300+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
301+
@echo
302+
@echo '==== Run ./simple -h for help. ===='
303+
@echo
304+
292305
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
293306
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
294307

@@ -307,6 +320,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
307320
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
308321
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
309322

323+
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
324+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
325+
310326
build-info.h: $(wildcard .git/index) scripts/build-info.sh
311327
@sh scripts/build-info.sh > $@.tmp
312328
@if ! cmp -s $@.tmp $@; then \

Package.swift

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ let package = Package(
1111
.target(
1212
name: "llama",
1313
path: ".",
14+
exclude: ["ggml-metal.metal"],
1415
sources: ["ggml.c", "llama.cpp"],
1516
publicHeadersPath: "spm-headers",
1617
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],

README.md

+41
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ And after 4.45 hours, you will have the final perplexity.
616616
617617
### Android
618618
619+
#### Building the Project using Android NDK
619620
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
620621
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
621622
```
@@ -630,6 +631,46 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
630631
631632
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
632633
634+
#### Building the Project using Termux (F-Droid)
635+
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
636+
637+
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
638+
639+
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
640+
```
641+
apt install libopenblas
642+
```
643+
644+
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
645+
```
646+
apt install ocl-icd opencl-headers opencl-clhpp clinfo
647+
```
648+
649+
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
650+
```
651+
cmake .
652+
make
653+
cp libclblast.so* $PREFIX/lib
654+
cp ./include/clblast.h ../llama.cpp
655+
```
656+
657+
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
658+
```
659+
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
660+
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
661+
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
662+
```
663+
664+
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
665+
```
666+
GGML_OPENCL_PLATFORM=0
667+
GGML_OPENCL_DEVICE=0
668+
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
669+
./main (...)
670+
```
671+
672+
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
673+
633674
### Docker
634675
635676
#### Prerequisites

convert.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,11 @@ def validate_conversion_to(self, data_type: DataType) -> None:
512512
if not isinstance(self.data_type, QuantizedDataType):
513513
raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
514514
if self.data_type.have_g_idx:
515-
sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
515+
sys.stderr.write(
516+
"Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
517+
"which is not yet natively supported by GGML. "
518+
"For now you can still convert this model by passing `--outtype f16` to dequantize, "
519+
"but that will result in a much larger output file for no quality benefit.\n")
516520
sys.exit(1)
517521
assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
518522

@@ -694,8 +698,9 @@ def load(offset: int, elm_count: int) -> NDArray:
694698
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
695699
return LazyStorage(load=load, kind=pid[1], description=description)
696700

697-
# @staticmethod
698-
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
701+
# @staticmethod
702+
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
703+
# pyright: ignore[reportSelfClsParameterName]
699704
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
700705
assert isinstance(storage, LazyStorage)
701706

@@ -812,7 +817,7 @@ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
812817
# Use mmap for the actual data to avoid race conditions with the file offset.
813818
off = fp.raw.tell()
814819
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
815-
fp.raw.seek(off) # needed on Windows
820+
fp.raw.seek(off) # needed on Windows
816821

817822
def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
818823
shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
@@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus:
10541059
files = list(path.glob("model-00001-of-*.safetensors"))
10551060
if not files:
10561061
# Try the PyTorch patterns too, with lower priority
1057-
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
1062+
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
10581063
files = [file for glob in globs for file in path.glob(glob)]
10591064
if not files:
10601065
# Try GGML too, but with lower priority, since if both a non-GGML
@@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab:
10941099
elif path3.exists():
10951100
path = path3
10961101
else:
1097-
raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
1102+
raise FileNotFoundError(
1103+
f"Could not find tokenizer.model in {path} or its parent; "
1104+
"if it's in another directory, pass the directory as --vocab-dir")
10981105
added_tokens_path = path.parent / "added_tokens.json"
10991106
print(f"Loading vocab file {path}")
11001107
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
@@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
11101117
}[params.file_type]
11111118
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
11121119
if ret in model_paths:
1113-
sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n")
1120+
sys.stderr.write(
1121+
f"Error: Default output path ({ret}) would overwrite the input. "
1122+
"Please explicitly specify a path using --outfile.\n")
11141123
sys.exit(1)
11151124
return ret
11161125

@@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
11311140
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
11321141
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
11331142
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1134-
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1143+
parser.add_argument("model", type=Path,
1144+
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
11351145
args = parser.parse_args(args_in)
11361146

11371147
vocab: Vocab

examples/baby-llama/baby-llama.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
#include <random>
55
#include <cstring>
66

7+
#if defined(_MSC_VER)
8+
#pragma warning(disable: 4244 4267) // possible loss of data
9+
#endif
10+
711
float frand() {
812
return (float)rand()/(float)RAND_MAX;
913
}
@@ -1470,7 +1474,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te
14701474
}
14711475

14721476
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
1473-
const float eps = 1e-3;
1477+
const float eps = 1e-3f;
14741478
return
14751479
ggml_sum(ctx,
14761480
ggml_neg(ctx,

examples/benchmark/benchmark-matmult.cpp

+7-3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
#include <iterator>
1717
#include <algorithm>
1818

19+
#if defined(_MSC_VER)
20+
#pragma warning(disable: 4244 4267) // possible loss of data
21+
#endif
22+
1923
float tensor_sum_elements(const ggml_tensor * tensor) {
2024
float sum = 0;
2125
if (tensor->type==GGML_TYPE_F32) {
@@ -29,9 +33,9 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
2933
}
3034

3135
void tensor_dump(const ggml_tensor * tensor, const char * name) {
32-
printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name,
36+
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
3337
tensor->type, ggml_type_name(tensor->type),
34-
(int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
38+
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
3539
float sum = tensor_sum_elements(tensor);
3640
printf("Sum of tensor %s is %6.2f\n", name, sum);
3741
}
@@ -120,7 +124,7 @@ int main(int argc, char ** argv) {
120124
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
121125
ctx_size += 1024*1024*16;
122126

123-
printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
127+
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
124128

125129
struct ggml_init_params params = {
126130
/*.mem_size =*/ ctx_size,

0 commit comments

Comments
 (0)