Skip to content

Commit 69a0c25

Browse files
committed
Merge remote-tracking branch 'upstream/concedo'
2 parents 096f0b0 + 1347d3a commit 69a0c25

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+5154
-860
lines changed

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
*.o
22
*.a
3+
*.so
34
.DS_Store
45
.build/
56
.cache/
@@ -36,6 +37,7 @@ out/
3637
/vdot
3738
/server
3839
/Pipfile
40+
/embd-input-test
3941
/libllama.so
4042

4143
arm_neon.h
@@ -64,4 +66,5 @@ koboldcpp.dll
6466
koboldcpp_failsafe.dll
6567
koboldcpp_openblas.dll
6668
koboldcpp_openblas_noavx2.dll
67-
koboldcpp_clblast.dll
69+
koboldcpp_clblast.dll
70+
koboldcpp_cublas.dll

CMakeLists.txt

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# DO NOT USE THIS FILE.
2-
# IT'S ONLY FOR CUBLAS BUILD PURPOSES ON WINDOWS VISUAL STUDIO.
1+
# DO NOT USE THIS FILE.
2+
# IT'S ONLY FOR CUBLAS BUILD PURPOSES ON WINDOWS VISUAL STUDIO.
33
# IT WILL NOT BE UPDATED OR MAINTAINED !!!
44

55
message(STATUS "============== ============== ==============")
@@ -69,6 +69,7 @@ if (LLAMA_CUBLAS)
6969

7070
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
7171
set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
72+
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
7273

7374
add_compile_definitions(GGML_USE_CUBLAS)
7475

@@ -259,7 +260,8 @@ set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
259260
add_library(ggml_v2 OBJECT
260261
otherarch/ggml_v2.c
261262
otherarch/ggml_v2.h
262-
${GGML_V2_CUDA_SOURCES})
263+
${GGML_V2_CUDA_SOURCES}
264+
${GGML_V2_LEGACY_CUDA_SOURCES})
263265
target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools)
264266
target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
265267
target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
@@ -273,7 +275,7 @@ target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
273275
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
274276
set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
275277

276-
add_library(gpttype_adapter
278+
add_library(gpttype_adapter
277279
gpttype_adapter.cpp)
278280
target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples)
279281
target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
@@ -287,12 +289,12 @@ if (GGML_CUDA_SOURCES)
287289
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
288290
endif()
289291

290-
set(TARGET koboldcpp)
292+
set(TARGET koboldcpp_cublas)
291293
add_library(${TARGET} SHARED expose.cpp expose.h)
292294
target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
293295
target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
294296
set_target_properties(${TARGET} PROPERTIES PREFIX "")
295-
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp")
297+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
296298
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
297299
target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
298300
target_compile_features(${TARGET} PRIVATE cxx_std_11)

Makefile

+34-18
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
1+
default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas
22
tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
33
dev: koboldcpp_openblas
44
dev2: koboldcpp_clblast
@@ -53,6 +53,9 @@ NONECFLAGS =
5353
OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
5454
CLBLAST_FLAGS = -DGGML_USE_CLBLAST
5555
FAILSAFE_FLAGS = -DUSE_FAILSAFE
56+
CUBLAS_FLAGS = -DGGML_USE_CUBLAS
57+
CUBLASLD_FLAGS =
58+
CUBLAS_OBJS =
5659

5760
#lets try enabling everything
5861
CFLAGS += -pthread -s
@@ -133,10 +136,9 @@ endif
133136

134137
# it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
135138
ifdef LLAMA_CUBLAS
136-
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
137-
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
138-
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
139-
OBJS += ggml-cuda.o ggml_v2-cuda.o
139+
CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
140+
CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
141+
CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
140142
NVCC = nvcc
141143
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
142144
ifdef LLAMA_CUDA_DMMV_X
@@ -158,9 +160,11 @@ else
158160
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
159161
endif
160162
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
161-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
163+
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
162164
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
163-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
165+
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
166+
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
167+
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
164168
endif # LLAMA_CUBLAS
165169

166170
ifdef LLAMA_HIPBLAS
@@ -225,15 +229,19 @@ FAILSAFE_BUILD =
225229
OPENBLAS_BUILD =
226230
OPENBLAS_NOAVX2_BUILD =
227231
CLBLAST_BUILD =
228-
CLBLAST_NOAVX2_BUILD =
232+
CUBLAS_BUILD =
229233

230234
ifeq ($(OS),Windows_NT)
231235
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
232236
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
233237
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
234238
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
235239
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
236-
CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
240+
241+
ifdef LLAMA_CUBLAS
242+
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
243+
endif
244+
237245
else
238246
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
239247
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
@@ -244,20 +252,26 @@ else
244252
ifdef LLAMA_CLBLAST
245253
ifeq ($(UNAME_S),Darwin)
246254
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
247-
CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
248255
else
249256
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
250-
CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
251257
endif
252258
endif
253259

260+
ifdef LLAMA_CUBLAS
261+
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
262+
endif
263+
254264
ifndef LLAMA_OPENBLAS
255265
ifndef LLAMA_CLBLAST
266+
ifndef LLAMA_CUBLAS
256267
OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
257268
endif
258269
endif
270+
endif
259271
endif
260272

273+
274+
261275
#
262276
# Print build information
263277
#
@@ -287,8 +301,8 @@ ggml_openblas_noavx2.o: ggml.c ggml.h
287301
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
288302
ggml_clblast.o: ggml.c ggml.h
289303
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
290-
ggml_clblast_noavx2.o: ggml.c ggml.h
291-
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
304+
ggml_cublas.o: ggml.c ggml.h
305+
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
292306

293307
#quants K
294308
k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
@@ -309,8 +323,8 @@ ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
309323
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
310324
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
311325
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
312-
ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
313-
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
326+
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
327+
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
314328

315329
#extreme old version compat
316330
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
@@ -339,9 +353,11 @@ gpttype_adapter.o: gpttype_adapter.cpp
339353
$(CXX) $(CXXFLAGS) -c $< -o $@
340354
gpttype_adapter_clblast.o: gpttype_adapter.cpp
341355
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
356+
gpttype_adapter_cublas.o: gpttype_adapter.cpp
357+
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
342358

343359
clean:
344-
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so
360+
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
345361

346362
main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS)
347363
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -360,8 +376,8 @@ koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml
360376
$(OPENBLAS_NOAVX2_BUILD)
361377
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS)
362378
$(CLBLAST_BUILD)
363-
koboldcpp_clblast_noavx2: ggml_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants_noavx2.o $(OBJS)
364-
$(CLBLAST_NOAVX2_BUILD)
379+
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o $(CUBLAS_OBJS) $(OBJS)
380+
$(CUBLAS_BUILD)
365381

366382
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o
367383
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editin
2424
![Preview](media/preview.png)
2525

2626
## Usage
27-
- [Download the latest release here](https://github.com/LostRuins/koboldcpp/releases/latest) or clone the repo.
27+
- **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo.
2828
- Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts.
2929
- Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places).
3030
- To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries.
@@ -40,9 +40,9 @@ For more information, be sure to run the program with the `--help` flag.
4040
- You will have to compile your binaries from source. A makefile is provided, simply run `make`
4141
- If you want you can also link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1`
4242
- Alternatively, if you want you can also link your own install of CLBlast manually with `make LLAMA_CLBLAST=1`, for this you will need to obtain and link OpenCL and CLBlast libraries.
43-
- For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1`
4443
- For Arch Linux: Install `cblas` `openblas` and `clblast`.
4544
- For Debian: Install `libclblast-dev` and `libopenblas-dev`.
45+
- For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1`
4646
- After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
4747
- Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
4848

@@ -65,7 +65,7 @@ For more information, be sure to run the program with the `--help` flag.
6565
- See https://github.com/ggerganov/llama.cpp/pull/1828/files
6666

6767
## CuBLAS?
68-
- You can attempt a CuBLAS build with LLAMA_CUBLAS=1 or using the provided CMake file (best for visual studio users). Note that support for CuBLAS is limited.
68+
- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. Note that support for CuBLAS is limited.
6969

7070
## Considerations
7171
- For Windows: No installation, single file executable, (It Just Works)

convert-lora-to-ggml.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,18 @@ def write_tensor_header(
113113

114114
write_file_header(fout, params)
115115
for k, v in model.items():
116+
if k.endswith(".default.weight"):
117+
k = k.replace(".default.weight", ".weight")
118+
if k in ["llama_proj.weight", "llama_proj.bias"]:
119+
continue
116120
if k.endswith("lora_A.weight"):
117121
if v.dtype != torch.float16 and v.dtype != torch.float32:
118122
v = v.float()
119123
v = v.T
120124
else:
121125
v = v.float()
122126

123-
t = v.numpy()
127+
t = v.detach().numpy()
124128
tname = translate_tensor_name(k)
125129
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
126130
write_tensor_header(fout, tname, t.shape, t.dtype)

cudart64_110.dll

506 KB
Binary file not shown.

examples/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ else()
3939
add_subdirectory(baby-llama)
4040
add_subdirectory(train-text-from-scratch)
4141
add_subdirectory(simple)
42+
add_subdirectory(embd-input)
4243
if (LLAMA_METAL)
4344
add_subdirectory(metal)
4445
endif()

examples/baby-llama/baby-llama.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -566,8 +566,8 @@ struct ggml_tensor * forward(
566566
// wk shape [n_embd, n_embd, 1, 1]
567567
// Qcur shape [n_embd/n_head, n_head, N, 1]
568568
// Kcur shape [n_embd/n_head, n_head, N, 1]
569-
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
570-
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
569+
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
570+
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
571571

572572
// store key and value to memory
573573
{
@@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch(
823823
// wk shape [n_embd, n_embd, 1, 1]
824824
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
825825
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
826-
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
827-
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
826+
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
827+
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
828828
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
829829
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
830830

@@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora(
11161116
model->layers[il].wqb,
11171117
cur)),
11181118
n_embd/n_head, n_head, N),
1119-
n_past, n_rot, 0);
1119+
n_past, n_rot, 0, 0);
11201120
struct ggml_tensor * Kcur = ggml_rope(ctx0,
11211121
ggml_reshape_3d(ctx0,
11221122
ggml_mul_mat(ctx0,
@@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora(
11251125
model->layers[il].wkb,
11261126
cur)),
11271127
n_embd/n_head, n_head, N),
1128-
n_past, n_rot, 0);
1128+
n_past, n_rot, 0, 0);
11291129

11301130
// store key and value to memory
11311131
{

examples/common.cpp

+5-7
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
343343
params.use_mmap = false;
344344
} else if (arg == "--mtest") {
345345
params.mem_test = true;
346+
} else if (arg == "--numa") {
347+
params.numa = true;
346348
} else if (arg == "--export") {
347349
params.export_cgraph = true;
348350
} else if (arg == "--verbose-prompt") {
@@ -414,13 +416,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
414416
exit(1);
415417
}
416418

417-
#ifdef GGML_USE_CUBLAS
418-
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
419-
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
420-
exit(1);
421-
}
422-
#endif // GGML_USE_CUBLAS
423-
424419
if (escape_prompt) {
425420
process_escapes(params.prompt);
426421
}
@@ -488,6 +483,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
488483
if (llama_mmap_supported()) {
489484
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
490485
}
486+
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
487+
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
488+
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
491489
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
492490
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
493491
fprintf(stderr, " number of layers to store in VRAM\n");

examples/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ struct gpt_params {
7676
bool use_mmap = true; // use mmap for faster loads
7777
bool use_mlock = false; // use mlock to keep model in memory
7878
bool mem_test = false; // compute maximum memory usage
79+
bool numa = false; // attempt optimizations that help on some NUMA systems
7980
bool export_cgraph = false; // export the computation graph
8081
bool verbose_prompt = false; // print prompt tokens before generation
8182
};

examples/embd-input/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
PandaGPT
2+
MiniGPT-4
3+
*.pth
4+

0 commit comments

Comments
 (0)