Skip to content

Commit 518eb2a

Browse files
committed
Merge remote-tracking branch 'upstream/concedo' into develop2Main
2 parents bda0215 + cae6a84 commit 518eb2a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+6049
-2792
lines changed

.gitignore

+2-2
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,12 @@ tests/test-tokenizer-0
8181
koboldcpp.so
8282
koboldcpp_failsafe.so
8383
koboldcpp_openblas.so
84-
koboldcpp_openblas_noavx2.so
84+
koboldcpp_noavx2.so
8585
koboldcpp_clblast.so
8686
koboldcpp.dll
8787
koboldcpp_failsafe.dll
8888
koboldcpp_openblas.dll
89-
koboldcpp_openblas_noavx2.dll
89+
koboldcpp_noavx2.dll
9090
koboldcpp_clblast.dll
9191
koboldcpp_cublas.dll
9292
cublas64_11.dll

CMakeLists.txt

+17-7
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,12 @@ if (NOT MSVC)
4343
endif()
4444

4545
# 3rd party libs
46-
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
46+
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
47+
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
4748
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
4849
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
4950
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
50-
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
51+
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
5152
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
5253
option(LLAMA_HIPBLAS "llama: use hipBLAS" ON)
5354
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@@ -80,13 +81,15 @@ if (LLAMA_CUBLAS)
8081
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
8182

8283
add_compile_definitions(GGML_USE_CUBLAS)
84+
#add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas
85+
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
8386
#add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
8487

8588
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
8689
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
8790
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
88-
if (LLAMA_CUDA_DMMV_F16)
89-
add_compile_definitions(GGML_CUDA_DMMV_F16)
91+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
92+
add_compile_definitions(GGML_CUDA_F16)
9093
endif()
9194
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
9295

@@ -97,10 +100,14 @@ if (LLAMA_CUBLAS)
97100
endif()
98101

99102
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
100-
if (LLAMA_CUDA_DMMV_F16)
101-
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
103+
# 52 == lowest CUDA 12 standard
104+
# 60 == f16 CUDA intrinsics
105+
# 61 == integer CUDA intrinsics
106+
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
107+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
108+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
102109
else()
103-
set(CMAKE_CUDA_ARCHITECTURES "37;52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
110+
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
104111
endif()
105112
endif()
106113
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -153,6 +160,7 @@ if (LLAMA_ALL_WARNINGS)
153160
-Wshadow
154161
-Wstrict-prototypes
155162
-Wpointer-arith
163+
-Wmissing-prototypes
156164
)
157165
set(cxx_flags
158166
-Wall
@@ -292,6 +300,8 @@ endif()
292300
add_library(ggml OBJECT
293301
ggml.c
294302
ggml.h
303+
ggml-alloc.c
304+
ggml-alloc.h
295305
k_quants.h
296306
k_quants.c
297307
${GGML_SOURCES_CUDA})

Makefile

+48-28
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas
1+
default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas
22
tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
33
dev: koboldcpp_openblas
44
dev2: koboldcpp_clblast
@@ -40,7 +40,7 @@ endif
4040

4141
# keep standard at C11 and C++11
4242
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
43-
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
43+
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
4444
LDFLAGS =
4545

4646
# these are used on windows, to build some libraries with extra old device compatibility
@@ -163,20 +163,34 @@ else ifdef LLAMA_CUDA_DMMV_Y
163163
else
164164
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
165165
endif # LLAMA_CUDA_MMV_Y
166+
ifdef LLAMA_CUDA_F16
167+
NVCCFLAGS += -DGGML_CUDA_F16
168+
endif # LLAMA_CUDA_F16
166169
ifdef LLAMA_CUDA_DMMV_F16
167-
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
170+
NVCCFLAGS += -DGGML_CUDA_F16
168171
endif # LLAMA_CUDA_DMMV_F16
169172
ifdef LLAMA_CUDA_KQUANTS_ITER
170173
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
171174
else
172175
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
173176
endif
177+
ifdef LLAMA_CUDA_MMQ_Y
178+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
179+
else
180+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
181+
endif # LLAMA_CUDA_MMQ_Y
182+
#ifdef LLAMA_CUDA_CUBLAS
183+
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
184+
#endif # LLAMA_CUDA_CUBLAS
185+
ifdef LLAMA_CUDA_CCBIN
186+
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
187+
endif
174188
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
175-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
189+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
176190
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
177-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
191+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
178192
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
179-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
193+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
180194
endif # LLAMA_CUBLAS
181195

182196
ifdef LLAMA_HIPBLAS
@@ -249,7 +263,7 @@ CXXV := $(shell $(CXX) --version | head -n 1)
249263
DEFAULT_BUILD =
250264
FAILSAFE_BUILD =
251265
OPENBLAS_BUILD =
252-
OPENBLAS_NOAVX2_BUILD =
266+
NOAVX2_BUILD =
253267
CLBLAST_BUILD =
254268
CUBLAS_BUILD =
255269
HIPBLAS_BUILD =
@@ -258,7 +272,7 @@ ifeq ($(OS),Windows_NT)
258272
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
259273
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
260274
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
261-
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
275+
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
262276
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
263277

264278
ifdef LLAMA_CUBLAS
@@ -272,7 +286,7 @@ else
272286
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
273287
ifdef LLAMA_OPENBLAS
274288
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
275-
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
289+
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
276290
endif
277291
ifdef LLAMA_CLBLAST
278292
ifeq ($(UNAME_S),Darwin)
@@ -327,8 +341,8 @@ ggml_openblas.o: ggml.c ggml.h
327341
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
328342
ggml_failsafe.o: ggml.c ggml.h
329343
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
330-
ggml_openblas_noavx2.o: ggml.c ggml.h
331-
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
344+
ggml_noavx2.o: ggml.c ggml.h
345+
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
332346
ggml_clblast.o: ggml.c ggml.h
333347
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
334348
ggml_cublas.o: ggml.c ggml.h
@@ -342,15 +356,19 @@ k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
342356
k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
343357
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
344358

359+
#there's no intrinsics or special gpu ops used here, so we can have a universal object
360+
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
361+
$(CC) $(CFLAGS) -c $< -o $@
362+
345363
#version 2 libs
346364
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
347365
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
348366
ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
349367
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
350368
ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
351369
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
352-
ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
353-
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
370+
ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
371+
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
354372
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
355373
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
356374
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
@@ -371,10 +389,12 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
371389
$(CC) $(CFLAGS) -c $< -o $@
372390

373391
# intermediate objects
374-
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
392+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
375393
$(CXX) $(CXXFLAGS) -c $< -o $@
376394
common.o: examples/common.cpp examples/common.h
377395
$(CXX) $(CXXFLAGS) -c $< -o $@
396+
console.o: examples/console.cpp examples/console.h
397+
$(CXX) $(CXXFLAGS) -c $< -o $@
378398
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
379399
$(CXX) $(CXXFLAGS) -c $< -o $@
380400
expose.o: expose.cpp expose.h
@@ -392,37 +412,37 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
392412
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
393413

394414
clean:
395-
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
415+
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
396416

397-
main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o grammar-parser.o $(OBJS)
417+
main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
398418
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
399419
@echo
400420
@echo '==== Run ./main -h for help. ===='
401421
@echo
402422

403423
#generated libraries
404-
koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o $(OBJS)
424+
koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS)
405425
$(DEFAULT_BUILD)
406-
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o $(OBJS)
426+
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS)
407427
$(OPENBLAS_BUILD)
408-
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o $(OBJS)
428+
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o $(OBJS)
409429
$(FAILSAFE_BUILD)
410-
koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o $(OBJS)
411-
$(OPENBLAS_NOAVX2_BUILD)
412-
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS)
430+
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o $(OBJS)
431+
$(NOAVX2_BUILD)
432+
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS)
413433
$(CLBLAST_BUILD)
414-
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o $(CUBLAS_OBJS) $(HIP_OBJS) $(OBJS)
434+
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(HIP_OBJS) $(OBJS)
415435
$(CUBLAS_BUILD) $(HIPBLAS_BUILD)
416436

417-
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o
437+
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
418438
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
419-
quantize_gptj: ggml.o llama.o k_quants.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
439+
quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
420440
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
421-
quantize_gpt2: ggml.o llama.o k_quants.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
441+
quantize_gpt2: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
422442
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
423-
quantize_neox: ggml.o llama.o k_quants.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
443+
quantize_neox: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
424444
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
425-
quantize_mpt: ggml.o llama.o k_quants.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
445+
quantize_mpt: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
426446
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
427447

428448

0 commit comments

Comments
 (0)