Skip to content

Commit adea837

Browse files
committed
Merge branch 'main' into pr/koboldcpp_exp/rocmpatch
2 parents e3d3de6 + 8432e9d commit adea837

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+7025
-2232
lines changed

CMakeLists.txt

+14-6
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,10 @@ if (NOT MSVC)
4141
endif()
4242

4343
# 3rd party libs
44-
option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
44+
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
4545
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
4646
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
47+
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
4748
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
4849
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
4950
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@@ -77,8 +78,11 @@ if (LLAMA_CUBLAS)
7778
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
7879

7980
add_compile_definitions(GGML_USE_CUBLAS)
81+
add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
82+
8083
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
8184
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
85+
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
8286
if (LLAMA_CUDA_DMMV_F16)
8387
add_compile_definitions(GGML_CUDA_DMMV_F16)
8488
endif()
@@ -90,6 +94,15 @@ if (LLAMA_CUBLAS)
9094
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
9195
endif()
9296

97+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
98+
if (LLAMA_CUDA_DMMV_F16)
99+
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
100+
else()
101+
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
102+
endif()
103+
endif()
104+
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
105+
93106
else()
94107
message(WARNING "cuBLAS not found")
95108
endif()
@@ -200,11 +213,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
200213
if (MSVC)
201214
# TODO: arm msvc?
202215
else()
203-
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
204-
# Apple M1, M2, etc.
205-
# Raspberry Pi 3, 4, Zero 2 (64-bit)
206-
add_compile_options(-mcpu=native)
207-
endif()
208216
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
209217
# Raspberry Pi 1, Zero
210218
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)

Makefile

+30-11
Original file line numberDiff line numberDiff line change
@@ -144,16 +144,18 @@ ifdef LLAMA_CUBLAS
144144
CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145145
CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
146146
NVCC = nvcc
147-
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
147+
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
148148
ifdef LLAMA_CUDA_DMMV_X
149149
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
150150
else
151151
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
152152
endif # LLAMA_CUDA_DMMV_X
153153
ifdef LLAMA_CUDA_DMMV_Y
154+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
154155
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
155156
else
156157
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
158+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
157159
endif # LLAMA_CUDA_DMMV_Y
158160
ifdef LLAMA_CUDA_DMMV_F16
159161
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
@@ -175,23 +177,40 @@ ifdef LLAMA_HIPBLAS
175177
ROCM_PATH ?= /opt/rocm
176178
CC := $(ROCM_PATH)/llvm/bin/clang
177179
CXX := $(ROCM_PATH)/llvm/bin/clang++
178-
GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030
180+
GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
179181
LLAMA_CUDA_DMMV_X ?= 64
180-
LLAMA_CUDA_DMMV_Y ?= 2
182+
LLAMA_CUDA_MMV_Y ?= 2
183+
LLAMA_CUDA_FORCE_DMMV = true
181184
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
182185
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
183186
LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
184187
OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
185188

189+
ifdef LLAMA_CUDA_DMMV_X
190+
CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
191+
else
192+
CXXFLAGS += -DGGML_CUDA_DMMV_X=32
193+
endif
194+
ifeq ($(LLAMA_CUDA_FORCE_DMMV), true)
195+
CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
196+
endif
197+
ifdef LLAMA_CUDA_MMV_Y
198+
CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
199+
else ifdef LLAMA_CUDA_DMMV_Y
200+
CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
201+
else
202+
CXXFLAGS += -DGGML_CUDA_MMV_Y=1
203+
endif
204+
186205
ifdef LLAMA_CUDA_KQUANTS_ITER
187206
CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
188207
else
189208
CXXFLAGS += -DK_QUANTS_PER_ITERATION=2
190209
endif
191210

192-
ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
193-
-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
194-
-DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
211+
ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
212+
213+
195214
# DGGML_CUDA_DMMV_F16 does not currently work with AMD.
196215
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
197216
$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
@@ -259,11 +278,11 @@ else
259278
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
260279
endif
261280
ifdef LLAMA_CLBLAST
262-
ifeq ($(UNAME_S),Darwin)
263-
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
264-
else
265-
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
266-
endif
281+
ifeq ($(UNAME_S),Darwin)
282+
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
283+
else
284+
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
285+
endif
267286
endif
268287

269288
ifdef LLAMA_CUBLAS

convert.py

+42-5
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
136136
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
137137
if calc_ff == n_ff:
138138
return n_mult
139-
return 1
139+
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
140140

141141
@dataclass
142142
class Params:
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
154154
# try transformer naming first
155155
if "model.layers.0.self_attn.q_proj.weight" in model:
156156
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
158+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
157159
else:
158160
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159161

162+
if n_layer < 1:
163+
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
164+
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
165+
160166
n_head=n_embd // 128 # guessed
161167

162168
return Params(
@@ -321,6 +327,10 @@ def astype(self, data_type: DataType) -> 'Tensor': ...
321327
@abstractmethod
322328
def permute(self, n_head: int) -> 'Tensor': ...
323329
@abstractmethod
330+
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
331+
@abstractmethod
332+
def part(self, n_part: int) -> 'UnquantizedTensor': ...
333+
@abstractmethod
324334
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
325335

326336

@@ -345,6 +355,14 @@ def astype(self, data_type: DataType) -> Tensor:
345355
def to_ggml(self) -> 'UnquantizedTensor':
346356
return self
347357

358+
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
359+
r = self.ndarray.shape[0] // 3
360+
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
361+
362+
def part(self, n_part: int) -> 'UnquantizedTensor':
363+
r = self.ndarray.shape[0] // 3
364+
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
365+
348366
def permute(self, n_head: int) -> 'UnquantizedTensor':
349367
return UnquantizedTensor(permute(self.ndarray, n_head))
350368

@@ -642,6 +660,19 @@ def load() -> Tensor:
642660
return lazy_tensor.load().permute(n_head)
643661
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
644662

663+
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
664+
def load() -> Tensor:
665+
return lazy_tensor.load().permute_part(n_part, n_head)
666+
s = lazy_tensor.shape.copy()
667+
s[0] = s[0] // 3
668+
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
669+
670+
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
671+
def load() -> Tensor:
672+
return lazy_tensor.load().part(n_part)
673+
s = lazy_tensor.shape.copy()
674+
s[0] = s[0] // 3
675+
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
645676

646677
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
647678
out: LazyModel = {}
@@ -650,11 +681,17 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
650681
out["output.weight"] = model["lm_head.weight"]
651682

652683
for i in itertools.count():
653-
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
684+
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
685+
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
686+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
687+
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
688+
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
689+
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
690+
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
691+
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
692+
else:
654693
break
655-
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
656-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
657-
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
694+
658695
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
659696

660697
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]

examples/alpaca.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
cd `dirname $0`
88
cd ..
99

10-
./main -m ./models/ggml-alpaca-7b-q4.bin \
10+
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
1111
--color \
1212
-f ./prompts/alpaca.txt \
1313
--ctx_size 2048 \

examples/common.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct gpt_params {
3131
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
3232
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3333
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
34-
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
34+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
3535

3636
// sampling parameters
3737
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,6 +59,7 @@ struct gpt_params {
5959
std::string lora_adapter = ""; // lora adapter path
6060
std::string lora_base = ""; // base model path for the lora adapter
6161

62+
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
6263
bool memory_f16 = true; // use f16 instead of f32 for memory kv
6364
bool random_prompt = false; // do not randomize prompt if none provided
6465
bool use_color = false; // use color to distinguish generations and inputs

examples/embd-input/embd-input-lib.cpp

+7-4
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
2929

3030
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
3131

32-
if (params.seed < 0) {
32+
if (params.seed == LLAMA_DEFAULT_SEED) {
3333
params.seed = time(NULL);
3434
}
3535
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
@@ -210,9 +210,12 @@ llama_token sampling_id(struct MyModel* mymodel) {
210210
const char * sampling(struct MyModel * mymodel) {
211211
llama_context * ctx = mymodel->ctx;
212212
int id = sampling_id(mymodel);
213-
std::string ret;
214-
if (id == llama_token_eos()) ret = "</s>";
215-
else ret = llama_token_to_str(ctx, id);
213+
static std::string ret;
214+
if (id == llama_token_eos()) {
215+
ret = "</s>";
216+
} else {
217+
ret = llama_token_to_str(ctx, id);
218+
}
216219
eval_id(mymodel, id);
217220
return ret.c_str();
218221
}

examples/embd-input/embd-input.h

+1-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "llama.h"
66
#include "build-info.h"
77

8-
98
extern "C" {
109

1110
typedef struct MyModel {
@@ -14,14 +13,13 @@ typedef struct MyModel {
1413
int n_past = 0;
1514
} MyModel;
1615

17-
1816
struct MyModel* create_mymodel(int argc, char ** argv);
1917

2018
bool eval_float(void* model, float* input, int N);
2119
bool eval_tokens(void* model, std::vector<llama_token> tokens);
2220
bool eval_id(struct MyModel* mymodel, int id);
2321
bool eval_string(struct MyModel* mymodel, const char* str);
24-
const char* sampling(struct MyModel* mymodel);
22+
const char * sampling(struct MyModel* mymodel);
2523
llama_token sampling_id(struct MyModel* mymodel);
2624
void free_mymodel(struct MyModel* mymodel);
2725

examples/embedding/embedding.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
1818
params.embedding = true;
1919

2020
if (params.n_ctx > 2048) {
21-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
21+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
2222
"expect poor results\n", __func__, params.n_ctx);
2323
}
2424

examples/main/main.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
8585
}
8686

8787
if (params.n_ctx > 2048) {
88-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
88+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
8989
"expect poor results\n", __func__, params.n_ctx);
9090
} else if (params.n_ctx < 8) {
9191
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);

examples/perplexity/perplexity.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
130130
params.n_batch = std::min(params.n_batch, params.n_ctx);
131131

132132
if (params.n_ctx > 2048) {
133-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
133+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
134134
"expect poor results\n", __func__, params.n_ctx);
135135
}
136136

examples/quantize-stats/quantize-stats.cpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
147147
const ggml_tensor * layer,
148148
int64_t offset,
149149
int64_t chunk_size,
150-
const quantize_fns_t & qfns,
150+
const ggml_type_traits_t & qfns,
151151
bool use_reference,
152152
float * input_scratch,
153153
char * quantized_scratch,
@@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
163163
}
164164

165165
if (use_reference) {
166-
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
166+
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
167167
} else {
168-
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
168+
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
169169
}
170-
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
170+
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
171171

172172
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
173173
}
@@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
177177
void test_roundtrip_on_layer(
178178
std::string & name,
179179
bool print_layer_stats,
180-
const quantize_fns_t & qfns,
180+
const ggml_type_traits_t & qfns,
181181
bool use_reference,
182182
const ggml_tensor * layer,
183183
std::vector<float> & input_scratch,
@@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
388388
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
389389
continue;
390390
}
391-
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
392-
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
391+
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
392+
if (qfns.from_float && qfns.to_float) {
393393
if (params.verbose) {
394394
printf("testing %s ...\n", ggml_type_name(type));
395395
}

0 commit comments

Comments
 (0)