Skip to content

Commit d43f66a

Browse files
committed
update to latest llama.cpp breaking API changes
Signed-off-by: mudler <[email protected]>
1 parent 79f9587 commit d43f66a

File tree

8 files changed

+124
-96
lines changed

8 files changed

+124
-96
lines changed

Makefile

+3-2
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,8 @@ binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o llama.c
232232

233233
## https://github.com/ggerganov/llama.cpp/pull/1902
234234
prepare:
235-
cd llama.cpp && patch -p1 < ../patches/1902-cuda.patch
235+
cd llama.cpp && \
236+
patch -p1 < ../patches/1902-cuda.patch
236237
touch $@
237238

238239
libbinding.a: prepare binding.o llama.cpp/k_quants.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o $(EXTRA_TARGETS)
@@ -248,4 +249,4 @@ ggllm-test-model.bin:
248249
wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O ggllm-test-model.bin
249250

250251
test: ggllm-test-model.bin libbinding.a
251-
C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=ggllm-test-model.bin go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="$(TEST_LABEL)" --flake-attempts 5 -v -r ./...
252+
C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=$(abspath ./)/ggllm-test-model.bin go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="$(TEST_LABEL)" -v -r ./...

binding.cpp

+69-57
Large diffs are not rendered by default.

binding.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,21 @@ void* load_model(const char *fname,
2121
bool mlock,
2222
bool embeddings,
2323
bool mmap,
24-
bool low_vram,
2524
int n_gpu,
2625
int n_batch,
2726
const char *maingpu,
2827
const char *tensorsplit,
2928
bool numa,
3029
float rope_freq_base,
3130
float rope_freq_scale,
32-
bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity
31+
bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool perplexity
3332
);
3433

3534
int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
3635

3736
int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tokenSize, float * res_embeddings);
3837

39-
void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
38+
void* llama_allocate_params(const char *prompt, int seed, int threads, int batch_threads, int tokens,
4039
int top_k, float top_p, float temp, float repeat_penalty,
4140
int repeat_last_n, bool ignore_eos, bool memory_f16,
4241
int n_batch, int n_keep, const char** antiprompt, int antiprompt_count,

llama.cpp

llama.go

+8-8
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ func New(model string, opts ...ModelOption) (*LLama, error) {
3838

3939
result := C.load_model(modelPath,
4040
C.int(mo.ContextSize), C.int(mo.Seed),
41-
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM),
41+
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap),
4242
C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA),
4343
C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale),
44-
C.bool(MulMatQ), loraAdapter, loraBase, C.bool(mo.Perplexity),
44+
C.bool(MulMatQ), loraAdapter, loraBase, C.float(mo.LoraScale), C.bool(mo.Perplexity),
4545
)
4646

4747
if result == nil {
@@ -112,7 +112,7 @@ func (l *LLama) TokenEmbeddings(tokens []int, opts ...PredictOption) ([]float32,
112112
// float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, const char *logit_bias, const char *session_file, bool prompt_cache_all, bool mlock, bool mmap, const char *maingpu, const char *tensorsplit , bool prompt_cache_ro,
113113
// float rope_freq_base, float rope_freq_scale, float negative_prompt_scale, const char* negative_prompt
114114
// );
115-
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
115+
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
116116
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
117117
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
118118
C.int(po.Batch), C.int(po.NKeep), nil, C.int(0),
@@ -154,7 +154,7 @@ func (l *LLama) Embeddings(text string, opts ...PredictOption) ([]float32, error
154154
pass = &reversePrompt[0]
155155
}
156156

157-
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
157+
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
158158
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
159159
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
160160
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
@@ -193,7 +193,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
193193
pass = &reversePrompt[0]
194194
}
195195

196-
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
196+
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
197197
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
198198
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
199199
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
@@ -238,7 +238,7 @@ func (l *LLama) SpeculativeSampling(ll *LLama, text string, opts ...PredictOptio
238238
pass = &reversePrompt[0]
239239
}
240240

241-
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
241+
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
242242
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
243243
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
244244
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
@@ -296,7 +296,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
296296
pass = &reversePrompt[0]
297297
}
298298

299-
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
299+
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
300300
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
301301
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
302302
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
@@ -346,7 +346,7 @@ func (l *LLama) TokenizeString(text string, opts ...PredictOption) (int32, []int
346346
var fakeDblPtr **C.char
347347

348348
// copy pasted and modified minimally. Should I simplify down / do we need an "allocate defaults"
349-
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
349+
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
350350
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
351351
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
352352
C.int(po.Batch), C.int(po.NKeep), fakeDblPtr, C.int(0),

llama_test.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ how much is 2+2?
7171
Expect(err).ToNot(HaveOccurred())
7272
Expect(model).ToNot(BeNil())
7373
text, err := model.SpeculativeSampling(model2, `[INST] Answer to the following question:
74-
how much is 2+2?
74+
Do a simple math calculation: How much is 2+2?
7575
[/INST]`, llama.SetNDraft(16),
7676
)
7777
Expect(err).ToNot(HaveOccurred(), text)
@@ -97,7 +97,10 @@ how much is 2+2?
9797
getModel := func() (*LLama, error) {
9898
model, err := New(
9999
testModelPath,
100-
llama.EnableF16Memory, llama.SetContext(128), llama.EnableEmbeddings, llama.SetGPULayers(10),
100+
llama.EnableF16Memory,
101+
llama.SetContext(128),
102+
llama.EnableEmbeddings,
103+
llama.SetGPULayers(10),
101104
)
102105
Expect(err).ToNot(HaveOccurred())
103106
Expect(model).ToNot(BeNil())

options.go

+15-6
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ type ModelOptions struct {
77
F16Memory bool
88
MLock bool
99
MMap bool
10-
LowVRAM bool
1110
Embeddings bool
1211
NUMA bool
1312
NGPULayers int
@@ -16,6 +15,7 @@ type ModelOptions struct {
1615
FreqRopeBase float32
1716
FreqRopeScale float32
1817
MulMatQ *bool
18+
LoraScale float32
1919
LoraBase string
2020
LoraAdapter string
2121
Perplexity bool
@@ -29,6 +29,7 @@ type PredictOptions struct {
2929
DebugMode bool
3030
StopPrompts []string
3131
IgnoreEOS bool
32+
BatchThreads int
3233

3334
TailFreeSamplingZ float32
3435
TypicalP float32
@@ -68,7 +69,6 @@ var DefaultModelOptions ModelOptions = ModelOptions{
6869
MLock: false,
6970
Embeddings: false,
7071
MMap: true,
71-
LowVRAM: false,
7272
NBatch: 512,
7373
FreqRopeBase: 10000,
7474
FreqRopeScale: 1.0,
@@ -79,6 +79,7 @@ var DefaultOptions PredictOptions = PredictOptions{
7979
Threads: 4,
8080
Tokens: 128,
8181
Penalty: 1.1,
82+
BatchThreads: -1,
8283
Repeat: 64,
8384
Batch: 512,
8485
NKeep: 64,
@@ -109,6 +110,18 @@ func SetLoraBase(s string) ModelOption {
109110
}
110111
}
111112

113+
func SetBatchThreads(b int) PredictOption {
114+
return func(p *PredictOptions) {
115+
p.BatchThreads = b
116+
}
117+
}
118+
119+
func SetLoraScale(f float32) ModelOption {
120+
return func(p *ModelOptions) {
121+
p.LoraScale = f
122+
}
123+
}
124+
112125
func SetLoraAdapter(s string) ModelOption {
113126
return func(p *ModelOptions) {
114127
p.LoraAdapter = s
@@ -219,10 +232,6 @@ func SetNegativePrompt(np string) PredictOption {
219232
}
220233
}
221234

222-
var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
223-
p.LowVRAM = true
224-
}
225-
226235
var EnableNUMA ModelOption = func(p *ModelOptions) {
227236
p.NUMA = true
228237
}

patches/1902-cuda.patch

+21-17
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
diff --git a/common/common.cpp b/common/common.cpp
2-
index 2597ba0..e42ae73 100644
2+
index ec181c6..9ba699b 100644
33
--- a/common/common.cpp
44
+++ b/common/common.cpp
5-
@@ -1268,3 +1268,218 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
5+
@@ -1345,3 +1345,222 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
66
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
77
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
88
}
99
+
10-
+gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base) {
10+
+gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base, float lora_scale) {
1111
+ gpt_params* lparams = new gpt_params;
1212
+ fprintf(stderr, "%s: loading model %s\n", __func__, fname.c_str());
1313
+
1414
+ // Initialize the 'model' member with the 'fname' parameter
1515
+ lparams->model = fname;
1616
+ lparams->lora_base = lora_base;
17-
+ lparams->lora_adapter = lora;
17+
+ if (lora_scale == 0 && !lora_base.empty()) {
18+
+ lora_scale = 1.0f;
19+
+ }
20+
+ if (!lora.empty()) {
21+
+ lparams->lora_adapter.push_back(std::make_tuple(lora, lora_scale));
22+
+ }
1823
+ if (lparams->lora_adapter.empty()) {
1924
+ lparams->use_mmap = false;
2025
+ }
@@ -30,14 +35,14 @@ index 2597ba0..e42ae73 100644
3035
+ return lparams;
3136
+}
3237
+
33-
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity) {
38+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all) {
3439
+ // load the model
3540
+ gpt_params * lparams;
3641
+// Temporary workaround for https://github.com/go-skynet/go-llama.cpp/issues/218
3742
+#ifdef GGML_USE_CUBLAS
3843
+ lparams = create_gpt_params_cuda(fname);
3944
+#else
40-
+ lparams = create_gpt_params(fname, lora, lora_base);
45+
+ lparams = create_gpt_params(fname, lora, lora_base, lora_scale);
4146
+#endif
4247
+ llama_model * model;
4348
+ llama_binding_state * state;
@@ -49,10 +54,8 @@ index 2597ba0..e42ae73 100644
4954
+ lparams->embedding = embeddings;
5055
+ lparams->use_mlock = mlock;
5156
+ lparams->n_gpu_layers = n_gpu_layers;
52-
+ lparams->perplexity = perplexity;
57+
+ lparams->logits_all = logits_all;
5358
+ lparams->use_mmap = mmap;
54-
+
55-
+ lparams->low_vram = low_vram;
5659
+ if (rope_freq_base != 0.0f) {
5760
+ lparams->rope_freq_base = rope_freq_base;
5861
+ } else {
@@ -114,8 +117,9 @@ index 2597ba0..e42ae73 100644
114117
+ int idx) {
115118
+
116119
+ struct gpt_params params = *g_params;
120+
+
117121
+ const int n_ctx = llama_n_ctx(ctx);
118-
+ const int n_vocab = llama_n_vocab(ctx);
122+
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
119123
+
120124
+ const float temp = params.temp;
121125
+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
@@ -133,7 +137,7 @@ index 2597ba0..e42ae73 100644
133137
+
134138
+ llama_token id = 0;
135139
+
136-
+ float * logits = llama_get_logits(ctx) + idx * n_vocab;
140+
+ float * logits = llama_get_logits_ith(ctx, idx);
137141
+
138142
+ // Apply params.logit_bias map
139143
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@@ -184,19 +188,19 @@ index 2597ba0..e42ae73 100644
184188
+ if (mirostat == 1) {
185189
+ static float mirostat_mu = 2.0f * mirostat_tau;
186190
+ const int mirostat_m = 100;
187-
+ llama_sample_temperature(ctx, &cur_p, temp);
191+
+ llama_sample_temp(ctx, &cur_p, temp);
188192
+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
189193
+ } else if (mirostat == 2) {
190194
+ static float mirostat_mu = 2.0f * mirostat_tau;
191-
+ llama_sample_temperature(ctx, &cur_p, temp);
195+
+ llama_sample_temp(ctx, &cur_p, temp);
192196
+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
193197
+ } else {
194198
+ // Temperature sampling
195199
+ llama_sample_top_k (ctx, &cur_p, top_k, 1);
196200
+ llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
197201
+ llama_sample_typical (ctx, &cur_p, typical_p, 1);
198202
+ llama_sample_top_p (ctx, &cur_p, top_p, 1);
199-
+ llama_sample_temperature(ctx, &cur_p, temp);
203+
+ llama_sample_temp(ctx, &cur_p, temp);
200204
+
201205
+ {
202206
+ const int n_top = 10;
@@ -223,10 +227,10 @@ index 2597ba0..e42ae73 100644
223227
+}
224228
\ No newline at end of file
225229
diff --git a/common/common.h b/common/common.h
226-
index 18aea38..ca7a168 100644
230+
index 0e2d3fa..9992d2b 100644
227231
--- a/common/common.h
228232
+++ b/common/common.h
229-
@@ -209,3 +209,19 @@ std::string get_sortable_timestamp();
233+
@@ -221,3 +221,19 @@ std::string get_sortable_timestamp();
230234
void dump_non_result_info_yaml(
231235
FILE * stream, const gpt_params & params, const llama_context * lctx,
232236
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
@@ -236,7 +240,7 @@ index 18aea38..ca7a168 100644
236240
+ llama_model * model;
237241
+};
238242
+
239-
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity);
243+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all);
240244
+
241245
+llama_token llama_sample_token_binding(
242246
+ struct llama_context * ctx,

0 commit comments

Comments
 (0)