1
1
diff --git a/common/common.cpp b/common/common.cpp
2
- index 2597ba0..e42ae73 100644
2
+ index ec181c6..9ba699b 100644
3
3
--- a/common/common.cpp
4
4
+++ b/common/common.cpp
5
- @@ -1268 ,3 +1268,218 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
5
+ @@ -1345 ,3 +1345,222 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
6
6
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
7
7
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
8
8
}
9
9
+
10
- + gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base) {
10
+ + gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base, float lora_scale ) {
11
11
+ gpt_params* lparams = new gpt_params;
12
12
+ fprintf(stderr, "%s: loading model %s\n", __func__, fname.c_str());
13
13
+
14
14
+ // Initialize the 'model' member with the 'fname' parameter
15
15
+ lparams->model = fname;
16
16
+ lparams->lora_base = lora_base;
17
- + lparams->lora_adapter = lora;
17
+ + if (lora_scale == 0 && !lora_base.empty()) {
18
+ + lora_scale = 1.0f;
19
+ + }
20
+ + if (!lora.empty()) {
21
+ + lparams->lora_adapter.push_back(std::make_tuple(lora, lora_scale));
22
+ + }
18
23
+ if (lparams->lora_adapter.empty()) {
19
24
+ lparams->use_mmap = false;
20
25
+ }
@@ -30,14 +35,14 @@ index 2597ba0..e42ae73 100644
30
35
+ return lparams;
31
36
+ }
32
37
+
33
- + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity ) {
38
+ + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all ) {
34
39
+ // load the model
35
40
+ gpt_params * lparams;
36
41
+ // Temporary workaround for https://github.com/go-skynet/go-llama.cpp/issues/218
37
42
+ #ifdef GGML_USE_CUBLAS
38
43
+ lparams = create_gpt_params_cuda(fname);
39
44
+ #else
40
- + lparams = create_gpt_params(fname, lora, lora_base);
45
+ + lparams = create_gpt_params(fname, lora, lora_base, lora_scale );
41
46
+ #endif
42
47
+ llama_model * model;
43
48
+ llama_binding_state * state;
@@ -49,10 +54,8 @@ index 2597ba0..e42ae73 100644
49
54
+ lparams->embedding = embeddings;
50
55
+ lparams->use_mlock = mlock;
51
56
+ lparams->n_gpu_layers = n_gpu_layers;
52
- + lparams->perplexity = perplexity ;
57
+ + lparams->logits_all = logits_all ;
53
58
+ lparams->use_mmap = mmap;
54
- +
55
- + lparams->low_vram = low_vram;
56
59
+ if (rope_freq_base != 0.0f) {
57
60
+ lparams->rope_freq_base = rope_freq_base;
58
61
+ } else {
@@ -114,8 +117,9 @@ index 2597ba0..e42ae73 100644
114
117
+ int idx) {
115
118
+
116
119
+ struct gpt_params params = *g_params;
120
+ +
117
121
+ const int n_ctx = llama_n_ctx(ctx);
118
- + const int n_vocab = llama_n_vocab(ctx);
122
+ + const int n_vocab = llama_n_vocab(llama_get_model( ctx) );
119
123
+
120
124
+ const float temp = params.temp;
121
125
+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
@@ -133,7 +137,7 @@ index 2597ba0..e42ae73 100644
133
137
+
134
138
+ llama_token id = 0;
135
139
+
136
- + float * logits = llama_get_logits (ctx) + idx * n_vocab ;
140
+ + float * logits = llama_get_logits_ith (ctx, idx) ;
137
141
+
138
142
+ // Apply params.logit_bias map
139
143
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@@ -184,19 +188,19 @@ index 2597ba0..e42ae73 100644
184
188
+ if (mirostat == 1) {
185
189
+ static float mirostat_mu = 2.0f * mirostat_tau;
186
190
+ const int mirostat_m = 100;
187
- + llama_sample_temperature (ctx, &cur_p, temp);
191
+ + llama_sample_temp (ctx, &cur_p, temp);
188
192
+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
189
193
+ } else if (mirostat == 2) {
190
194
+ static float mirostat_mu = 2.0f * mirostat_tau;
191
- + llama_sample_temperature (ctx, &cur_p, temp);
195
+ + llama_sample_temp (ctx, &cur_p, temp);
192
196
+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
193
197
+ } else {
194
198
+ // Temperature sampling
195
199
+ llama_sample_top_k (ctx, &cur_p, top_k, 1);
196
200
+ llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
197
201
+ llama_sample_typical (ctx, &cur_p, typical_p, 1);
198
202
+ llama_sample_top_p (ctx, &cur_p, top_p, 1);
199
- + llama_sample_temperature (ctx, &cur_p, temp);
203
+ + llama_sample_temp (ctx, &cur_p, temp);
200
204
+
201
205
+ {
202
206
+ const int n_top = 10;
@@ -223,10 +227,10 @@ index 2597ba0..e42ae73 100644
223
227
+ }
224
228
\ No newline at end of file
225
229
diff --git a/common/common.h b/common/common.h
226
- index 18aea38..ca7a168 100644
230
+ index 0e2d3fa..9992d2b 100644
227
231
--- a/common/common.h
228
232
+++ b/common/common.h
229
- @@ -209 ,3 +209 ,19 @@ std::string get_sortable_timestamp();
233
+ @@ -221 ,3 +221 ,19 @@ std::string get_sortable_timestamp();
230
234
void dump_non_result_info_yaml(
231
235
FILE * stream, const gpt_params & params, const llama_context * lctx,
232
236
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
@@ -236,7 +240,7 @@ index 18aea38..ca7a168 100644
236
240
+ llama_model * model;
237
241
+ };
238
242
+
239
- + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity );
243
+ + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all );
240
244
+
241
245
+ llama_token llama_sample_token_binding(
242
246
+ struct llama_context * ctx,
0 commit comments