Skip to content

Commit 76e5d87

Browse files
committed
llama : clean-up
ggml-ci
1 parent 6384002 commit 76e5d87

File tree

3 files changed

+84
-32
lines changed

3 files changed

+84
-32
lines changed

examples/llama.vim

+64-24
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,55 @@
1+
" LLM-based code completion using llama.cpp
2+
"
3+
" requires:
4+
" - neovim
5+
" - llama.cpp server instance
6+
"
17
" sample config:
28
"
3-
" - Ctrl+F - trigger FIM completion manually
9+
" - Tab - accept the current suggestion
10+
" - Shift+Tab - accept just the first line
11+
" - Ctrl+F - trigger FIM completion manually
12+
"
13+
" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
14+
"
15+
" start the llama.cpp server with a FIM-compatible model. for example:
16+
"
17+
" llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -ub 1024 -b 2048
18+
"
19+
" adjust the batch size to control how much of the provided context will be used during the inference
20+
" lower values will use smaller part of the context, which will result in faster processing
421
"
5-
" run this once to initialise the plugin:
22+
" run this once to initialise llama.vim:
623
"
7-
" :call llama#init()
24+
" :call llama#init()
825
"
926

1027
" color of the suggested text
1128
highlight llama_hl_hint guifg=#ff772f
1229
highlight llama_hl_info guifg=#77ff2f
1330

31+
" endpoint: llama.cpp server endpoint
32+
" n_prefix: number of lines to include in the prefix
33+
" n_suffix: number of lines to include in the suffix
34+
" n_predict: max number of tokens to predict
35+
" t_max_prompt_ms: max alloted time for the text generation
36+
" show_info: show extra info about the inference
37+
" auto_fim: trigger FIM completion automatically on cursor movement
1438
let s:default_config = {
1539
\ 'endpoint': 'http://127.0.0.1:8012/infill',
1640
\ 'n_prefix': 128,
1741
\ 'n_suffix': 128,
1842
\ 'n_predict': 64,
1943
\ 't_max_prompt_ms': 300,
2044
\ 't_max_predict_ms': 200,
45+
\ 'show_info': v:true,
2146
\ 'auto_fim': v:true,
22-
\ 'stop': ["\n"]
2347
\ }
2448

2549
let g:llama_config = get(g:, 'llama_config', s:default_config)
2650

2751
function! llama#init()
28-
let s:pos_x = 0
52+
let s:pos_x = 0 " cursor position upon start of completion
2953
let s:pos_y = 0
3054
let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case
3155

@@ -46,8 +70,8 @@ function! llama#init()
4670

4771
augroup llama
4872
autocmd!
49-
autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
50-
autocmd InsertLeave * call llama#fim_cancel()
73+
autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
74+
autocmd InsertLeavePre * call llama#fim_cancel()
5175

5276
autocmd CursorMoved * call llama#fim_cancel()
5377
augroup END
@@ -90,7 +114,6 @@ function! llama#fim(is_auto) abort
90114
\ 'prompt': "",
91115
\ 'input_prefix': l:prefix,
92116
\ 'input_suffix': l:suffix,
93-
"\ 'stop': g:llama_config.stop,
94117
\ 'n_predict': g:llama_config.n_predict,
95118
\ 'penalty_last_n': 0,
96119
\ 'top_k': 100,
@@ -126,16 +149,23 @@ function! llama#fim(is_auto) abort
126149
endif
127150
endfunction
128151

129-
function! llama#fim_accept()
152+
" if first_line == v:true accept only the first line of the response
153+
function! llama#fim_accept(first_line)
130154
" insert the suggestion at the cursor location
131155
if s:can_accept && len(s:content) > 0
132156
call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
133157
if len(s:content) > 1
134-
call append(s:pos_y, s:content[1:-1])
158+
if !a:first_line
159+
call append(s:pos_y, s:content[1:-1])
160+
endif
135161
endif
136162

137163
" move the cursor to the end of the accepted text
138-
call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
164+
if !a:first_line
165+
call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
166+
else
167+
call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1)
168+
endif
139169
endif
140170

141171
call llama#fim_cancel()
@@ -146,6 +176,11 @@ function! llama#fim_cancel()
146176
call jobstop(s:current_job)
147177
endif
148178

179+
if s:timer_fim != -1
180+
call timer_stop(s:timer_fim)
181+
let s:timer_fim = -1
182+
endif
183+
149184
" clear the virtual text
150185
let l:bufnr = bufnr('%')
151186

@@ -155,7 +190,9 @@ function! llama#fim_cancel()
155190
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1)
156191
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
157192

193+
" remove the mappings
158194
silent! iunmap <buffer> <Tab>
195+
silent! iunmap <buffer> <S-Tab>
159196
silent! iunmap <buffer> <Esc>
160197

161198
augroup llama_insert
@@ -173,6 +210,8 @@ function! s:fim_auto_enable()
173210
augroup END
174211
endfunction
175212

213+
" auto-start a fim job a short time after the cursor has moved
214+
" if there is already a job queued - cancel it
176215
function! s:fim_auto()
177216
if s:current_job != v:null
178217
call jobstop(s:current_job)
@@ -189,7 +228,7 @@ function! s:fim_auto()
189228
let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
190229
endfunction
191230

192-
231+
" callback that processes the result from the server
193232
function! s:fim_on_stdout(job_id, data, event) dict
194233
let l:raw = join(a:data, "\n")
195234
if len(l:raw) == 0
@@ -199,6 +238,13 @@ function! s:fim_on_stdout(job_id, data, event) dict
199238
let s:can_accept = v:true
200239
let l:has_info = v:false
201240

241+
if s:can_accept && v:shell_error
242+
if !self.is_auto
243+
call add(s:content, "<| curl error: is the server on? |>")
244+
endif
245+
let s:can_accept = v:false
246+
endif
247+
202248
let l:n_prompt = 0
203249
let l:t_prompt_ms = 1.0
204250
let l:s_prompt = 0
@@ -207,13 +253,6 @@ function! s:fim_on_stdout(job_id, data, event) dict
207253
let l:t_predict_ms = 1.0
208254
let l:s_predict = 0
209255

210-
if s:can_accept && v:shell_error
211-
if !self.is_auto
212-
call add(s:content, "<| curl error: is the server on? |>")
213-
endif
214-
let s:can_accept = v:false
215-
endif
216-
217256
" get the generated suggestion
218257
if s:can_accept
219258
let l:response = json_decode(l:raw)
@@ -227,7 +266,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
227266
call remove(s:content, -1)
228267
endwhile
229268

230-
" if response.timings
269+
" if response.timings is available
231270
if len(get(l:response, 'timings', {})) > 0
232271
let l:has_info = v:true
233272
let l:timings = get(l:response, 'timings', {})
@@ -264,8 +303,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
264303
let l:id_vt_fim = nvim_create_namespace('vt_fim')
265304
let l:id_vt_info = nvim_create_namespace('vt_info')
266305

267-
" construct the info message:
268-
if l:has_info
306+
" construct the info message and display it to the right of the current line
307+
if g:llama_config.show_info && l:has_info
269308
" prefix the info string with whitespace in order to offset it to the right of the fim overlay
270309
let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
271310

@@ -282,6 +321,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
282321
\ })
283322
endif
284323

324+
" display the suggestion
285325
call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
286326
\ 'virt_text': [[s:content[0], 'llama_hl_hint']],
287327
\ 'virt_text_win_col': virtcol('.') - 1
@@ -293,8 +333,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
293333
\ })
294334

295335
" setup accept/cancel events
296-
inoremap <buffer> <Tab> <C-O>:call llama#fim_accept()<CR>
297-
inoremap <buffer> <Esc> <C-O>:call llama#fim_cancel()<CR><Esc>
336+
inoremap <buffer> <Tab> <C-O>:call llama#fim_accept(v:false)<CR>
337+
inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
298338
299339
augroup llama_insert
300340
autocmd!

examples/server/server.cpp

+15-7
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,8 @@ struct slot_params {
132132
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
133133
int32_t n_predict = -1; // new tokens to predict
134134

135-
int64_t t_max_prompt_ms = -1;
136-
int64_t t_max_predict_ms = -1;
135+
int64_t t_max_prompt_ms = -1; // TODO: not implemented
136+
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
137137

138138
std::vector<std::string> antiprompt;
139139

@@ -2028,8 +2028,8 @@ struct server_context {
20282028
auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
20292029
auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
20302030

2031-
// for now pick context to fit in a single batch
2032-
const int n_suffix_take = std::min<int>(suffix_tokens.size(), n_batch/2);
2031+
// for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?)
2032+
const int n_suffix_take = std::min<int>(suffix_tokens.size(), n_batch/4);
20332033
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
20342034

20352035
prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
@@ -2057,9 +2057,17 @@ struct server_context {
20572057

20582058
SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
20592059

2060-
// print prompt tokens:
2061-
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
2062-
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
2060+
// print prompt tokens (for debugging)
2061+
if (1) {
2062+
// first 16 tokens (avoid flooding logs)
2063+
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
2064+
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
2065+
}
2066+
} else {
2067+
// all
2068+
for (int i = 0; i < prompt_tokens.size(); i++) {
2069+
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
2070+
}
20632071
}
20642072

20652073
// empty prompt passed -> release the slot and send empty response

src/llama-sampling.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -1646,6 +1646,8 @@ struct llama_sampler * llama_sampler_init_logit_bias(
16461646

16471647
// infill
16481648

1649+
//#define GGML_DEBUG_SAMPLER_INFILL
1650+
16491651
struct llama_sampler_infill {
16501652
const struct llama_vocab * vocab;
16511653
};
@@ -1659,10 +1661,11 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
16591661

16601662
llama_sampler_softmax_impl(cur_p);
16611663

1662-
// print cur_p:
1664+
#if defined(GGML_DEBUG_SAMPLER_INFILL)
16631665
for (size_t i = 0; i < cur_p->size; ++i) {
16641666
LLAMA_LOG_DEBUG("infill: cur_p[%zu] = { id: %d, p: %f, logit: %f }\n", i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
16651667
}
1668+
#endif
16661669

16671670
float p_max = 0.0f;
16681671
float p_txt_sum = 0.0f;
@@ -1746,6 +1749,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
17461749
return;
17471750
}
17481751

1752+
// pick the best token
17491753
cur_p->size = 1;
17501754
cur_p->data[0] = cur_p->data[i_max];
17511755

0 commit comments

Comments
 (0)