17
17
"
18
18
" start the llama.cpp server with a FIM-compatible model. for example:
19
19
"
20
- " $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 64
20
+ " $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
21
21
"
22
22
" --batch-size [512, model max context]
23
23
"
29
29
" chunks the batch into smaller chunks for faster processing
30
30
" depends on the specific hardware. use llama-bench to profile and determine the best size
31
31
"
32
+ " --cache-reuse (ge:llama_config.n_predict, 1024]
33
+ "
34
+ " this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
35
+ " using non-zero value enables context reuse on the server side which dramatically improves the performance at
36
+ " large contexts. a value of 256 should be good for all cases
37
+ "
32
38
" run this once to initialise llama.vim:
33
39
"
34
40
" :call llama#init()
@@ -43,8 +49,8 @@ highlight llama_hl_info guifg=#77ff2f
43
49
" general parameters:
44
50
"
45
51
" endpoint: llama.cpp server endpoint
46
- " n_prefix: number of lines before the cursor location to include in the prefix
47
- " n_suffix: number of lines after the cursor location to include in the suffix
52
+ " n_prefix: number of lines before the cursor location to include in the local prefix
53
+ " n_suffix: number of lines after the cursor location to include in the local suffix
48
54
" n_predict: max number of tokens to predict
49
55
" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported)
50
56
" t_max_predict_ms: max alloted time for the prediction
@@ -72,7 +78,7 @@ highlight llama_hl_info guifg=#77ff2f
72
78
let s: default_config = {
73
79
\ ' endpoint' : ' http://127.0.0.1:8012/infill' ,
74
80
\ ' n_prefix' : 256 ,
75
- \ ' n_suffix' : 8 ,
81
+ \ ' n_suffix' : 64 ,
76
82
\ ' n_predict' : 128 ,
77
83
\ ' t_max_prompt_ms' : 500 ,
78
84
\ ' t_max_predict_ms' : 1000 ,
@@ -463,7 +469,7 @@ function! llama#fim_accept(first_line)
463
469
464
470
" move the cursor to the end of the accepted text
465
471
if ! a: first_line && len (s: content ) > 1
466
- call cursor (s: pos_y + len (s: content ) - 1 , s: pos_x + s: pos_dx )
472
+ call cursor (s: pos_y + len (s: content ) - 1 , s: pos_x + s: pos_dx + 1 )
467
473
else
468
474
call cursor (s: pos_y , s: pos_x + len (s: content [0 ]))
469
475
endif
0 commit comments