|
1 |
| -" Requires an already running llama.cpp server |
2 |
| -" To install either copy or symlink to ~/.vim/autoload/llama.vim |
3 |
| -" Then start with either :call llama#doLlamaGen(), |
4 |
| -" or add a keybind to your vimrc such as |
5 |
| -" nnoremap Z :call llama#doLlamaGen()<CR> |
6 |
| -" Similarly, you could add an insert mode keybind with |
7 |
| -" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR> |
| 1 | +" sample config: |
8 | 2 | "
|
9 |
| -" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc |
10 |
| -" let g:llama_api_url = "192.168.1.10:8080" |
11 |
| -" llama_overrides can also be set through buffer/window scopes. For instance |
12 |
| -" autocmd filetype python let b:llama_overrides = {"temp": 0.2} |
13 |
| -" Could be added to your .vimrc to automatically set a lower temperature when |
14 |
| -" editing a python script |
15 |
| -" Additionally, an override dict can be stored at the top of a file |
16 |
| -" !*{"stop": ["User:"]} |
17 |
| -" Could be added to the start of your chatlog.txt to set the stopping token |
18 |
| -" These parameter dicts are merged together from lowest to highest priority: |
19 |
| -" server default -> g:llama_overrides -> w:llama_overrides -> |
20 |
| -" b:llama_overrides -> in file (!*) overrides |
| 3 | +" - Ctrl+F - trigger FIM completion |
| 4 | +" |
| 5 | +" copy paste this in your .vimrc: |
| 6 | +" |
| 7 | +"augroup llama_cpp |
| 8 | +" autocmd! |
| 9 | +" autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <Esc>:call llama#fim()<CR> |
| 10 | +"augroup END |
21 | 11 | "
|
22 |
| -" Sublists (like logit_bias and stop) are overridden, not merged |
23 |
| -" Example override: |
24 |
| -" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} |
25 |
| -if !exists("g:llama_api_url") |
26 |
| - let g:llama_api_url= "127.0.0.1:8080" |
27 |
| -endif |
28 |
| -if !exists("g:llama_overrides") |
29 |
| - let g:llama_overrides = {} |
30 |
| -endif |
31 |
| -const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } |
32 |
| -const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] |
33 |
| -let s:linedict = {} |
34 | 12 |
|
35 |
| -func s:callbackHandler(bufn, channel, msg) |
36 |
| - if len(a:msg) < 3 |
37 |
| - return |
38 |
| - elseif a:msg[0] == "d" |
39 |
| - let l:msg = a:msg[6:-1] |
40 |
| - else |
41 |
| - let l:msg = a:msg |
42 |
| - endif |
43 |
| - let l:decoded_msg = json_decode(l:msg) |
44 |
| - let l:newtext = split(l:decoded_msg['content'], "\n", 1) |
45 |
| - if len(l:newtext) > 0 |
46 |
| - call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) |
47 |
| - else |
48 |
| - echo "nothing genned" |
49 |
| - endif |
50 |
| - if len(newtext) > 1 |
51 |
| - let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) |
52 |
| - let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 |
53 |
| - endif |
54 |
| - if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop |
55 |
| - echo "Finished generation" |
56 |
| - endif |
57 |
| -endfunction |
| 13 | +let s:default_config = { |
| 14 | + \ 'prefix_lines': 32, |
| 15 | + \ 'suffix_lines': 32, |
| 16 | + \ 'endpoint': 'http://127.0.0.1:8012/infill', |
| 17 | + \ 'stop': ["\n"], |
| 18 | + \ 'n_predict': 64, |
| 19 | + \ 'n_probs': 3, |
| 20 | + \ 'temperature': 0.1 |
| 21 | + \} |
58 | 22 |
|
59 |
| -func llama#doLlamaGen() |
60 |
| - if exists("b:job") |
61 |
| - if job_status(b:job) == "run" |
62 |
| - call job_stop(b:job) |
63 |
| - return |
64 |
| - endif |
65 |
| - endif |
| 23 | +let g:llama_config = get(g:, 'llama_config', s:default_config) |
66 | 24 |
|
67 |
| - let l:cbuffer = bufnr("%") |
68 |
| - let s:linedict[l:cbuffer] = line('$') |
69 |
| - let l:buflines = getbufline(l:cbuffer, 1, 1000) |
70 |
| - let l:querydata = copy(s:querydata) |
71 |
| - call extend(l:querydata, g:llama_overrides) |
72 |
| - if exists("w:llama_overrides") |
73 |
| - call extend(l:querydata, w:llama_overrides) |
74 |
| - endif |
75 |
| - if exists("b:llama_overrides") |
76 |
| - call extend(l:querydata, b:llama_overrides) |
77 |
| - endif |
78 |
| - if l:buflines[0][0:1] == '!*' |
79 |
| - let l:userdata = json_decode(l:buflines[0][2:-1]) |
80 |
| - call extend(l:querydata, l:userdata) |
81 |
| - let l:buflines = l:buflines[1:-1] |
82 |
| - endif |
83 |
| - let l:querydata.prompt = join(l:buflines, "\n") |
84 |
| - let l:curlcommand = copy(s:curlcommand) |
85 |
| - if exists("g:llama_api_key") |
86 |
| - call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key]) |
87 |
| - endif |
88 |
| - let l:curlcommand[2] = json_encode(l:querydata) |
89 |
| - let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) |
90 |
| -endfunction |
| 25 | +function! llama#fim() abort |
| 26 | + let l:lines_prefix = getline(max([1, line('.') - g:llama_config.suffix_lines]), line('.') - 1) |
| 27 | + let l:lines_suffix = getline(line('.') + 1, min([line('$'), line('.') + g:llama_config.prefix_lines])) |
91 | 28 |
|
92 |
| -" Echos the tokkenization of the provided string , or cursor to end of word |
93 |
| -" Onus is placed on the user to include the preceding space |
94 |
| -func llama#tokenizeWord(...) |
95 |
| - if (a:0 > 0) |
96 |
| - let l:input = a:1 |
97 |
| - else |
98 |
| - exe "normal \"*ye" |
99 |
| - let l:input = @* |
100 |
| - endif |
101 |
| - let l:querydata = {"content": l:input} |
102 |
| - let l:curlcommand = copy(s:curlcommand) |
103 |
| - let l:curlcommand[2] = json_encode(l:querydata) |
104 |
| - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" |
105 |
| - let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) |
106 |
| -endfunction |
| 29 | + let l:cursor_col = col('.') |
107 | 30 |
|
108 |
| -func s:tokenizeWordCallback(plaintext, channel, msg) |
109 |
| - echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) |
110 |
| -endfunction |
| 31 | + let l:line_cur = getline('.') |
| 32 | + let l:line_cur_prefix = strpart(l:line_cur, 0, l:cursor_col) |
| 33 | + let l:line_cur_suffix = strpart(l:line_cur, l:cursor_col) |
111 | 34 |
|
| 35 | + let l:prefix = "" |
| 36 | + \ . join(l:lines_prefix, "\n") |
| 37 | + \ . "\n" |
| 38 | + \ . l:line_cur_prefix |
112 | 39 |
|
113 |
| -" Echos the token count of the entire buffer (or provided string) |
114 |
| -" Example usage :echo llama#tokenCount() |
115 |
| -func llama#tokenCount(...) |
116 |
| - if (a:0 > 0) |
117 |
| - let l:buflines = a:1 |
118 |
| - else |
119 |
| - let l:buflines = getline(1,1000) |
120 |
| - if l:buflines[0][0:1] == '!*' |
121 |
| - let l:buflines = l:buflines[1:-1] |
122 |
| - endif |
123 |
| - let l:buflines = join(l:buflines, "\n") |
124 |
| - endif |
125 |
| - let l:querydata = {"content": l:buflines} |
126 |
| - let l:curlcommand = copy(s:curlcommand) |
127 |
| - let l:curlcommand[2] = json_encode(l:querydata) |
128 |
| - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" |
129 |
| - let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) |
130 |
| -endfunction |
| 40 | + let l:suffix = "" |
| 41 | + \ . l:line_cur_suffix |
| 42 | + \ . join(l:lines_suffix, "\n") |
| 43 | + |
| 44 | + let l:request = json_encode({ |
| 45 | + \ 'prompt': "", |
| 46 | + \ 'input_prefix': l:prefix, |
| 47 | + \ 'input_suffix': l:suffix, |
| 48 | + "\ 'stop': g:llama_config.stop, |
| 49 | + \ 'n_predict': g:llama_config.n_predict, |
| 50 | + "\ 'n_probs': g:llama_config.n_probs, |
| 51 | + \ 'penalty_last_n': 0, |
| 52 | + \ 'temperature': g:llama_config.temperature, |
| 53 | + \ 'top_k': 10, |
| 54 | + \ 'stream': v:false, |
| 55 | + \ 'samplers': ["top_k"] |
| 56 | + \ }) |
| 57 | + |
| 58 | + " request completion from the server |
| 59 | + let l:curl_command = printf( |
| 60 | + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", |
| 61 | + \ g:llama_config.endpoint, shellescape(l:request) |
| 62 | + \ ) |
| 63 | + |
| 64 | + let l:response = json_decode(system(l:curl_command)) |
| 65 | + |
| 66 | + echom l:response |
| 67 | + |
| 68 | + let l:content = [] |
| 69 | + for l:part in split(get(l:response, 'content', ''), "\n", 1) |
| 70 | + call add(l:content, l:part) |
| 71 | + endfor |
| 72 | + |
| 73 | + echom l:content |
| 74 | + |
| 75 | + " insert the 'content' at the current cursor location |
| 76 | + let l:content[0] = l:line_cur_prefix . l:content[0] |
| 77 | + let l:content[-1] .= l:line_cur_suffix |
131 | 78 |
|
132 |
| -func s:tokenCountCallback(channel, msg) |
133 |
| - let resp = json_decode(a:msg) |
134 |
| - echo len(resp.tokens) |
| 79 | + call setline('.', l:content[0]) |
| 80 | + call append (line('.'), l:content[1:-1]) |
135 | 81 | endfunction
|
0 commit comments