Skip to content

Commit 0e70ba6

Browse files
ggerganovngxson
andauthored
server : add "tokens" output (#10853)
* server : add "tokens" output ggml-ci * server : update readme ggml-ci * server : return tokens ids only if requested ggml-ci * tests : improve "tokens" type check Co-authored-by: Xuan Son Nguyen <[email protected]> * server : remove "tokens" from the OAI endpoint ggml-ci --------- Co-authored-by: Xuan Son Nguyen <[email protected]>
1 parent 4682887 commit 0e70ba6

File tree

3 files changed

+46
-16
lines changed

3 files changed

+46
-16
lines changed

examples/server/README.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -438,19 +438,22 @@ These words will not be included in the completion, so make sure to add them to
438438

439439
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
440440

441+
`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
442+
441443
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
442444

443445
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
444446

445447
**Response format**
446448

447-
- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
449+
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
448450

449451
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
450452

451453
```json
452454
{
453-
"content": "<the token selected by the model>",
455+
"content": "<the token generated by the model>",
456+
"tokens": [ generated token ids if requested ],
454457
"probs": [
455458
{
456459
"prob": float,
@@ -468,6 +471,7 @@ These words will not be included in the completion, so make sure to add them to
468471
Notice that each `probs` is an array of length `n_probs`.
469472

470473
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
474+
- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
471475
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
472476
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
473477
- `model`: The path to the model loaded with `-m`

examples/server/server.cpp

+28-10
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,9 @@ enum error_type {
7979
};
8080

8181
struct slot_params {
82-
bool stream = true;
83-
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
82+
bool stream = true;
83+
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
84+
bool return_tokens = false;
8485

8586
int32_t n_keep = 0; // number of tokens to keep from initial prompt
8687
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -199,6 +200,7 @@ struct server_task {
199200

200201
params.stream = json_value(data, "stream", false);
201202
params.cache_prompt = json_value(data, "cache_prompt", true);
203+
params.return_tokens = json_value(data, "return_tokens", false);
202204
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
203205
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
204206
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
@@ -468,7 +470,10 @@ struct completion_token_output {
468470

469471
struct server_task_result_cmpl_final : server_task_result {
470472
int index = 0;
471-
std::string content;
473+
474+
std::string content;
475+
llama_tokens tokens;
476+
472477
bool stream;
473478
result_timings timings;
474479
std::string prompt;
@@ -510,6 +515,7 @@ struct server_task_result_cmpl_final : server_task_result {
510515
json res = json {
511516
{"index", index},
512517
{"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk
518+
{"tokens", stream ? llama_tokens {} : tokens},
513519
{"id_slot", id_slot},
514520
{"stop", true},
515521
{"model", oaicompat_model},
@@ -539,9 +545,9 @@ struct server_task_result_cmpl_final : server_task_result {
539545
json choices = json::array({json{
540546
{"finish_reason", finish_reason},
541547
{"index", 0},
542-
{"message", json{
548+
{"message", json {
543549
{"content", content},
544-
{"role", "assistant"}
550+
{"role", "assistant"}
545551
}
546552
}}});
547553

@@ -605,7 +611,9 @@ struct server_task_result_cmpl_final : server_task_result {
605611

606612
struct server_task_result_cmpl_partial : server_task_result {
607613
int index = 0;
608-
std::string content;
614+
615+
std::string content;
616+
llama_tokens tokens;
609617

610618
int32_t n_decoded;
611619
int32_t n_prompt_tokens;
@@ -637,6 +645,7 @@ struct server_task_result_cmpl_partial : server_task_result {
637645
json res = json {
638646
{"index", index},
639647
{"content", content},
648+
{"tokens", tokens},
640649
{"stop", false},
641650
{"id_slot", id_slot},
642651
{"tokens_predicted", n_decoded},
@@ -678,7 +687,7 @@ struct server_task_result_cmpl_partial : server_task_result {
678687
json second_ret = json{
679688
{"choices", json::array({json{{"finish_reason", nullptr},
680689
{"index", 0},
681-
{"delta", json{
690+
{"delta", json {
682691
{"content", content}}}
683692
}})},
684693
{"created", t},
@@ -693,7 +702,7 @@ struct server_task_result_cmpl_partial : server_task_result {
693702
{"finish_reason", nullptr},
694703
{"index", 0},
695704
{"delta",
696-
json{
705+
json {
697706
{"content", content},
698707
}},
699708
}});
@@ -955,8 +964,11 @@ struct server_slot {
955964

956965
size_t last_nl_pos = 0;
957966

958-
std::string generated_text;
967+
std::string generated_text;
968+
llama_tokens generated_tokens;
969+
959970
llama_tokens cache_tokens;
971+
960972
std::vector<completion_token_output> generated_token_probs;
961973

962974
bool has_next_token = true;
@@ -1000,6 +1012,7 @@ struct server_slot {
10001012
n_sent_token_probs = 0;
10011013
task_type = SERVER_TASK_TYPE_COMPLETION;
10021014

1015+
generated_tokens.clear();
10031016
generated_token_probs.clear();
10041017
}
10051018

@@ -1740,8 +1753,10 @@ struct server_context {
17401753
const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
17411754
slot.sampled = result.tok;
17421755

1743-
// search stop word and delete it
17441756
slot.generated_text += token_str;
1757+
if (slot.params.return_tokens) {
1758+
slot.generated_tokens.push_back(result.tok);
1759+
}
17451760
slot.has_next_token = true;
17461761

17471762
// check if there is incomplete UTF-8 character at the end
@@ -1766,6 +1781,7 @@ struct server_context {
17661781
break;
17671782
}
17681783

1784+
// search stop word and delete it
17691785
if (!incomplete) {
17701786
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
17711787

@@ -1918,6 +1934,7 @@ struct server_context {
19181934
res->id = slot.id_task;
19191935
res->index = slot.index;
19201936
res->content = tkn.text_to_send;
1937+
res->tokens = { tkn.tok };
19211938

19221939
res->n_decoded = slot.n_decoded;
19231940
res->n_prompt_tokens = slot.n_prompt_tokens;
@@ -1958,6 +1975,7 @@ struct server_context {
19581975

19591976
res->index = slot.index;
19601977
res->content = slot.generated_text;
1978+
res->tokens = slot.generated_tokens;
19611979
res->timings = slot.get_timings();
19621980
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
19631981

examples/server/tests/unit/test_completion.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,29 @@ def create_server():
1010
global server
1111
server = ServerPreset.tinyllama2()
1212

13-
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
14-
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
15-
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
13+
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
14+
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
15+
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
1616
])
17-
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
17+
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
1818
global server
1919
server.start()
2020
res = server.make_request("POST", "/completion", data={
2121
"n_predict": n_predict,
2222
"prompt": prompt,
23+
"return_tokens": return_tokens,
2324
})
2425
assert res.status_code == 200
2526
assert res.body["timings"]["prompt_n"] == n_prompt
2627
assert res.body["timings"]["predicted_n"] == n_predicted
2728
assert res.body["truncated"] == truncated
2829
assert type(res.body["has_new_line"]) == bool
2930
assert match_regex(re_content, res.body["content"])
31+
if return_tokens:
32+
assert len(res.body["tokens"]) > 0
33+
assert all(type(tok) == int for tok in res.body["tokens"])
34+
else:
35+
assert res.body["tokens"] == []
3036

3137

3238
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
@@ -56,6 +62,8 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
5662
assert data["generation_settings"]["seed"] == server.seed
5763
assert match_regex(re_content, content)
5864
else:
65+
assert len(data["tokens"]) > 0
66+
assert all(type(tok) == int for tok in data["tokens"])
5967
content += data["content"]
6068

6169

0 commit comments

Comments
 (0)