From 5cad62bce41546ddae8908eeb5bb06476f4c5bd8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 15:55:23 +0300 Subject: [PATCH 01/12] tests : write a Python tokenizer test (wip) --- tests/test-tokenizer-0.cpp | 40 ++++++++++++++++++++------------------ tests/test-tokenizer-0.py | 18 +++++++++++++++++ 2 files changed, 39 insertions(+), 19 deletions(-) create mode 100644 tests/test-tokenizer-0.py diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 7e9ac9188d5c5..4bed054d6cae0 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -16,36 +16,38 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector> & k_tests() { static std::map> _k_tests = { - { " ", {1, 259, }, }, - { " ", { 1, 1678, }, }, - { " ", { 1, 268, }, }, - { "\t", { 1, 29871, 12, }, }, - { "\n", { 1, 29871, 13, }, }, - { "\t\n", { 1, 29871, 12, 13, }, }, + { " ", { 1, 259, }, }, + { " ", { 1, 1678, }, }, + { " ", { 1, 268, }, }, + { "\t", { 1, 29871, 12, }, }, + { "\n", { 1, 29871, 13, }, }, + { "\t\n", { 1, 29871, 12, 13, }, }, { "Hello world", { 1, 15043, 3186, }, }, { " Hello world", { 1, 29871, 15043, 3186, }, }, { "Hello World", { 1, 15043, 2787, }, }, { " Hello World", { 1, 29871, 15043, 2787, }, }, { " Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, + { "Hello, world!", { 1, 15043, 29892, 3186, 29991, }, }, + { " Hello, world!", { 1, 29871, 15043, 29892, 3186, 29991, }, }, { " this is πŸ¦™.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, { "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, - 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, - 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, - 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, - 136, 228, 162, 132, 228, 161, 140, }, }, + 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, + 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, + 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, + 136, 228, 162, 132, 228, 161, 140, }, }, { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, - 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, - 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, - 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, - { "Hello", { 1, 15043 }, }, - { " Hello", { 1, 29871, 15043 }, }, - { " Hello", { 1, 259, 15043 }, }, - { " Hello", { 1, 1678, 15043 }, }, - { " Hello", { 1, 268, 15043 }, }, - { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, }, + 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, + 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, + 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello", { 1, 15043, }, }, + { " Hello", { 1, 29871, 15043, }, }, + { " Hello", { 1, 259, 15043, }, }, + { " Hello", { 1, 1678, 15043, }, }, + { " Hello", { 1, 268, 15043, }, }, + { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043, }, }, }; return _k_tests; diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py new file mode 100644 index 0000000000000..d21f8b5a13ae5 --- /dev/null +++ b/tests/test-tokenizer-0.py @@ -0,0 +1,18 @@ +import os +import sys +import argparse + +from sentencepiece import SentencePieceProcessor + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') + +text = 'Hello, world!' +print(text) +print(tokenizer.encode(text, add_bos=True)) +print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) From 5d0ffb69f5c6b8aa10ee2bb88c6a601a46df33d3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 17:08:59 +0300 Subject: [PATCH 02/12] llama : prefix input text for tokenization with whitespace --- examples/embedding/embedding.cpp | 3 -- examples/main/main.cpp | 5 -- llama.cpp | 21 ++++---- tests/test-tokenizer-0.cpp | 88 +++++++++++++++++--------------- tests/test-tokenizer-0.py | 41 +++++++++++++-- 5 files changed, 95 insertions(+), 63 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38395c75b0b5b..abe5c87818fd0 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -56,9 +56,6 @@ int main(int argc, char ** argv) { int n_past = 0; - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - // tokenize the prompt auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4665b82fe7f97..0d3783d6758eb 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -195,11 +195,6 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector embd_inp; - if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) { - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - } - if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos); } else { diff --git a/llama.cpp b/llama.cpp index b0a3b5768f3dd..0453dd9cf6f7b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1635,7 +1635,7 @@ static void llm_load_hparams( } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos); +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); static void llm_load_vocab( llama_model_loader & ml, @@ -3026,10 +3026,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { return vocab.token_to_id.at(buf); } -static std::string llama_escape_whitespace(const std::string& text) { - std::string result = text; - replace_all(result, " ", "\xe2\x96\x81"); - return result; +static void llama_escape_whitespace(std::string & text) { + replace_all(text, " ", "\xe2\x96\x81"); } static void llama_unescape_whitespace(std::string & word) { @@ -3373,22 +3371,25 @@ struct llm_tokenizer_bpe { llm_bigram_bpe::queue work_queue; }; -static std::vector llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) { +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) { std::vector output; + if (bos && vocab.special_bos_id != -1) { + output.push_back(vocab.special_bos_id); + } + if (raw_text.empty()) { return output; } - if (bos && vocab.special_bos_id != -1) { - output.push_back(vocab.special_bos_id); - } + raw_text = " " + raw_text; switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { llm_tokenizer_spm tokenizer(vocab); - tokenizer.tokenize(llama_escape_whitespace(raw_text), output); + llama_escape_whitespace(raw_text); + tokenizer.tokenize(raw_text, output); } break; case LLAMA_VOCAB_TYPE_BPE: { diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 4bed054d6cae0..8a86ad4e66f06 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -6,7 +6,7 @@ #include #include -static std::string unescape_whitespace(llama_context* ctx, const std::vector& tokens) { +static std::string llama_detokenize(llama_context * ctx, const std::vector & tokens) { std::string result; for (size_t i = 0; i < tokens.size(); ++i) { result += llama_token_to_str(ctx, tokens[i]); @@ -16,38 +16,40 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector> & k_tests() { static std::map> _k_tests = { - { " ", { 1, 259, }, }, - { " ", { 1, 1678, }, }, - { " ", { 1, 268, }, }, - { "\t", { 1, 29871, 12, }, }, - { "\n", { 1, 29871, 13, }, }, - { "\t\n", { 1, 29871, 12, 13, }, }, - { "Hello world", { 1, 15043, 3186, }, }, - { " Hello world", { 1, 29871, 15043, 3186, }, }, - { "Hello World", { 1, 15043, 2787, }, }, - { " Hello World", { 1, 29871, 15043, 2787, }, }, - { " Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, - { "Hello, world!", { 1, 15043, 29892, 3186, 29991, }, }, - { " Hello, world!", { 1, 29871, 15043, 29892, 3186, 29991, }, }, - { " this is πŸ¦™.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, - { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, - 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, - 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, - 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, - 136, 228, 162, 132, 228, 161, 140, }, }, + { "" , { }, }, + { " ", { 259, }, }, + { " ", { 1678, }, }, + { " ", { 268, }, }, + { "\t", { 29871, 12, }, }, + { "\n", { 29871, 13, }, }, + { "\t\n", { 29871, 12, 13, }, }, + { "Hello world", { 15043, 3186, }, }, + { " Hello world", { 29871, 15043, 3186, }, }, + { "Hello World", { 15043, 2787, }, }, + { " Hello World", { 29871, 15043, 2787, }, }, + { " Hello World!", { 29871, 15043, 2787, 29991, }, }, + { "Hello, world!", { 15043, 29892, 3186, 29991, }, }, + { " Hello, world!", { 29871, 15043, 29892, 3186, 29991, }, }, + { " this is πŸ¦™.cpp", { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu", { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1538, 4851, 665, 1386, 29713, 1305, }, }, + { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, + 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, + 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, + 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, + 136, 228, 162, 132, 228, 161, 140, }, }, { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - { 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, - 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, - 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, - 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, - { "Hello", { 1, 15043, }, }, - { " Hello", { 1, 29871, 15043, }, }, - { " Hello", { 1, 259, 15043, }, }, - { " Hello", { 1, 1678, 15043, }, }, - { " Hello", { 1, 268, 15043, }, }, - { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043, }, }, + { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, + 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, + 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, + 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello", { 15043, }, }, + { " Hello", { 29871, 15043, }, }, + { " Hello", { 259, 15043, }, }, + { " Hello", { 1678, 15043, }, }, + { " Hello", { 268, 15043, }, }, + { " Hello\n Hello", { 268, 15043, 13, 1678, 15043, }, }, }; return _k_tests; @@ -102,15 +104,18 @@ int main(int argc, char **argv) { bool success = true; for (const auto & test_kv : k_tests()) { - // Add a space in front of the first character to match OG llama tokenizer behavior - std::vector res = llama_tokenize(ctx, " " + test_kv.first, true); - fprintf(stderr, "%s : '%s' tokenized to '%s'\n", - __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); + const std::vector res_bos = llama_tokenize(ctx, test_kv.first, true); + const std::vector res_nobos = llama_tokenize(ctx, test_kv.first, false); - bool correct = res.size() == test_kv.second.size(); + fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), llama_detokenize(ctx, res_bos).c_str()); - for (int i = 0; i < (int) res.size() && correct; ++i) { - if (res[i] != test_kv.second[i]) { + bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1; + + for (int i = 0; i < (int) res_nobos.size() && correct; ++i) { + if (test_kv.second[i] != res_bos[i + 1]) { + correct = false; + } + if (test_kv.second[i] != res_nobos[i]) { correct = false; } } @@ -118,14 +123,15 @@ int main(int argc, char **argv) { if (!correct) { fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str()); + llama_detokenize(ctx, res_nobos).c_str(), + llama_detokenize(ctx, test_kv.second).c_str()); fprintf(stderr, "%s : expected tokens: ", __func__); for (const auto & t : test_kv.second) { fprintf(stderr, "%6d, ", t); } fprintf(stderr, "\n"); fprintf(stderr, "%s : got tokens: ", __func__); - for (const auto & t : res) { + for (const auto & t : res_nobos) { fprintf(stderr, "%6d, ", t); } fprintf(stderr, "\n"); diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index d21f8b5a13ae5..982615e6096c5 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -12,7 +12,40 @@ tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') -text = 'Hello, world!' -print(text) -print(tokenizer.encode(text, add_bos=True)) -print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) +tests = [ + "" + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + + +for text in tests: + print('text: ', text) + print('\nwith bos:') + print(tokenizer.encode(text, add_bos=True)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) + print('\nwithout bos:') + print(tokenizer.encode(text, add_bos=False)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) From 9668aa115c9d3204a8a04de0129488a91cb48440 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 17:35:45 +0300 Subject: [PATCH 03/12] llama : distinguish pieces from decoded text + fix detokenization --- common/common.cpp | 27 ++++++++++++-- common/common.h | 7 +++- examples/beam_search/beam_search.cpp | 6 ++-- examples/embd-input/embd-input-lib.cpp | 2 +- examples/embedding/embedding.cpp | 2 +- examples/main/main.cpp | 14 ++++---- examples/save-load-state/save-load-state.cpp | 4 +-- examples/server/server.cpp | 14 ++++---- examples/simple/simple.cpp | 4 +-- .../train-text-from-scratch.cpp | 4 +-- llama.cpp | 36 +++++++++++-------- llama.h | 10 +++--- tests/test-tokenizer-0.cpp | 8 ----- tests/test-tokenizer-0.py | 9 ++++- tests/test-tokenizer-1.cpp | 14 ++------ 15 files changed, 93 insertions(+), 68 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ff19ec4e50f60..4e40a52501446 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -733,12 +733,12 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -746,3 +746,24 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok return std::string(result.data(), result.size()); } + +std::string llama_detokenize(llama_context * ctx, const std::vector & tokens) { + const llama_token bos_id = llama_token_bos(ctx); + + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + // remove the leading space of the first non-BOS token + if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { + piece = piece.substr(1); + } + + result += piece; + } + + return result; +} + diff --git a/common/common.h b/common/common.h index ce61265f8c124..cb1627fc62341 100644 --- a/common/common.h +++ b/common/common.h @@ -121,6 +121,11 @@ std::vector llama_tokenize( const std::string & text, bool add_bos); -std::string llama_token_to_str( +std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); + +// removes the leading space from the first non-BOS token +std::string llama_detokenize( + llama_context * ctx, + const std::vector & tokens); diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp index 1c04fabc21b3d..42c7c72542321 100644 --- a/examples/beam_search/beam_search.cpp +++ b/examples/beam_search/beam_search.cpp @@ -35,7 +35,7 @@ struct ostream_beam_view { std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { - os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]); + os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); } return os << ')'; } @@ -156,7 +156,7 @@ int main(int argc, char ** argv) for( auto id : tokens_list ) { - std::cout << llama_token_to_str(ctx, id); + std::cout << llama_token_to_piece(ctx, id); } std::cout << std::flush; @@ -175,7 +175,7 @@ int main(int argc, char ** argv) std::cout << "\n\n"; for (llama_token const token_id : callback_data.response) { - std::cout << llama_token_to_str(ctx,token_id); + std::cout << llama_token_to_piece(ctx,token_id); } std::cout << std::endl; diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 8a6ad882e8fa8..036bdb3987f34 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) { if (id == llama_token_eos(ctx)) { ret = ""; } else { - ret = llama_token_to_str(ctx, id); + ret = llama_token_to_piece(ctx, id); } eval_id(mymodel, id); return ret.c_str(); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index abe5c87818fd0..93d583b5ce151 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -64,7 +64,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "\n"); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0d3783d6758eb..6f312b38b9730 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -280,7 +280,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (ctx_guidance) { @@ -288,14 +288,14 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); } } if (params.n_keep > 0) { fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "'\n"); } @@ -451,7 +451,7 @@ int main(int argc, char ** argv) { //printf("\n---\n"); //printf("resetting: '"); //for (int i = 0; i < (int) embd.size(); i++) { - // printf("%s", llama_token_to_str(ctx, embd[i])); + // printf("%s", llama_token_to_piece(ctx, embd[i])); //} //printf("'\n"); //printf("\n---\n"); @@ -504,7 +504,7 @@ int main(int argc, char ** argv) { input_size = embd_guidance.size(); //fprintf(stderr, "\n---------------------\n"); //for (int i = 0; i < (int) embd_guidance.size(); i++) { - //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i])); + //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i])); //} //fprintf(stderr, "\n---------------------\n"); } else { @@ -663,7 +663,7 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - printf("%s", llama_token_to_str(ctx, id).c_str()); + printf("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stdout); } @@ -679,7 +679,7 @@ int main(int argc, char ** argv) { if (params.antiprompt.size()) { std::string last_output; for (auto id : last_n_tokens) { - last_output += llama_token_to_str(ctx, id); + last_output += llama_token_to_piece(ctx, id); } is_antiprompt = false; diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3db61b7541171..573bc4ef988a6 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -87,7 +87,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx, &candidates_p); - auto next_token_str = llama_token_to_str(ctx, next_token); + auto next_token_str = llama_token_to_piece(ctx, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx2, &candidates_p); - auto next_token_str = llama_token_to_str(ctx2, next_token); + auto next_token_str = llama_token_to_piece(ctx2, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3300553f9b397..615801fe53b43 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_str(ctx, *begin); + ret += llama_token_to_piece(ctx, *begin); } return ret; } @@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line, // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_str(ctx, token); + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) if (out.size() == 1 && (out[0] & 0x80) == 0x80) @@ -566,7 +566,7 @@ struct llama_server_context if (!embd.empty() && embd.back() == llama_token_eos(ctx)) { - // stopping_word = llama_token_to_str(ctx, embd.back()); + // stopping_word = llama_token_to_piece(ctx, embd.back()); has_next_token = false; stopped_eos = true; LOG_VERBOSE("eos token found", {}); @@ -613,7 +613,7 @@ struct llama_server_context { const completion_token_output token_with_probs = nextToken(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok); + const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; if (params.n_probs > 0) @@ -1248,7 +1248,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) { struct token_translator { llama_context * ctx; - std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); } + std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); } }; @@ -1358,7 +1358,7 @@ int main(int argc, char **argv) while (llama.has_next_token) { const completion_token_output token_with_probs = llama.doCompletion(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok); + const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok); stop_pos = llama.findStoppingStrings(llama.generated_text, token_text.size(), STOP_FULL); @@ -1389,7 +1389,7 @@ int main(int argc, char **argv) if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) { continue; } - const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok); + const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok); size_t pos = std::min(sent_count, llama.generated_text.size()); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 132f7fbf912bb..4ee85faca9f4a 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -63,7 +63,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -112,7 +112,7 @@ int main(int argc, char ** argv) { } // print the new token : - printf("%s", llama_token_to_str(ctx, new_token_id).c_str()); + printf("%s", llama_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); // push this new token for next evaluation diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 79b117df72fd3..12d153417968b 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) { void print_token(struct llama_context * ctx, llama_token token) { - printf("%s", llama_token_to_str(ctx, token).c_str()); + printf("%s", llama_token_to_piece(ctx, token).c_str()); } void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { @@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto const char * in = buf.data(); const char * end = buf.data() + buf.size(); for (int i = 0; i < (int) out.size(); ++i) { - std::string s = llama_token_to_str(lctx, out[i]); + std::string s = llama_token_to_piece(lctx, out[i]); int len = s.length(); if (in >= end) { printf("%s: unexpected end of original text.\n", __func__); diff --git a/llama.cpp b/llama.cpp index 0453dd9cf6f7b..a9a2b4d5c5f50 100644 --- a/llama.cpp +++ b/llama.cpp @@ -796,12 +796,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } -static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) { +static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -3374,6 +3374,11 @@ struct llm_tokenizer_bpe { static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) { std::vector output; + // OG tokenizer behavior: + // + // tokenizer.encode('', add_bos=True) returns [1] + // tokenizer.encode('', add_bos=False) returns [] + if (bos && vocab.special_bos_id != -1) { output.push_back(vocab.special_bos_id); } @@ -3382,11 +3387,12 @@ static std::vector llama_tokenize_internal(const llama_vocab & return output; } - raw_text = " " + raw_text; - switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { + // without adding this leading whitespace, we do not get the same results as the original tokenizer + raw_text = " " + raw_text; + llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); tokenizer.tokenize(raw_text, output); @@ -4079,16 +4085,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c std::vector candidates_grammar; for (size_t i = 0; i < candidates->size; ++i) { - const llama_token id = candidates->data[i].id; - const std::string text = llama_token_to_text(ctx, id); + const llama_token id = candidates->data[i].id; + const std::string piece = llama_token_to_str(ctx, id); if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; } - } else if (text.empty() || text[0] == 0) { + } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else { - candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8)); + candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); } } @@ -4292,10 +4298,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string text = llama_token_to_text(ctx, token); + const std::string piece = llama_token_to_str(ctx, token); // Note terminating 0 in decoded string - const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8); + const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); const auto & code_points = decoded.first; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); @@ -6089,12 +6095,12 @@ int llama_tokenize_with_model( return res.size(); } -int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) { - return llama_token_to_str_with_model(&ctx->model, token, buf, length); +int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) { + return llama_token_to_piece_with_model(&ctx->model, token, buf, length); } -// does not write null-terminator to str -int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { +// does not write null-terminator to buf +int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { if (0 <= token && token < llama_model_n_vocab(model)) { if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; diff --git a/llama.h b/llama.h index b77dd7735fdf0..f9a7300eaa3e2 100644 --- a/llama.h +++ b/llama.h @@ -381,15 +381,17 @@ extern "C" { int n_max_tokens, bool add_bos); - // Token Id -> String. Uses the vocabulary in the provided context - // Does not write null terminator to the buffer - LLAMA_API int llama_token_to_str( + // Token Id -> Piece. + // Uses the vocabulary in the provided context. + // Does not write null terminator to the buffer. + // Use code is responsible to remove the leading whitespace of the first non-BOS token. + LLAMA_API int llama_token_to_piece( const struct llama_context * ctx, llama_token token, char * buf, int length); - LLAMA_API int llama_token_to_str_with_model( + LLAMA_API int llama_token_to_piece_with_model( const struct llama_model * model, llama_token token, char * buf, diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 8a86ad4e66f06..68e39259745a0 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -6,14 +6,6 @@ #include #include -static std::string llama_detokenize(llama_context * ctx, const std::vector & tokens) { - std::string result; - for (size_t i = 0; i < tokens.size(); ++i) { - result += llama_token_to_str(ctx, tokens[i]); - } - return result; -} - static const std::map> & k_tests() { static std::map> _k_tests = { { "" , { }, }, diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index 982615e6096c5..2d74808bbe586 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -13,7 +13,7 @@ tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') tests = [ - "" + "", " ", " ", " ", @@ -49,3 +49,10 @@ print('\nwithout bos:') print(tokenizer.encode(text, add_bos=False)) print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) + +print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' +print("'" + tokenizer.id_to_piece(29871) + "'") # '_' +print("'" + tokenizer.decode([15043]) + "'") # 'Hello' +print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' +print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' +print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index bd607d12bb1cd..ce4f2898ce49a 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) { return result; } -static std::string unescape_whitespace(llama_context * ctx, const std::vector & tokens) { - std::string result; - for (size_t i = 0; i < tokens.size(); ++i) { - result += llama_token_to_str(ctx, tokens[i]); - } - return result; -} - int main(int argc, char **argv) { if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); @@ -72,13 +64,13 @@ int main(int argc, char **argv) { const int n_vocab = llama_n_vocab(ctx); for (int i = 0; i < n_vocab; ++i) { - std::string forward = llama_token_to_str(ctx, i); + std::string forward = llama_token_to_piece(ctx, i); std::vector tokens = llama_tokenize(ctx, forward, false); if (tokens.size() == 1) { if (i != tokens[0]) { - std::string backward = llama_token_to_str(ctx, tokens[0]); + std::string backward = llama_token_to_piece(ctx, tokens[0]); fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", - __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str()); + __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str()); return 2; } } From 1e7a033f10891e502e19b19ebc7da918409fe21e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 17:42:33 +0300 Subject: [PATCH 04/12] common : add comments --- common/common.h | 6 ++++++ llama.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index cb1627fc62341..1c1acf98916b1 100644 --- a/common/common.h +++ b/common/common.h @@ -116,15 +116,21 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // Vocab utils // +// tokenizes a string into a vector of tokens +// should work similar to Python's `tokenizer.encode` std::vector llama_tokenize( struct llama_context * ctx, const std::string & text, bool add_bos); +// tokenizes a token into a piece +// should work similar to Python's `tokenizer.id_to_piece` std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` // removes the leading space from the first non-BOS token std::string llama_detokenize( llama_context * ctx, diff --git a/llama.h b/llama.h index f9a7300eaa3e2..b084fe23c8fcc 100644 --- a/llama.h +++ b/llama.h @@ -384,7 +384,7 @@ extern "C" { // Token Id -> Piece. // Uses the vocabulary in the provided context. // Does not write null terminator to the buffer. - // Use code is responsible to remove the leading whitespace of the first non-BOS token. + // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. LLAMA_API int llama_token_to_piece( const struct llama_context * ctx, llama_token token, From dfa058ef73972878092ceda2c235aada28c9cf99 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 17:51:35 +0300 Subject: [PATCH 05/12] examples : no longer manually add leading space when tokenizing --- examples/main/main.cpp | 1 - examples/server/server.cpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6f312b38b9730..3cb79b79eb5f6 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -211,7 +211,6 @@ int main(int argc, char ** argv) { int guidance_offset = 0; int original_prompt_len = 0; if (ctx_guidance) { - params.cfg_negative_prompt.insert(0, 1, ' '); guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 615801fe53b43..e13bf76e654cb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -286,7 +286,6 @@ struct llama_server_context std::vector p; if (first) { - s.insert(0, 1, ' '); // add a space if it's the first p = ::llama_tokenize(ctx, s, add_bos); first = false; } @@ -309,7 +308,6 @@ struct llama_server_context else { auto s = json_prompt.template get(); - s.insert(0, 1, ' '); // always add a first space prompt_tokens = ::llama_tokenize(ctx, s, add_bos); } From 70005bd5c9b96296a9339d60fd1e7e87ef97f4bb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 18:05:59 +0300 Subject: [PATCH 06/12] tests : use Python to generate tokenizer tests for C++ --- tests/test-tokenizer-0.cpp | 69 +++++++++++++++++++------------------- tests/test-tokenizer-0.py | 12 +++++++ 2 files changed, 46 insertions(+), 35 deletions(-) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 68e39259745a0..f7c4dcf77987d 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -6,42 +6,34 @@ #include #include +// generate using test-tokenizer-0.py static const std::map> & k_tests() { static std::map> _k_tests = { - { "" , { }, }, - { " ", { 259, }, }, - { " ", { 1678, }, }, - { " ", { 268, }, }, - { "\t", { 29871, 12, }, }, - { "\n", { 29871, 13, }, }, - { "\t\n", { 29871, 12, 13, }, }, - { "Hello world", { 15043, 3186, }, }, - { " Hello world", { 29871, 15043, 3186, }, }, - { "Hello World", { 15043, 2787, }, }, - { " Hello World", { 29871, 15043, 2787, }, }, - { " Hello World!", { 29871, 15043, 2787, 29991, }, }, - { "Hello, world!", { 15043, 29892, 3186, 29991, }, }, - { " Hello, world!", { 29871, 15043, 29892, 3186, 29991, }, }, - { " this is πŸ¦™.cpp", { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu", { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1538, 4851, 665, 1386, 29713, 1305, }, }, - { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, - 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, - 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, - 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, - 136, 228, 162, 132, 228, 161, 140, }, }, - { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, - 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, - 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, - 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, - { "Hello", { 15043, }, }, - { " Hello", { 29871, 15043, }, }, - { " Hello", { 259, 15043, }, }, - { " Hello", { 1678, 15043, }, }, - { " Hello", { 268, 15043, }, }, - { " Hello\n Hello", { 268, 15043, 13, 1678, 15043, }, }, + { "" , { }, }, + { " " , { 259, }, }, + { " " , { 1678, }, }, + { " " , { 268, }, }, + { "\t" , { 29871, 12, }, }, + { "\n" , { 29871, 13, }, }, + { "\t\n" , { 29871, 12, 13, }, }, + { "Hello world" , { 15043, 3186, }, }, + { " Hello world" , { 29871, 15043, 3186, }, }, + { "Hello World" , { 15043, 2787, }, }, + { " Hello World" , { 29871, 15043, 2787, }, }, + { " Hello World!" , { 29871, 15043, 2787, 29991, }, }, + { "Hello, world!" , { 15043, 29892, 3186, 29991, }, }, + { " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, }, + { " this is πŸ¦™.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 1538, 4851, 665, 1386, 29713, 1305, }, }, + { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, }, + { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello" , { 15043, }, }, + { " Hello" , { 29871, 15043, }, }, + { " Hello" , { 259, 15043, }, }, + { " Hello" , { 1678, 15043, }, }, + { " Hello" , { 268, 15043, }, }, + { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, }; return _k_tests; @@ -99,7 +91,14 @@ int main(int argc, char **argv) { const std::vector res_bos = llama_tokenize(ctx, test_kv.first, true); const std::vector res_nobos = llama_tokenize(ctx, test_kv.first, false); - fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), llama_detokenize(ctx, res_bos).c_str()); + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize(ctx, res_bos).c_str()); + printf("tok: "); + for (const auto & tok : res_bos) { + printf("%d ", tok); + } + printf("\n"); bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1; diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index 2d74808bbe586..722ba81118f75 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -56,3 +56,15 @@ print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text, add_bos=False) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") From e4324cbd4d367fb9ddaaef98c0929acebbf22954 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 19:21:22 +0300 Subject: [PATCH 07/12] tests : add option to tokenize text files ggml-ci --- tests/test-tokenizer-0.cpp | 44 +++++++++++++++++++++++++++++++++++++- tests/test-tokenizer-0.py | 18 ++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index f7c4dcf77987d..5c58942085206 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -5,6 +5,7 @@ #include #include #include +#include // generate using test-tokenizer-0.py static const std::map> & k_tests() { @@ -41,12 +42,17 @@ static const std::map> & k_tests() { int main(int argc, char **argv) { if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); return 1; } const std::string fname = argv[1]; + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); llama_model * model; @@ -131,6 +137,42 @@ int main(int argc, char **argv) { } } + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + } + llama_free_model(model); llama_free(ctx); diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index 722ba81118f75..a21e9ed70c5b8 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -6,6 +6,7 @@ parser = argparse.ArgumentParser() parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") args = parser.parse_args() dir_tokenizer = args.dir_tokenizer @@ -68,3 +69,20 @@ for x in res: print("%7d," % x, end='') print(" }, },") + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s, add_bos=True) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) From eb8b3264f6054a9f688c3df8ff441aca400ad426 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 00:41:44 +0300 Subject: [PATCH 08/12] tests : add test-tokenizer-1.py --- tests/test-tokenizer-0.py | 7 ++++ tests/test-tokenizer-1.py | 83 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tests/test-tokenizer-1.py diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index a21e9ed70c5b8..bc164ee296cb1 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -1,3 +1,5 @@ +# tests with SPM tokenizer + import os import sys import argparse @@ -70,6 +72,11 @@ print("%7d," % x, end='') print(" }, },") +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + fname_tok = args.fname_tok if fname_tok: print('tokenizing file: ', fname_tok) diff --git a/tests/test-tokenizer-1.py b/tests/test-tokenizer-1.py new file mode 100644 index 0000000000000..9c8c1c7d1d3ca --- /dev/null +++ b/tests/test-tokenizer-1.py @@ -0,0 +1,83 @@ +# tests with BPE tokenizer + +import os +import sys +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + +for text in tests: + print('text: ', text) + print(tokenizer.encode(text)) + print(tokenizer.decode(tokenizer.encode(text))) + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) From ab3ba64f62e924391b61622423caf1e500bc28d7 Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Sat, 26 Aug 2023 23:03:01 +0200 Subject: [PATCH 09/12] llama.cpp : fix LF token --- llama.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 8681c02562580..f67d6688bb253 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1636,6 +1636,7 @@ static void llm_load_hparams( // TODO: This should probably be in llama.h static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); +static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( llama_model_loader & ml, @@ -1737,7 +1738,11 @@ static void llm_load_vocab( } // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' - vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } else { + vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + } // special tokens GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); From dbcf470bc6606d67ff2916c6d3377741fad34090 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 00:44:49 +0300 Subject: [PATCH 10/12] hellaswag : move the concat space for clarity --- examples/perplexity/perplexity.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index fd89852d6dedd..b596d062613d7 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -392,7 +392,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_data[i].context = prompt_lines[idx*6]; hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j=0; j < 4; j++) { - hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j]; + hs_data[i].ending[j] = prompt_lines[idx*6+2+j]; } // Delete the selected random example from the prompt @@ -417,7 +417,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t context_size = context_embd.size(); for (int i = 0; i < 4; ++i) { - ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos); + ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos); for (int k = 0; k < int(context_size); ++k) { if (ending_tokens[i][k] != context_embd[k]) { fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k); From 3bb0f84932a0e0302ef4deae54917a5bd18ae4c0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 11:26:48 +0300 Subject: [PATCH 11/12] tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci --- tests/CMakeLists.txt | 6 +- tests/test-tokenizer-0-falcon.cpp | 180 ++++++++++++++++++ ...enizer-1.py => test-tokenizer-0-falcon.py} | 0 ...nizer-0.cpp => test-tokenizer-0-llama.cpp} | 4 +- ...kenizer-0.py => test-tokenizer-0-llama.py} | 0 5 files changed, 187 insertions(+), 3 deletions(-) create mode 100644 tests/test-tokenizer-0-falcon.cpp rename tests/{test-tokenizer-1.py => test-tokenizer-0-falcon.py} (100%) rename tests/{test-tokenizer-0.cpp => test-tokenizer-0-llama.cpp} (97%) rename tests/{test-tokenizer-0.py => test-tokenizer-0-llama.py} (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2afaf86b11450..ca1f39d31b081 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -25,8 +25,10 @@ endfunction() llama_build_and_test_executable(test-quantize-fns.cpp) llama_build_and_test_executable(test-quantize-perf.cpp) llama_build_and_test_executable(test-sampling.cpp) -llama_build_executable(test-tokenizer-0.cpp) -llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_build_executable(test-tokenizer-0-llama.cpp) +llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_build_executable(test-tokenizer-0-falcon.cpp) +#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_build_executable(test-tokenizer-1.cpp) # test-tokenizer-1 requires a BPE vocab. re-enable when we have one. #llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp new file mode 100644 index 0000000000000..3063e2c64c549 --- /dev/null +++ b/tests/test-tokenizer-0-falcon.cpp @@ -0,0 +1,180 @@ +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-falcon.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 204, }, }, + { " " , { 258, }, }, + { " " , { 466, }, }, + { "\t" , { 192, }, }, + { "\n" , { 193, }, }, + { "\t\n" , { 19125, }, }, + { "Hello world" , { 9856, 1079, }, }, + { " Hello world" , { 23090, 1079, }, }, + { "Hello World" , { 9856, 2889, }, }, + { " Hello World" , { 23090, 2889, }, }, + { " Hello World!" , { 23090, 2889, 12, }, }, + { "Hello, world!" , { 9856, 23, 1079, 12, }, }, + { " Hello, world!" , { 23090, 23, 1079, 12, }, }, + { " this is πŸ¦™.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, }, + { "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, }, + { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, }, + { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, }, + { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, }, + { "Hello" , { 9856, }, }, + { " Hello" , { 23090, }, }, + { " Hello" , { 204, 23090, }, }, + { " Hello" , { 258, 23090, }, }, + { " Hello" , { 466, 23090, }, }, + { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + const int n_vocab = llama_n_vocab(ctx); + + if (n_vocab != 65024) { + fprintf(stderr, "%s : expected 65024 tokens, got %d\n", __func__, n_vocab); + llama_free_model(model); + llama_free(ctx); + return 2; + } + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize(ctx, res).c_str(), + llama_detokenize(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-1.py b/tests/test-tokenizer-0-falcon.py similarity index 100% rename from tests/test-tokenizer-1.py rename to tests/test-tokenizer-0-falcon.py diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0-llama.cpp similarity index 97% rename from tests/test-tokenizer-0.cpp rename to tests/test-tokenizer-0-llama.cpp index 5c58942085206..c28cd2753146f 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -7,7 +7,7 @@ #include #include -// generate using test-tokenizer-0.py +// generate using test-tokenizer-0-llama.py static const std::map> & k_tests() { static std::map> _k_tests = { { "" , { }, }, @@ -171,6 +171,8 @@ int main(int argc, char **argv) { ofs << "\n"; } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); } llama_free_model(model); diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0-llama.py similarity index 100% rename from tests/test-tokenizer-0.py rename to tests/test-tokenizer-0-llama.py From 841983fe47f7488429c06bb6dabaa1274d01bca0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 13:04:04 +0300 Subject: [PATCH 12/12] common : temporary separate llama_detokenize calls for SPM and BPE --- common/common.cpp | 14 +++++++++++++- common/common.h | 11 ++++++++++- tests/test-tokenizer-0-falcon.cpp | 12 +++++------- tests/test-tokenizer-0-llama.cpp | 12 +++++------- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4e40a52501446..0d91a6a35acaa 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -747,7 +747,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t return std::string(result.data(), result.size()); } -std::string llama_detokenize(llama_context * ctx, const std::vector & tokens) { +std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { const llama_token bos_id = llama_token_bos(ctx); std::string piece; @@ -767,3 +767,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector return result; } +std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + result += piece; + } + + return result; +} diff --git a/common/common.h b/common/common.h index 1c1acf98916b1..97fda2be78b51 100644 --- a/common/common.h +++ b/common/common.h @@ -129,9 +129,18 @@ std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); +// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function +// that takes into account the tokenizer type and decides how to handle the leading space +// // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` // removes the leading space from the first non-BOS token -std::string llama_detokenize( +std::string llama_detokenize_spm( + llama_context * ctx, + const std::vector & tokens); + +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +std::string llama_detokenize_bpe( llama_context * ctx, const std::vector & tokens); diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 3063e2c64c549..836fb8ad27109 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -82,10 +82,8 @@ int main(int argc, char **argv) { } } - const int n_vocab = llama_n_vocab(ctx); - - if (n_vocab != 65024) { - fprintf(stderr, "%s : expected 65024 tokens, got %d\n", __func__, n_vocab); + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); llama_free_model(model); llama_free(ctx); return 2; @@ -98,7 +96,7 @@ int main(int argc, char **argv) { printf("\n"); printf("src: '%s'\n", test_kv.first.c_str()); - printf("res: '%s'\n", llama_detokenize(ctx, res).c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); printf("tok: "); for (const auto & tok : res) { printf("%d ", tok); @@ -116,8 +114,8 @@ int main(int argc, char **argv) { if (!correct) { fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - llama_detokenize(ctx, res).c_str(), - llama_detokenize(ctx, test_kv.second).c_str()); + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); fprintf(stderr, "%s : expected tokens: ", __func__); for (const auto & t : test_kv.second) { fprintf(stderr, "%6d, ", t); diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index c28cd2753146f..8630742c612bf 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -82,10 +82,8 @@ int main(int argc, char **argv) { } } - const int n_vocab = llama_n_vocab(ctx); - - if (n_vocab != 32000) { - fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab); + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); llama_free_model(model); llama_free(ctx); return 2; @@ -99,7 +97,7 @@ int main(int argc, char **argv) { printf("\n"); printf("src: '%s'\n", test_kv.first.c_str()); - printf("res: '%s'\n", llama_detokenize(ctx, res_bos).c_str()); + printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str()); printf("tok: "); for (const auto & tok : res_bos) { printf("%d ", tok); @@ -120,8 +118,8 @@ int main(int argc, char **argv) { if (!correct) { fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - llama_detokenize(ctx, res_nobos).c_str(), - llama_detokenize(ctx, test_kv.second).c_str()); + llama_detokenize_spm(ctx, res_nobos).c_str(), + llama_detokenize_spm(ctx, test_kv.second).c_str()); fprintf(stderr, "%s : expected tokens: ", __func__); for (const auto & t : test_kv.second) { fprintf(stderr, "%6d, ", t);