From 5cad62bce41546ddae8908eeb5bb06476f4c5bd8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Aug 2023 15:55:23 +0300
Subject: [PATCH 01/12] tests : write a Python tokenizer test (wip)

---
 tests/test-tokenizer-0.cpp | 40 ++++++++++++++++++++------------------
 tests/test-tokenizer-0.py  | 18 +++++++++++++++++
 2 files changed, 39 insertions(+), 19 deletions(-)
 create mode 100644 tests/test-tokenizer-0.py
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 7e9ac9188d5c5..4bed054d6cae0 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -16,36 +16,38 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla
 
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
     static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { " ",                      {1,    259, }, },
-        { "  ",                     { 1,    1678, }, },
-        { "   ",                    { 1,     268, }, },
-        { "\t",                     { 1,    29871,   12, }, },
-        { "\n",                     { 1,    29871,   13, }, },
-        { "\t\n",                   { 1,    29871,   12,     13, }, },
+        { " ",                      { 1,    259, }, },
+        { "  ",                     { 1,   1678, }, },
+        { "   ",                    { 1,    268, }, },
+        { "\t",                     { 1,  29871,     12, }, },
+        { "\n",                     { 1,  29871,     13, }, },
+        { "\t\n",                   { 1,  29871,     12,     13, }, },
         { "Hello world",            { 1,  15043,   3186, }, },
         { " Hello world",           { 1,  29871,  15043,   3186, }, },
         { "Hello World",            { 1,  15043,   2787, }, },
         { " Hello World",           { 1,  29871,  15043,   2787, }, },
         { " Hello World!",          { 1,  29871,  15043,   2787,  29991, }, },
+        { "Hello, world!",          { 1,  15043,  29892,   3186,  29991, }, },
+        { " Hello, world!",         { 1,  29871,  15043,  29892,   3186,  29991, }, },
         { " this is 🦙.cpp",        { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
         { "w048 7tuijk dsdfhu",     { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
         { "нещо на Български",      { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
         { "កាន់តែពិសេសអាចខលចេញ",   { 1,  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
-                                     146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
-                                     31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
-                                     161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
-                                     136,    228,    162,    132,    228,    161,    140, }, },
+                                    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
+                                  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
+                                    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
+                                    136,    228,    162,    132,    228,    161,    140, }, },
         { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
             { 1,  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
-                243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
-                313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
-                313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
-        { "Hello",                  { 1,    15043 }, },
-        { " Hello",                 { 1,    29871,  15043 }, },
-        { "  Hello",                { 1,    259,    15043 }, },
-        { "   Hello",               { 1,    1678,   15043 }, },
-        { "    Hello",              { 1,    268,    15043 }, },
-        { "    Hello\n    Hello",   { 1,    268,    15043,  13,     1678,   15043 }, },
+                    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
+                    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
+                    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
+        { "Hello",                  { 1,  15043, }, },
+        { " Hello",                 { 1,  29871,  15043, }, },
+        { "  Hello",                { 1,    259,  15043, }, },
+        { "   Hello",               { 1,   1678,  15043, }, },
+        { "    Hello",              { 1,    268,  15043, }, },
+        { "    Hello\n    Hello",   { 1,    268,  15043,     13,   1678,  15043, }, },
     };
 
     return _k_tests;
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
new file mode 100644
index 0000000000000..d21f8b5a13ae5
--- /dev/null
+++ b/tests/test-tokenizer-0.py
@@ -0,0 +1,18 @@
+import os
+import sys
+import argparse
+
+from sentencepiece import SentencePieceProcessor
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+args = parser.parse_args()
+
+dir_tokenizer = args.dir_tokenizer
+
+tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
+
+text = 'Hello, world!'
+print(text)
+print(tokenizer.encode(text, add_bos=True))
+print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))

From 5d0ffb69f5c6b8aa10ee2bb88c6a601a46df33d3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Aug 2023 17:08:59 +0300
Subject: [PATCH 02/12] llama : prefix input text for tokenization with
 whitespace

---
 examples/embedding/embedding.cpp |  3 --
 examples/main/main.cpp           |  5 --
 llama.cpp                        | 21 ++++----
 tests/test-tokenizer-0.cpp       | 88 +++++++++++++++++---------------
 tests/test-tokenizer-0.py        | 41 +++++++++++++--
 5 files changed, 95 insertions(+), 63 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 38395c75b0b5b..abe5c87818fd0 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
 
     int n_past = 0;
 
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
     // tokenize the prompt
     auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 4665b82fe7f97..0d3783d6758eb 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -195,11 +195,6 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
     std::vector<llama_token> embd_inp;
 
-    if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
-        // Add a space in front of the first character to match OG llama tokenizer behavior
-        params.prompt.insert(0, 1, ' ');
-    }
-
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
     } else {
diff --git a/llama.cpp b/llama.cpp
index b0a3b5768f3dd..0453dd9cf6f7b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1635,7 +1635,7 @@ static void llm_load_hparams(
 }
 
 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
 
 static void llm_load_vocab(
         llama_model_loader & ml,
@@ -3026,10 +3026,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
     return vocab.token_to_id.at(buf);
 }
 
-static std::string llama_escape_whitespace(const std::string& text) {
-    std::string result = text;
-    replace_all(result, " ", "\xe2\x96\x81");
-    return result;
+static void llama_escape_whitespace(std::string & text) {
+    replace_all(text, " ", "\xe2\x96\x81");
 }
 
 static void llama_unescape_whitespace(std::string & word) {
@@ -3373,22 +3371,25 @@ struct llm_tokenizer_bpe {
     llm_bigram_bpe::queue work_queue;
 };
 
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) {
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
     std::vector<llama_vocab::id> output;
 
+    if (bos && vocab.special_bos_id != -1) {
+        output.push_back(vocab.special_bos_id);
+    }
+
     if (raw_text.empty()) {
         return output;
     }
 
-    if (bos && vocab.special_bos_id != -1) {
-        output.push_back(vocab.special_bos_id);
-    }
+    raw_text = " " + raw_text;
 
     switch (vocab.type) {
         case LLAMA_VOCAB_TYPE_SPM:
             {
                 llm_tokenizer_spm tokenizer(vocab);
-                tokenizer.tokenize(llama_escape_whitespace(raw_text), output);
+                llama_escape_whitespace(raw_text);
+                tokenizer.tokenize(raw_text, output);
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 4bed054d6cae0..8a86ad4e66f06 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -6,7 +6,7 @@
 #include <map>
 #include <vector>
 
-static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
+static std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
     std::string result;
     for (size_t i = 0; i < tokens.size(); ++i) {
         result += llama_token_to_str(ctx, tokens[i]);
@@ -16,38 +16,40 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla
 
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
     static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { " ",                      { 1,    259, }, },
-        { "  ",                     { 1,   1678, }, },
-        { "   ",                    { 1,    268, }, },
-        { "\t",                     { 1,  29871,     12, }, },
-        { "\n",                     { 1,  29871,     13, }, },
-        { "\t\n",                   { 1,  29871,     12,     13, }, },
-        { "Hello world",            { 1,  15043,   3186, }, },
-        { " Hello world",           { 1,  29871,  15043,   3186, }, },
-        { "Hello World",            { 1,  15043,   2787, }, },
-        { " Hello World",           { 1,  29871,  15043,   2787, }, },
-        { " Hello World!",          { 1,  29871,  15043,   2787,  29991, }, },
-        { "Hello, world!",          { 1,  15043,  29892,   3186,  29991, }, },
-        { " Hello, world!",         { 1,  29871,  15043,  29892,   3186,  29991, }, },
-        { " this is 🦙.cpp",        { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu",     { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български",      { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
-        { "កាន់តែពិសេសអាចខលចេញ",   { 1,  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
-                                    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
-                                  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
-                                    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
-                                    136,    228,    162,    132,    228,    161,    140, }, },
+        { "" ,                      { }, },
+        { " ",                      {    259, }, },
+        { "  ",                     {   1678, }, },
+        { "   ",                    {    268, }, },
+        { "\t",                     {  29871,     12, }, },
+        { "\n",                     {  29871,     13, }, },
+        { "\t\n",                   {  29871,     12,     13, }, },
+        { "Hello world",            {  15043,   3186, }, },
+        { " Hello world",           {  29871,  15043,   3186, }, },
+        { "Hello World",            {  15043,   2787, }, },
+        { " Hello World",           {  29871,  15043,   2787, }, },
+        { " Hello World!",          {  29871,  15043,   2787,  29991, }, },
+        { "Hello, world!",          {  15043,  29892,   3186,  29991, }, },
+        { " Hello, world!",         {  29871,  15043,  29892,   3186,  29991, }, },
+        { " this is 🦙.cpp",        {  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        { "w048 7tuijk dsdfhu",     {    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        { "нещо на Български",      {   1538,   4851,    665,   1386,  29713,   1305, }, },
+        { "កាន់តែពិសេសអាចខលចេញ",
+                                    {  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
+                                         146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
+                                       31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
+                                         161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
+                                         136,    228,    162,    132,    228,    161,    140, }, },
         { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-            { 1,  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
-                    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
-                    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
-                    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
-        { "Hello",                  { 1,  15043, }, },
-        { " Hello",                 { 1,  29871,  15043, }, },
-        { "  Hello",                { 1,    259,  15043, }, },
-        { "   Hello",               { 1,   1678,  15043, }, },
-        { "    Hello",              { 1,    268,  15043, }, },
-        { "    Hello\n    Hello",   { 1,    268,  15043,     13,   1678,  15043, }, },
+                                    {  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
+                                         243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
+                                         313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
+                                         313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
+        { "Hello",                  {  15043, }, },
+        { " Hello",                 {  29871,  15043, }, },
+        { "  Hello",                {    259,  15043, }, },
+        { "   Hello",               {   1678,  15043, }, },
+        { "    Hello",              {    268,  15043, }, },
+        { "    Hello\n    Hello",   {    268,  15043,     13,   1678,  15043, }, },
     };
 
     return _k_tests;
@@ -102,15 +104,18 @@ int main(int argc, char **argv) {
     bool success = true;
 
     for (const auto & test_kv : k_tests()) {
-        // Add a space in front of the first character to match OG llama tokenizer behavior
-        std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
-        fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
-            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
+        const std::vector<llama_token> res_bos   = llama_tokenize(ctx, test_kv.first, true);
+        const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
 
-        bool correct = res.size() == test_kv.second.size();
+        fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), llama_detokenize(ctx, res_bos).c_str());
 
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (res[i] != test_kv.second[i]) {
+        bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
+
+        for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
+            if (test_kv.second[i] != res_bos[i + 1]) {
+                correct = false;
+            }
+            if (test_kv.second[i] != res_nobos[i]) {
                 correct = false;
             }
         }
@@ -118,14 +123,15 @@ int main(int argc, char **argv) {
         if (!correct) {
             fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
             fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
+                llama_detokenize(ctx, res_nobos).c_str(),
+                llama_detokenize(ctx, test_kv.second).c_str());
             fprintf(stderr, "%s : expected tokens: ", __func__);
             for (const auto & t : test_kv.second) {
                 fprintf(stderr, "%6d, ", t);
             }
             fprintf(stderr, "\n");
             fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
+            for (const auto & t : res_nobos) {
                 fprintf(stderr, "%6d, ", t);
             }
             fprintf(stderr, "\n");
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
index d21f8b5a13ae5..982615e6096c5 100644
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@@ -12,7 +12,40 @@
 
 tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
 
-text = 'Hello, world!'
-print(text)
-print(tokenizer.encode(text, add_bos=True))
-print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
+tests = [
+        ""
+        " ",
+        "  ",
+        "   ",
+        "\t",
+        "\n",
+        "\t\n",
+        "Hello world",
+        " Hello world",
+        "Hello World",
+        " Hello World",
+        " Hello World!",
+        "Hello, world!",
+        " Hello, world!",
+        " this is 🦙.cpp",
+        "w048 7tuijk dsdfhu",
+        "нещо на Български",
+        "កាន់តែពិសេសអាចខលចេញ",
+        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+        "Hello",
+        " Hello",
+        "  Hello",
+        "   Hello",
+        "    Hello",
+        "    Hello\n    Hello",
+    ]
+
+
+for text in tests:
+    print('text: ', text)
+    print('\nwith bos:')
+    print(tokenizer.encode(text, add_bos=True))
+    print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
+    print('\nwithout bos:')
+    print(tokenizer.encode(text, add_bos=False))
+    print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))

From 9668aa115c9d3204a8a04de0129488a91cb48440 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Aug 2023 17:35:45 +0300
Subject: [PATCH 03/12] llama : distinguish pieces from decoded text + fix
 detokenization

---
 common/common.cpp                             | 27 ++++++++++++--
 common/common.h                               |  7 +++-
 examples/beam_search/beam_search.cpp          |  6 ++--
 examples/embd-input/embd-input-lib.cpp        |  2 +-
 examples/embedding/embedding.cpp              |  2 +-
 examples/main/main.cpp                        | 14 ++++----
 examples/save-load-state/save-load-state.cpp  |  4 +--
 examples/server/server.cpp                    | 14 ++++----
 examples/simple/simple.cpp                    |  4 +--
 .../train-text-from-scratch.cpp               |  4 +--
 llama.cpp                                     | 36 +++++++++++--------
 llama.h                                       | 10 +++---
 tests/test-tokenizer-0.cpp                    |  8 -----
 tests/test-tokenizer-0.py                     |  9 ++++-
 tests/test-tokenizer-1.cpp                    | 14 ++------
 15 files changed, 93 insertions(+), 68 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ff19ec4e50f60..4e40a52501446 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -733,12 +733,12 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
 
-std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_token_to_str(ctx, token, result.data(), result.size());
+        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
@@ -746,3 +746,24 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
 
     return std::string(result.data(), result.size());
 }
+
+std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
+    const llama_token bos_id = llama_token_bos(ctx);
+
+    std::string piece;
+    std::string result;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        piece = llama_token_to_piece(ctx, tokens[i]);
+
+        // remove the leading space of the first non-BOS token
+        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
+            piece = piece.substr(1);
+        }
+
+        result += piece;
+    }
+
+    return result;
+}
+
diff --git a/common/common.h b/common/common.h
index ce61265f8c124..cb1627fc62341 100644
--- a/common/common.h
+++ b/common/common.h
@@ -121,6 +121,11 @@ std::vector<llama_token> llama_tokenize(
            const std::string & text,
                         bool   add_bos);
 
-std::string llama_token_to_str(
+std::string llama_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token);
+
+// removes the leading space from the first non-BOS token
+std::string llama_detokenize(
+                         llama_context * ctx,
+        const std::vector<llama_token> & tokens);
diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp
index 1c04fabc21b3d..42c7c72542321 100644
--- a/examples/beam_search/beam_search.cpp
+++ b/examples/beam_search/beam_search.cpp
@@ -35,7 +35,7 @@ struct ostream_beam_view {
 std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
     os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
     for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
-        os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
+        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
     }
     return os << ')';
 }
@@ -156,7 +156,7 @@ int main(int argc, char ** argv)
 
     for( auto id : tokens_list )
     {
-        std::cout << llama_token_to_str(ctx, id);
+        std::cout << llama_token_to_piece(ctx, id);
     }
     std::cout << std::flush;
 
@@ -175,7 +175,7 @@ int main(int argc, char ** argv)
 
     std::cout << "\n\n";
     for (llama_token const token_id : callback_data.response) {
-        std::cout << llama_token_to_str(ctx,token_id);
+        std::cout << llama_token_to_piece(ctx,token_id);
     }
     std::cout << std::endl;
 
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 8a6ad882e8fa8..036bdb3987f34 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) {
     if (id == llama_token_eos(ctx)) {
         ret = "</s>";
     } else {
-        ret = llama_token_to_str(ctx, id);
+        ret = llama_token_to_piece(ctx, id);
     }
     eval_id(mymodel, id);
     return ret.c_str();
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index abe5c87818fd0..93d583b5ce151 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
         fprintf(stderr, "\n");
     }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 0d3783d6758eb..6f312b38b9730 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -280,7 +280,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (ctx_guidance) {
@@ -288,14 +288,14 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
             fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
             }
         }
 
         if (params.n_keep > 0) {
         fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
+                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
             fprintf(stderr, "'\n");
         }
@@ -451,7 +451,7 @@ int main(int argc, char ** argv) {
                 //printf("\n---\n");
                 //printf("resetting: '");
                 //for (int i = 0; i < (int) embd.size(); i++) {
-                //    printf("%s", llama_token_to_str(ctx, embd[i]));
+                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
                 //}
                 //printf("'\n");
                 //printf("\n---\n");
@@ -504,7 +504,7 @@ int main(int argc, char ** argv) {
                     input_size = embd_guidance.size();
                     //fprintf(stderr, "\n---------------------\n");
                     //for (int i = 0; i < (int) embd_guidance.size(); i++) {
-                        //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
+                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
                     //}
                     //fprintf(stderr, "\n---------------------\n");
                 } else {
@@ -663,7 +663,7 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo) {
             for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id).c_str());
+                printf("%s", llama_token_to_piece(ctx, id).c_str());
             }
             fflush(stdout);
         }
@@ -679,7 +679,7 @@ int main(int argc, char ** argv) {
             if (params.antiprompt.size()) {
                 std::string last_output;
                 for (auto id : last_n_tokens) {
-                    last_output += llama_token_to_str(ctx, id);
+                    last_output += llama_token_to_piece(ctx, id);
                 }
 
                 is_antiprompt = false;
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 3db61b7541171..573bc4ef988a6 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -87,7 +87,7 @@ int main(int argc, char ** argv) {
         }
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
         auto next_token = llama_sample_token(ctx, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx, next_token);
+        auto next_token_str = llama_token_to_piece(ctx, next_token);
         last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         }
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
         auto next_token = llama_sample_token(ctx2, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        auto next_token_str = llama_token_to_piece(ctx2, next_token);
         last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3300553f9b397..615801fe53b43 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
     std::string ret;
     for (; begin != end; ++begin)
     {
-        ret += llama_token_to_str(ctx, *begin);
+        ret += llama_token_to_piece(ctx, *begin);
     }
     return ret;
 }
@@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line,
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
     // if the size is 1 and first bit is 1, meaning it's a partial character
     //   (size > 1 meaning it's already a known token)
     if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -566,7 +566,7 @@ struct llama_server_context
 
         if (!embd.empty() && embd.back() == llama_token_eos(ctx))
         {
-            // stopping_word = llama_token_to_str(ctx, embd.back());
+            // stopping_word = llama_token_to_piece(ctx, embd.back());
             has_next_token = false;
             stopped_eos = true;
             LOG_VERBOSE("eos token found", {});
@@ -613,7 +613,7 @@ struct llama_server_context
     {
         const completion_token_output token_with_probs = nextToken();
 
-        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
+        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
         generated_text += token_text;
 
         if (params.n_probs > 0)
@@ -1248,7 +1248,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
 
 struct token_translator {
     llama_context * ctx;
-    std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
+    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
     std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
 };
 
@@ -1358,7 +1358,7 @@ int main(int argc, char **argv)
 
                 while (llama.has_next_token) {
                     const completion_token_output token_with_probs = llama.doCompletion();
-                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
+                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
 
                     stop_pos = llama.findStoppingStrings(llama.generated_text,
                         token_text.size(), STOP_FULL);
@@ -1389,7 +1389,7 @@ int main(int argc, char **argv)
                     if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
                         continue;
                     }
-                    const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
+                    const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
 
                     size_t pos = std::min(sent_count, llama.generated_text.size());
 
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 132f7fbf912bb..4ee85faca9f4a 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "\n\n");
 
     for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
         }
 
         // print the new token :
-        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
+        printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
         fflush(stdout);
 
         // push this new token for next evaluation
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 79b117df72fd3..12d153417968b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
 
 
 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token).c_str());
+    printf("%s", llama_token_to_piece(ctx, token).c_str());
 }
 
 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
         const char * in  = buf.data();
         const char * end = buf.data() + buf.size();
         for (int i = 0; i < (int) out.size(); ++i) {
-            std::string s = llama_token_to_str(lctx, out[i]);
+            std::string s = llama_token_to_piece(lctx, out[i]);
             int len = s.length();
             if (in >= end) {
                 printf("%s: unexpected end of original text.\n", __func__);
diff --git a/llama.cpp b/llama.cpp
index 0453dd9cf6f7b..a9a2b4d5c5f50 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -796,12 +796,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
     (void) tensor;
 }
 
-static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_token_to_str(ctx, token, result.data(), result.size());
+        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
@@ -3374,6 +3374,11 @@ struct llm_tokenizer_bpe {
 static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
     std::vector<llama_vocab::id> output;
 
+    // OG tokenizer behavior:
+    //
+    // tokenizer.encode('', add_bos=True)  returns [1]
+    // tokenizer.encode('', add_bos=False) returns []
+
     if (bos && vocab.special_bos_id != -1) {
         output.push_back(vocab.special_bos_id);
     }
@@ -3382,11 +3387,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
         return output;
     }
 
-    raw_text = " " + raw_text;
-
     switch (vocab.type) {
         case LLAMA_VOCAB_TYPE_SPM:
             {
+                // without adding this leading whitespace, we do not get the same results as the original tokenizer
+                raw_text = " " + raw_text;
+
                 llm_tokenizer_spm tokenizer(vocab);
                 llama_escape_whitespace(raw_text);
                 tokenizer.tokenize(raw_text, output);
@@ -4079,16 +4085,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
     std::vector<llama_grammar_candidate>                              candidates_grammar;
 
     for (size_t i = 0; i < candidates->size; ++i) {
-        const llama_token id   = candidates->data[i].id;
-        const std::string text = llama_token_to_text(ctx, id);
+        const llama_token id    = candidates->data[i].id;
+        const std::string piece = llama_token_to_str(ctx, id);
         if (id == eos) {
             if (!allow_eos) {
                 candidates->data[i].logit = -INFINITY;
             }
-        } else if (text.empty() || text[0] == 0) {
+        } else if (piece.empty() || piece[0] == 0) {
             candidates->data[i].logit = -INFINITY;
         } else {
-            candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
+            candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
             candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
         }
     }
@@ -4292,10 +4298,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
         GGML_ASSERT(false);
     }
 
-    const std::string text = llama_token_to_text(ctx, token);
+    const std::string piece = llama_token_to_str(ctx, token);
 
     // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(text.c_str(), grammar->partial_utf8);
+    const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
     const auto & code_points = decoded.first;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
         grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -6089,12 +6095,12 @@ int llama_tokenize_with_model(
     return res.size();
 }
 
-int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
-    return llama_token_to_str_with_model(&ctx->model, token, buf, length);
+int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
+    return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
 }
 
-// does not write null-terminator to str
-int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
+// does not write null-terminator to buf
+int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
     if (0 <= token && token < llama_model_n_vocab(model)) {
         if (llama_is_normal_token(model->vocab, token)) {
             std::string result = model->vocab.id_to_token[token].text;
diff --git a/llama.h b/llama.h
index b77dd7735fdf0..f9a7300eaa3e2 100644
--- a/llama.h
+++ b/llama.h
@@ -381,15 +381,17 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
 
-    // Token Id -> String. Uses the vocabulary in the provided context
-    // Does not write null terminator to the buffer
-    LLAMA_API int llama_token_to_str(
+    // Token Id -> Piece.
+    // Uses the vocabulary in the provided context.
+    // Does not write null terminator to the buffer.
+    // Use code is responsible to remove the leading whitespace of the first non-BOS token.
+    LLAMA_API int llama_token_to_piece(
             const struct llama_context * ctx,
                            llama_token   token,
                                   char * buf,
                                   int    length);
 
-    LLAMA_API int llama_token_to_str_with_model(
+    LLAMA_API int llama_token_to_piece_with_model(
               const struct llama_model * model,
                            llama_token   token,
                                   char * buf,
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 8a86ad4e66f06..68e39259745a0 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -6,14 +6,6 @@
 #include <map>
 #include <vector>
 
-static std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    std::string result;
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        result += llama_token_to_str(ctx, tokens[i]);
-    }
-    return result;
-}
-
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
     static std::map<std::string, std::vector<llama_token>> _k_tests = {
         { "" ,                      { }, },
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
index 982615e6096c5..2d74808bbe586 100644
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@@ -13,7 +13,7 @@
 tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
 
 tests = [
-        ""
+        "",
         " ",
         "  ",
         "   ",
@@ -49,3 +49,10 @@
     print('\nwithout bos:')
     print(tokenizer.encode(text, add_bos=False))
     print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
+
+print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
+print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
+print("'" + tokenizer.decode([15043]) + "'")        # 'Hello'
+print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
+print("'" + tokenizer.decode([29871, 15043]) + "'")               # ' Hello'
+print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello  Hello'
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
index bd607d12bb1cd..ce4f2898ce49a 100644
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) {
     return result;
 }
 
-static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    std::string result;
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        result += llama_token_to_str(ctx, tokens[i]);
-    }
-    return result;
-}
-
 int main(int argc, char **argv) {
     if (argc < 2) {
         fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
@@ -72,13 +64,13 @@ int main(int argc, char **argv) {
     const int n_vocab = llama_n_vocab(ctx);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string forward = llama_token_to_str(ctx, i);
+        std::string forward = llama_token_to_piece(ctx, i);
         std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
         if (tokens.size() == 1) {
             if (i != tokens[0]) {
-                std::string backward = llama_token_to_str(ctx, tokens[0]);
+                std::string backward = llama_token_to_piece(ctx, tokens[0]);
                 fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
-                    __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
+                    __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
                 return 2;
             }
         }

From 1e7a033f10891e502e19b19ebc7da918409fe21e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Aug 2023 17:42:33 +0300
Subject: [PATCH 04/12] common : add comments

---
 common/common.h | 6 ++++++
 llama.h         | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/common/common.h b/common/common.h
index cb1627fc62341..1c1acf98916b1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -116,15 +116,21 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 // Vocab utils
 //
 
+// tokenizes a string into a vector of tokens
+// should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
         struct llama_context * ctx,
            const std::string & text,
                         bool   add_bos);
 
+// tokenizes a token into a piece
+// should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token);
 
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
 // removes the leading space from the first non-BOS token
 std::string llama_detokenize(
                          llama_context * ctx,
diff --git a/llama.h b/llama.h
index f9a7300eaa3e2..b084fe23c8fcc 100644
--- a/llama.h
+++ b/llama.h
@@ -384,7 +384,7 @@ extern "C" {
     // Token Id -> Piece.
     // Uses the vocabulary in the provided context.
     // Does not write null terminator to the buffer.
-    // Use code is responsible to remove the leading whitespace of the first non-BOS token.
+    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
     LLAMA_API int llama_token_to_piece(
             const struct llama_context * ctx,
                            llama_token   token,

From dfa058ef73972878092ceda2c235aada28c9cf99 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Aug 2023 17:51:35 +0300
Subject: [PATCH 05/12] examples : no longer manually add leading space when
 tokenizing

---
 examples/main/main.cpp     | 1 -
 examples/server/server.cpp | 2 --
 2 files changed, 3 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 6f312b38b9730..3cb79b79eb5f6 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -211,7 +211,6 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
-        params.cfg_negative_prompt.insert(0, 1, ' ');
         guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 615801fe53b43..e13bf76e654cb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -286,7 +286,6 @@ struct llama_server_context
                     std::vector<llama_token> p;
                     if (first)
                     {
-                        s.insert(0, 1, ' '); // add a space if it's the first
                         p = ::llama_tokenize(ctx, s, add_bos);
                         first = false;
                     }
@@ -309,7 +308,6 @@ struct llama_server_context
         else
         {
             auto s = json_prompt.template get<std::string>();
-            s.insert(0, 1, ' '); // always add a first space
             prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
         }
 

From 70005bd5c9b96296a9339d60fd1e7e87ef97f4bb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Aug 2023 18:05:59 +0300
Subject: [PATCH 06/12] tests : use Python to generate tokenizer tests for C++

---
 tests/test-tokenizer-0.cpp | 69 +++++++++++++++++++-------------------
 tests/test-tokenizer-0.py  | 12 +++++++
 2 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 68e39259745a0..f7c4dcf77987d 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -6,42 +6,34 @@
 #include <map>
 #include <vector>
 
+// generate using test-tokenizer-0.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
     static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { "" ,                      { }, },
-        { " ",                      {    259, }, },
-        { "  ",                     {   1678, }, },
-        { "   ",                    {    268, }, },
-        { "\t",                     {  29871,     12, }, },
-        { "\n",                     {  29871,     13, }, },
-        { "\t\n",                   {  29871,     12,     13, }, },
-        { "Hello world",            {  15043,   3186, }, },
-        { " Hello world",           {  29871,  15043,   3186, }, },
-        { "Hello World",            {  15043,   2787, }, },
-        { " Hello World",           {  29871,  15043,   2787, }, },
-        { " Hello World!",          {  29871,  15043,   2787,  29991, }, },
-        { "Hello, world!",          {  15043,  29892,   3186,  29991, }, },
-        { " Hello, world!",         {  29871,  15043,  29892,   3186,  29991, }, },
-        { " this is 🦙.cpp",        {  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu",     {    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български",      {   1538,   4851,    665,   1386,  29713,   1305, }, },
-        { "កាន់តែពិសេសអាចខលចេញ",
-                                    {  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
-                                         146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
-                                       31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
-                                         161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
-                                         136,    228,    162,    132,    228,    161,    140, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-                                    {  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
-                                         243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
-                                         313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
-                                         313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
-        { "Hello",                  {  15043, }, },
-        { " Hello",                 {  29871,  15043, }, },
-        { "  Hello",                {    259,  15043, }, },
-        { "   Hello",               {   1678,  15043, }, },
-        { "    Hello",              {    268,  15043, }, },
-        { "    Hello\n    Hello",   {    268,  15043,     13,   1678,  15043, }, },
+        { ""                      , {  }, },
+        { " "                     , {     259, }, },
+        { "  "                    , {    1678, }, },
+        { "   "                   , {     268, }, },
+        { "\t"                    , {   29871,     12, }, },
+        { "\n"                    , {   29871,     13, }, },
+        { "\t\n"                  , {   29871,     12,     13, }, },
+        { "Hello world"           , {   15043,   3186, }, },
+        { " Hello world"          , {   29871,  15043,   3186, }, },
+        { "Hello World"           , {   15043,   2787, }, },
+        { " Hello World"          , {   29871,  15043,   2787, }, },
+        { " Hello World!"         , {   29871,  15043,   2787,  29991, }, },
+        { "Hello, world!"         , {   15043,  29892,   3186,  29991, }, },
+        { " Hello, world!"        , {   29871,  15043,  29892,   3186,  29991, }, },
+        { " this is 🦙.cpp"        , {   29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        { "w048 7tuijk dsdfhu"    , {     281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        { "нещо на Български"     , {    1538,   4851,    665,   1386,  29713,   1305, }, },
+        { "កាន់តែពិសេសអាចខលចេញ"   , {   29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,    136,    228,    162,    132,    228,    161,    140, }, },
+        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {   29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
+        { "Hello"                 , {   15043, }, },
+        { " Hello"                , {   29871,  15043, }, },
+        { "  Hello"               , {     259,  15043, }, },
+        { "   Hello"              , {    1678,  15043, }, },
+        { "    Hello"             , {     268,  15043, }, },
+        { "    Hello\n    Hello"  , {     268,  15043,     13,   1678,  15043, }, },
     };
 
     return _k_tests;
@@ -99,7 +91,14 @@ int main(int argc, char **argv) {
         const std::vector<llama_token> res_bos   = llama_tokenize(ctx, test_kv.first, true);
         const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
 
-        fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), llama_detokenize(ctx, res_bos).c_str());
+        printf("\n");
+        printf("src: '%s'\n", test_kv.first.c_str());
+        printf("res: '%s'\n", llama_detokenize(ctx, res_bos).c_str());
+        printf("tok: ");
+        for (const auto & tok : res_bos) {
+            printf("%d ", tok);
+        }
+        printf("\n");
 
         bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
 
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
index 2d74808bbe586..722ba81118f75 100644
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@@ -56,3 +56,15 @@
 print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
 print("'" + tokenizer.decode([29871, 15043]) + "'")               # ' Hello'
 print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello  Hello'
+
+print("\n\ntests for C++:\n")
+for text in tests:
+    res = tokenizer.encode(text, add_bos=False)
+
+    k = text.replace('\n', '\\n')
+    k = k.replace('\t', '\\t')
+    k = '"' + k + '"'
+    print("{ %-24s, { " % k, end='')
+    for x in res:
+        print("%7d," % x, end='')
+    print(" }, },")

From e4324cbd4d367fb9ddaaef98c0929acebbf22954 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 26 Aug 2023 19:21:22 +0300
Subject: [PATCH 07/12] tests : add option to tokenize text files

ggml-ci
---
 tests/test-tokenizer-0.cpp | 44 +++++++++++++++++++++++++++++++++++++-
 tests/test-tokenizer-0.py  | 18 ++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index f7c4dcf77987d..5c58942085206 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -5,6 +5,7 @@
 #include <string>
 #include <map>
 #include <vector>
+#include <fstream>
 
 // generate using test-tokenizer-0.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
@@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
 
 int main(int argc, char **argv) {
     if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
         return 1;
     }
 
     const std::string fname = argv[1];
 
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
     fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
 
     llama_model * model;
@@ -131,6 +137,42 @@ int main(int argc, char **argv) {
         }
     }
 
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                ofs << tok << " ";
+            }
+
+            ofs << "\n";
+        }
+    }
+
     llama_free_model(model);
     llama_free(ctx);
 
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
index 722ba81118f75..a21e9ed70c5b8 100644
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@@ -6,6 +6,7 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 
 dir_tokenizer = args.dir_tokenizer
@@ -68,3 +69,20 @@
     for x in res:
         print("%7d," % x, end='')
     print(" }, },")
+
+fname_tok = args.fname_tok
+if fname_tok:
+    print('tokenizing file: ', fname_tok)
+    fname_out = fname_tok + '.tok'
+    with open(fname_tok, 'r') as f:
+        lines = f.readlines()
+        s = ''.join(lines)
+        res = tokenizer.encode(s, add_bos=True)
+        # write to file
+        with open(fname_out, 'w') as f:
+            for x in res:
+                f.write(str(x) + ' ')
+            f.write('\n')
+        print('len(res): ', len(res))
+        print('len(lines): ', len(lines))
+    print('results written to: ', fname_out)

From eb8b3264f6054a9f688c3df8ff441aca400ad426 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Aug 2023 00:41:44 +0300
Subject: [PATCH 08/12] tests : add test-tokenizer-1.py

---
 tests/test-tokenizer-0.py |  7 ++++
 tests/test-tokenizer-1.py | 83 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 tests/test-tokenizer-1.py

diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
index a21e9ed70c5b8..bc164ee296cb1 100644
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@@ -1,3 +1,5 @@
+# tests with SPM tokenizer
+
 import os
 import sys
 import argparse
@@ -70,6 +72,11 @@
         print("%7d," % x, end='')
     print(" }, },")
 
+print(tokenizer.encode('hello'))
+print(tokenizer.encode('world'))
+print(tokenizer.encode(' world'))
+print(tokenizer.encode('hello world'))
+
 fname_tok = args.fname_tok
 if fname_tok:
     print('tokenizing file: ', fname_tok)
diff --git a/tests/test-tokenizer-1.py b/tests/test-tokenizer-1.py
new file mode 100644
index 0000000000000..9c8c1c7d1d3ca
--- /dev/null
+++ b/tests/test-tokenizer-1.py
@@ -0,0 +1,83 @@
+# tests with BPE tokenizer
+
+import os
+import sys
+import argparse
+
+from transformers import AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
+args = parser.parse_args()
+
+dir_tokenizer = args.dir_tokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+
+tests = [
+        "",
+        " ",
+        "  ",
+        "   ",
+        "\t",
+        "\n",
+        "\t\n",
+        "Hello world",
+        " Hello world",
+        "Hello World",
+        " Hello World",
+        " Hello World!",
+        "Hello, world!",
+        " Hello, world!",
+        " this is 🦙.cpp",
+        "w048 7tuijk dsdfhu",
+        "нещо на Български",
+        "កាន់តែពិសេសអាចខលចេញ",
+        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+        "Hello",
+        " Hello",
+        "  Hello",
+        "   Hello",
+        "    Hello",
+        "    Hello\n    Hello",
+    ]
+
+for text in tests:
+    print('text: ', text)
+    print(tokenizer.encode(text))
+    print(tokenizer.decode(tokenizer.encode(text)))
+
+print("\n\ntests for C++:\n")
+for text in tests:
+    res = tokenizer.encode(text)
+
+    k = text.replace('\n', '\\n')
+    k = k.replace('\t', '\\t')
+    k = '"' + k + '"'
+    print("{ %-24s, { " % k, end='')
+    for x in res:
+        print("%7d," % x, end='')
+    print(" }, },")
+
+print(tokenizer.encode('hello'))
+print(tokenizer.encode('world'))
+print(tokenizer.encode(' world'))
+print(tokenizer.encode('hello world'))
+
+fname_tok = args.fname_tok
+if fname_tok:
+    print('tokenizing file: ', fname_tok)
+    fname_out = fname_tok + '.tok'
+    with open(fname_tok, 'r') as f:
+        lines = f.readlines()
+        s = ''.join(lines)
+        res = tokenizer.encode(s)
+        # write to file
+        with open(fname_out, 'w') as f:
+            for x in res:
+                f.write(str(x) + ' ')
+            f.write('\n')
+        print('len(res): ', len(res))
+        print('len(lines): ', len(lines))
+    print('results written to: ', fname_out)

From ab3ba64f62e924391b61622423caf1e500bc28d7 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Sat, 26 Aug 2023 23:03:01 +0200
Subject: [PATCH 09/12] llama.cpp : fix LF token

---
 llama.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 8681c02562580..f67d6688bb253 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1636,6 +1636,7 @@ static void llm_load_hparams(
 
 // TODO: This should probably be in llama.h
 static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
+static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 
 static void llm_load_vocab(
         llama_model_loader & ml,
@@ -1737,7 +1738,11 @@ static void llm_load_vocab(
     }
 
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
-    vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
+    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
+        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+    } else {
+        vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
+    }
 
     // special tokens
     GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));

From dbcf470bc6606d67ff2916c6d3377741fad34090 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Aug 2023 00:44:49 +0300
Subject: [PATCH 10/12] hellaswag : move the concat space for clarity

---
 examples/perplexity/perplexity.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index fd89852d6dedd..b596d062613d7 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -392,7 +392,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         hs_data[i].context = prompt_lines[idx*6];
         hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
         for (size_t j=0; j < 4; j++) {
-            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
+            hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
         }
 
         // Delete the selected random example from the prompt
@@ -417,7 +417,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         size_t context_size = context_embd.size();
 
         for (int i = 0; i < 4; ++i) {
-            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
+            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
             for (int k = 0; k < int(context_size); ++k) {
                 if (ending_tokens[i][k] != context_embd[k]) {
                     fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);

From 3bb0f84932a0e0302ef4deae54917a5bd18ae4c0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Aug 2023 11:26:48 +0300
Subject: [PATCH 11/12] tests : add falcon tests (py + cpp, currently do not
 pass Unicode)

ggml-ci
---
 tests/CMakeLists.txt                          |   6 +-
 tests/test-tokenizer-0-falcon.cpp             | 180 ++++++++++++++++++
 ...enizer-1.py => test-tokenizer-0-falcon.py} |   0
 ...nizer-0.cpp => test-tokenizer-0-llama.cpp} |   4 +-
 ...kenizer-0.py => test-tokenizer-0-llama.py} |   0
 5 files changed, 187 insertions(+), 3 deletions(-)
 create mode 100644 tests/test-tokenizer-0-falcon.cpp
 rename tests/{test-tokenizer-1.py => test-tokenizer-0-falcon.py} (100%)
 rename tests/{test-tokenizer-0.cpp => test-tokenizer-0-llama.cpp} (97%)
 rename tests/{test-tokenizer-0.py => test-tokenizer-0-llama.py} (100%)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2afaf86b11450..ca1f39d31b081 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -25,8 +25,10 @@ endfunction()
 llama_build_and_test_executable(test-quantize-fns.cpp)
 llama_build_and_test_executable(test-quantize-perf.cpp)
 llama_build_and_test_executable(test-sampling.cpp)
-llama_build_executable(test-tokenizer-0.cpp)
-llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_build_executable(test-tokenizer-0-llama.cpp)
+llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_build_executable(test-tokenizer-0-falcon.cpp)
+#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_build_executable(test-tokenizer-1.cpp)
 # test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
 #llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
new file mode 100644
index 0000000000000..3063e2c64c549
--- /dev/null
+++ b/tests/test-tokenizer-0-falcon.cpp
@@ -0,0 +1,180 @@
+#include "llama.h"
+#include "common.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+
+// generate using test-tokenizer-0-falcon.py
+static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+        { ""                      , {  }, },
+        { " "                     , {     204, }, },
+        { "  "                    , {     258, }, },
+        { "   "                   , {     466, }, },
+        { "\t"                    , {     192, }, },
+        { "\n"                    , {     193, }, },
+        { "\t\n"                  , {   19125, }, },
+        { "Hello world"           , {    9856,   1079, }, },
+        { " Hello world"          , {   23090,   1079, }, },
+        { "Hello World"           , {    9856,   2889, }, },
+        { " Hello World"          , {   23090,   2889, }, },
+        { " Hello World!"         , {   23090,   2889,     12, }, },
+        { "Hello, world!"         , {    9856,     23,   1079,     12, }, },
+        { " Hello, world!"        , {   23090,     23,   1079,     12, }, },
+        { " this is 🦙.cpp"        , {     414,    304,   3346,    111,    231,     25,  29247, }, },
+        { "w048 7tuijk dsdfhu"    , {      98,  55866,    204,     34,  16682,   7149,  36190,   6869,  11481, }, },
+        { "нещо на Български"     , {     150,    133,   6207,    151,    215,    150,    134,   5052,    133,   6279,   5052,    223,    151,    216,  49679,    123,  53110,  47043,   7795, }, },
+        { "កាន់តែពិសេសអាចខលចេញ"   , {   38154,    206,  38154,    126,  38154,    225,    167,    237,    217,  38154,    221,    167,    237,    208,  38154,    228,  38154,    127,  38154,    237,    167,    237,    207,  38154,    237,  38154,    107,  38154,    126,  38154,    211,  38154,    207,  38154,    233,  38154,    211,    167,    237,    207,  38154,    215, }, },
+        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    2571,    232,    206,    204,     19,  11003,     20,   8196,    126,    283,    219,  48778,    116,  13392,    204,     19,  51831,    732,  63209,   1741,   7955,    522,     20,  22438,    211,    204,     19,   7927,  53360,    325,    504,    701,    946,  10930,     20, }, },
+        { "Hello"                 , {    9856, }, },
+        { " Hello"                , {   23090, }, },
+        { "  Hello"               , {     204,  23090, }, },
+        { "   Hello"              , {     258,  23090, }, },
+        { "    Hello"             , {     466,  23090, }, },
+        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
+    };
+
+    return _k_tests;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init(false);
+
+    // load the vocab
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.vocab_only = true;
+
+        model = llama_load_model_from_file(fname.c_str(), lparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        ctx = llama_new_context_with_model(model, lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_free_model(model);
+            return 1;
+        }
+    }
+
+    const int n_vocab = llama_n_vocab(ctx);
+
+    if (n_vocab != 65024) {
+        fprintf(stderr, "%s : expected 65024 tokens, got %d\n", __func__, n_vocab);
+        llama_free_model(model);
+        llama_free(ctx);
+        return 2;
+    }
+
+    bool success = true;
+
+    for (const auto & test_kv : k_tests()) {
+        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
+
+        printf("\n");
+        printf("src: '%s'\n", test_kv.first.c_str());
+        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
+        printf("tok: ");
+        for (const auto & tok : res) {
+            printf("%d ", tok);
+        }
+        printf("\n");
+
+        bool correct = res.size() == test_kv.second.size();
+
+        for (int i = 0; i < (int) res.size() && correct; ++i) {
+            if (test_kv.second[i] != res[i]) {
+                correct = false;
+            }
+        }
+
+        if (!correct) {
+            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                llama_detokenize(ctx, res).c_str(),
+                llama_detokenize(ctx, test_kv.second).c_str());
+            fprintf(stderr, "%s : expected tokens: ", __func__);
+            for (const auto & t : test_kv.second) {
+                fprintf(stderr, "%6d, ", t);
+            }
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s : got tokens:      ", __func__);
+            for (const auto & t : res) {
+                fprintf(stderr, "%6d, ", t);
+            }
+            fprintf(stderr, "\n");
+
+            success = false;
+        }
+    }
+
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                ofs << tok << " ";
+            }
+
+            ofs << "\n";
+        }
+
+        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+    }
+
+    llama_free_model(model);
+    llama_free(ctx);
+
+    llama_backend_free();
+
+    return success ? 0 : 3;
+}
diff --git a/tests/test-tokenizer-1.py b/tests/test-tokenizer-0-falcon.py
similarity index 100%
rename from tests/test-tokenizer-1.py
rename to tests/test-tokenizer-0-falcon.py
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0-llama.cpp
similarity index 97%
rename from tests/test-tokenizer-0.cpp
rename to tests/test-tokenizer-0-llama.cpp
index 5c58942085206..c28cd2753146f 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <fstream>
 
-// generate using test-tokenizer-0.py
+// generate using test-tokenizer-0-llama.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
     static std::map<std::string, std::vector<llama_token>> _k_tests = {
         { ""                      , {  }, },
@@ -171,6 +171,8 @@ int main(int argc, char **argv) {
 
             ofs << "\n";
         }
+
+        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
     }
 
     llama_free_model(model);
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0-llama.py
similarity index 100%
rename from tests/test-tokenizer-0.py
rename to tests/test-tokenizer-0-llama.py

From 841983fe47f7488429c06bb6dabaa1274d01bca0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Aug 2023 13:04:04 +0300
Subject: [PATCH 12/12] common : temporary separate llama_detokenize calls for
 SPM and BPE

---
 common/common.cpp                 | 14 +++++++++++++-
 common/common.h                   | 11 ++++++++++-
 tests/test-tokenizer-0-falcon.cpp | 12 +++++-------
 tests/test-tokenizer-0-llama.cpp  | 12 +++++-------
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 4e40a52501446..0d91a6a35acaa 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -747,7 +747,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
     return std::string(result.data(), result.size());
 }
 
-std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
+std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
     const llama_token bos_id = llama_token_bos(ctx);
 
     std::string piece;
@@ -767,3 +767,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
     return result;
 }
 
+std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::string piece;
+    std::string result;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        piece = llama_token_to_piece(ctx, tokens[i]);
+
+        result += piece;
+    }
+
+    return result;
+}
diff --git a/common/common.h b/common/common.h
index 1c1acf98916b1..97fda2be78b51 100644
--- a/common/common.h
+++ b/common/common.h
@@ -129,9 +129,18 @@ std::string llama_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token);
 
+// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
+//       that takes into account the tokenizer type and decides how to handle the leading space
+//
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // removes the leading space from the first non-BOS token
-std::string llama_detokenize(
+std::string llama_detokenize_spm(
+                         llama_context * ctx,
+        const std::vector<llama_token> & tokens);
+
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+std::string llama_detokenize_bpe(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
index 3063e2c64c549..836fb8ad27109 100644
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@@ -82,10 +82,8 @@ int main(int argc, char **argv) {
         }
     }
 
-    const int n_vocab = llama_n_vocab(ctx);
-
-    if (n_vocab != 65024) {
-        fprintf(stderr, "%s : expected 65024 tokens, got %d\n", __func__, n_vocab);
+    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) {
+        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
         llama_free_model(model);
         llama_free(ctx);
         return 2;
@@ -98,7 +96,7 @@ int main(int argc, char **argv) {
 
         printf("\n");
         printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
+        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
         printf("tok: ");
         for (const auto & tok : res) {
             printf("%d ", tok);
@@ -116,8 +114,8 @@ int main(int argc, char **argv) {
         if (!correct) {
             fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
             fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
+                llama_detokenize_bpe(ctx, res).c_str(),
+                llama_detokenize_bpe(ctx, test_kv.second).c_str());
             fprintf(stderr, "%s : expected tokens: ", __func__);
             for (const auto & t : test_kv.second) {
                 fprintf(stderr, "%6d, ", t);
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp
index c28cd2753146f..8630742c612bf 100644
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -82,10 +82,8 @@ int main(int argc, char **argv) {
         }
     }
 
-    const int n_vocab = llama_n_vocab(ctx);
-
-    if (n_vocab != 32000) {
-        fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
+    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) {
+        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
         llama_free_model(model);
         llama_free(ctx);
         return 2;
@@ -99,7 +97,7 @@ int main(int argc, char **argv) {
 
         printf("\n");
         printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res_bos).c_str());
+        printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
         printf("tok: ");
         for (const auto & tok : res_bos) {
             printf("%d ", tok);
@@ -120,8 +118,8 @@ int main(int argc, char **argv) {
         if (!correct) {
             fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
             fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res_nobos).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
+                llama_detokenize_spm(ctx, res_nobos).c_str(),
+                llama_detokenize_spm(ctx, test_kv.second).c_str());
             fprintf(stderr, "%s : expected tokens: ", __func__);
             for (const auto & t : test_kv.second) {
                 fprintf(stderr, "%6d, ", t);