tests : add option to tokenize text files

ggerganov · ggerganov · commit e4324cbd4d36 · 2023-08-26T19:21:48.000+03:00
ggml-ci
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
@@ -5,6 +5,7 @@
 #include <string>
 #include <map>
 #include <vector>
+#include <fstream>
 
 // generate using test-tokenizer-0.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
@@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
 
 int main(int argc, char **argv) {
     if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
         return 1;
     }
 
     const std::string fname = argv[1];
 
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
     fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
 
     llama_model * model;
@@ -131,6 +137,42 @@ int main(int argc, char **argv) {
         }
     }
 
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                ofs << tok << " ";
+            }
+
+            ofs << "\n";
+        }
+    }
+
     llama_free_model(model);
     llama_free(ctx);
 
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
@@ -6,6 +6,7 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 
 dir_tokenizer = args.dir_tokenizer
@@ -68,3 +69,20 @@
     for x in res:
         print("%7d," % x, end='')
     print(" }, },")
+
+fname_tok = args.fname_tok
+if fname_tok:
+    print('tokenizing file: ', fname_tok)
+    fname_out = fname_tok + '.tok'
+    with open(fname_tok, 'r') as f:
+        lines = f.readlines()
+        s = ''.join(lines)
+        res = tokenizer.encode(s, add_bos=True)
+        # write to file
+        with open(fname_out, 'w') as f:
+            for x in res:
+                f.write(str(x) + ' ')
+            f.write('\n')
+        print('len(res): ', len(res))
+        print('len(lines): ', len(lines))
+    print('results written to: ', fname_out)