|
5 | 5 | #include <string>
|
6 | 6 | #include <map>
|
7 | 7 | #include <vector>
|
| 8 | +#include <fstream> |
8 | 9 |
|
9 | 10 | // generate using test-tokenizer-0.py
|
10 | 11 | static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
@@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
41 | 42 |
|
42 | 43 | int main(int argc, char **argv) {
|
43 | 44 | if (argc < 2) {
|
44 |
| - fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]); |
| 45 | + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); |
45 | 46 | return 1;
|
46 | 47 | }
|
47 | 48 |
|
48 | 49 | const std::string fname = argv[1];
|
49 | 50 |
|
| 51 | + std::string fname_text; |
| 52 | + if (argc > 2) { |
| 53 | + fname_text = argv[2]; |
| 54 | + } |
| 55 | + |
50 | 56 | fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
51 | 57 |
|
52 | 58 | llama_model * model;
|
@@ -131,6 +137,42 @@ int main(int argc, char **argv) {
|
131 | 137 | }
|
132 | 138 | }
|
133 | 139 |
|
| 140 | + if (!fname_text.empty()) { |
| 141 | + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); |
| 142 | + |
| 143 | + std::string text; |
| 144 | + { |
| 145 | + std::ifstream ifs(fname_text); |
| 146 | + if (!ifs) { |
| 147 | + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); |
| 148 | + return 1; |
| 149 | + } |
| 150 | + text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>()); |
| 151 | + } |
| 152 | + |
| 153 | + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); |
| 154 | + |
| 155 | + const std::vector<llama_token> res = llama_tokenize(ctx, text, true); |
| 156 | + |
| 157 | + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); |
| 158 | + |
| 159 | + { |
| 160 | + const std::string fname_out = fname_text + ".tokcpp"; |
| 161 | + |
| 162 | + std::ofstream ofs(fname_out); |
| 163 | + if (!ofs) { |
| 164 | + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); |
| 165 | + return 1; |
| 166 | + } |
| 167 | + |
| 168 | + for (const auto & tok : res) { |
| 169 | + ofs << tok << " "; |
| 170 | + } |
| 171 | + |
| 172 | + ofs << "\n"; |
| 173 | + } |
| 174 | + } |
| 175 | + |
134 | 176 | llama_free_model(model);
|
135 | 177 | llama_free(ctx);
|
136 | 178 |
|
|
0 commit comments