Skip to content

Commit e4324cb

Browse files
committed
tests : add option to tokenize text files
ggml-ci
1 parent 70005bd commit e4324cb

File tree

2 files changed

+61
-1
lines changed

2 files changed

+61
-1
lines changed

tests/test-tokenizer-0.cpp

+43-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <string>
66
#include <map>
77
#include <vector>
8+
#include <fstream>
89

910
// generate using test-tokenizer-0.py
1011
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
@@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
4142

4243
int main(int argc, char **argv) {
4344
if (argc < 2) {
44-
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
45+
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
4546
return 1;
4647
}
4748

4849
const std::string fname = argv[1];
4950

51+
std::string fname_text;
52+
if (argc > 2) {
53+
fname_text = argv[2];
54+
}
55+
5056
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
5157

5258
llama_model * model;
@@ -131,6 +137,42 @@ int main(int argc, char **argv) {
131137
}
132138
}
133139

140+
if (!fname_text.empty()) {
141+
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
142+
143+
std::string text;
144+
{
145+
std::ifstream ifs(fname_text);
146+
if (!ifs) {
147+
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
148+
return 1;
149+
}
150+
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
151+
}
152+
153+
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
154+
155+
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
156+
157+
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
158+
159+
{
160+
const std::string fname_out = fname_text + ".tokcpp";
161+
162+
std::ofstream ofs(fname_out);
163+
if (!ofs) {
164+
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
165+
return 1;
166+
}
167+
168+
for (const auto & tok : res) {
169+
ofs << tok << " ";
170+
}
171+
172+
ofs << "\n";
173+
}
174+
}
175+
134176
llama_free_model(model);
135177
llama_free(ctx);
136178

tests/test-tokenizer-0.py

+18
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
parser = argparse.ArgumentParser()
88
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
9+
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
910
args = parser.parse_args()
1011

1112
dir_tokenizer = args.dir_tokenizer
@@ -68,3 +69,20 @@
6869
for x in res:
6970
print("%7d," % x, end='')
7071
print(" }, },")
72+
73+
fname_tok = args.fname_tok
74+
if fname_tok:
75+
print('tokenizing file: ', fname_tok)
76+
fname_out = fname_tok + '.tok'
77+
with open(fname_tok, 'r') as f:
78+
lines = f.readlines()
79+
s = ''.join(lines)
80+
res = tokenizer.encode(s, add_bos=True)
81+
# write to file
82+
with open(fname_out, 'w') as f:
83+
for x in res:
84+
f.write(str(x) + ' ')
85+
f.write('\n')
86+
print('len(res): ', len(res))
87+
print('len(lines): ', len(lines))
88+
print('results written to: ', fname_out)

0 commit comments

Comments
 (0)