Skip to content

Commit 46ef5b5

Browse files
authored
llama : fix whitespace escaping in tokenizer (ggml-org#2724)
1 parent c63bb1d commit 46ef5b5

File tree

3 files changed

+16
-21
lines changed

3 files changed

+16
-21
lines changed

llama.cpp

+3-10
Original file line numberDiff line numberDiff line change
@@ -2253,18 +2253,11 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
22532253
}
22542254

22552255
static std::string llama_escape_whitespace(const std::string& text) {
2256-
std::string result;
2257-
bool escaping = false;
2258-
result += "\xe2\x96\x81";
2256+
std::string result = "\xe2\x96\x81";
22592257
for (size_t offs = 0; offs < text.length(); ++offs) {
22602258
if (text[offs] == ' ') {
2261-
if (!escaping) {
2262-
result += "\xe2\x96\x81";
2263-
escaping = true;
2264-
}
2265-
}
2266-
else {
2267-
escaping = false;
2259+
result += "\xe2\x96\x81";
2260+
} else {
22682261
result += text[offs];
22692262
}
22702263
}

tests/test-tokenizer-0.cpp

+10-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla
1717
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
1818
static std::map<std::string, std::vector<llama_token>> _k_tests = {
1919
{ " ", {1, 259, }, },
20+
{ " ", { 1, 1678, }, },
21+
{ " ", { 1, 268, }, },
2022
{ "\t", { 1, 29871, 12, }, },
2123
{ "\n", { 1, 29871, 13, }, },
2224
{ "\t\n", { 1, 29871, 12, 13, }, },
@@ -38,6 +40,12 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
3840
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
3941
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
4042
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
43+
{ "Hello", { 1, 15043 }, },
44+
{ " Hello", { 1, 29871, 15043 }, },
45+
{ " Hello", { 1, 259, 15043 }, },
46+
{ " Hello", { 1, 1678, 15043 }, },
47+
{ " Hello", { 1, 268, 15043 }, },
48+
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
4149
};
4250

4351
return _k_tests;
@@ -106,7 +114,8 @@ int main(int argc, char **argv) {
106114

107115
if (!correct) {
108116
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
109-
fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
117+
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
118+
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
110119
fprintf(stderr, "%s : expected tokens: ", __func__);
111120
for (const auto & t : test_kv.second) {
112121
fprintf(stderr, "%6d, ", t);

tests/test-tokenizer-1.cpp

+3-10
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,11 @@
1111
#include <locale>
1212

1313
static std::string escape_whitespace(const std::string& text) {
14-
std::string result;
15-
bool escaping = false;
16-
result += "\xe2\x96\x81";
14+
std::string result = "\xe2\x96\x81";
1715
for (size_t offs = 0; offs < text.length(); ++offs) {
1816
if (text[offs] == ' ') {
19-
if (!escaping) {
20-
result += "\xe2\x96\x81";
21-
escaping = true;
22-
}
23-
}
24-
else {
25-
escaping = false;
17+
result += "\xe2\x96\x81";
18+
} else {
2619
result += text[offs];
2720
}
2821
}

0 commit comments

Comments
 (0)