Skip to content

Commit 917dc8c

Browse files
jaime-m-pggerganov
andauthored
Tokenizer SPM fixes for phi-3 and llama-spm (#7375)
* Update brute force test: special tokens * Fix added tokens - Try to read 'added_tokens.json'. - Try to read 'tokenizer_config.json'. - Try to read 'tokenizer.json'. * Fix special tokens rtrim Co-authored-by: Georgi Gerganov <[email protected]> * server : fix test regexes
1 parent fabf30b commit 917dc8c

File tree

5 files changed

+98
-14
lines changed

5 files changed

+98
-14
lines changed

convert-hf-to-gguf.py

+32
Original file line numberDiff line numberDiff line change
@@ -1740,6 +1740,38 @@ def set_vocab(self):
17401740
scores[token_id] = -1000.0
17411741
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
17421742

1743+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1744+
if tokenizer_config_file.is_file():
1745+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1746+
tokenizer_config_json = json.load(f)
1747+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
1748+
for token_id, foken_data in added_tokens_decoder.items():
1749+
token_id = int(token_id)
1750+
token = foken_data["content"].encode("utf-8")
1751+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1752+
assert(tokens[token_id] == token)
1753+
tokens[token_id] = token
1754+
scores[token_id] = -1000.0
1755+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
1756+
if foken_data.get("special"):
1757+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
1758+
1759+
tokenizer_file = self.dir_model / 'tokenizer.json'
1760+
if tokenizer_file.is_file():
1761+
with open(tokenizer_file, "r", encoding="utf-8") as f:
1762+
tokenizer_json = json.load(f)
1763+
added_tokens = tokenizer_json.get("added_tokens", [])
1764+
for foken_data in added_tokens:
1765+
token_id = int(foken_data["id"])
1766+
token = foken_data["content"].encode("utf-8")
1767+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1768+
assert(tokens[token_id] == token)
1769+
tokens[token_id] = token
1770+
scores[token_id] = -1000.0
1771+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
1772+
if foken_data.get("special"):
1773+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
1774+
17431775
self.gguf_writer.add_tokenizer_model("llama")
17441776
self.gguf_writer.add_tokenizer_pre("default")
17451777
self.gguf_writer.add_token_list(tokens)

examples/server/tests/features/server.feature

+5-5
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ Feature: llama.cpp server
3737

3838
Examples: Prompts
3939
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
40-
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
41-
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
40+
| I believe the meaning of life is | 8 | (read\|going\|pretty)+ | 18 | 8 | not |
41+
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 45 | 64 | not |
4242

4343
Scenario: Completion prompt truncated
4444
Given a prompt:
@@ -67,8 +67,8 @@ Feature: llama.cpp server
6767

6868
Examples: Prompts
6969
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
70-
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
71-
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
70+
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 76 | 8 | disabled | not |
71+
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|fireplace)+ | -1 | 64 | enabled | |
7272

7373

7474
Scenario Outline: OAI Compatibility w/ response format
@@ -84,7 +84,7 @@ Feature: llama.cpp server
8484
| response_format | n_predicted | re_content |
8585
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
8686
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
87-
| {"type": "json_object"} | 10 | \{ " Jacky. |
87+
| {"type": "json_object"} | 10 | \{ " Saragine. |
8888

8989

9090
Scenario: Tokenize / Detokenize

examples/server/tests/features/slotsave.feature

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Feature: llama.cpp server slot management
2626
# Since we have cache, this should only process the last tokens
2727
Given a user prompt "What is the capital of Germany?"
2828
And a completion request with no api error
29-
Then 24 tokens are predicted matching (Thank|special)
29+
Then 24 tokens are predicted matching (Thank|special|Lily)
3030
And 7 prompt tokens are processed
3131
# Loading the original cache into slot 0,
3232
# we should only be processing 1 prompt token and get the same output
@@ -41,7 +41,7 @@ Feature: llama.cpp server slot management
4141
Given a user prompt "What is the capital of Germany?"
4242
And using slot id 1
4343
And a completion request with no api error
44-
Then 24 tokens are predicted matching (Thank|special)
44+
Then 24 tokens are predicted matching (Thank|special|Lily)
4545
And 1 prompt tokens are processed
4646

4747
Scenario: Erase Slot

llama.cpp

+27-4
Original file line numberDiff line numberDiff line change
@@ -4553,7 +4553,8 @@ static void llm_load_vocab(
45534553
(t.first == "<|eot_id|>" ||
45544554
t.first == "<|im_end|>" ||
45554555
t.first == "<|end|>" ||
4556-
t.first == "<end_of_turn>"
4556+
t.first == "<end_of_turn>" ||
4557+
t.first == "<|endoftext|>"
45574558
)
45584559
) {
45594560
vocab.special_eot_id = t.second;
@@ -12502,6 +12503,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1250212503
output.push_back(vocab.special_bos_id);
1250312504
}
1250412505

12506+
static const bool rtrim = true; //TODO: as param
12507+
bool is_prev_special = false;
12508+
bool special_token_rtrim = false;
12509+
1250512510
for (const auto & fragment : fragment_buffer) {
1250612511
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1250712512
// without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -12511,9 +12516,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1251112516
// and passing 'add space prefix' as bool argument
1251212517
//
1251312518
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12514-
if (&fragment == &fragment_buffer.front()) {
12515-
if (vocab.add_space_prefix) {
12516-
raw_text = " " + raw_text; // prefix with space if the first token is not special
12519+
12520+
if (special_token_rtrim) {
12521+
size_t num_whitespaces = 0;
12522+
while (isspace(raw_text[num_whitespaces])) {
12523+
num_whitespaces++;
12524+
}
12525+
if (num_whitespaces == raw_text.size()) {
12526+
continue; // skip if all whitespaces
12527+
}
12528+
raw_text = raw_text.substr(num_whitespaces);
12529+
}
12530+
12531+
if (vocab.add_space_prefix) {
12532+
if (!output.size() || is_prev_special) { // prefix with space if first token
12533+
raw_text = " " + raw_text;
1251712534
}
1251812535
}
1251912536

@@ -12525,6 +12542,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1252512542
tokenizer.tokenize(raw_text, output);
1252612543
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1252712544
output.push_back(fragment.token);
12545+
is_prev_special = true;
12546+
// phi-3 special tokens without rtrim, works fine for llama-spm too
12547+
special_token_rtrim = rtrim
12548+
&& fragment.token != vocab.special_bos_id
12549+
&& fragment.token != vocab.special_unk_id
12550+
&& fragment.token != vocab.special_eos_id;
1252812551
}
1252912552
}
1253012553

tests/test-tokenizer-random.py

+32-3
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
153153
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
154154
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
155155
'Cửa Việt', # llama-3, ignore_merges = true
156-
'<s>a', # TODO: Phi-3 fail
156+
'<s>a', # Phi-3 fail
157+
'<unk><|endoftext|><s>' # Phi-3 fail
157158
'a\na', # TODO: Bert fail
158159
]
159160

160161

162+
def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
163+
special_tokens = set(special_tokens)
164+
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
165+
special_tokens = list(sorted(special_tokens))
166+
rand = random.Random()
167+
for m in range(iterations):
168+
rand.seed(m)
169+
words = rand.choices(special_tokens, k=500)
170+
yield "".join(words)
171+
172+
161173
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
162174
"""Brute force check all vocab words"""
163175
yield from vocab
@@ -289,14 +301,31 @@ def func_tokenize1(text: str):
289301
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
290302
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
291303
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
304+
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
292305
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
293306
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
294307
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
295-
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000))
308+
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
296309
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
297310

298311
model.free()
299312

300313

301314
if __name__ == "__main__":
302-
main()
315+
# main()
316+
317+
path_tokenizers = "./models/tokenizers/"
318+
path_vocab_format = "./models/ggml-vocab-%s.gguf"
319+
320+
# import os
321+
# tokenizers = os.listdir(path_tokenizers)
322+
tokenizers = [
323+
"llama-spm", # SPM
324+
"phi-3", # SPM
325+
]
326+
327+
for tokenizer in tokenizers:
328+
print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa
329+
vocab_file = path_vocab_format % tokenizer
330+
dir_tokenizer = path_tokenizers + "/" + tokenizer
331+
main([vocab_file, dir_tokenizer, "--verbose"])

0 commit comments

Comments
 (0)