Skip to content

Commit bf06349

Browse files
committed
Revert "Work on the BPE tokenizer (ggml-org#3252)"
This reverts commit ff5a3f0.
1 parent e2583cb commit bf06349

15 files changed

+227
-852
lines changed

.gitignore

+1-2
Original file line numberDiff line numberDiff line change
@@ -91,5 +91,4 @@ tests/test-quantize-perf
9191
tests/test-sampling
9292
tests/test-tokenizer-0-llama
9393
tests/test-tokenizer-0-falcon
94-
tests/test-tokenizer-1-llama
95-
tests/test-tokenizer-1-bpe
94+
tests/test-tokenizer-1

Makefile

+2-7
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
33

44
# Binaries only useful for tests
5-
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
5+
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
66

77
# Code coverage output files
88
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -62,10 +62,8 @@ test: $(TEST_TARGETS)
6262
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
6363
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
6464
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
65-
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
66-
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
6765
continue; \
68-
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
66+
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
6967
continue; \
7068
else \
7169
echo "Running test $$test_target..."; \
@@ -672,9 +670,6 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
672670
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
673671
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
674672

675-
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
676-
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
677-
678673
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
679674
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
680675

common/common.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
923923
result += piece;
924924
}
925925

926-
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
927926
return result;
928927
}
929928

convert-falcon-hf-to-gguf.py

+40-7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,28 @@
2020
import gguf
2121

2222

23+
def bytes_to_unicode():
24+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
25+
"""
26+
Returns list of utf-8 byte and a corresponding list of unicode strings.
27+
The reversible bpe codes work on unicode strings.
28+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
29+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
30+
This is a significant percentage of your normal, say, 32K bpe vocab.
31+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32+
And avoids mapping to whitespace/control characters the bpe code barfs on.
33+
"""
34+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
35+
cs = bs[:]
36+
n = 0
37+
for b in range(2**8):
38+
if b not in bs:
39+
bs.append(b)
40+
cs.append(2**8+n)
41+
n += 1
42+
return dict(zip(bs, (chr(n) for n in cs)))
43+
44+
2345
def count_model_parts(dir_model: Path) -> int:
2446
num_parts = 0
2547
for filename in os.listdir(dir_model):
@@ -111,8 +133,6 @@ def parse_args() -> argparse.Namespace:
111133
print("gguf: get tokenizer metadata")
112134

113135
tokens: list[bytearray] = []
114-
scores: list[float] = []
115-
toktypes: list[int] = []
116136

117137
# gpt2 tokenizer
118138
gguf_writer.add_tokenizer_model("gpt2")
@@ -128,15 +148,28 @@ def parse_args() -> argparse.Namespace:
128148
assert max(tokenizer.vocab.values()) < vocab_size
129149

130150
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
151+
byte_encoder = bytes_to_unicode()
152+
byte_decoder = {v: k for k, v in byte_encoder.items()}
131153

132154
for i in range(vocab_size):
133-
tokens.append(reverse_vocab[i])
134-
scores.append(0.0) # dummy
135-
toktypes.append(gguf.TokenType.NORMAL)
155+
if i in reverse_vocab:
156+
try:
157+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
158+
except KeyError:
159+
text = bytearray()
160+
for c in reverse_vocab[i]:
161+
if ord(c) < 256: # single byte character
162+
text.append(byte_decoder[ord(c)])
163+
else: # multibyte special token character
164+
text.extend(c.encode('utf-8'))
165+
else:
166+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
167+
pad_token = f"[PAD{i}]".encode("utf8")
168+
text = bytearray(pad_token)
169+
170+
tokens.append(text)
136171

137172
gguf_writer.add_token_list(tokens)
138-
gguf_writer.add_token_scores(scores)
139-
gguf_writer.add_token_types(toktypes)
140173

141174
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
142175
special_vocab.add_to_gguf(gguf_writer)

convert-gptneox-hf-to-gguf.py

+41-7
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,29 @@
1919
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
2020
import gguf
2121

22+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23+
24+
25+
def bytes_to_unicode():
26+
"""
27+
Returns list of utf-8 byte and a corresponding list of unicode strings.
28+
The reversible bpe codes work on unicode strings.
29+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31+
This is a significant percentage of your normal, say, 32K bpe vocab.
32+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33+
And avoids mapping to whitespace/control characters the bpe code barfs on.
34+
"""
35+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
36+
cs = bs[:]
37+
n = 0
38+
for b in range(2**8):
39+
if b not in bs:
40+
bs.append(b)
41+
cs.append(2**8+n)
42+
n += 1
43+
return dict(zip(bs, (chr(n) for n in cs)))
44+
2245

2346
def count_model_parts(dir_model: Path) -> int:
2447
num_parts = 0
@@ -107,8 +130,6 @@ def parse_args() -> argparse.Namespace:
107130
print("gguf: get tokenizer metadata")
108131

109132
tokens: list[bytearray] = []
110-
scores: list[float] = []
111-
toktypes: list[int] = []
112133

113134
# gpt2 tokenizer
114135
gguf_writer.add_tokenizer_model("gpt2")
@@ -124,15 +145,28 @@ def parse_args() -> argparse.Namespace:
124145
assert max(tokenizer.vocab.values()) < vocab_size
125146

126147
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
148+
byte_encoder = bytes_to_unicode()
149+
byte_decoder = {v: k for k, v in byte_encoder.items()}
127150

128151
for i in range(vocab_size):
129-
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
130-
scores.append(0.0) # dummy
131-
toktypes.append(gguf.TokenType.NORMAL)
152+
if i in reverse_vocab:
153+
try:
154+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
155+
except KeyError:
156+
text = bytearray()
157+
for c in reverse_vocab[i]:
158+
if ord(c) < 256: # single byte character
159+
text.append(byte_decoder[ord(c)])
160+
else: # multibyte special token character
161+
text.extend(c.encode('utf-8'))
162+
else:
163+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
164+
pad_token = f"[PAD{i}]".encode("utf8")
165+
text = bytearray(pad_token)
166+
167+
tokens.append(text)
132168

133169
gguf_writer.add_token_list(tokens)
134-
gguf_writer.add_token_scores(scores)
135-
gguf_writer.add_token_types(toktypes)
136170

137171
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
138172
special_vocab.add_to_gguf(gguf_writer)

convert-starcoder-hf-to-gguf.py

+40-7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,28 @@
2020
import gguf
2121

2222

23+
def bytes_to_unicode():
24+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
25+
"""
26+
Returns list of utf-8 byte and a corresponding list of unicode strings.
27+
The reversible bpe codes work on unicode strings.
28+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
29+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
30+
This is a significant percentage of your normal, say, 32K bpe vocab.
31+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32+
And avoids mapping to whitespace/control characters the bpe code barfs on.
33+
"""
34+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
35+
cs = bs[:]
36+
n = 0
37+
for b in range(2**8):
38+
if b not in bs:
39+
bs.append(b)
40+
cs.append(2**8+n)
41+
n += 1
42+
return dict(zip(bs, (chr(n) for n in cs)))
43+
44+
2345
def count_model_parts(dir_model: Path) -> int:
2446
num_parts = 0
2547
for filename in os.listdir(dir_model):
@@ -95,8 +117,6 @@ def parse_args() -> argparse.Namespace:
95117
print("gguf: get tokenizer metadata")
96118

97119
tokens: list[bytearray] = []
98-
scores: list[float] = []
99-
toktypes: list[int] = []
100120

101121
# gpt2 tokenizer
102122
gguf_writer.add_tokenizer_model("gpt2")
@@ -112,15 +132,28 @@ def parse_args() -> argparse.Namespace:
112132
assert max(tokenizer.vocab.values()) < vocab_size
113133

114134
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
135+
byte_encoder = bytes_to_unicode()
136+
byte_decoder = {v: k for k, v in byte_encoder.items()}
115137

116138
for i in range(vocab_size):
117-
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
118-
scores.append(0.0) # dummy
119-
toktypes.append(gguf.TokenType.NORMAL)
139+
if i in reverse_vocab:
140+
try:
141+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
142+
except KeyError:
143+
text = bytearray()
144+
for c in reverse_vocab[i]:
145+
if ord(c) < 256: # single byte character
146+
text.append(byte_decoder[ord(c)])
147+
else: # multibyte special token character
148+
text.extend(c.encode('utf-8'))
149+
else:
150+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
151+
pad_token = f"[PAD{i}]".encode("utf8")
152+
text = bytearray(pad_token)
153+
154+
tokens.append(text)
120155

121156
gguf_writer.add_token_list(tokens)
122-
gguf_writer.add_token_scores(scores)
123-
gguf_writer.add_token_types(toktypes)
124157

125158
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
126159
special_vocab.add_to_gguf(gguf_writer)

convert.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -338,15 +338,29 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
338338
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
339339
tokenizer = self.bpe_tokenizer
340340
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
341-
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
342-
343-
for i, _ in enumerate(tokenizer):
344-
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
341+
byte_encoder = tokenization_gpt2.bytes_to_unicode()
342+
byte_decoder = {v: k for k, v in byte_encoder.items()}
343+
score = 0.0
344+
for i, item in enumerate(tokenizer):
345+
text: bytes = item.encode("utf-8")
346+
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
347+
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
348+
if i == 0 and text == b'<unk>':
349+
toktype = gguf.TokenType.UNKNOWN
350+
elif i == 1 or i == 2:
351+
toktype = gguf.TokenType.CONTROL
352+
elif i >= 3 and text.startswith(b'<0x'):
353+
toktype = gguf.TokenType.BYTE
354+
else:
355+
toktype = gguf.TokenType.NORMAL
356+
else:
357+
toktype = gguf.TokenType.NORMAL
358+
yield text, score, toktype
345359

346360
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
347361
for text in self.added_tokens_list:
348362
score = -1000.0
349-
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
363+
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
350364

351365
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
352366
yield from self.bpe_tokens()

0 commit comments

Comments
 (0)