Skip to content

Commit 777f42b

Browse files
authored
Improve handling of special tokens in GGML to GGUF converter (#2725)
* Improve UNK, BOS, EOS token handling when converting without metadata. * Allow importing as a module. * Remove some obsolete code and minor cleanups. * Set default UNK token mapping from -1 to 0 in llama.cpp * Try to handle overflow due to buggy Windows Python with a better error message
1 parent 46ef5b5 commit 777f42b

File tree

2 files changed

+31
-14
lines changed

2 files changed

+31
-14
lines changed

convert-llama-ggmlv3-to-gguf.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
import sys, struct, math, argparse
1+
import sys, struct, math, argparse, warnings
22
from pathlib import Path
33

44
import numpy as np
55

66
import gguf
77

8+
warnings.filterwarnings('error')
9+
810
# Note: Does not support GGML_QKK_64
911
QK_K = 256
1012
# Items here are (block size, type size)
@@ -215,25 +217,32 @@ def add_vocab(self, gguf_writer):
215217
if self.vocab_override is not None:
216218
vo = self.vocab_override
217219
print('* Adding vocab item(s)')
218-
for (idx, vitem) in enumerate(vo.all_tokens()):
219-
if len(vitem) == 3:
220-
tokens.append(vitem[0])
221-
scores.append(vitem[1])
222-
toktypes.append(vitem[2])
223-
else:
224-
# Maybe try to guess the token type here?
225-
tokens.append(vitem[0])
226-
scores.append(vitem[1])
220+
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
221+
tokens.append(vbytes)
222+
scores.append(score)
223+
toktypes.append(ttype)
227224
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
228225
gguf_writer.add_token_list(tokens)
229226
gguf_writer.add_token_scores(scores)
230227
if len(toktypes) > 0:
231228
gguf_writer.add_token_types(toktypes)
232229
return
233230
print(f'* Adding {hp.n_vocab} vocab item(s)')
231+
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
234232
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
235233
tt = 1 # Normal
236-
if len(vbytes) == 0:
234+
# Special handling for UNK, BOS, EOS tokens.
235+
if tokid <= 2:
236+
if tokid == 0:
237+
vbytes = b'<unk>'
238+
tt = 2
239+
elif tokid == 1:
240+
vbytes = b'<s>'
241+
tt = 3
242+
else:
243+
vbytes = b'</s>'
244+
tt = 3
245+
elif len(vbytes) == 0:
237246
tt = 3 # Control
238247
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
239248
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
@@ -246,6 +255,9 @@ def add_vocab(self, gguf_writer):
246255
gguf_writer.add_token_list(tokens)
247256
gguf_writer.add_token_scores(scores)
248257
gguf_writer.add_token_types(toktypes)
258+
gguf_writer.add_unk_token_id(0)
259+
gguf_writer.add_bos_token_id(1)
260+
gguf_writer.add_eos_token_id(2)
249261

250262
def add_tensors(self, gguf_writer):
251263
nm = self.name_map
@@ -315,7 +327,11 @@ def main():
315327
data = np.memmap(cfg.input, mode = 'r')
316328
model = GGMLV3Model()
317329
print('* Scanning GGML input file')
318-
offset = model.load(data, 0)
330+
try:
331+
offset = model.load(data, 0)
332+
except OverflowError:
333+
print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr)
334+
raise
319335
print(f'* GGML model hyperparameters: {model.hyperparameters}')
320336
vocab_override = None
321337
params_override = None
@@ -330,4 +346,5 @@ def main():
330346
converter.save()
331347
print(f'* Successful completion. Output saved to: {cfg.output}')
332348

333-
main()
349+
if __name__ == '__main__':
350+
main()

llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -703,7 +703,7 @@ struct llama_vocab {
703703
// default LLaMA special tokens
704704
id special_bos_id = 1;
705705
id special_eos_id = 2;
706-
id special_unk_id = -1;
706+
id special_unk_id = 0;
707707
id special_sep_id = -1;
708708
id special_pad_id = -1;
709709

0 commit comments

Comments
 (0)