Skip to content

Commit 63687c1

Browse files
pcuencahodlen
authored andcommitted
convert-hf : fix exception in sentencepiece with added tokens (ggml-org#6320)
1 parent 01f69de commit 63687c1

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

convert-hf-to-gguf.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ def _set_vocab_sentencepiece(self):
331331
tokenizer = SentencePieceProcessor(str(tokenizer_path))
332332
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
333333

334-
for token_id in range(vocab_size):
334+
for token_id in range(tokenizer.vocab_size()):
335335
piece = tokenizer.id_to_piece(token_id)
336336
text = piece.encode("utf-8")
337337
score = tokenizer.get_score(token_id)
@@ -356,9 +356,13 @@ def _set_vocab_sentencepiece(self):
356356
added_tokens_json = json.load(f)
357357

358358
for key in added_tokens_json:
359-
tokens.append(key.encode("utf-8"))
360-
scores.append(-1000.0)
361-
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
359+
key = key.encode("utf-8")
360+
if key not in tokens:
361+
tokens.append(key)
362+
scores.append(-1000.0)
363+
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
364+
365+
assert len(tokens) == vocab_size
362366

363367
self.gguf_writer.add_tokenizer_model("llama")
364368
self.gguf_writer.add_token_list(tokens)

0 commit comments

Comments
 (0)