Skip to content

Commit 5b61c04

Browse files
author
jaime-m-p
committed
Fix added tokens
The file 'added_tokens.json' does not exist for phi-3 or llama-spm. Read from 'tokenizer_config.json'. Then read from 'tokenizer.json'.
1 parent 04aad94 commit 5b61c04

File tree

1 file changed

+32
-0
lines changed

1 file changed

+32
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,6 +1776,38 @@ def set_vocab(self):
17761776
scores[token_id] = -1000.0
17771777
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
17781778

1779+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1780+
if tokenizer_config_file.is_file():
1781+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1782+
tokenizer_config_json = json.load(f)
1783+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
1784+
for token_id, foken_data in added_tokens_decoder.items():
1785+
token_id = int(token_id)
1786+
token = foken_data["content"].encode("utf-8")
1787+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1788+
assert(tokens[token_id] == token)
1789+
tokens[token_id] = token
1790+
scores[token_id] = -1000.0
1791+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
1792+
if foken_data.get("special"):
1793+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
1794+
1795+
tokenizer_file = self.dir_model / 'tokenizer.json'
1796+
if tokenizer_file.is_file():
1797+
with open(tokenizer_file, "r", encoding="utf-8") as f:
1798+
tokenizer_json = json.load(f)
1799+
added_tokens = tokenizer_json.get("added_tokens", [])
1800+
for foken_data in added_tokens:
1801+
token_id = int(foken_data["id"])
1802+
token = foken_data["content"].encode("utf-8")
1803+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1804+
assert(tokens[token_id] == token)
1805+
tokens[token_id] = token
1806+
scores[token_id] = -1000.0
1807+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
1808+
if foken_data.get("special"):
1809+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
1810+
17791811
self.gguf_writer.add_tokenizer_model("llama")
17801812
self.gguf_writer.add_tokenizer_pre("default")
17811813
self.gguf_writer.add_token_list(tokens)

0 commit comments

Comments
 (0)