@@ -1776,6 +1776,38 @@ def set_vocab(self):
1776
1776
scores [token_id ] = - 1000.0
1777
1777
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
1778
1778
1779
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
1780
+ if tokenizer_config_file .is_file ():
1781
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
1782
+ tokenizer_config_json = json .load (f )
1783
+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
1784
+ for token_id , foken_data in added_tokens_decoder .items ():
1785
+ token_id = int (token_id )
1786
+ token = foken_data ["content" ].encode ("utf-8" )
1787
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1788
+ assert (tokens [token_id ] == token )
1789
+ tokens [token_id ] = token
1790
+ scores [token_id ] = - 1000.0
1791
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
1792
+ if foken_data .get ("special" ):
1793
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
1794
+
1795
+ tokenizer_file = self .dir_model / 'tokenizer.json'
1796
+ if tokenizer_file .is_file ():
1797
+ with open (tokenizer_file , "r" , encoding = "utf-8" ) as f :
1798
+ tokenizer_json = json .load (f )
1799
+ added_tokens = tokenizer_json .get ("added_tokens" , [])
1800
+ for foken_data in added_tokens :
1801
+ token_id = int (foken_data ["id" ])
1802
+ token = foken_data ["content" ].encode ("utf-8" )
1803
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1804
+ assert (tokens [token_id ] == token )
1805
+ tokens [token_id ] = token
1806
+ scores [token_id ] = - 1000.0
1807
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
1808
+ if foken_data .get ("special" ):
1809
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
1810
+
1779
1811
self .gguf_writer .add_tokenizer_model ("llama" )
1780
1812
self .gguf_writer .add_tokenizer_pre ("default" )
1781
1813
self .gguf_writer .add_token_list (tokens )
0 commit comments