File tree 1 file changed +8
-4
lines changed
1 file changed +8
-4
lines changed Original file line number Diff line number Diff line change @@ -331,7 +331,7 @@ def _set_vocab_sentencepiece(self):
331
331
tokenizer = SentencePieceProcessor (str (tokenizer_path ))
332
332
vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
333
333
334
- for token_id in range (vocab_size ):
334
+ for token_id in range (tokenizer . vocab_size () ):
335
335
piece = tokenizer .id_to_piece (token_id )
336
336
text = piece .encode ("utf-8" )
337
337
score = tokenizer .get_score (token_id )
@@ -356,9 +356,13 @@ def _set_vocab_sentencepiece(self):
356
356
added_tokens_json = json .load (f )
357
357
358
358
for key in added_tokens_json :
359
- tokens .append (key .encode ("utf-8" ))
360
- scores .append (- 1000.0 )
361
- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
359
+ key = key .encode ("utf-8" )
360
+ if key not in tokens :
361
+ tokens .append (key )
362
+ scores .append (- 1000.0 )
363
+ toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
364
+
365
+ assert len (tokens ) == vocab_size
362
366
363
367
self .gguf_writer .add_tokenizer_model ("llama" )
364
368
self .gguf_writer .add_token_list (tokens )
You can’t perform that action at this time.
0 commit comments