File tree Expand file tree Collapse file tree 1 file changed +9
-0
lines changed Expand file tree Collapse file tree 1 file changed +9
-0
lines changed Original file line number Diff line number Diff line change
1
+ import contextlib
1
2
import os
2
3
import warnings
3
4
from pathlib import Path
@@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
67
68
tokenizer .all_special_tokens_extended )
68
69
tokenizer_all_special_tokens = set (tokenizer .all_special_tokens )
69
70
tokenizer_len = len (tokenizer )
71
+
70
72
max_token_id = max (tokenizer .get_vocab ().values ())
73
+ # Some tokenizers (e.g., QwenTokenizer) have special tokens that
74
+ # are added and included in the implementation of the vocab_size
75
+ # property, but not in get_vocab(); if there is an implementation
76
+ # of vocab size, we should take the greater value.
77
+ if hasattr (tokenizer , "vocab_size" ):
78
+ with contextlib .suppress (NotImplementedError ):
79
+ max_token_id = max (max_token_id , tokenizer .vocab_size )
71
80
72
81
class CachedTokenizer (tokenizer .__class__ ): # type: ignore
73
82
You can’t perform that action at this time.
0 commit comments