Skip to content

Commit 8bedcb3

Browse files
Merge pull request huggingface#4 from DaryaTereshchenko/changes_tokenizer
fix additional tokens list and run tests
2 parents 2603cf8 + 17c1198 commit 8bedcb3

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

src/transformers/models/prism/tokenization_prism.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,9 @@ def __init__(
169169
fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
170170
self.lang_code_to_token = {lang_code: f"<{lang_code}>" for lang_code in fairseq_language_code}
171171

172+
language_tokens = [self.get_lang_token(lang_code) for lang_code in fairseq_language_code]
172173
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
173-
for lang_code in fairseq_language_code:
174-
token = self.get_lang_token(lang_code)
175-
if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
176-
additional_special_tokens.append(token)
174+
self.additional_special_tokens = language_tokens + additional_special_tokens
177175

178176
self.vocab_file = vocab_file
179177
self.encoder = load_json(vocab_file)
@@ -215,6 +213,8 @@ def __init__(
215213
num_madeup_words=num_madeup_words,
216214
**kwargs,
217215
)
216+
217+
self.special_tokens_map['additional_special_tokens'] = self.additional_special_tokens
218218
self.set_src_lang_special_tokens(self._src_lang)
219219

220220
@property

0 commit comments

Comments
 (0)