Skip to content

Commit 16ff98d

Browse files
authored
Auto-detect wordpiece tokenizer when model.type is missing (#1151)
* Auto-detect wordpiece tokenizer when model.type is missing * Update test name
1 parent 761f257 commit 16ff98d

File tree

3 files changed

+23
-3
lines changed

3 files changed

+23
-3
lines changed

src/tokenizers.js

+4-2
Original file line numberDiff line numberDiff line change
@@ -364,13 +364,15 @@ export class TokenizerModel extends Callable {
364364
return new BPE(config);
365365

366366
default:
367-
// Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
368-
// In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
367+
// Some older tokenizers, like `google-t5/t5-small` and `distilbert/distilbert-base-uncased`, do not have a `type` field.
368+
// In this case, we can infer the tokenizer type based on the structure of the `vocab` field and other properties.
369369
if (config.vocab) {
370370
if (Array.isArray(config.vocab)) {
371371
// config.vocab is of type `[string, number][]`
372372
// @ts-ignore
373373
return new Unigram(config, ...args);
374+
} else if (typeof config.vocab === 'object' && config.continuing_subword_prefix && config.unk_token) {
375+
return new WordPieceTokenizer(config);
374376
} else {
375377
// @ts-ignore
376378
return new LegacyTokenizerModel(config, ...args);

tests/models/bert/test_tokenization_bert.js

+9
Original file line numberDiff line numberDiff line change
@@ -1332,4 +1332,13 @@ export const TEST_CONFIG = {
13321332
decoded: "[CLS] test $ 1 r2 # 3 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] test [SEP]",
13331333
},
13341334
},
1335+
// `model.type` field missing in tokenizer.json
1336+
"google-bert/bert-base-cased": {
1337+
CHINESE_LATIN_MIXED: {
1338+
text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
1339+
tokens: ["ah", "[UNK]", "[UNK]", "z", "##z"],
1340+
ids: [101, 18257, 100, 100, 195, 1584, 102],
1341+
decoded: "[CLS] ah [UNK] [UNK] zz [SEP]",
1342+
},
1343+
},
13351344
};

tests/models/distilbert/test_tokenization_distilbert.js

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { DistilBertTokenizer } from "../../../src/tokenizers.js";
2-
import { BASE_TEST_STRINGS } from "../test_strings.js";
2+
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
33

44
export const TOKENIZER_CLASS = DistilBertTokenizer;
55
export const TEST_CONFIG = {
@@ -303,4 +303,13 @@ export const TEST_CONFIG = {
303303
decoded: "[CLS] weird \uff5e edge \uff5e case [SEP]",
304304
},
305305
},
306+
// `model.type` field missing in tokenizer.json
307+
"distilbert/distilbert-base-multilingual-cased": {
308+
CHINESE_LATIN_MIXED: {
309+
text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
310+
tokens: ["ah", "\u535a", "\u63a8", "z", "##z"],
311+
ids: [101, 69863, 2684, 4163, 194, 10305, 102],
312+
decoded: "[CLS] ah \u535a \u63a8 zz [SEP]",
313+
},
314+
},
306315
};

0 commit comments

Comments
 (0)