Auto-detect wordpiece tokenizer when model.type is missing (#1151)

xenova · web-flow · commit 16ff98da0889 · 2025-01-18T20:10:29.000+02:00
* Auto-detect wordpiece tokenizer when model.type is missing

* Update test name
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -364,13 +364,15 @@ export class TokenizerModel extends Callable {
                 return new BPE(config);
 
             default:
-                // Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
-                // In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
+                // Some older tokenizers, like `google-t5/t5-small` and `distilbert/distilbert-base-uncased`, do not have a `type` field.
+                // In this case, we can infer the tokenizer type based on the structure of the `vocab` field and other properties.
                 if (config.vocab) {
                     if (Array.isArray(config.vocab)) {
                         // config.vocab is of type `[string, number][]`
                         // @ts-ignore
                         return new Unigram(config, ...args);
+                    } else if (typeof config.vocab === 'object' && config.continuing_subword_prefix && config.unk_token) {
+                        return new WordPieceTokenizer(config);
                     } else {
                         // @ts-ignore
                         return new LegacyTokenizerModel(config, ...args);
diff --git a/tests/models/bert/test_tokenization_bert.js b/tests/models/bert/test_tokenization_bert.js
@@ -1332,4 +1332,13 @@ export const TEST_CONFIG = {
       decoded: "[CLS] test $ 1 r2 # 3 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] test [SEP]",
     },
   },
+  // `model.type` field missing in tokenizer.json
+  "google-bert/bert-base-cased": {
+    CHINESE_LATIN_MIXED: {
+      text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
+      tokens: ["ah", "[UNK]", "[UNK]", "z", "##z"],
+      ids: [101, 18257, 100, 100, 195, 1584, 102],
+      decoded: "[CLS] ah [UNK] [UNK] zz [SEP]",
+    },
+  },
 };
diff --git a/tests/models/distilbert/test_tokenization_distilbert.js b/tests/models/distilbert/test_tokenization_distilbert.js
@@ -1,5 +1,5 @@
 import { DistilBertTokenizer } from "../../../src/tokenizers.js";
-import { BASE_TEST_STRINGS } from "../test_strings.js";
+import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
 
 export const TOKENIZER_CLASS = DistilBertTokenizer;
 export const TEST_CONFIG = {
@@ -303,4 +303,13 @@ export const TEST_CONFIG = {
       decoded: "[CLS] weird \uff5e edge \uff5e case [SEP]",
     },
   },
+  // `model.type` field missing in tokenizer.json
+  "distilbert/distilbert-base-multilingual-cased": {
+    CHINESE_LATIN_MIXED: {
+      text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
+      tokens: ["ah", "\u535a", "\u63a8", "z", "##z"],
+      ids: [101, 69863, 2684, 4163, 194, 10305, 102],
+      decoded: "[CLS] ah \u535a \u63a8 zz [SEP]",
+    },
+  },
 };