[ML] fix NLP tokenization never_split handling around punctuation (#82982)

benwtrent · web-flow · commit b9aebc8d64d5 · 2022-01-25T07:39:28.000-05:00
When multiple characters in a row might be part of the never_split we erroneously tokenized them.

This commit handles this scenario so now `[[UNK]` is now tokenized as `[`, `[UNK]`
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizer.java
@@ -128,7 +128,13 @@ private List<DelimitedToken> mergeNeverSplitTokens(String originalText, List<Del
                     matchingTokens = new ArrayList<>();
                     current = neverSplitTokenTrieRoot;
                 }
-                mergedTokens.add(token);
+                childNode = current.getChild(token.getToken());
+                if (childNode == null) {
+                    mergedTokens.add(token);
+                } else {
+                    matchingTokens.add(token);
+                    current = childNode;
+                }
             } else if (childNode.isLeaf()) {
                 matchingTokens.add(token);
                 DelimitedToken mergedToken = DelimitedToken.mergeTokens(matchingTokens);
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizerTests.java
@@ -79,6 +79,11 @@ public void testNeverSplit_GivenNoLowerCase() {
         assertThat(tokenStrings(tokens), contains("Hello", "-", "[UNK]"));
         tokens = tokenizer.tokenize("Hello~[UNK][UNK]");
         assertThat(tokenStrings(tokens), contains("Hello", "~", "[UNK]", "[UNK]"));
+        assertThat(tokenStrings(tokenizer.tokenize("Hello~[[UNK]")), contains("Hello", "~", "[", "[UNK]"));
+        assertThat(tokenStrings(tokenizer.tokenize("Hello~[[[UNK]")), contains("Hello", "~", "[", "[", "[UNK]"));
+        assertThat(tokenStrings(tokenizer.tokenize("Hello~[UNK]]")), contains("Hello", "~", "[UNK]", "]"));
+        assertThat(tokenStrings(tokenizer.tokenize("Hello~[UNK]]]")), contains("Hello", "~", "[UNK]", "]", "]"));
+        assertThat(tokenStrings(tokenizer.tokenize("Hello~[[UNK]]")), contains("Hello", "~", "[", "[UNK]", "]"));
         tokens = tokenizer.tokenize("Hello-[unk]");
         assertThat(tokenStrings(tokens), contains("Hello", "-", "[", "unk", "]"));
     }