diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizer.java index 789710cefbfb2..561fd429422bf 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizer.java @@ -128,7 +128,13 @@ private List mergeNeverSplitTokens(String originalText, List(); current = neverSplitTokenTrieRoot; } - mergedTokens.add(token); + childNode = current.getChild(token.getToken()); + if (childNode == null) { + mergedTokens.add(token); + } else { + matchingTokens.add(token); + current = childNode; + } } else if (childNode.isLeaf()) { matchingTokens.add(token); DelimitedToken mergedToken = DelimitedToken.mergeTokens(matchingTokens); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizerTests.java index 0e08f31989a90..effe3be0da5a6 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenizerTests.java @@ -79,6 +79,11 @@ public void testNeverSplit_GivenNoLowerCase() { assertThat(tokenStrings(tokens), contains("Hello", "-", "[UNK]")); tokens = tokenizer.tokenize("Hello~[UNK][UNK]"); assertThat(tokenStrings(tokens), contains("Hello", "~", "[UNK]", "[UNK]")); + assertThat(tokenStrings(tokenizer.tokenize("Hello~[[UNK]")), contains("Hello", "~", "[", "[UNK]")); + assertThat(tokenStrings(tokenizer.tokenize("Hello~[[[UNK]")), contains("Hello", "~", "[", "[", "[UNK]")); + assertThat(tokenStrings(tokenizer.tokenize("Hello~[UNK]]")), contains("Hello", "~", "[UNK]", "]")); + assertThat(tokenStrings(tokenizer.tokenize("Hello~[UNK]]]")), contains("Hello", "~", "[UNK]", "]", "]")); + assertThat(tokenStrings(tokenizer.tokenize("Hello~[[UNK]]")), contains("Hello", "~", "[", "[UNK]", "]")); tokens = tokenizer.tokenize("Hello-[unk]"); assertThat(tokenStrings(tokens), contains("Hello", "-", "[", "unk", "]")); }