diff --git a/docs/changelog/116358.yaml b/docs/changelog/116358.yaml new file mode 100644 index 0000000000000..58b44a1e9bcf5 --- /dev/null +++ b/docs/changelog/116358.yaml @@ -0,0 +1,5 @@ +pr: 116358 +summary: Update Deberta tokenizer +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java index 31deac066cba2..01821f5582471 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java @@ -367,8 +367,10 @@ List tokenize(CharSequence inputSequence, IntToIntFuncti new DelimitedToken.Encoded( Strings.format("<0x%02X>", bytes[i]), pieces[i], + // even though we are changing the number of characters in the output, we don't + // need to change the offsets. The offsets refer to the input characters offsetCorrection.apply(node.startsAtCharPos), - offsetCorrection.apply(startsAtBytes + i) + offsetCorrection.apply(endsAtChars) ) ); }