From 6455a7aa798000f15d55bb04f54a5cbd714e5afb Mon Sep 17 00:00:00 2001 From: Max Hniebergall <137079448+maxhniebergall@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:08:44 -0500 Subject: [PATCH] [ML] Update Deberta tokenizer (#116358) * Was using byte position for end of offset, but it seems like using char position is correct * Update docs/changelog/116358.yaml * Update UnigramTokenizer.java --------- Co-authored-by: Elastic Machine --- docs/changelog/116358.yaml | 5 +++++ .../xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/116358.yaml diff --git a/docs/changelog/116358.yaml b/docs/changelog/116358.yaml new file mode 100644 index 0000000000000..58b44a1e9bcf5 --- /dev/null +++ b/docs/changelog/116358.yaml @@ -0,0 +1,5 @@ +pr: 116358 +summary: Update Deberta tokenizer +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java index 31deac066cba2..01821f5582471 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java @@ -367,8 +367,10 @@ List tokenize(CharSequence inputSequence, IntToIntFuncti new DelimitedToken.Encoded( Strings.format("<0x%02X>", bytes[i]), pieces[i], + // even though we are changing the number of characters in the output, we don't + // need to change the offsets. The offsets refer to the input characters offsetCorrection.apply(node.startsAtCharPos), - offsetCorrection.apply(startsAtBytes + i) + offsetCorrection.apply(endsAtChars) ) ); }