From 6455a7aa798000f15d55bb04f54a5cbd714e5afb Mon Sep 17 00:00:00 2001
From: Max Hniebergall <137079448+maxhniebergall@users.noreply.github.com>
Date: Wed, 20 Nov 2024 15:08:44 -0500
Subject: [PATCH] [ML] Update Deberta tokenizer (#116358)

* Was using byte position for end of offset, but it seems like using char position is correct

* Update docs/changelog/116358.yaml

* Update UnigramTokenizer.java

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
---
 docs/changelog/116358.yaml                                   | 5 +++++
 .../xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java  | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 docs/changelog/116358.yaml

diff --git a/docs/changelog/116358.yaml b/docs/changelog/116358.yaml
new file mode 100644
index 0000000000000..58b44a1e9bcf5
--- /dev/null
+++ b/docs/changelog/116358.yaml
@@ -0,0 +1,5 @@
+pr: 116358
+summary: Update Deberta tokenizer
+area: Machine Learning
+type: bug
+issues: []
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java
index 31deac066cba2..01821f5582471 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java
@@ -367,8 +367,10 @@ List<DelimitedToken.Encoded> tokenize(CharSequence inputSequence, IntToIntFuncti
                         new DelimitedToken.Encoded(
                             Strings.format("<0x%02X>", bytes[i]),
                             pieces[i],
+                            // even though we are changing the number of characters in the output, we don't
+                            // need to change the offsets. The offsets refer to the input characters
                             offsetCorrection.apply(node.startsAtCharPos),
-                            offsetCorrection.apply(startsAtBytes + i)
+                            offsetCorrection.apply(endsAtChars)
                         )
                     );
                 }