[ML] Fix end offset for first_non_blank_line char_filter (#73828)

droberts195 · web-flow · commit 334ad82c99b7 · 2021-06-07T18:18:19.000+01:00
When the input gets chopped by a char_filter immediately after a token, that token must be reported as ending at the very end of the original input, otherwise analysis will have incorrect offsets when multiple field values are analyzed in the same _analyze request. The pattern_replace filter works like this. This PR changes the new first_non_blank_line filter to work in the same way. Fixes elastic/kibana#101255
diff --git a/x-pack/plugin/build.gradle b/x-pack/plugin/build.gradle
@@ -121,6 +121,8 @@ tasks.named("yamlRestCompatTest").configure {
     'ml/jobs_get_stats/Test get job stats after uploading data prompting the creation of some stats',
     'ml/jobs_get_stats/Test get job stats for closed job',
     'ml/jobs_get_stats/Test no exception on get job stats with missing index',
+    // TODO: remove the next one after backporting https://github.com/elastic/elasticsearch/pull/73828
+    'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
     'ml/post_data/Test POST data job api, flush, close and verify DataCounts doc',
     'ml/post_data/Test flush with skip_time',
     'ml/set_upgrade_mode/Setting upgrade mode to disabled from enabled',
diff --git a/x-pack/plugin/ml/qa/ml-with-security/build.gradle b/x-pack/plugin/ml/qa/ml-with-security/build.gradle
@@ -22,6 +22,8 @@ tasks.named("yamlRestTest").configure {
     'ml/ml_classic_analyze/Test analyze API with an analyzer that does what we used to do in native code',
     'ml/ml_standard_analyze/Test analyze API with the standard 7.14 ML analyzer',
     'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
+    'ml/ml_standard_analyze/Test 7.14 analyzer with multiple multiline messages',
+    'ml/ml_standard_analyze/Test 7.14 analyzer with stop words in messages',
     // Remove tests that are expected to throw an exception, because we cannot then
     // know whether to expect an authorization exception or a validation exception
     'ml/calendar_crud/Test get calendar given missing',
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java
@@ -93,6 +93,9 @@ private CharSequence process(CharSequence input) {
         }
 
         addOffCorrectMap(0, prevNewlineIndex + 1);
+        if (endIndex < input.length()) {
+            addOffCorrectMap(endIndex - prevNewlineIndex - 1, input.length() - endIndex + prevNewlineIndex + 1);
+        }
         return input.subSequence(prevNewlineIndex + 1, endIndex);
     }
 }
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java
@@ -121,8 +121,11 @@ public void testCorrect() throws IOException {
         assertThat(new String(output), equalTo(expectedOutput));
 
         int expectedOutputIndex = input.indexOf(expectedOutput);
-        for (int i = 0; i <= expectedOutput.length(); ++i) {
+        for (int i = 0; i < expectedOutput.length(); ++i) {
             assertThat(filter.correctOffset(i), equalTo(expectedOutputIndex + i));
         }
+        // When the input gets chopped by a char filter immediately after a token, that token must be reported as
+        // ending at the very end of the original input, otherwise multi-message analysis will have incorrect offsets
+        assertThat(filter.correctOffset(expectedOutput.length()), equalTo(input.length()));
     }
 }
diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml
@@ -111,5 +111,107 @@
   - match: { tokens.0.position: 0 }
   - match: { tokens.1.token: "line" }
   - match: { tokens.1.start_offset: 10 }
-  - match: { tokens.1.end_offset: 14 }
+  - match: { tokens.1.end_offset: 26 }
   - match: { tokens.1.position: 1 }
+
+---
+"Test 7.14 analyzer with multiple multiline messages":
+  - do:
+      indices.analyze:
+        body:  >
+          {
+            "char_filter" : [
+              "first_non_blank_line"
+            ],
+            "tokenizer" : "ml_standard",
+            "filter" : [
+              { "type" : "stop", "stopwords": [
+                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
+                "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
+                "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
+                "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+                "GMT", "UTC"
+              ] }
+            ],
+            "text" : [
+              "   \nfirst line\nsecond line",
+              "   \nfirst line of second message\nsecond line of second message"
+            ]
+          }
+  - match: { tokens.0.token: "first" }
+  - match: { tokens.0.start_offset: 4 }
+  - match: { tokens.0.end_offset: 9 }
+  - match: { tokens.0.position: 0 }
+  - match: { tokens.1.token: "line" }
+  - match: { tokens.1.start_offset: 10 }
+  - match: { tokens.1.end_offset: 26 }
+  - match: { tokens.1.position: 1 }
+  - match: { tokens.2.token: "first" }
+  - match: { tokens.2.start_offset: 31 }
+  - match: { tokens.2.end_offset: 36 }
+  - match: { tokens.2.position: 102 }
+  - match: { tokens.3.token: "line" }
+  - match: { tokens.3.start_offset: 37 }
+  - match: { tokens.3.end_offset: 41 }
+  - match: { tokens.3.position: 103 }
+  - match: { tokens.4.token: "of" }
+  - match: { tokens.4.start_offset: 42 }
+  - match: { tokens.4.end_offset: 44 }
+  - match: { tokens.4.position: 104 }
+  - match: { tokens.5.token: "second" }
+  - match: { tokens.5.start_offset: 45 }
+  - match: { tokens.5.end_offset: 51 }
+  - match: { tokens.5.position: 105 }
+  - match: { tokens.6.token: "message" }
+  - match: { tokens.6.start_offset: 52 }
+  - match: { tokens.6.end_offset: 89 }
+  - match: { tokens.6.position: 106 }
+
+---
+"Test 7.14 analyzer with stop words in messages":
+  - do:
+      indices.analyze:
+        body:  >
+          {
+            "char_filter" : [
+              "first_non_blank_line"
+            ],
+            "tokenizer" : "ml_standard",
+            "filter" : [
+              { "type" : "stop", "stopwords": [
+                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
+                "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
+                "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
+                "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+                "GMT", "UTC"
+              ] }
+            ],
+            "text" : [
+              "May 27, 2021 @ 19:51:15.288 UTC log message one",
+              "May 27, 2021 @ 19:52:25.288 UTC log message two"
+            ]
+          }
+  - match: { tokens.0.token: "log" }
+  - match: { tokens.0.start_offset: 32 }
+  - match: { tokens.0.end_offset: 35 }
+  - match: { tokens.0.position: 7 }
+  - match: { tokens.1.token: "message" }
+  - match: { tokens.1.start_offset: 36 }
+  - match: { tokens.1.end_offset: 43 }
+  - match: { tokens.1.position: 8 }
+  - match: { tokens.2.token: "one" }
+  - match: { tokens.2.start_offset: 44 }
+  - match: { tokens.2.end_offset: 47 }
+  - match: { tokens.2.position: 9 }
+  - match: { tokens.3.token: "log" }
+  - match: { tokens.3.start_offset: 80 }
+  - match: { tokens.3.end_offset: 83 }
+  - match: { tokens.3.position: 117 }
+  - match: { tokens.4.token: "message" }
+  - match: { tokens.4.start_offset: 84 }
+  - match: { tokens.4.end_offset: 91 }
+  - match: { tokens.4.position: 118 }
+  - match: { tokens.5.token: "two" }
+  - match: { tokens.5.start_offset: 92 }
+  - match: { tokens.5.end_offset: 95 }
+  - match: { tokens.5.position: 119 }

Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,9 @@ private CharSequence process(CharSequence input) {`
`93`	`93`	`}`
`94`	`94`
`95`	`95`	`addOffCorrectMap(0, prevNewlineIndex + 1);`
	`96`	`+ if (endIndex < input.length()) {`
	`97`	`+ addOffCorrectMap(endIndex - prevNewlineIndex - 1, input.length() - endIndex + prevNewlineIndex + 1);`
	`98`	`+ }`
`96`	`99`	`return input.subSequence(prevNewlineIndex + 1, endIndex);`
`97`	`100`	`}`
`98`	`101`	`}`
Original file line number	Diff line number	Diff line change
`@@ -121,8 +121,11 @@ public void testCorrect() throws IOException {`
`121`	`121`	`assertThat(new String(output), equalTo(expectedOutput));`
`122`	`122`
`123`	`123`	`int expectedOutputIndex = input.indexOf(expectedOutput);`
`124`		`- for (int i = 0; i <= expectedOutput.length(); ++i) {`
	`124`	`+ for (int i = 0; i < expectedOutput.length(); ++i) {`
`125`	`125`	`assertThat(filter.correctOffset(i), equalTo(expectedOutputIndex + i));`
`126`	`126`	`}`
	`127`	`+ // When the input gets chopped by a char filter immediately after a token, that token must be reported as`
	`128`	`+ // ending at the very end of the original input, otherwise multi-message analysis will have incorrect offsets`
	`129`	`+ assertThat(filter.correctOffset(expectedOutput.length()), equalTo(input.length()));`
`127`	`130`	`}`
`128`	`131`	`}`