Skip to content

Commit 334ad82

Browse files
authored
[ML] Fix end offset for first_non_blank_line char_filter (#73828)
When the input gets chopped by a char_filter immediately after a token, that token must be reported as ending at the very end of the original input, otherwise analysis will have incorrect offsets when multiple field values are analyzed in the same _analyze request. The pattern_replace filter works like this. This PR changes the new first_non_blank_line filter to work in the same way. Fixes elastic/kibana#101255
1 parent 6b7fea0 commit 334ad82

File tree

5 files changed

+114
-2
lines changed

5 files changed

+114
-2
lines changed

x-pack/plugin/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ tasks.named("yamlRestCompatTest").configure {
121121
'ml/jobs_get_stats/Test get job stats after uploading data prompting the creation of some stats',
122122
'ml/jobs_get_stats/Test get job stats for closed job',
123123
'ml/jobs_get_stats/Test no exception on get job stats with missing index',
124+
// TODO: remove the next one after backporting https://github.com/elastic/elasticsearch/pull/73828
125+
'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
124126
'ml/post_data/Test POST data job api, flush, close and verify DataCounts doc',
125127
'ml/post_data/Test flush with skip_time',
126128
'ml/set_upgrade_mode/Setting upgrade mode to disabled from enabled',

x-pack/plugin/ml/qa/ml-with-security/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ tasks.named("yamlRestTest").configure {
2222
'ml/ml_classic_analyze/Test analyze API with an analyzer that does what we used to do in native code',
2323
'ml/ml_standard_analyze/Test analyze API with the standard 7.14 ML analyzer',
2424
'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
25+
'ml/ml_standard_analyze/Test 7.14 analyzer with multiple multiline messages',
26+
'ml/ml_standard_analyze/Test 7.14 analyzer with stop words in messages',
2527
// Remove tests that are expected to throw an exception, because we cannot then
2628
// know whether to expect an authorization exception or a validation exception
2729
'ml/calendar_crud/Test get calendar given missing',

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ private CharSequence process(CharSequence input) {
9393
}
9494

9595
addOffCorrectMap(0, prevNewlineIndex + 1);
96+
if (endIndex < input.length()) {
97+
addOffCorrectMap(endIndex - prevNewlineIndex - 1, input.length() - endIndex + prevNewlineIndex + 1);
98+
}
9699
return input.subSequence(prevNewlineIndex + 1, endIndex);
97100
}
98101
}

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,11 @@ public void testCorrect() throws IOException {
121121
assertThat(new String(output), equalTo(expectedOutput));
122122

123123
int expectedOutputIndex = input.indexOf(expectedOutput);
124-
for (int i = 0; i <= expectedOutput.length(); ++i) {
124+
for (int i = 0; i < expectedOutput.length(); ++i) {
125125
assertThat(filter.correctOffset(i), equalTo(expectedOutputIndex + i));
126126
}
127+
// When the input gets chopped by a char filter immediately after a token, that token must be reported as
128+
// ending at the very end of the original input, otherwise multi-message analysis will have incorrect offsets
129+
assertThat(filter.correctOffset(expectedOutput.length()), equalTo(input.length()));
127130
}
128131
}

x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,5 +111,107 @@
111111
- match: { tokens.0.position: 0 }
112112
- match: { tokens.1.token: "line" }
113113
- match: { tokens.1.start_offset: 10 }
114-
- match: { tokens.1.end_offset: 14 }
114+
- match: { tokens.1.end_offset: 26 }
115115
- match: { tokens.1.position: 1 }
116+
117+
---
118+
"Test 7.14 analyzer with multiple multiline messages":
119+
- do:
120+
indices.analyze:
121+
body: >
122+
{
123+
"char_filter" : [
124+
"first_non_blank_line"
125+
],
126+
"tokenizer" : "ml_standard",
127+
"filter" : [
128+
{ "type" : "stop", "stopwords": [
129+
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
130+
"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
131+
"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
132+
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
133+
"GMT", "UTC"
134+
] }
135+
],
136+
"text" : [
137+
" \nfirst line\nsecond line",
138+
" \nfirst line of second message\nsecond line of second message"
139+
]
140+
}
141+
- match: { tokens.0.token: "first" }
142+
- match: { tokens.0.start_offset: 4 }
143+
- match: { tokens.0.end_offset: 9 }
144+
- match: { tokens.0.position: 0 }
145+
- match: { tokens.1.token: "line" }
146+
- match: { tokens.1.start_offset: 10 }
147+
- match: { tokens.1.end_offset: 26 }
148+
- match: { tokens.1.position: 1 }
149+
- match: { tokens.2.token: "first" }
150+
- match: { tokens.2.start_offset: 31 }
151+
- match: { tokens.2.end_offset: 36 }
152+
- match: { tokens.2.position: 102 }
153+
- match: { tokens.3.token: "line" }
154+
- match: { tokens.3.start_offset: 37 }
155+
- match: { tokens.3.end_offset: 41 }
156+
- match: { tokens.3.position: 103 }
157+
- match: { tokens.4.token: "of" }
158+
- match: { tokens.4.start_offset: 42 }
159+
- match: { tokens.4.end_offset: 44 }
160+
- match: { tokens.4.position: 104 }
161+
- match: { tokens.5.token: "second" }
162+
- match: { tokens.5.start_offset: 45 }
163+
- match: { tokens.5.end_offset: 51 }
164+
- match: { tokens.5.position: 105 }
165+
- match: { tokens.6.token: "message" }
166+
- match: { tokens.6.start_offset: 52 }
167+
- match: { tokens.6.end_offset: 89 }
168+
- match: { tokens.6.position: 106 }
169+
170+
---
171+
"Test 7.14 analyzer with stop words in messages":
172+
- do:
173+
indices.analyze:
174+
body: >
175+
{
176+
"char_filter" : [
177+
"first_non_blank_line"
178+
],
179+
"tokenizer" : "ml_standard",
180+
"filter" : [
181+
{ "type" : "stop", "stopwords": [
182+
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
183+
"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
184+
"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
185+
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
186+
"GMT", "UTC"
187+
] }
188+
],
189+
"text" : [
190+
"May 27, 2021 @ 19:51:15.288 UTC log message one",
191+
"May 27, 2021 @ 19:52:25.288 UTC log message two"
192+
]
193+
}
194+
- match: { tokens.0.token: "log" }
195+
- match: { tokens.0.start_offset: 32 }
196+
- match: { tokens.0.end_offset: 35 }
197+
- match: { tokens.0.position: 7 }
198+
- match: { tokens.1.token: "message" }
199+
- match: { tokens.1.start_offset: 36 }
200+
- match: { tokens.1.end_offset: 43 }
201+
- match: { tokens.1.position: 8 }
202+
- match: { tokens.2.token: "one" }
203+
- match: { tokens.2.start_offset: 44 }
204+
- match: { tokens.2.end_offset: 47 }
205+
- match: { tokens.2.position: 9 }
206+
- match: { tokens.3.token: "log" }
207+
- match: { tokens.3.start_offset: 80 }
208+
- match: { tokens.3.end_offset: 83 }
209+
- match: { tokens.3.position: 117 }
210+
- match: { tokens.4.token: "message" }
211+
- match: { tokens.4.start_offset: 84 }
212+
- match: { tokens.4.end_offset: 91 }
213+
- match: { tokens.4.position: 118 }
214+
- match: { tokens.5.token: "two" }
215+
- match: { tokens.5.start_offset: 92 }
216+
- match: { tokens.5.end_offset: 95 }
217+
- match: { tokens.5.position: 119 }

0 commit comments

Comments
 (0)