Skip to content

Commit 8a97833

Browse files
committed
Apply review comments.
1 parent 873af34 commit 8a97833

File tree

2 files changed

+50
-16
lines changed

2 files changed

+50
-16
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/process/IndexingStateProcessor.java

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,21 @@
3535
import java.util.Map;
3636
import java.util.Objects;
3737

38-
3938
/**
40-
* Reads state documents of a stream, splits them and persists to an index via a bulk request
39+
* Reads state documents of a stream, splits them and persists to an index via a bulk request.
40+
*
41+
* Some types of state, for example data frame analytics state and categorizer state, are written multiple times with the same document id.
42+
* The code needs to make sure that even after .ml-state index rollover there are no duplicate documents across the .ml-state*
43+
* indices. Such duplicates are undesirable for at least two reasons:
44+
* 1. We deliberately have no mappings on the state index so we cannot sort and filter in a search
45+
* 2. The state documents are large, so having dead documents with duplicate IDs is suboptimal from a disk usage perspective
46+
*
47+
* In order to avoid duplicates the following sequence of steps is executed every time the document is about to get persisted:
48+
* 1. The first non-blank line is extracted from the given bytes. Lines are delimited by the new line character ('\n')
49+
* 2. Document id is extracted from this line.
50+
* 3. Document with this id is searched for in .ml-state* indices
51+
* 4. If the document is found, it is overwritten in place (i.e. in the same index) with the new content.
52+
* Otherwise, it is written to the index pointed by the current write alias, i.e. .ml-state-writei
4153
*/
4254
public class IndexingStateProcessor implements StateProcessor {
4355

@@ -100,7 +112,7 @@ private BytesReference splitAndPersist(BytesReference bytesRef, int searchFrom)
100112
// Ignore completely empty chunks
101113
if (nextZeroByte > splitFrom) {
102114
// No validation - assume the native process has formatted the state correctly
103-
findAppropriateIndexAndPersist(bytesRef.slice(splitFrom, nextZeroByte - splitFrom));
115+
findAppropriateIndexOrAliasAndPersist(bytesRef.slice(splitFrom, nextZeroByte - splitFrom));
104116
}
105117
splitFrom = nextZeroByte + 1;
106118
}
@@ -110,11 +122,16 @@ private BytesReference splitAndPersist(BytesReference bytesRef, int searchFrom)
110122
return bytesRef.slice(splitFrom, bytesRef.length() - splitFrom);
111123
}
112124

113-
void findAppropriateIndexAndPersist(BytesReference bytes) throws IOException {
114-
String stateDocId = extractDocId(bytes);
115-
if (stateDocId == null) {
125+
/**
126+
* Finds an appropriate index the document should be put in and then persists the document in that index.
127+
* For what is considered to be "appropriate" see the class documentation.
128+
*/
129+
void findAppropriateIndexOrAliasAndPersist(BytesReference bytes) throws IOException {
130+
String firstNonBlankLine = extractFirstNonBlankLine(bytes);
131+
if (firstNonBlankLine == null) {
116132
return;
117133
}
134+
String stateDocId = extractDocId(firstNonBlankLine);
118135
String indexOrAlias = getConcreteIndexOrWriteAlias(stateDocId);
119136
persist(indexOrAlias, bytes);
120137
}
@@ -142,11 +159,11 @@ private static int findNextZeroByte(BytesReference bytesRef, int searchFrom, int
142159
}
143160

144161
@SuppressWarnings("unchecked")
145-
static String extractDocId(BytesReference bytesRef) throws IOException {
146-
String firstNonBlankLine = extractFirstNonBlankLine(bytesRef);
147-
if (firstNonBlankLine == null) {
148-
return null;
149-
}
162+
/**
163+
* Extracts document id from the given {@code bytesRef}.
164+
* Only first non-blank line is parsed and document id is assumed to be a nested "index._id" field of type String.
165+
*/
166+
static String extractDocId(String firstNonBlankLine) throws IOException {
150167
try (XContentParser parser =
151168
JsonXContent.jsonXContent.createParser(
152169
NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, firstNonBlankLine)) {
@@ -162,19 +179,36 @@ static String extractDocId(BytesReference bytesRef) throws IOException {
162179
}
163180
}
164181

182+
/**
183+
* Extracts the first non-blank line from the given {@code bytesRef}.
184+
* Lines are separated by the new line character ('\n').
185+
* A line is considered blank if it only consists of space characters (' ').
186+
*/
165187
private static String extractFirstNonBlankLine(BytesReference bytesRef) {
166188
for (int searchFrom = 0; searchFrom < bytesRef.length();) {
167189
int newLineMarkerIndex = bytesRef.indexOf((byte) '\n', searchFrom);
168190
int searchTo = newLineMarkerIndex != -1 ? newLineMarkerIndex : bytesRef.length();
169-
String line = bytesRef.slice(searchFrom, searchTo - searchFrom).utf8ToString();
170-
if (line.isBlank() == false) {
171-
return line;
191+
if (isBlank(bytesRef, searchFrom, searchTo) == false) {
192+
return bytesRef.slice(searchFrom, searchTo - searchFrom).utf8ToString();
172193
}
173194
searchFrom = newLineMarkerIndex != -1 ? newLineMarkerIndex + 1 : bytesRef.length();
174195
}
175196
return null;
176197
}
177198

199+
/**
200+
* Checks whether the line pointed to by a pair of indexes: {@code from} (inclusive) and {@code to} (exclusive) is blank.
201+
* A line is considered blank if it only consists of space characters (' ').
202+
*/
203+
private static boolean isBlank(BytesReference bytesRef, int from, int to) {
204+
for (int i = from; i < to; ++i) {
205+
if (bytesRef.get(i) != ((byte) ' ')) {
206+
return false;
207+
}
208+
}
209+
return true;
210+
}
211+
178212
private String getConcreteIndexOrWriteAlias(String documentId) {
179213
Objects.requireNonNull(documentId);
180214
SearchRequest searchRequest =

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/process/IndexingStateProcessorTests.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import org.elasticsearch.action.bulk.BulkResponse;
1111
import org.elasticsearch.action.search.SearchRequest;
1212
import org.elasticsearch.action.search.SearchResponse;
13-
import org.elasticsearch.common.bytes.BytesArray;
1413
import org.elasticsearch.common.bytes.BytesReference;
1514
import org.elasticsearch.rest.RestStatus;
1615
import org.elasticsearch.search.SearchHit;
@@ -86,7 +85,8 @@ public void verifyNoMoreClientInteractions() {
8685
}
8786

8887
public void testExtractDocId() throws IOException {
89-
assertThat(IndexingStateProcessor.extractDocId(new BytesArray(STATE_SAMPLE.getBytes(StandardCharsets.UTF_8))), equalTo("1"));
88+
assertThat(IndexingStateProcessor.extractDocId("{ \"index\": {\"_index\": \"test\", \"_id\": \"1\" } }\n"), equalTo("1"));
89+
assertThat(IndexingStateProcessor.extractDocId("{ \"index\": {\"_id\": \"2\" } }\n"), equalTo("2"));
9090
}
9191

9292
private void testStateRead(SearchHits searchHits, String expectedIndexOrAlias) throws IOException {

0 commit comments

Comments
 (0)