Apply review comments.

przemekwitek · przemekwitek · commit 8a97833d65f9 · 2020-02-10T13:21:34.000+01:00
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/process/IndexingStateProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/process/IndexingStateProcessor.java
@@ -35,9 +35,21 @@
 import java.util.Map;
 import java.util.Objects;
 
-
 /**
- * Reads state documents of a stream, splits them and persists to an index via a bulk request
+ * Reads state documents of a stream, splits them and persists to an index via a bulk request.
+ *
+ * Some types of state, for example data frame analytics state and categorizer state, are written multiple times with the same document id.
+ * The code needs to make sure that even after .ml-state index rollover there are no duplicate documents across the .ml-state*
+ * indices. Such duplicates are undesirable for at least two reasons:
+ *  1. We deliberately have no mappings on the state index so we cannot sort and filter in a search
+ *  2. The state documents are large, so having dead documents with duplicate IDs is suboptimal from a disk usage perspective
+ *
+ * In order to avoid duplicates the following sequence of steps is executed every time the document is about to get persisted:
+ *  1. The first non-blank line is extracted from the given bytes. Lines are delimited by the new line character ('\n')
+ *  2. Document id is extracted from this line.
+ *  3. Document with this id is searched for in .ml-state* indices
+ *  4. If the document is found, it is overwritten in place (i.e. in the same index) with the new content.
+ *     Otherwise, it is written to the index pointed by the current write alias, i.e. .ml-state-writei
  */
 public class IndexingStateProcessor implements StateProcessor {
 
@@ -100,7 +112,7 @@ private BytesReference splitAndPersist(BytesReference bytesRef, int searchFrom)
             // Ignore completely empty chunks
             if (nextZeroByte > splitFrom) {
                 // No validation - assume the native process has formatted the state correctly
-                findAppropriateIndexAndPersist(bytesRef.slice(splitFrom, nextZeroByte - splitFrom));
+                findAppropriateIndexOrAliasAndPersist(bytesRef.slice(splitFrom, nextZeroByte - splitFrom));
             }
             splitFrom = nextZeroByte + 1;
         }
@@ -110,11 +122,16 @@ private BytesReference splitAndPersist(BytesReference bytesRef, int searchFrom)
         return bytesRef.slice(splitFrom, bytesRef.length() - splitFrom);
     }
 
-    void findAppropriateIndexAndPersist(BytesReference bytes) throws IOException {
-        String stateDocId = extractDocId(bytes);
-        if (stateDocId == null) {
+    /**
+     * Finds an appropriate index the document should be put in and then persists the document in that index.
+     * For what is considered to be "appropriate" see the class documentation.
+     */
+    void findAppropriateIndexOrAliasAndPersist(BytesReference bytes) throws IOException {
+        String firstNonBlankLine = extractFirstNonBlankLine(bytes);
+        if (firstNonBlankLine == null) {
             return;
         }
+        String stateDocId = extractDocId(firstNonBlankLine);
         String indexOrAlias = getConcreteIndexOrWriteAlias(stateDocId);
         persist(indexOrAlias, bytes);
     }
@@ -142,11 +159,11 @@ private static int findNextZeroByte(BytesReference bytesRef, int searchFrom, int
     }
 
     @SuppressWarnings("unchecked")
-    static String extractDocId(BytesReference bytesRef) throws IOException {
-        String firstNonBlankLine = extractFirstNonBlankLine(bytesRef);
-        if (firstNonBlankLine == null) {
-            return null;
-        }
+    /**
+     * Extracts document id from the given {@code bytesRef}.
+     * Only first non-blank line is parsed and document id is assumed to be a nested "index._id" field of type String.
+     */
+    static String extractDocId(String firstNonBlankLine) throws IOException {
         try (XContentParser parser =
                  JsonXContent.jsonXContent.createParser(
                      NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, firstNonBlankLine)) {
@@ -162,19 +179,36 @@ static String extractDocId(BytesReference bytesRef) throws IOException {
         }
     }
 
+    /**
+     * Extracts the first non-blank line from the given {@code bytesRef}.
+     * Lines are separated by the new line character ('\n').
+     * A line is considered blank if it only consists of space characters (' ').
+     */
     private static String extractFirstNonBlankLine(BytesReference bytesRef) {
         for (int searchFrom = 0; searchFrom < bytesRef.length();) {
             int newLineMarkerIndex = bytesRef.indexOf((byte) '\n', searchFrom);
             int searchTo = newLineMarkerIndex != -1 ? newLineMarkerIndex : bytesRef.length();
-            String line = bytesRef.slice(searchFrom, searchTo - searchFrom).utf8ToString();
-            if (line.isBlank() == false) {
-                return line;
+            if (isBlank(bytesRef, searchFrom, searchTo) == false) {
+                return bytesRef.slice(searchFrom, searchTo - searchFrom).utf8ToString();
             }
             searchFrom = newLineMarkerIndex != -1 ? newLineMarkerIndex + 1 : bytesRef.length();
         }
         return null;
     }
 
+    /**
+     * Checks whether the line pointed to by a pair of indexes: {@code from} (inclusive) and {@code to} (exclusive) is blank.
+     * A line is considered blank if it only consists of space characters (' ').
+     */
+    private static boolean isBlank(BytesReference bytesRef, int from, int to) {
+        for (int i = from; i < to; ++i) {
+            if (bytesRef.get(i) != ((byte) ' ')) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     private String getConcreteIndexOrWriteAlias(String documentId) {
         Objects.requireNonNull(documentId);
         SearchRequest searchRequest =
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/process/IndexingStateProcessorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/process/IndexingStateProcessorTests.java
@@ -10,7 +10,6 @@
 import org.elasticsearch.action.bulk.BulkResponse;
 import org.elasticsearch.action.search.SearchRequest;
 import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.SearchHit;
@@ -86,7 +85,8 @@ public void verifyNoMoreClientInteractions() {
     }
 
     public void testExtractDocId() throws IOException {
-        assertThat(IndexingStateProcessor.extractDocId(new BytesArray(STATE_SAMPLE.getBytes(StandardCharsets.UTF_8))), equalTo("1"));
+        assertThat(IndexingStateProcessor.extractDocId("{ \"index\": {\"_index\": \"test\", \"_id\": \"1\" } }\n"), equalTo("1"));
+        assertThat(IndexingStateProcessor.extractDocId("{ \"index\": {\"_id\": \"2\" } }\n"), equalTo("2"));
     }
 
     private void testStateRead(SearchHits searchHits, String expectedIndexOrAlias) throws IOException {