[ML] Fix snapshot upgrader so that if state is not fully written or parseable the task fails (#65755)

benwtrent · web-flow · commit dd50520f9d8b · 2020-12-03T09:16:06.000-05:00
It is possible that snapshot upgrader execution path continues before the old model state is fully read by the native process. To prevent this, a flush request is made after the state is loaded. This is to verify that the all the state has been read by the native process. This allows the task to fail if reading the state fails and prevents some strange race conditions. closes #65699
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java
@@ -2730,10 +2730,6 @@ private String createAndPutDatafeed(String jobId, String indexName) throws IOExc
     }
 
     public void createModelSnapshot(String jobId, String snapshotId) throws IOException {
-        createModelSnapshot(jobId, snapshotId, Version.CURRENT);
-    }
-
-    public void createModelSnapshot(String jobId, String snapshotId, Version minVersion) throws IOException {
         String documentId = jobId + "_model_snapshot_" + snapshotId;
         Job job = MachineLearningIT.buildJob(jobId);
         highLevelClient().machineLearning().putJob(new PutJobRequest(job), RequestOptions.DEFAULT);
@@ -2747,7 +2743,7 @@ public void createModelSnapshot(String jobId, String snapshotId, Version minVers
             "\"total_by_field_count\":3, \"total_over_field_count\":0, \"total_partition_field_count\":2," +
             "\"bucket_allocation_failures_count\":0, \"memory_status\":\"ok\", \"log_time\":1541587919000, " +
             "\"timestamp\":1519930800000}, \"latest_record_time_stamp\":1519931700000," +
-            "\"latest_result_time_stamp\":1519930800000, \"retain\":false, \"min_version\":\"" + minVersion.toString() + "\"}",
+            "\"latest_result_time_stamp\":1519930800000, \"retain\":false, \"min_version\":\"" + Version.CURRENT.toString() + "\"}",
             XContentType.JSON);
 
         highLevelClient().index(indexRequest, RequestOptions.DEFAULT);
@@ -2828,12 +2824,11 @@ public void testUpdateModelSnapshot() throws Exception {
             getModelSnapshotsResponse2.snapshots().get(0).getDescription());
     }
 
-    @AwaitsFix(bugUrl="https://github.com/elastic/elasticsearch/issues/65699")
     public void testUpgradeJobSnapshot() throws Exception {
         String jobId = "test-upgrade-model-snapshot";
         String snapshotId = "1541587919";
 
-        createModelSnapshot(jobId, snapshotId, Version.CURRENT);
+        createModelSnapshot(jobId, snapshotId);
         MachineLearningClient machineLearningClient = highLevelClient().machineLearning();
         UpgradeJobModelSnapshotRequest request = new UpgradeJobModelSnapshotRequest(jobId, snapshotId, null, true);
         ElasticsearchException ex = expectThrows(ElasticsearchException.class,
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
@@ -2336,7 +2336,6 @@ public void onFailure(Exception e) {
         }
     }
 
-    @AwaitsFix(bugUrl="https://github.com/elastic/elasticsearch/issues/65699")
     public void testUpgradeJobSnapshot() throws IOException, InterruptedException {
         RestHighLevelClient client = highLevelClient();
 
@@ -2376,7 +2375,7 @@ public void testUpgradeJobSnapshot() throws IOException, InterruptedException {
                 // end::upgrade-job-model-snapshot-execute
                 fail("upgrade model snapshot should not have succeeded.");
             } catch (ElasticsearchException ex) {
-                assertThat(ex.getMessage(), containsString("Expected persisted state but no state exists"));
+                assertThat(ex.getMessage(), containsString("Unexpected state [failed] while waiting for to be assigned to a node"));
             }
             UpgradeJobModelSnapshotResponse response = new UpgradeJobModelSnapshotResponse(true, "");
 
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/JobModelSnapshotUpgrader.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/JobModelSnapshotUpgrader.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.core.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.output.FlushAcknowledgement;
 import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeState;
 import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
@@ -31,11 +32,13 @@
 import org.elasticsearch.xpack.ml.job.persistence.StateStreamer;
 import org.elasticsearch.xpack.ml.job.process.autodetect.output.JobSnapshotUpgraderResultProcessor;
 import org.elasticsearch.xpack.ml.job.process.autodetect.params.AutodetectParams;
+import org.elasticsearch.xpack.ml.job.process.autodetect.params.FlushJobParams;
 import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradeTask;
 import org.elasticsearch.xpack.ml.process.NativeStorageProvider;
 import org.elasticsearch.xpack.ml.process.writer.LengthEncodedWriter;
 
 import java.io.IOException;
+import java.time.Duration;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Objects;
@@ -48,7 +51,7 @@
 import static org.elasticsearch.xpack.ml.MachineLearning.UTILITY_THREAD_POOL_NAME;
 
 public final class JobModelSnapshotUpgrader {
-
+    private static final Duration FLUSH_PROCESS_CHECK_FREQUENCY = Duration.ofSeconds(1);
     private static final Logger logger = LogManager.getLogger(JobModelSnapshotUpgrader.class);
 
     private final SnapshotUpgradeTask task;
@@ -97,7 +100,9 @@ void start() {
             params,
             autodetectExecutorService,
             (reason) -> {
-                setTaskToFailed(reason, ActionListener.wrap(t -> {}, f -> {}));
+                setTaskToFailed(reason, ActionListener.wrap(t -> {
+                }, f -> {
+                }));
                 try {
                     nativeStorageProvider.cleanupLocalTmpStorage(task.getDescription());
                 } catch (IOException e) {
@@ -200,6 +205,24 @@ void writeHeader() throws IOException {
             process.writeRecord(record);
         }
 
+        FlushAcknowledgement waitFlushToCompletion(String flushId) throws Exception {
+            logger.debug(() -> new ParameterizedMessage("[{}] [{}] waiting for flush [{}]", jobId, snapshotId, flushId));
+
+            FlushAcknowledgement flushAcknowledgement;
+            try {
+                flushAcknowledgement = processor.waitForFlushAcknowledgement(flushId, FLUSH_PROCESS_CHECK_FREQUENCY);
+                while (flushAcknowledgement == null) {
+                    checkProcessIsAlive();
+                    checkResultsProcessorIsAlive();
+                    flushAcknowledgement = processor.waitForFlushAcknowledgement(flushId, FLUSH_PROCESS_CHECK_FREQUENCY);
+                }
+            } finally {
+                processor.clearAwaitingFlush(flushId);
+            }
+            logger.debug(() -> new ParameterizedMessage("[{}] [{}] flush completed [{}]", jobId, snapshotId, flushId));
+            return flushAcknowledgement;
+        }
+
         void restoreState() {
             try {
                 process.restoreState(stateStreamer, params.modelSnapshot());
@@ -209,6 +232,31 @@ void restoreState() {
                     ActionListener.wrap(t -> shutdown(e), f -> shutdown(e)));
                 return;
             }
+            submitOperation(() -> {
+                String flushId = process.flushJob(FlushJobParams.builder().waitForNormalization(false).build());
+                return waitFlushToCompletion(flushId);
+            }, (aVoid, e) -> {
+                Runnable nextStep;
+                if (e != null) {
+                    logger.error(
+                        () -> new ParameterizedMessage(
+                            "[{}] [{}] failed to flush after writing old state",
+                            jobId,
+                            snapshotId
+                        ),
+                        e);
+                    nextStep = () -> setTaskToFailed(
+                        "Failed to flush after writing old state due to: " + e.getMessage(),
+                        ActionListener.wrap(t -> shutdown(e), f -> shutdown(e))
+                    );
+                } else {
+                    nextStep = this::requestStateWrite;
+                }
+                threadPool.executor(UTILITY_THREAD_POOL_NAME).execute(nextStep);
+            });
+        }
+
+        private void requestStateWrite() {
             task.updatePersistentTaskState(
                 new SnapshotUpgradeTaskState(SnapshotUpgradeState.SAVING_NEW_STATE, task.getAllocationId(), ""),
                 ActionListener.wrap(
@@ -282,6 +330,13 @@ private void checkProcessIsAlive() {
             }
         }
 
+        private void checkResultsProcessorIsAlive() {
+            if (processor.isFailed()) {
+                // Don't log here - it just causes double logging when the exception gets logged
+                throw new ElasticsearchException("[{}] Unexpected death of the result processor", job.getId());
+            }
+        }
+
         void shutdown(Exception e) {
             // No point in sending an action to the executor if the process has died
             if (process.isProcessAlive() == false) {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/output/JobSnapshotUpgraderResultProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/output/JobSnapshotUpgraderResultProcessor.java
@@ -10,6 +10,7 @@
 import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.action.bulk.BulkResponse;
 import org.elasticsearch.action.support.WriteRequest;
+import org.elasticsearch.common.Nullable;
 import org.elasticsearch.xpack.core.ml.MachineLearningField;
 import org.elasticsearch.xpack.core.ml.annotations.Annotation;
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.output.FlushAcknowledgement;
@@ -28,6 +29,7 @@
 import org.elasticsearch.xpack.ml.job.process.autodetect.AutodetectProcess;
 import org.elasticsearch.xpack.ml.job.results.AutodetectResult;
 
+import java.time.Duration;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Objects;
@@ -52,6 +54,7 @@ public class JobSnapshotUpgraderResultProcessor {
     private final JobResultsPersister persister;
     private final AutodetectProcess process;
     private final JobResultsPersister.Builder bulkResultsPersister;
+    private final FlushListener flushListener;
     private volatile boolean processKilled;
     private volatile boolean failed;
 
@@ -64,6 +67,7 @@ public JobSnapshotUpgraderResultProcessor(String jobId,
         this.persister = Objects.requireNonNull(persister);
         this.process = Objects.requireNonNull(autodetectProcess);
         this.bulkResultsPersister = persister.bulkPersisterBuilder(jobId).shouldRetry(this::isAlive);
+        this.flushListener = new FlushListener();
     }
 
     public void process() {
@@ -204,10 +208,34 @@ void processResult(AutodetectResult result) {
         }
         FlushAcknowledgement flushAcknowledgement = result.getFlushAcknowledgement();
         if (flushAcknowledgement != null) {
-            logUnexpectedResult(FlushAcknowledgement.TYPE.getPreferredName());
+            LOGGER.debug(
+                () -> new ParameterizedMessage(
+                    "[{}] [{}] Flush acknowledgement parsed from output for ID {}",
+                    jobId,
+                    snapshotId,
+                    flushAcknowledgement.getId()
+                )
+            );
+            flushListener.acknowledgeFlush(flushAcknowledgement, null);
         }
     }
 
+    /**
+     * Blocks until a flush is acknowledged or the timeout expires, whichever happens first.
+     *
+     * @param flushId the id of the flush request to wait for
+     * @param timeout the timeout
+     * @return The {@link FlushAcknowledgement} if the flush has completed or the parsing finished; {@code null} if the timeout expired
+     */
+    @Nullable
+    public FlushAcknowledgement waitForFlushAcknowledgement(String flushId, Duration timeout) throws Exception {
+        return failed ? null : flushListener.waitForFlush(flushId, timeout);
+    }
+
+    public void clearAwaitingFlush(String flushId) {
+        flushListener.clear(flushId);
+    }
+
     public void awaitCompletion() throws TimeoutException {
         try {
             // Although the results won't take 30 minutes to finish, the pipe won't be closed
@@ -230,7 +258,6 @@ public void awaitCompletion() throws TimeoutException {
         }
     }
 
-
     /**
      * If failed then there was an error parsing the results that cannot be recovered from
      *

Original file line number	Diff line number	Diff line change
`@@ -2336,7 +2336,6 @@ public void onFailure(Exception e) {`
`2336`	`2336`	`}`
`2337`	`2337`	`}`
`2338`	`2338`
`2339`		`- @AwaitsFix(bugUrl="https://github.com/elastic/elasticsearch/issues/65699")`
`2340`	`2339`	`public void testUpgradeJobSnapshot() throws IOException, InterruptedException {`
`2341`	`2340`	`RestHighLevelClient client = highLevelClient();`
`2342`	`2341`
`@@ -2376,7 +2375,7 @@ public void testUpgradeJobSnapshot() throws IOException, InterruptedException {`
`2376`	`2375`	`// end::upgrade-job-model-snapshot-execute`
`2377`	`2376`	`fail("upgrade model snapshot should not have succeeded.");`
`2378`	`2377`	`} catch (ElasticsearchException ex) {`
`2379`		`- assertThat(ex.getMessage(), containsString("Expected persisted state but no state exists"));`
	`2378`	`+ assertThat(ex.getMessage(), containsString("Unexpected state [failed] while waiting for to be assigned to a node"));`
`2380`	`2379`	`}`
`2381`	`2380`	`UpgradeJobModelSnapshotResponse response = new UpgradeJobModelSnapshotResponse(true, "");`
`2382`	`2381`