[ML] Skip execution of timed out inference requests waiting in queue (#80087)

dimitris-athanasiou · web-flow · commit e5251bc5a0d5 · 2021-10-29T18:05:52.000+03:00
If by the time we get to execute an inference request the action
has already been notified, it means the request timed out while
it was waiting in the queue. We should return early from the `doRun`
method to avoid unnecessary work.
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java
@@ -326,6 +326,14 @@ public void onFailure(Exception e) {
 
         @Override
         protected void doRun() throws Exception {
+            if (notified.get()) {
+                // Should not execute request as it has already timed out while waiting in the queue
+                logger.debug(
+                    () -> new ParameterizedMessage("[{}] skipping inference on request [{}] as it has timed out", modelId, requestId)
+                );
+                return;
+            }
+
             final String requestIdStr = String.valueOf(requestId);
             try {
                 // The request builder expect a list of inputs which are then batched.
@@ -378,6 +386,17 @@ private void processResult(
             logger.debug(
                 () -> new ParameterizedMessage("[{}] retrieved result for request [{}]", processContext.task.getModelId(), requestId)
             );
+            if (notified.get()) {
+                // The request has timed out. No need to spend cycles processing the result.
+                logger.debug(
+                    () -> new ParameterizedMessage(
+                        "[{}] skipping result processing for request [{}] as the request has timed out",
+                        processContext.task.getModelId(),
+                        requestId
+                    )
+                );
+                return;
+            }
             InferenceResults results = inferenceResultsProcessor.processResult(tokenization, pyTorchResult);
             logger.debug(
                 () -> new ParameterizedMessage("[{}] processed result for request [{}]", processContext.task.getModelId(), requestId)