Skip to content

Commit 8740e47

Browse files
committed
[ML] Move open job failure explanation out of root cause (#31925)
When an ML job cannot be allocated to a node the exception contained an explanation of why the job couldn't be allocated to each node in the cluster. For large clusters this was not particularly easy to read and made the error displayed in the UI look very scary. This commit changes the structure of the error to an outer ElasticsearchException with a high level message and an inner IllegalStateException containing the detailed explanation. Because the definition of root cause is the innermost ElasticsearchException the detailed explanation will not be the root cause (which is what Kibana displays). Fixes #29950
1 parent 6d210d7 commit 8740e47

File tree

3 files changed

+32
-19
lines changed

3 files changed

+32
-19
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportOpenJobAction.java

+11-6
Original file line numberDiff line numberDiff line change
@@ -684,10 +684,7 @@ public void validate(OpenJobAction.JobParams params, ClusterState clusterState)
684684
PersistentTasksCustomMetaData.Assignment assignment = selectLeastLoadedMlNode(params.getJobId(), clusterState,
685685
maxConcurrentJobAllocations, fallbackMaxNumberOfOpenJobs, maxMachineMemoryPercent, logger);
686686
if (assignment.getExecutorNode() == null) {
687-
String msg = "Could not open job because no suitable nodes were found, allocation explanation ["
688-
+ assignment.getExplanation() + "]";
689-
logger.warn("[{}] {}", params.getJobId(), msg);
690-
throw new ElasticsearchStatusException(msg, RestStatus.TOO_MANY_REQUESTS);
687+
throw makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
691688
}
692689
}
693690

@@ -791,9 +788,9 @@ public boolean test(PersistentTasksCustomMetaData.PersistentTask<?> persistentTa
791788
// and this is why this class must only be used when opening a job
792789
if (assignment != null && assignment.equals(PersistentTasksCustomMetaData.INITIAL_ASSIGNMENT) == false &&
793790
assignment.isAssigned() == false) {
791+
OpenJobAction.JobParams params = (OpenJobAction.JobParams) persistentTask.getParams();
794792
// Assignment has failed on the master node despite passing our "fast fail" validation
795-
exception = new ElasticsearchStatusException("Could not open job because no suitable nodes were found, " +
796-
"allocation explanation [" + assignment.getExplanation() + "]", RestStatus.TOO_MANY_REQUESTS);
793+
exception = makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
797794
// The persistent task should be cancelled so that the observed outcome is the
798795
// same as if the "fast fail" validation on the coordinating node had failed
799796
shouldCancel = true;
@@ -819,4 +816,12 @@ public boolean test(PersistentTasksCustomMetaData.PersistentTask<?> persistentTa
819816
}
820817
}
821818
}
819+
820+
static ElasticsearchException makeNoSuitableNodesException(Logger logger, String jobId, String explanation) {
821+
String msg = "Could not open job because no suitable nodes were found, allocation explanation [" + explanation + "]";
822+
logger.warn("[{}] {}", jobId, msg);
823+
Exception detail = new IllegalStateException(msg);
824+
return new ElasticsearchStatusException("Could not open job because no ML nodes with sufficient capacity were found",
825+
RestStatus.TOO_MANY_REQUESTS, detail);
826+
}
822827
}

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/BasicDistributedJobsIT.java

+8-4
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,14 @@ public void testMlIndicesNotAvailable() throws Exception {
366366

367367
Exception e = expectThrows(ElasticsearchStatusException.class,
368368
() -> client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet());
369-
assertTrue(e.getMessage(),
370-
e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
371-
assertTrue(e.getMessage(), e.getMessage().endsWith("because not all primary shards are active for the following indices "
372-
+ "[.ml-state,.ml-anomalies-shared]]"));
369+
assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
370+
IllegalStateException detail = (IllegalStateException) e.getCause();
371+
assertNotNull(detail);
372+
String detailedMessage = detail.getMessage();
373+
assertTrue(detailedMessage,
374+
detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
375+
assertTrue(detailedMessage, detailedMessage.endsWith("because not all primary shards are active for the following indices " +
376+
"[.ml-state,.ml-anomalies-shared]]"));
373377

374378
logger.info("Start data node");
375379
String nonMlNode = internalCluster().startNode(Settings.builder()

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/TooManyJobsIT.java

+13-9
Original file line numberDiff line numberDiff line change
@@ -92,18 +92,22 @@ private void verifyMaxNumberOfJobsLimit(int numNodes, int maxNumberOfJobsPerNode
9292
});
9393
logger.info("Opened {}th job", i);
9494
} catch (ElasticsearchStatusException e) {
95-
assertTrue(e.getMessage(),
96-
e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
95+
assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
96+
IllegalStateException detail = (IllegalStateException) e.getCause();
97+
assertNotNull(detail);
98+
String detailedMessage = detail.getMessage();
99+
assertTrue(detailedMessage,
100+
detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
97101
if (expectMemoryLimitBeforeCountLimit) {
98102
int expectedJobsAlreadyOpenOnNode = (i - 1) / numNodes;
99-
assertTrue(e.getMessage(),
100-
e.getMessage().endsWith("because this node has insufficient available memory. Available memory for ML [" +
101-
maxMlMemoryPerNode + "], memory required by existing jobs [" +
102-
(expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) +
103-
"], estimated memory required for this job [" + memoryFootprintPerJob + "]]"));
103+
assertTrue(detailedMessage,
104+
detailedMessage.endsWith("because this node has insufficient available memory. Available memory for ML [" +
105+
maxMlMemoryPerNode + "], memory required by existing jobs [" +
106+
(expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) + "], estimated memory required for this job [" +
107+
memoryFootprintPerJob + "]]"));
104108
} else {
105-
assertTrue(e.getMessage(), e.getMessage().endsWith("because this node is full. Number of opened jobs [" +
106-
maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
109+
assertTrue(detailedMessage, detailedMessage.endsWith("because this node is full. Number of opened jobs [" +
110+
maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
107111
}
108112
logger.info("good news everybody --> reached maximum number of allowed opened jobs, after trying to open the {}th job", i);
109113

0 commit comments

Comments
 (0)