|
5 | 5 | */
|
6 | 6 | package org.elasticsearch.xpack.ml.action;
|
7 | 7 |
|
| 8 | +import org.elasticsearch.ElasticsearchException; |
8 | 9 | import org.elasticsearch.ElasticsearchStatusException;
|
9 | 10 | import org.elasticsearch.ElasticsearchTimeoutException;
|
| 11 | +import org.elasticsearch.ResourceNotFoundException; |
10 | 12 | import org.elasticsearch.action.ActionListener;
|
11 | 13 | import org.elasticsearch.action.support.ActionFilters;
|
12 | 14 | import org.elasticsearch.action.support.master.AcknowledgedResponse;
|
|
33 | 35 | import org.elasticsearch.xpack.core.ml.action.SetUpgradeModeAction;
|
34 | 36 | import org.elasticsearch.xpack.ml.utils.TypedChainTaskExecutor;
|
35 | 37 |
|
| 38 | +import java.util.Comparator; |
36 | 39 | import java.util.List;
|
37 | 40 | import java.util.Set;
|
38 | 41 | import java.util.concurrent.atomic.AtomicBoolean;
|
39 | 42 | import java.util.stream.Collectors;
|
40 | 43 |
|
| 44 | +import static org.elasticsearch.ExceptionsHelper.rethrowAndSuppress; |
41 | 45 | import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN;
|
42 | 46 | import static org.elasticsearch.xpack.core.ClientHelper.executeAsyncWithOrigin;
|
43 | 47 | import static org.elasticsearch.xpack.core.ml.MlTasks.AWAITING_UPGRADE;
|
@@ -119,9 +123,20 @@ protected void masterOperation(SetUpgradeModeAction.Request request, ClusterStat
|
119 | 123 | .cluster()
|
120 | 124 | .prepareListTasks()
|
121 | 125 | .setActions(DATAFEED_TASK_NAME + "[c]", JOB_TASK_NAME + "[c]")
|
| 126 | + // There is a chance that we failed un-allocating a task due to allocation_id being changed |
| 127 | + // This call will timeout in that case and return an error |
122 | 128 | .setWaitForCompletion(true)
|
123 | 129 | .setTimeout(request.timeout()).execute(ActionListener.wrap(
|
124 |
| - r -> wrappedListener.onResponse(new AcknowledgedResponse(true)), |
| 130 | + r -> { |
| 131 | + try { |
| 132 | + // Handle potential node timeouts, |
| 133 | + // these should be considered failures as tasks as still potentially executing |
| 134 | + rethrowAndSuppress(r.getNodeFailures()); |
| 135 | + wrappedListener.onResponse(new AcknowledgedResponse(true)); |
| 136 | + } catch (ElasticsearchException ex) { |
| 137 | + wrappedListener.onFailure(ex); |
| 138 | + } |
| 139 | + }, |
125 | 140 | wrappedListener::onFailure));
|
126 | 141 | },
|
127 | 142 | wrappedListener::onFailure
|
@@ -243,10 +258,19 @@ private void unassignPersistentTasks(PersistentTasksCustomMetaData tasksCustomMe
|
243 | 258 | .stream()
|
244 | 259 | .filter(persistentTask -> (persistentTask.getTaskName().equals(MlTasks.JOB_TASK_NAME) ||
|
245 | 260 | persistentTask.getTaskName().equals(MlTasks.DATAFEED_TASK_NAME)))
|
| 261 | + // We want to always have the same ordering of which tasks we un-allocate first. |
| 262 | + // However, the order in which the distributed tasks handle the un-allocation event is not guaranteed. |
| 263 | + .sorted(Comparator.comparing(PersistentTask::getTaskName)) |
246 | 264 | .collect(Collectors.toList());
|
247 | 265 |
|
248 | 266 | TypedChainTaskExecutor<PersistentTask<?>> chainTaskExecutor =
|
249 |
| - new TypedChainTaskExecutor<>(client.threadPool().executor(executor()), r -> true, ex -> true); |
| 267 | + new TypedChainTaskExecutor<>(client.threadPool().executor(executor()), |
| 268 | + r -> true, |
| 269 | + // Another process could modify tasks and thus we cannot find them via the allocation_id and name |
| 270 | + // If the task was removed from the node, all is well |
| 271 | + // We handle the case of allocation_id changing later in this transport class by timing out waiting for task completion |
| 272 | + // Consequently, if the exception is ResourceNotFoundException, continue execution; circuit break otherwise. |
| 273 | + ex -> ex instanceof ResourceNotFoundException == false); |
250 | 274 |
|
251 | 275 | for (PersistentTask<?> task : datafeedAndJobTasks) {
|
252 | 276 | chainTaskExecutor.add(
|
|
0 commit comments