17
17
import org .elasticsearch .cluster .metadata .IndexNameExpressionResolver ;
18
18
import org .elasticsearch .cluster .node .DiscoveryNodes ;
19
19
import org .elasticsearch .cluster .service .ClusterService ;
20
- import org .elasticsearch .common .Strings ;
21
20
import org .elasticsearch .common .inject .Inject ;
22
21
import org .elasticsearch .common .io .stream .StreamInput ;
23
22
import org .elasticsearch .common .settings .Settings ;
24
23
import org .elasticsearch .common .util .concurrent .AbstractRunnable ;
25
24
import org .elasticsearch .common .util .concurrent .AtomicArray ;
26
25
import org .elasticsearch .discovery .MasterNotDiscoveredException ;
26
+ import org .elasticsearch .persistent .PersistentTasksClusterService ;
27
27
import org .elasticsearch .persistent .PersistentTasksCustomMetaData ;
28
28
import org .elasticsearch .persistent .PersistentTasksService ;
29
29
import org .elasticsearch .tasks .Task ;
32
32
import org .elasticsearch .xpack .core .ml .MlTasks ;
33
33
import org .elasticsearch .xpack .core .ml .action .StopDatafeedAction ;
34
34
import org .elasticsearch .xpack .core .ml .datafeed .DatafeedState ;
35
+ import org .elasticsearch .xpack .core .ml .utils .ExceptionsHelper ;
35
36
import org .elasticsearch .xpack .ml .MachineLearning ;
36
37
import org .elasticsearch .xpack .ml .datafeed .DatafeedConfigReader ;
37
38
import org .elasticsearch .xpack .ml .datafeed .persistence .DatafeedConfigProvider ;
38
39
39
40
import java .io .IOException ;
40
41
import java .util .ArrayList ;
42
+ import java .util .Collection ;
41
43
import java .util .HashSet ;
42
44
import java .util .List ;
43
45
import java .util .Set ;
@@ -72,32 +74,46 @@ public TransportStopDatafeedAction(Settings settings, TransportService transport
72
74
* @param tasks Persistent task meta data
73
75
* @param startedDatafeedIds Started datafeed ids are added to this list
74
76
* @param stoppingDatafeedIds Stopping datafeed ids are added to this list
77
+ * @param notStoppedDatafeedIds Datafeed ids are added to this list for all datafeeds that are not stopped
75
78
*/
76
- static void sortDatafeedIdsByTaskState (Set <String > expandedDatafeedIds ,
79
+ static void sortDatafeedIdsByTaskState (Collection <String > expandedDatafeedIds ,
77
80
PersistentTasksCustomMetaData tasks ,
78
81
List <String > startedDatafeedIds ,
79
- List <String > stoppingDatafeedIds ) {
82
+ List <String > stoppingDatafeedIds ,
83
+ List <String > notStoppedDatafeedIds ) {
80
84
81
85
for (String expandedDatafeedId : expandedDatafeedIds ) {
82
86
addDatafeedTaskIdAccordingToState (expandedDatafeedId , MlTasks .getDatafeedState (expandedDatafeedId , tasks ),
83
- startedDatafeedIds , stoppingDatafeedIds );
87
+ startedDatafeedIds , stoppingDatafeedIds , notStoppedDatafeedIds );
84
88
}
85
89
}
86
90
87
91
private static void addDatafeedTaskIdAccordingToState (String datafeedId ,
88
92
DatafeedState datafeedState ,
89
93
List <String > startedDatafeedIds ,
90
- List <String > stoppingDatafeedIds ) {
94
+ List <String > stoppingDatafeedIds ,
95
+ List <String > notStoppedDatafeedIds ) {
91
96
switch (datafeedState ) {
97
+ case STARTING :
98
+ // The STARTING state is not used anywhere at present, so this should never happen.
99
+ // At present datafeeds that have a persistent task that hasn't yet been assigned
100
+ // a state are reported as STOPPED (which is not great). It could be considered a
101
+ // breaking change to introduce the STARTING state though, so let's aim to do it in
102
+ // version 8. Also consider treating STARTING like STARTED for stop API behaviour.
103
+ notStoppedDatafeedIds .add (datafeedId );
104
+ break ;
92
105
case STARTED :
93
106
startedDatafeedIds .add (datafeedId );
107
+ notStoppedDatafeedIds .add (datafeedId );
94
108
break ;
95
109
case STOPPED :
96
110
break ;
97
111
case STOPPING :
98
112
stoppingDatafeedIds .add (datafeedId );
113
+ notStoppedDatafeedIds .add (datafeedId );
99
114
break ;
100
115
default :
116
+ assert false : "Unexpected datafeed state " + datafeedState ;
101
117
break ;
102
118
}
103
119
}
@@ -123,17 +139,18 @@ protected void doExecute(Task task, StopDatafeedAction.Request request, ActionLi
123
139
124
140
List <String > startedDatafeeds = new ArrayList <>();
125
141
List <String > stoppingDatafeeds = new ArrayList <>();
126
- sortDatafeedIdsByTaskState (expandedIds , tasks , startedDatafeeds , stoppingDatafeeds );
142
+ List <String > notStoppedDatafeeds = new ArrayList <>();
143
+ sortDatafeedIdsByTaskState (expandedIds , tasks , startedDatafeeds , stoppingDatafeeds , notStoppedDatafeeds );
127
144
if (startedDatafeeds .isEmpty () && stoppingDatafeeds .isEmpty ()) {
128
145
listener .onResponse (new StopDatafeedAction .Response (true ));
129
146
return ;
130
147
}
131
148
request .setResolvedStartedDatafeedIds (startedDatafeeds .toArray (new String [startedDatafeeds .size ()]));
132
149
133
150
if (request .isForce ()) {
134
- forceStopDatafeed (request , listener , tasks , startedDatafeeds );
151
+ forceStopDatafeed (request , listener , tasks , notStoppedDatafeeds );
135
152
} else {
136
- normalStopDatafeed (task , request , listener , tasks , startedDatafeeds , stoppingDatafeeds );
153
+ normalStopDatafeed (task , request , listener , tasks , nodes , startedDatafeeds , stoppingDatafeeds );
137
154
}
138
155
},
139
156
listener ::onFailure
@@ -142,20 +159,20 @@ protected void doExecute(Task task, StopDatafeedAction.Request request, ActionLi
142
159
}
143
160
144
161
private void normalStopDatafeed (Task task , StopDatafeedAction .Request request , ActionListener <StopDatafeedAction .Response > listener ,
145
- PersistentTasksCustomMetaData tasks ,
162
+ PersistentTasksCustomMetaData tasks , DiscoveryNodes nodes ,
146
163
List <String > startedDatafeeds , List <String > stoppingDatafeeds ) {
147
- Set <String > executorNodes = new HashSet <>();
164
+ final Set <String > executorNodes = new HashSet <>();
148
165
for (String datafeedId : startedDatafeeds ) {
149
166
PersistentTasksCustomMetaData .PersistentTask <?> datafeedTask = MlTasks .getDatafeedTask (datafeedId , tasks );
150
167
if (datafeedTask == null ) {
151
168
// This should not happen, because startedDatafeeds was derived from the same tasks that is passed to this method
152
169
String msg = "Requested datafeed [" + datafeedId + "] be stopped, but datafeed's task could not be found." ;
153
170
assert datafeedTask != null : msg ;
154
171
logger .error (msg );
155
- } else if (datafeedTask .isAssigned () ) {
172
+ } else if (PersistentTasksClusterService . needsReassignment ( datafeedTask .getAssignment (), nodes ) == false ) {
156
173
executorNodes .add (datafeedTask .getExecutorNode ());
157
174
} else {
158
- // This is the easy case - the datafeed is not currently assigned to a node,
175
+ // This is the easy case - the datafeed is not currently assigned to a valid node,
159
176
// so can be gracefully stopped simply by removing its persistent task. (Usually
160
177
// a graceful stop cannot be achieved by simply removing the persistent task, but
161
178
// if the datafeed has no running code then graceful/forceful are the same.)
@@ -176,48 +193,62 @@ private void normalStopDatafeed(Task task, StopDatafeedAction.Request request, A
176
193
177
194
ActionListener <StopDatafeedAction .Response > finalListener = ActionListener .wrap (
178
195
r -> waitForDatafeedStopped (allDataFeedsToWaitFor , request , r , listener ),
179
- listener ::onFailure );
196
+ e -> {
197
+ if (ExceptionsHelper .unwrapCause (e ) instanceof FailedNodeException ) {
198
+ // A node has dropped out of the cluster since we started executing the requests.
199
+ // Since stopping an already stopped datafeed is not an error we can try again.
200
+ // The datafeeds that were running on the node that dropped out of the cluster
201
+ // will just have their persistent tasks cancelled. Datafeeds that were stopped
202
+ // by the previous attempt will be noops in the subsequent attempt.
203
+ doExecute (task , request , listener );
204
+ } else {
205
+ listener .onFailure (e );
206
+ }
207
+ });
180
208
181
209
super .doExecute (task , request , finalListener );
182
210
}
183
211
184
212
private void forceStopDatafeed (final StopDatafeedAction .Request request , final ActionListener <StopDatafeedAction .Response > listener ,
185
- PersistentTasksCustomMetaData tasks , final List <String > startedDatafeeds ) {
213
+ PersistentTasksCustomMetaData tasks , final List <String > notStoppedDatafeeds ) {
186
214
final AtomicInteger counter = new AtomicInteger ();
187
- final AtomicArray <Exception > failures = new AtomicArray <>(startedDatafeeds .size ());
215
+ final AtomicArray <Exception > failures = new AtomicArray <>(notStoppedDatafeeds .size ());
188
216
189
- for (String datafeedId : startedDatafeeds ) {
217
+ for (String datafeedId : notStoppedDatafeeds ) {
190
218
PersistentTasksCustomMetaData .PersistentTask <?> datafeedTask = MlTasks .getDatafeedTask (datafeedId , tasks );
191
219
if (datafeedTask != null ) {
192
220
persistentTasksService .sendRemoveRequest (datafeedTask .getId (),
193
221
new ActionListener <PersistentTasksCustomMetaData .PersistentTask <?>>() {
194
222
@ Override
195
223
public void onResponse (PersistentTasksCustomMetaData .PersistentTask <?> persistentTask ) {
196
- if (counter .incrementAndGet () == startedDatafeeds .size ()) {
224
+ if (counter .incrementAndGet () == notStoppedDatafeeds .size ()) {
197
225
sendResponseOrFailure (request .getDatafeedId (), listener , failures );
198
226
}
199
227
}
200
228
201
229
@ Override
202
230
public void onFailure (Exception e ) {
203
231
final int slot = counter .incrementAndGet ();
204
- if ((e instanceof ResourceNotFoundException &&
205
- Strings .isAllOrWildcard (new String []{request .getDatafeedId ()})) == false ) {
232
+ // We validated that the datafeed names supplied in the request existed when we started processing the action.
233
+ // If the related tasks don't exist at this point then they must have been stopped by a simultaneous stop request.
234
+ // This is not an error.
235
+ if (ExceptionsHelper .unwrapCause (e ) instanceof ResourceNotFoundException == false ) {
206
236
failures .set (slot - 1 , e );
207
237
}
208
- if (slot == startedDatafeeds .size ()) {
238
+ if (slot == notStoppedDatafeeds .size ()) {
209
239
sendResponseOrFailure (request .getDatafeedId (), listener , failures );
210
240
}
211
241
}
212
242
});
213
243
} else {
214
- // This should not happen, because startedDatafeeds was derived from the same tasks that is passed to this method
244
+ // This should not happen, because startedDatafeeds and stoppingDatafeeds
245
+ // were derived from the same tasks that were passed to this method
215
246
String msg = "Requested datafeed [" + datafeedId + "] be force-stopped, but datafeed's task could not be found." ;
216
247
assert datafeedTask != null : msg ;
217
248
logger .error (msg );
218
249
final int slot = counter .incrementAndGet ();
219
250
failures .set (slot - 1 , new RuntimeException (msg ));
220
- if (slot == startedDatafeeds .size ()) {
251
+ if (slot == notStoppedDatafeeds .size ()) {
221
252
sendResponseOrFailure (request .getDatafeedId (), listener , failures );
222
253
}
223
254
}
@@ -233,24 +264,25 @@ protected void taskOperation(StopDatafeedAction.Request request, TransportStartD
233
264
threadPool .executor (MachineLearning .UTILITY_THREAD_POOL_NAME ).execute (new AbstractRunnable () {
234
265
@ Override
235
266
public void onFailure (Exception e ) {
236
- if ((e instanceof ResourceNotFoundException &&
237
- Strings .isAllOrWildcard (new String []{request .getDatafeedId ()}))) {
238
- datafeedTask .stop ("stop_datafeed (api)" , request .getStopTimeout ());
267
+ // We validated that the datafeed names supplied in the request existed when we started processing the action.
268
+ // If the related task for one of them doesn't exist at this point then it must have been removed by a
269
+ // simultaneous force stop request. This is not an error.
270
+ if (ExceptionsHelper .unwrapCause (e ) instanceof ResourceNotFoundException ) {
239
271
listener .onResponse (new StopDatafeedAction .Response (true ));
240
272
} else {
241
273
listener .onFailure (e );
242
274
}
243
275
}
244
276
245
277
@ Override
246
- protected void doRun () throws Exception {
278
+ protected void doRun () {
247
279
datafeedTask .stop ("stop_datafeed (api)" , request .getStopTimeout ());
248
280
listener .onResponse (new StopDatafeedAction .Response (true ));
249
281
}
250
282
});
251
283
},
252
284
e -> {
253
- if (e instanceof ResourceNotFoundException ) {
285
+ if (ExceptionsHelper . unwrapCause ( e ) instanceof ResourceNotFoundException ) {
254
286
// the task has disappeared so must have stopped
255
287
listener .onResponse (new StopDatafeedAction .Response (true ));
256
288
} else {
@@ -318,7 +350,7 @@ protected StopDatafeedAction.Response newResponse(StopDatafeedAction.Request req
318
350
.convertToElastic (failedNodeExceptions .get (0 ));
319
351
} else {
320
352
// This can happen we the actual task in the node no longer exists,
321
- // which means the datafeed(s) have already been closed .
353
+ // which means the datafeed(s) have already been stopped .
322
354
return new StopDatafeedAction .Response (true );
323
355
}
324
356
}
0 commit comments