16
16
import org .elasticsearch .cluster .ClusterState ;
17
17
import org .elasticsearch .cluster .node .DiscoveryNodes ;
18
18
import org .elasticsearch .cluster .service .ClusterService ;
19
- import org .elasticsearch .common .Strings ;
20
19
import org .elasticsearch .common .inject .Inject ;
21
20
import org .elasticsearch .common .util .concurrent .AbstractRunnable ;
22
21
import org .elasticsearch .common .util .concurrent .AtomicArray ;
23
22
import org .elasticsearch .discovery .MasterNotDiscoveredException ;
23
+ import org .elasticsearch .persistent .PersistentTasksClusterService ;
24
24
import org .elasticsearch .persistent .PersistentTasksCustomMetaData ;
25
25
import org .elasticsearch .persistent .PersistentTasksService ;
26
26
import org .elasticsearch .tasks .Task ;
34
34
import org .elasticsearch .xpack .ml .datafeed .persistence .DatafeedConfigProvider ;
35
35
36
36
import java .util .ArrayList ;
37
+ import java .util .Collection ;
37
38
import java .util .HashSet ;
38
39
import java .util .List ;
39
40
import java .util .Set ;
@@ -68,32 +69,46 @@ public TransportStopDatafeedAction(TransportService transportService, ThreadPool
68
69
* @param tasks Persistent task meta data
69
70
* @param startedDatafeedIds Started datafeed ids are added to this list
70
71
* @param stoppingDatafeedIds Stopping datafeed ids are added to this list
72
+ * @param notStoppedDatafeedIds Datafeed ids are added to this list for all datafeeds that are not stopped
71
73
*/
72
- static void sortDatafeedIdsByTaskState (Set <String > expandedDatafeedIds ,
74
+ static void sortDatafeedIdsByTaskState (Collection <String > expandedDatafeedIds ,
73
75
PersistentTasksCustomMetaData tasks ,
74
76
List <String > startedDatafeedIds ,
75
- List <String > stoppingDatafeedIds ) {
77
+ List <String > stoppingDatafeedIds ,
78
+ List <String > notStoppedDatafeedIds ) {
76
79
77
80
for (String expandedDatafeedId : expandedDatafeedIds ) {
78
81
addDatafeedTaskIdAccordingToState (expandedDatafeedId , MlTasks .getDatafeedState (expandedDatafeedId , tasks ),
79
- startedDatafeedIds , stoppingDatafeedIds );
82
+ startedDatafeedIds , stoppingDatafeedIds , notStoppedDatafeedIds );
80
83
}
81
84
}
82
85
83
86
private static void addDatafeedTaskIdAccordingToState (String datafeedId ,
84
87
DatafeedState datafeedState ,
85
88
List <String > startedDatafeedIds ,
86
- List <String > stoppingDatafeedIds ) {
89
+ List <String > stoppingDatafeedIds ,
90
+ List <String > notStoppedDatafeedIds ) {
87
91
switch (datafeedState ) {
92
+ case STARTING :
93
+ // The STARTING state is not used anywhere at present, so this should never happen.
94
+ // At present datafeeds that have a persistent task that hasn't yet been assigned
95
+ // a state are reported as STOPPED (which is not great). It could be considered a
96
+ // breaking change to introduce the STARTING state though, so let's aim to do it in
97
+ // version 8. Also consider treating STARTING like STARTED for stop API behaviour.
98
+ notStoppedDatafeedIds .add (datafeedId );
99
+ break ;
88
100
case STARTED :
89
101
startedDatafeedIds .add (datafeedId );
102
+ notStoppedDatafeedIds .add (datafeedId );
90
103
break ;
91
104
case STOPPED :
92
105
break ;
93
106
case STOPPING :
94
107
stoppingDatafeedIds .add (datafeedId );
108
+ notStoppedDatafeedIds .add (datafeedId );
95
109
break ;
96
110
default :
111
+ assert false : "Unexpected datafeed state " + datafeedState ;
97
112
break ;
98
113
}
99
114
}
@@ -118,17 +133,18 @@ protected void doExecute(Task task, StopDatafeedAction.Request request, ActionLi
118
133
119
134
List <String > startedDatafeeds = new ArrayList <>();
120
135
List <String > stoppingDatafeeds = new ArrayList <>();
121
- sortDatafeedIdsByTaskState (expandedIds , tasks , startedDatafeeds , stoppingDatafeeds );
136
+ List <String > notStoppedDatafeeds = new ArrayList <>();
137
+ sortDatafeedIdsByTaskState (expandedIds , tasks , startedDatafeeds , stoppingDatafeeds , notStoppedDatafeeds );
122
138
if (startedDatafeeds .isEmpty () && stoppingDatafeeds .isEmpty ()) {
123
139
listener .onResponse (new StopDatafeedAction .Response (true ));
124
140
return ;
125
141
}
126
142
request .setResolvedStartedDatafeedIds (startedDatafeeds .toArray (new String [startedDatafeeds .size ()]));
127
143
128
144
if (request .isForce ()) {
129
- forceStopDatafeed (request , listener , tasks , startedDatafeeds );
145
+ forceStopDatafeed (request , listener , tasks , notStoppedDatafeeds );
130
146
} else {
131
- normalStopDatafeed (task , request , listener , tasks , startedDatafeeds , stoppingDatafeeds );
147
+ normalStopDatafeed (task , request , listener , tasks , nodes , startedDatafeeds , stoppingDatafeeds );
132
148
}
133
149
},
134
150
listener ::onFailure
@@ -137,20 +153,20 @@ protected void doExecute(Task task, StopDatafeedAction.Request request, ActionLi
137
153
}
138
154
139
155
private void normalStopDatafeed (Task task , StopDatafeedAction .Request request , ActionListener <StopDatafeedAction .Response > listener ,
140
- PersistentTasksCustomMetaData tasks ,
156
+ PersistentTasksCustomMetaData tasks , DiscoveryNodes nodes ,
141
157
List <String > startedDatafeeds , List <String > stoppingDatafeeds ) {
142
- Set <String > executorNodes = new HashSet <>();
158
+ final Set <String > executorNodes = new HashSet <>();
143
159
for (String datafeedId : startedDatafeeds ) {
144
160
PersistentTasksCustomMetaData .PersistentTask <?> datafeedTask = MlTasks .getDatafeedTask (datafeedId , tasks );
145
161
if (datafeedTask == null ) {
146
162
// This should not happen, because startedDatafeeds was derived from the same tasks that is passed to this method
147
163
String msg = "Requested datafeed [" + datafeedId + "] be stopped, but datafeed's task could not be found." ;
148
164
assert datafeedTask != null : msg ;
149
165
logger .error (msg );
150
- } else if (datafeedTask .isAssigned () ) {
166
+ } else if (PersistentTasksClusterService . needsReassignment ( datafeedTask .getAssignment (), nodes ) == false ) {
151
167
executorNodes .add (datafeedTask .getExecutorNode ());
152
168
} else {
153
- // This is the easy case - the datafeed is not currently assigned to a node,
169
+ // This is the easy case - the datafeed is not currently assigned to a valid node,
154
170
// so can be gracefully stopped simply by removing its persistent task. (Usually
155
171
// a graceful stop cannot be achieved by simply removing the persistent task, but
156
172
// if the datafeed has no running code then graceful/forceful are the same.)
@@ -171,48 +187,62 @@ private void normalStopDatafeed(Task task, StopDatafeedAction.Request request, A
171
187
172
188
ActionListener <StopDatafeedAction .Response > finalListener = ActionListener .wrap (
173
189
r -> waitForDatafeedStopped (allDataFeedsToWaitFor , request , r , listener ),
174
- listener ::onFailure );
190
+ e -> {
191
+ if (ExceptionsHelper .unwrapCause (e ) instanceof FailedNodeException ) {
192
+ // A node has dropped out of the cluster since we started executing the requests.
193
+ // Since stopping an already stopped datafeed is not an error we can try again.
194
+ // The datafeeds that were running on the node that dropped out of the cluster
195
+ // will just have their persistent tasks cancelled. Datafeeds that were stopped
196
+ // by the previous attempt will be noops in the subsequent attempt.
197
+ doExecute (task , request , listener );
198
+ } else {
199
+ listener .onFailure (e );
200
+ }
201
+ });
175
202
176
203
super .doExecute (task , request , finalListener );
177
204
}
178
205
179
206
private void forceStopDatafeed (final StopDatafeedAction .Request request , final ActionListener <StopDatafeedAction .Response > listener ,
180
- PersistentTasksCustomMetaData tasks , final List <String > startedDatafeeds ) {
207
+ PersistentTasksCustomMetaData tasks , final List <String > notStoppedDatafeeds ) {
181
208
final AtomicInteger counter = new AtomicInteger ();
182
- final AtomicArray <Exception > failures = new AtomicArray <>(startedDatafeeds .size ());
209
+ final AtomicArray <Exception > failures = new AtomicArray <>(notStoppedDatafeeds .size ());
183
210
184
- for (String datafeedId : startedDatafeeds ) {
211
+ for (String datafeedId : notStoppedDatafeeds ) {
185
212
PersistentTasksCustomMetaData .PersistentTask <?> datafeedTask = MlTasks .getDatafeedTask (datafeedId , tasks );
186
213
if (datafeedTask != null ) {
187
214
persistentTasksService .sendRemoveRequest (datafeedTask .getId (),
188
215
new ActionListener <PersistentTasksCustomMetaData .PersistentTask <?>>() {
189
216
@ Override
190
217
public void onResponse (PersistentTasksCustomMetaData .PersistentTask <?> persistentTask ) {
191
- if (counter .incrementAndGet () == startedDatafeeds .size ()) {
218
+ if (counter .incrementAndGet () == notStoppedDatafeeds .size ()) {
192
219
sendResponseOrFailure (request .getDatafeedId (), listener , failures );
193
220
}
194
221
}
195
222
196
223
@ Override
197
224
public void onFailure (Exception e ) {
198
225
final int slot = counter .incrementAndGet ();
199
- if ((ExceptionsHelper .unwrapCause (e ) instanceof ResourceNotFoundException &&
200
- Strings .isAllOrWildcard (new String []{request .getDatafeedId ()})) == false ) {
226
+ // We validated that the datafeed names supplied in the request existed when we started processing the action.
227
+ // If the related tasks don't exist at this point then they must have been stopped by a simultaneous stop request.
228
+ // This is not an error.
229
+ if (ExceptionsHelper .unwrapCause (e ) instanceof ResourceNotFoundException == false ) {
201
230
failures .set (slot - 1 , e );
202
231
}
203
- if (slot == startedDatafeeds .size ()) {
232
+ if (slot == notStoppedDatafeeds .size ()) {
204
233
sendResponseOrFailure (request .getDatafeedId (), listener , failures );
205
234
}
206
235
}
207
236
});
208
237
} else {
209
- // This should not happen, because startedDatafeeds was derived from the same tasks that is passed to this method
238
+ // This should not happen, because startedDatafeeds and stoppingDatafeeds
239
+ // were derived from the same tasks that were passed to this method
210
240
String msg = "Requested datafeed [" + datafeedId + "] be force-stopped, but datafeed's task could not be found." ;
211
241
assert datafeedTask != null : msg ;
212
242
logger .error (msg );
213
243
final int slot = counter .incrementAndGet ();
214
244
failures .set (slot - 1 , new RuntimeException (msg ));
215
- if (slot == startedDatafeeds .size ()) {
245
+ if (slot == notStoppedDatafeeds .size ()) {
216
246
sendResponseOrFailure (request .getDatafeedId (), listener , failures );
217
247
}
218
248
}
@@ -228,17 +258,18 @@ protected void taskOperation(StopDatafeedAction.Request request, TransportStartD
228
258
threadPool .executor (MachineLearning .UTILITY_THREAD_POOL_NAME ).execute (new AbstractRunnable () {
229
259
@ Override
230
260
public void onFailure (Exception e ) {
231
- if ((e instanceof ResourceNotFoundException &&
232
- Strings .isAllOrWildcard (new String []{request .getDatafeedId ()}))) {
233
- datafeedTask .stop ("stop_datafeed (api)" , request .getStopTimeout ());
261
+ // We validated that the datafeed names supplied in the request existed when we started processing the action.
262
+ // If the related task for one of them doesn't exist at this point then it must have been removed by a
263
+ // simultaneous force stop request. This is not an error.
264
+ if (ExceptionsHelper .unwrapCause (e ) instanceof ResourceNotFoundException ) {
234
265
listener .onResponse (new StopDatafeedAction .Response (true ));
235
266
} else {
236
267
listener .onFailure (e );
237
268
}
238
269
}
239
270
240
271
@ Override
241
- protected void doRun () throws Exception {
272
+ protected void doRun () {
242
273
datafeedTask .stop ("stop_datafeed (api)" , request .getStopTimeout ());
243
274
listener .onResponse (new StopDatafeedAction .Response (true ));
244
275
}
@@ -312,8 +343,8 @@ protected StopDatafeedAction.Response newResponse(StopDatafeedAction.Request req
312
343
throw org .elasticsearch .ExceptionsHelper
313
344
.convertToElastic (failedNodeExceptions .get (0 ));
314
345
} else {
315
- // This can happen we the actual task in the node no longer exists,
316
- // which means the datafeed(s) have already been closed .
346
+ // This can happen when the actual task in the node no longer exists,
347
+ // which means the datafeed(s) have already been stopped .
317
348
return new StopDatafeedAction .Response (true );
318
349
}
319
350
}
0 commit comments