8
8
import org .apache .logging .log4j .LogManager ;
9
9
import org .apache .logging .log4j .Logger ;
10
10
import org .apache .logging .log4j .message .ParameterizedMessage ;
11
+ import org .apache .lucene .util .SetOnce ;
11
12
import org .elasticsearch .action .admin .indices .refresh .RefreshAction ;
12
13
import org .elasticsearch .action .admin .indices .refresh .RefreshRequest ;
13
14
import org .elasticsearch .action .search .SearchResponse ;
@@ -90,19 +91,19 @@ public AnalyticsProcessManager(Client client,
90
91
this .trainedModelProvider = Objects .requireNonNull (trainedModelProvider );
91
92
}
92
93
93
- public void runJob (DataFrameAnalyticsTask task , DataFrameAnalyticsConfig config , DataFrameDataExtractorFactory dataExtractorFactory ,
94
- Consumer <Exception > finishHandler ) {
94
+ public void runJob (DataFrameAnalyticsTask task , DataFrameAnalyticsConfig config , DataFrameDataExtractorFactory dataExtractorFactory ) {
95
95
executorServiceForJob .execute (() -> {
96
- ProcessContext processContext = new ProcessContext (config . getId () );
96
+ ProcessContext processContext = new ProcessContext (config );
97
97
synchronized (processContextByAllocation ) {
98
98
if (task .isStopping ()) {
99
99
// The task was requested to stop before we created the process context
100
- finishHandler .accept (null );
100
+ auditor .info (config .getId (), Messages .DATA_FRAME_ANALYTICS_AUDIT_FINISHED_ANALYSIS );
101
+ task .markAsCompleted ();
101
102
return ;
102
103
}
103
104
if (processContextByAllocation .putIfAbsent (task .getAllocationId (), processContext ) != null ) {
104
- finishHandler . accept (
105
- ExceptionsHelper . serverError ( "[" + config .getId () + "] Could not create process as one already exists" ) );
105
+ task . updateState (
106
+ DataFrameAnalyticsState . FAILED , "[" + config .getId () + "] Could not create process as one already exists" );
106
107
return ;
107
108
}
108
109
}
@@ -113,13 +114,13 @@ public void runJob(DataFrameAnalyticsTask task, DataFrameAnalyticsConfig config,
113
114
// Fetch existing model state (if any)
114
115
BytesReference state = getModelState (config );
115
116
116
- if (processContext .startProcess (dataExtractorFactory , config , task , state )) {
117
- executorServiceForProcess .execute (() -> processResults (processContext ));
118
- executorServiceForProcess .execute (() -> processData (task , config , processContext .dataExtractor ,
119
- processContext .process , processContext .resultProcessor , finishHandler , state ));
117
+ if (processContext .startProcess (dataExtractorFactory , task , state )) {
118
+ executorServiceForProcess .execute (() -> processContext .resultProcessor .get ().process (processContext .process .get ()));
119
+ executorServiceForProcess .execute (() -> processData (task , processContext , state ));
120
120
} else {
121
121
processContextByAllocation .remove (task .getAllocationId ());
122
- finishHandler .accept (null );
122
+ auditor .info (config .getId (), Messages .DATA_FRAME_ANALYTICS_AUDIT_FINISHED_ANALYSIS );
123
+ task .markAsCompleted ();
123
124
}
124
125
});
125
126
}
@@ -140,26 +141,18 @@ private BytesReference getModelState(DataFrameAnalyticsConfig config) {
140
141
}
141
142
}
142
143
143
- private void processResults (ProcessContext processContext ) {
144
+ private void processData (DataFrameAnalyticsTask task , ProcessContext processContext , BytesReference state ) {
145
+ DataFrameAnalyticsConfig config = processContext .config ;
146
+ DataFrameDataExtractor dataExtractor = processContext .dataExtractor .get ();
147
+ AnalyticsProcess <AnalyticsResult > process = processContext .process .get ();
148
+ AnalyticsResultProcessor resultProcessor = processContext .resultProcessor .get ();
144
149
try {
145
- processContext .resultProcessor .process (processContext .process );
146
- } catch (Exception e ) {
147
- processContext .setFailureReason (e .getMessage ());
148
- }
149
- }
150
-
151
- private void processData (DataFrameAnalyticsTask task , DataFrameAnalyticsConfig config , DataFrameDataExtractor dataExtractor ,
152
- AnalyticsProcess <AnalyticsResult > process , AnalyticsResultProcessor resultProcessor ,
153
- Consumer <Exception > finishHandler , BytesReference state ) {
154
-
155
- try {
156
- ProcessContext processContext = processContextByAllocation .get (task .getAllocationId ());
157
150
writeHeaderRecord (dataExtractor , process );
158
151
writeDataRows (dataExtractor , process , config .getAnalysis (), task .getProgressTracker ());
159
152
process .writeEndOfDataMessage ();
160
153
process .flushStream ();
161
154
162
- restoreState (config , state , process , finishHandler );
155
+ restoreState (task , config , state , process );
163
156
164
157
LOGGER .info ("[{}] Waiting for result processor to complete" , config .getId ());
165
158
resultProcessor .awaitForCompletion ();
@@ -168,26 +161,34 @@ private void processData(DataFrameAnalyticsTask task, DataFrameAnalyticsConfig c
168
161
refreshDest (config );
169
162
LOGGER .info ("[{}] Result processor has completed" , config .getId ());
170
163
} catch (Exception e ) {
171
- if (task .isStopping () == false ) {
172
- String errorMsg = new ParameterizedMessage ("[{}] Error while processing data [{}]" , config .getId (), e .getMessage ())
173
- .getFormattedMessage ();
164
+ if (task .isStopping ()) {
165
+ // Errors during task stopping are expected but we still want to log them just in case.
166
+ String errorMsg =
167
+ new ParameterizedMessage (
168
+ "[{}] Error while processing data [{}]; task is stopping" , config .getId (), e .getMessage ()).getFormattedMessage ();
169
+ LOGGER .debug (errorMsg , e );
170
+ } else {
171
+ String errorMsg =
172
+ new ParameterizedMessage ("[{}] Error while processing data [{}]" , config .getId (), e .getMessage ()).getFormattedMessage ();
174
173
LOGGER .error (errorMsg , e );
175
- processContextByAllocation . get ( task . getAllocationId ()) .setFailureReason (errorMsg );
174
+ processContext .setFailureReason (errorMsg );
176
175
}
177
176
} finally {
178
177
closeProcess (task );
179
178
180
- ProcessContext processContext = processContextByAllocation .remove (task .getAllocationId ());
179
+ processContextByAllocation .remove (task .getAllocationId ());
181
180
LOGGER .debug ("Removed process context for task [{}]; [{}] processes still running" , config .getId (),
182
181
processContextByAllocation .size ());
183
182
184
183
if (processContext .getFailureReason () == null ) {
185
184
// This results in marking the persistent task as complete
186
185
LOGGER .info ("[{}] Marking task completed" , config .getId ());
187
- finishHandler .accept (null );
186
+ auditor .info (config .getId (), Messages .DATA_FRAME_ANALYTICS_AUDIT_FINISHED_ANALYSIS );
187
+ task .markAsCompleted ();
188
188
} else {
189
189
LOGGER .error ("[{}] Marking task failed; {}" , config .getId (), processContext .getFailureReason ());
190
190
task .updateState (DataFrameAnalyticsState .FAILED , processContext .getFailureReason ());
191
+ // Note: We are not marking the task as failed here as we want the user to be able to inspect the failure reason.
191
192
}
192
193
}
193
194
}
@@ -239,8 +240,8 @@ private void writeHeaderRecord(DataFrameDataExtractor dataExtractor, AnalyticsPr
239
240
process .writeRecord (headerRecord );
240
241
}
241
242
242
- private void restoreState (DataFrameAnalyticsConfig config , @ Nullable BytesReference state , AnalyticsProcess < AnalyticsResult > process ,
243
- Consumer < Exception > failureHandler ) {
243
+ private void restoreState (DataFrameAnalyticsTask task , DataFrameAnalyticsConfig config , @ Nullable BytesReference state ,
244
+ AnalyticsProcess < AnalyticsResult > process ) {
244
245
if (config .getAnalysis ().persistsState () == false ) {
245
246
LOGGER .debug ("[{}] Analysis does not support state" , config .getId ());
246
247
return ;
@@ -258,7 +259,7 @@ private void restoreState(DataFrameAnalyticsConfig config, @Nullable BytesRefere
258
259
process .restoreState (state );
259
260
} catch (Exception e ) {
260
261
LOGGER .error (new ParameterizedMessage ("[{}] Failed to restore state" , process .getConfig ().jobId ()), e );
261
- failureHandler . accept ( ExceptionsHelper . serverError ( "Failed to restore state" , e ));
262
+ task . updateState ( DataFrameAnalyticsState . FAILED , "Failed to restore state: " + e . getMessage ( ));
262
263
}
263
264
}
264
265
@@ -293,9 +294,10 @@ private void closeProcess(DataFrameAnalyticsTask task) {
293
294
294
295
ProcessContext processContext = processContextByAllocation .get (task .getAllocationId ());
295
296
try {
296
- processContext .process .close ();
297
+ processContext .process .get (). close ();
297
298
LOGGER .info ("[{}] Closed process" , configId );
298
299
} catch (Exception e ) {
300
+ LOGGER .error ("[" + configId + "] Error closing data frame analyzer process" , e );
299
301
String errorMsg = new ParameterizedMessage (
300
302
"[{}] Error closing data frame analyzer process [{}]" , configId , e .getMessage ()).getFormattedMessage ();
301
303
processContext .setFailureReason (errorMsg );
@@ -323,79 +325,76 @@ int getProcessContextCount() {
323
325
324
326
class ProcessContext {
325
327
326
- private final String id ;
327
- private volatile AnalyticsProcess <AnalyticsResult > process ;
328
- private volatile DataFrameDataExtractor dataExtractor ;
329
- private volatile AnalyticsResultProcessor resultProcessor ;
330
- private volatile boolean processKilled ;
331
- private volatile String failureReason ;
328
+ private final DataFrameAnalyticsConfig config ;
329
+ private final SetOnce <AnalyticsProcess <AnalyticsResult >> process = new SetOnce <>();
330
+ private final SetOnce <DataFrameDataExtractor > dataExtractor = new SetOnce <>();
331
+ private final SetOnce <AnalyticsResultProcessor > resultProcessor = new SetOnce <>();
332
+ private final SetOnce <String > failureReason = new SetOnce <>();
332
333
333
- ProcessContext (String id ) {
334
- this .id = Objects .requireNonNull (id );
334
+ ProcessContext (DataFrameAnalyticsConfig config ) {
335
+ this .config = Objects .requireNonNull (config );
335
336
}
336
337
337
- synchronized String getFailureReason () {
338
- return failureReason ;
338
+ String getFailureReason () {
339
+ return failureReason . get () ;
339
340
}
340
341
341
- synchronized void setFailureReason (String failureReason ) {
342
- // Only set the new reason if there isn't one already as we want to keep the first reason
343
- if (this .failureReason == null && failureReason != null ) {
344
- this .failureReason = failureReason ;
342
+ void setFailureReason (String failureReason ) {
343
+ if (failureReason == null ) {
344
+ return ;
345
345
}
346
+ // Only set the new reason if there isn't one already as we want to keep the first reason (most likely the root cause).
347
+ this .failureReason .trySet (failureReason );
346
348
}
347
349
348
350
synchronized void stop () {
349
- LOGGER .debug ("[{}] Stopping process" , id );
350
- processKilled = true ;
351
- if (dataExtractor != null ) {
352
- dataExtractor .cancel ();
351
+ LOGGER .debug ("[{}] Stopping process" , config .getId ());
352
+ if (dataExtractor .get () != null ) {
353
+ dataExtractor .get ().cancel ();
353
354
}
354
- if (resultProcessor != null ) {
355
- resultProcessor .cancel ();
355
+ if (resultProcessor . get () != null ) {
356
+ resultProcessor .get (). cancel ();
356
357
}
357
- if (process != null ) {
358
+ if (process . get () != null ) {
358
359
try {
359
- process .kill ();
360
+ process .get (). kill ();
360
361
} catch (IOException e ) {
361
- LOGGER .error (new ParameterizedMessage ("[{}] Failed to kill process" , id ), e );
362
+ LOGGER .error (new ParameterizedMessage ("[{}] Failed to kill process" , config . getId () ), e );
362
363
}
363
364
}
364
365
}
365
366
366
367
/**
367
368
* @return {@code true} if the process was started or {@code false} if it was not because it was stopped in the meantime
368
369
*/
369
- synchronized boolean startProcess (DataFrameDataExtractorFactory dataExtractorFactory , DataFrameAnalyticsConfig config ,
370
- DataFrameAnalyticsTask task , @ Nullable BytesReference state ) {
371
- if (processKilled ) {
370
+ synchronized boolean startProcess (DataFrameDataExtractorFactory dataExtractorFactory ,
371
+ DataFrameAnalyticsTask task ,
372
+ @ Nullable BytesReference state ) {
373
+ if (task .isStopping ()) {
372
374
// The job was stopped before we started the process so no need to start it
373
375
return false ;
374
376
}
375
377
376
- dataExtractor = dataExtractorFactory .newExtractor (false );
378
+ dataExtractor . set ( dataExtractorFactory .newExtractor (false ) );
377
379
AnalyticsProcessConfig analyticsProcessConfig =
378
- createProcessConfig (config , dataExtractor , dataExtractorFactory .getExtractedFields ());
380
+ createProcessConfig (dataExtractor . get () , dataExtractorFactory .getExtractedFields ());
379
381
LOGGER .trace ("[{}] creating analytics process with config [{}]" , config .getId (), Strings .toString (analyticsProcessConfig ));
380
382
// If we have no rows, that means there is no data so no point in starting the native process
381
383
// just finish the task
382
384
if (analyticsProcessConfig .rows () == 0 ) {
383
385
LOGGER .info ("[{}] no data found to analyze. Will not start analytics native process." , config .getId ());
384
386
return false ;
385
387
}
386
- process = createProcess (task , config , analyticsProcessConfig , state );
387
- DataFrameRowsJoiner dataFrameRowsJoiner = new DataFrameRowsJoiner (config .getId (), client ,
388
- dataExtractorFactory .newExtractor (true ));
389
- resultProcessor = new AnalyticsResultProcessor (
390
- config , dataFrameRowsJoiner , task .getProgressTracker (), trainedModelProvider , auditor , dataExtractor .getFieldNames ());
388
+ process .set (createProcess (task , config , analyticsProcessConfig , state ));
389
+ resultProcessor .set (createResultProcessor (task , dataExtractorFactory ));
391
390
return true ;
392
391
}
393
392
394
- private AnalyticsProcessConfig createProcessConfig (
395
- DataFrameAnalyticsConfig config , DataFrameDataExtractor dataExtractor , ExtractedFields extractedFields ) {
393
+ private AnalyticsProcessConfig createProcessConfig (DataFrameDataExtractor dataExtractor ,
394
+ ExtractedFields extractedFields ) {
396
395
DataFrameDataExtractor .DataSummary dataSummary = dataExtractor .collectDataSummary ();
397
396
Set <String > categoricalFields = dataExtractor .getCategoricalFields (config .getAnalysis ());
398
- AnalyticsProcessConfig processConfig = new AnalyticsProcessConfig (
397
+ return new AnalyticsProcessConfig (
399
398
config .getId (),
400
399
dataSummary .rows ,
401
400
dataSummary .cols ,
@@ -405,7 +404,14 @@ private AnalyticsProcessConfig createProcessConfig(
405
404
categoricalFields ,
406
405
config .getAnalysis (),
407
406
extractedFields );
408
- return processConfig ;
407
+ }
408
+
409
+ private AnalyticsResultProcessor createResultProcessor (DataFrameAnalyticsTask task ,
410
+ DataFrameDataExtractorFactory dataExtractorFactory ) {
411
+ DataFrameRowsJoiner dataFrameRowsJoiner =
412
+ new DataFrameRowsJoiner (config .getId (), client , dataExtractorFactory .newExtractor (true ));
413
+ return new AnalyticsResultProcessor (
414
+ config , dataFrameRowsJoiner , task .getProgressTracker (), trainedModelProvider , auditor , dataExtractor .get ().getFieldNames ());
409
415
}
410
416
}
411
417
}
0 commit comments