-
Notifications
You must be signed in to change notification settings - Fork 25.2k
[ML][Data Frame] fixing _start?force=true bug #45660
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -235,7 +235,7 @@ public long getInProgressCheckpoint() { | |
} | ||
} | ||
|
||
public void setTaskStateStopped() { | ||
public synchronized void setTaskStateStopped() { | ||
taskState.set(DataFrameTransformTaskState.STOPPED); | ||
} | ||
|
||
|
@@ -256,8 +256,16 @@ public synchronized void start(Long startingCheckpoint, boolean force, ActionLis | |
return; | ||
} | ||
if (getIndexer() == null) { | ||
listener.onFailure(new ElasticsearchException("Task for transform [{}] not fully initialized. Try again later", | ||
getTransformId())); | ||
// If our state is failed AND the indexer is null, the user needs to _stop?force=true so that the indexer gets | ||
// fully initialized. | ||
// If we are NOT failed, then we can assume that `start` was just called early in the process. | ||
String msg = taskState.get() == DataFrameTransformTaskState.FAILED ? | ||
"It failed during the initialization process; force stop to allow reinitialization." : | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we failed before even being fully initialized, there was probably something REALLY wrong with the cluster at the time. We should indicate to the user that |
||
"Try again later."; | ||
listener.onFailure(new ElasticsearchStatusException("Task for transform [{}] not fully initialized. {}", | ||
RestStatus.CONFLICT, | ||
getTransformId(), | ||
msg)); | ||
return; | ||
} | ||
final IndexerState newState = getIndexer().start(); | ||
|
@@ -409,6 +417,13 @@ void persistStateToClusterState(DataFrameTransformState state, | |
} | ||
|
||
synchronized void markAsFailed(String reason, ActionListener<Void> listener) { | ||
// If we are already flagged as failed, this probably means that a second trigger started firing while we were attempting to | ||
// flag the previously triggered indexer as failed. Exit early as we are already flagged as failed. | ||
if (taskState.get() == DataFrameTransformTaskState.FAILED) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only reason to move this up earlier in the stack is to prevent needless checks. Additionally, we don't want to accidentally return weird error messages as a task could potentially (however rarely) be |
||
logger.warn("[{}] is already failed but encountered new failure; reason [{}] ", getTransformId(), reason); | ||
listener.onResponse(null); | ||
return; | ||
} | ||
// If the indexer is `STOPPING` this means that `DataFrameTransformTask#stop` was called previously, but something caused | ||
// the indexer to fail. Since `ClientDataFrameIndexer#doSaveState` will persist the state to the index once the indexer stops, | ||
// it is probably best to NOT change the internal state of the task and allow the normal stopping logic to continue. | ||
|
@@ -425,26 +440,13 @@ synchronized void markAsFailed(String reason, ActionListener<Void> listener) { | |
listener.onResponse(null); | ||
return; | ||
} | ||
// If we are already flagged as failed, this probably means that a second trigger started firing while we were attempting to | ||
// flag the previously triggered indexer as failed. Exit early as we are already flagged as failed. | ||
if (taskState.get() == DataFrameTransformTaskState.FAILED) { | ||
logger.warn("[{}] is already failed but encountered new failure; reason [{}] ", getTransformId(), reason); | ||
listener.onResponse(null); | ||
return; | ||
} | ||
auditor.error(transform.getId(), reason); | ||
// We should not keep retrying. Either the task will be stopped, or started | ||
// If it is started again, it is registered again. | ||
deregisterSchedulerJob(); | ||
DataFrameTransformState newState = new DataFrameTransformState( | ||
DataFrameTransformTaskState.FAILED, | ||
getIndexer() == null ? initialIndexerState : getIndexer().getState(), | ||
getIndexer() == null ? initialPosition : getIndexer().getPosition(), | ||
currentCheckpoint.get(), | ||
reason, | ||
getIndexer() == null ? null : getIndexer().getProgress()); | ||
taskState.set(DataFrameTransformTaskState.FAILED); | ||
stateReason.set(reason); | ||
DataFrameTransformState newState = getState(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is safe to call now like this since |
||
// Even though the indexer information is persisted to an index, we still need DataFrameTransformTaskState in the clusterstate | ||
// This keeps track of STARTED, FAILED, STOPPED | ||
// This is because a FAILED state could occur because we failed to read the config from the internal index, which would imply that | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I made this synchronized as
start
stop
andmarkAsFailed
are all synchronized and we do not want task state flipping between states while we are attempting to transition it.doSaveState
callssetTaskStateStopped
and sincesynchronized
utilize reentrant locks,stop
callingdoSaveState
directly in certain scenarios should not be a problem.