-
Notifications
You must be signed in to change notification settings - Fork 25.2k
[ML] Wait for model process to stop in stop deployment #83644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 83644 | ||
summary: Wait for model process to be stop in stop deployment | ||
area: Machine Learning | ||
type: bug | ||
issues: [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,7 +39,6 @@ | |
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper; | ||
import org.elasticsearch.xpack.ml.inference.allocation.TrainedModelAllocationClusterService; | ||
import org.elasticsearch.xpack.ml.inference.allocation.TrainedModelAllocationMetadata; | ||
import org.elasticsearch.xpack.ml.inference.allocation.TrainedModelAllocationService; | ||
import org.elasticsearch.xpack.ml.inference.deployment.TrainedModelDeploymentTask; | ||
|
||
import java.util.Collections; | ||
|
@@ -66,7 +65,6 @@ public class TransportStopTrainedModelDeploymentAction extends TransportTasksAct | |
|
||
private final Client client; | ||
private final IngestService ingestService; | ||
private final TrainedModelAllocationService trainedModelAllocationService; | ||
private final TrainedModelAllocationClusterService trainedModelAllocationClusterService; | ||
|
||
@Inject | ||
|
@@ -76,7 +74,6 @@ public TransportStopTrainedModelDeploymentAction( | |
ActionFilters actionFilters, | ||
Client client, | ||
IngestService ingestService, | ||
TrainedModelAllocationService trainedModelAllocationService, | ||
TrainedModelAllocationClusterService trainedModelAllocationClusterService | ||
) { | ||
super( | ||
|
@@ -91,7 +88,6 @@ public TransportStopTrainedModelDeploymentAction( | |
); | ||
this.client = new OriginSettingClient(client, ML_ORIGIN); | ||
this.ingestService = ingestService; | ||
this.trainedModelAllocationService = trainedModelAllocationService; | ||
this.trainedModelAllocationClusterService = trainedModelAllocationClusterService; | ||
} | ||
|
||
|
@@ -150,6 +146,7 @@ protected void doExecute( | |
} | ||
|
||
// NOTE, should only run on Master node | ||
assert clusterService.localNode().isMasterNode(); | ||
trainedModelAllocationClusterService.setModelAllocationToStopping( | ||
modelId, | ||
ActionListener.wrap( | ||
|
@@ -196,30 +193,25 @@ private void normalUndeploy( | |
) { | ||
request.setNodes(modelAllocation.getNodeRoutingTable().keySet().toArray(String[]::new)); | ||
ActionListener<StopTrainedModelDeploymentAction.Response> finalListener = ActionListener.wrap(r -> { | ||
waitForTaskRemoved(modelId, modelAllocation, request, r, ActionListener.wrap(waited -> { | ||
trainedModelAllocationService.deleteModelAllocation( | ||
modelId, | ||
ActionListener.wrap(deleted -> listener.onResponse(r), deletionFailed -> { | ||
logger.error( | ||
() -> new ParameterizedMessage( | ||
"[{}] failed to delete model allocation after nodes unallocated the deployment", | ||
modelId | ||
), | ||
assert clusterService.localNode().isMasterNode(); | ||
trainedModelAllocationClusterService.removeModelAllocation( | ||
modelId, | ||
ActionListener.wrap(deleted -> listener.onResponse(r), deletionFailed -> { | ||
logger.error( | ||
() -> new ParameterizedMessage( | ||
"[{}] failed to delete model allocation after nodes unallocated the deployment", | ||
modelId | ||
), | ||
deletionFailed | ||
); | ||
listener.onFailure( | ||
ExceptionsHelper.serverError( | ||
"failed to delete model allocation after nodes unallocated the deployment. Attempt to stop again", | ||
deletionFailed | ||
); | ||
listener.onFailure( | ||
ExceptionsHelper.serverError( | ||
"failed to delete model allocation after nodes unallocated the deployment. Attempt to stop again", | ||
deletionFailed | ||
) | ||
); | ||
}) | ||
); | ||
}, | ||
// TODO should we attempt to delete the deployment here? | ||
listener::onFailure | ||
)); | ||
|
||
) | ||
); | ||
}) | ||
); | ||
}, e -> { | ||
if (ExceptionsHelper.unwrapCause(e) instanceof FailedNodeException) { | ||
// A node has dropped out of the cluster since we started executing the requests. | ||
|
@@ -235,24 +227,6 @@ private void normalUndeploy( | |
super.doExecute(task, request, finalListener); | ||
} | ||
|
||
void waitForTaskRemoved( | ||
String modelId, | ||
TrainedModelAllocation trainedModelAllocation, | ||
StopTrainedModelDeploymentAction.Request request, | ||
StopTrainedModelDeploymentAction.Response response, | ||
ActionListener<StopTrainedModelDeploymentAction.Response> listener | ||
) { | ||
final Set<String> nodesOfConcern = trainedModelAllocation.getNodeRoutingTable().keySet(); | ||
client.admin() | ||
.cluster() | ||
.prepareListTasks(nodesOfConcern.toArray(String[]::new)) | ||
.setDetailed(true) | ||
.setWaitForCompletion(true) | ||
.setActions(modelId) | ||
.setTimeout(request.getTimeout()) | ||
.execute(ActionListener.wrap(complete -> listener.onResponse(response), listener::onFailure)); | ||
} | ||
|
||
@Override | ||
protected StopTrainedModelDeploymentAction.Response newResponse( | ||
StopTrainedModelDeploymentAction.Request request, | ||
|
@@ -275,7 +249,9 @@ protected void taskOperation( | |
TrainedModelDeploymentTask task, | ||
ActionListener<StopTrainedModelDeploymentAction.Response> listener | ||
) { | ||
task.stop("undeploy_trained_model (api)"); | ||
listener.onResponse(new StopTrainedModelDeploymentAction.Response(true)); | ||
task.stop( | ||
"undeploy_trained_model (api)", | ||
ActionListener.wrap(r -> listener.onResponse(new StopTrainedModelDeploymentAction.Response(true)), listener::onFailure) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a much cleaner and the execution path is more easily read. |
||
); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,9 +9,11 @@ | |
|
||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.apache.logging.log4j.message.ParameterizedMessage; | ||
import org.apache.lucene.util.SetOnce; | ||
import org.elasticsearch.ElasticsearchStatusException; | ||
import org.elasticsearch.action.ActionListener; | ||
import org.elasticsearch.action.support.master.AcknowledgedResponse; | ||
import org.elasticsearch.core.TimeValue; | ||
import org.elasticsearch.license.LicensedFeature; | ||
import org.elasticsearch.license.XPackLicenseState; | ||
|
@@ -80,15 +82,11 @@ public TaskParams getParams() { | |
return params; | ||
} | ||
|
||
public void stop(String reason) { | ||
logger.debug("[{}] Stopping due to reason [{}]", getModelId(), reason); | ||
licensedFeature.stopTracking(licenseState, "model-" + params.getModelId()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this still needs to be called. If you are concerned, maybe wrap the listener and call this on response/failure? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I'm not happy with this pattern. The task asks the node service to stop then the node service calls back to The problem is there are a few ways I deleted these lines because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Gotcha, we just need to be careful and make sure that |
||
stopped = true; | ||
stoppedReasonHolder.trySet(reason); | ||
trainedModelAllocationNodeService.stopDeploymentAndNotify(this, reason); | ||
public void stop(String reason, ActionListener<AcknowledgedResponse> listener) { | ||
trainedModelAllocationNodeService.stopDeploymentAndNotify(this, reason, listener); | ||
} | ||
|
||
public void stopWithoutNotification(String reason) { | ||
public void markAsStopped(String reason) { | ||
licensedFeature.stopTracking(licenseState, "model-" + params.getModelId()); | ||
logger.debug("[{}] Stopping due to reason [{}]", getModelId(), reason); | ||
stoppedReasonHolder.trySet(reason); | ||
|
@@ -106,7 +104,14 @@ public Optional<String> stoppedReason() { | |
@Override | ||
protected void onCancelled() { | ||
String reason = getReasonCancelled(); | ||
stop(reason); | ||
logger.info("[{}] task cancelled due to reason [{}]", getModelId(), reason); | ||
stop( | ||
reason, | ||
ActionListener.wrap( | ||
acknowledgedResponse -> {}, | ||
e -> logger.error(new ParameterizedMessage("[{}] error stopping the model after task cancellation", getModelId()), e) | ||
) | ||
); | ||
} | ||
|
||
public void infer(Map<String, Object> doc, InferenceConfigUpdate update, TimeValue timeout, ActionListener<InferenceResults> listener) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this bug was added in: #81259
The actions used to contain the model id. Regardless, the new stopping path is much cleaner.