-
Notifications
You must be signed in to change notification settings - Fork 25.2k
[ML] only retry persistence failures when the failure is intermittent and stop retrying when analytics job is stopping #53725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
93a18be
d15da5b
19c48f7
06934f2
7ae4ba1
7d15d20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
import org.apache.logging.log4j.Logger; | ||
import org.apache.logging.log4j.message.ParameterizedMessage; | ||
import org.elasticsearch.ElasticsearchException; | ||
import org.elasticsearch.ExceptionsHelper; | ||
import org.elasticsearch.action.bulk.BulkAction; | ||
import org.elasticsearch.action.bulk.BulkItemResponse; | ||
import org.elasticsearch.action.bulk.BulkRequest; | ||
|
@@ -33,6 +34,7 @@ | |
import java.io.IOException; | ||
import java.time.Duration; | ||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.Map; | ||
import java.util.Random; | ||
import java.util.Set; | ||
|
@@ -42,6 +44,22 @@ | |
import java.util.stream.Collectors; | ||
|
||
public class ResultsPersisterService { | ||
/** | ||
* List of rest statuses that we consider irrecoverable | ||
*/ | ||
public static final Set<RestStatus> IRRECOVERABLE_REST_STATUSES = new HashSet<>( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: since the name is in capitals it should be immutable, i.e. |
||
Arrays.asList( | ||
RestStatus.GONE, | ||
RestStatus.NOT_IMPLEMENTED, | ||
RestStatus.NOT_FOUND, | ||
RestStatus.BAD_REQUEST, | ||
RestStatus.UNAUTHORIZED, | ||
RestStatus.FORBIDDEN, | ||
RestStatus.METHOD_NOT_ALLOWED, | ||
RestStatus.NOT_ACCEPTABLE | ||
) | ||
); | ||
|
||
private static final Logger LOGGER = LogManager.getLogger(ResultsPersisterService.class); | ||
|
||
public static final Setting<Integer> PERSIST_RESULTS_MAX_RETRIES = Setting.intSetting( | ||
|
@@ -124,9 +142,23 @@ private BulkResponse bulkIndexWithRetry(BulkRequest bulkRequest, | |
if (bulkResponse.hasFailures() == false) { | ||
return bulkResponse; | ||
} | ||
|
||
for (BulkItemResponse itemResponse : bulkResponse.getItems()) { | ||
if (itemResponse.isFailed()) { | ||
if (isIrrecoverable(itemResponse.getFailure().getCause())) { | ||
Throwable unwrappedParticular = ExceptionsHelper.unwrapCause(itemResponse.getFailure().getCause()); | ||
LOGGER.warn(new ParameterizedMessage( | ||
"[{}] experienced failure that cannot be automatically retried. Bulk failure message [{}]", | ||
jobId, | ||
bulkResponse.buildFailureMessage()), | ||
unwrappedParticular); | ||
throw new ElasticsearchException( | ||
"{} experienced failure that cannot be automatically retried. See logs for bulk failures", | ||
unwrappedParticular, | ||
jobId); | ||
} | ||
} | ||
} | ||
retryContext.nextIteration("index", bulkResponse.buildFailureMessage()); | ||
|
||
// We should only retry the docs that failed. | ||
bulkRequest = buildNewRequestFromFailures(bulkRequest, bulkResponse); | ||
} | ||
|
@@ -148,12 +180,28 @@ public SearchResponse searchWithRetry(SearchRequest searchRequest, | |
} catch (ElasticsearchException e) { | ||
LOGGER.warn("[" + jobId + "] Exception while executing search action", e); | ||
failureMessage = e.getDetailedMessage(); | ||
if (isIrrecoverable(e)) { | ||
LOGGER.warn(new ParameterizedMessage("[{}] experienced irrecoverable failure", jobId), e); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you make this message consistent with others? "...failure that cannot be..." |
||
throw new ElasticsearchException("{} experienced failure that cannot be automatically retried", e, jobId); | ||
} | ||
} | ||
|
||
retryContext.nextIteration("search", failureMessage); | ||
} | ||
} | ||
|
||
/** | ||
* @param ex The exception to check | ||
* @return true when the failure will persist no matter how many times we retry. | ||
*/ | ||
static boolean isIrrecoverable(Exception ex) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be "private"? |
||
Throwable t = ExceptionsHelper.unwrapCause(ex); | ||
if (t instanceof ElasticsearchException) { | ||
return IRRECOVERABLE_REST_STATUSES.contains(((ElasticsearchException) t).status()); | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* {@link RetryContext} object handles logic that is executed between consecutive retries of an action. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
package org.elasticsearch.xpack.ml.utils.persistence; | ||
|
||
import org.elasticsearch.ElasticsearchException; | ||
import org.elasticsearch.ElasticsearchStatusException; | ||
import org.elasticsearch.action.ActionListener; | ||
import org.elasticsearch.action.DocWriteRequest; | ||
import org.elasticsearch.action.bulk.BulkAction; | ||
|
@@ -27,8 +28,10 @@ | |
import org.elasticsearch.common.CheckedConsumer; | ||
import org.elasticsearch.common.settings.ClusterSettings; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.IndexNotFoundException; | ||
import org.elasticsearch.index.Index; | ||
import org.elasticsearch.index.shard.ShardId; | ||
import org.elasticsearch.indices.IndexPrimaryShardNotAllocatedException; | ||
import org.elasticsearch.rest.RestStatus; | ||
import org.elasticsearch.test.ESTestCase; | ||
import org.elasticsearch.threadpool.ThreadPool; | ||
import org.elasticsearch.xpack.core.ClientHelper; | ||
|
@@ -131,7 +134,8 @@ public void testSearchWithRetries_SuccessAfterRetry() { | |
} | ||
|
||
public void testSearchWithRetries_SuccessAfterRetryDueToException() { | ||
doThrow(new IndexNotFoundException("my-index")).doAnswer(withResponse(SEARCH_RESPONSE_SUCCESS)) | ||
doThrow(new IndexPrimaryShardNotAllocatedException(new Index("my-index", "UUID"))) | ||
.doAnswer(withResponse(SEARCH_RESPONSE_SUCCESS)) | ||
.when(client).execute(eq(SearchAction.INSTANCE), eq(SEARCH_REQUEST), any()); | ||
|
||
List<String> messages = new ArrayList<>(); | ||
|
@@ -206,6 +210,21 @@ public void testSearchWithRetries_Failure_ShouldNotRetryAfterRandomNumberOfRetri | |
verify(client, times(maxRetries + 1)).execute(eq(SearchAction.INSTANCE), eq(SEARCH_REQUEST), any()); | ||
} | ||
|
||
public void testSearchWithRetries_FailureOnIrrecoverableError() { | ||
resultsPersisterService.setMaxFailureRetries(5); | ||
|
||
doAnswer(withFailure(new ElasticsearchStatusException("bad search request", RestStatus.BAD_REQUEST))) | ||
.when(client).execute(eq(SearchAction.INSTANCE), eq(SEARCH_REQUEST), any()); | ||
|
||
ElasticsearchException e = | ||
expectThrows( | ||
ElasticsearchException.class, | ||
() -> resultsPersisterService.searchWithRetry(SEARCH_REQUEST, JOB_ID, () -> true, (s) -> {})); | ||
assertThat(e.getMessage(), containsString("experienced failure that cannot be automatically retried")); | ||
|
||
verify(client, times(1)).execute(eq(SearchAction.INSTANCE), eq(SEARCH_REQUEST), any()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think |
||
} | ||
|
||
private static Supplier<Boolean> shouldRetryUntil(int maxRetries) { | ||
return new Supplier<>() { | ||
int retries = 0; | ||
|
@@ -240,6 +259,29 @@ public void testBulkRequestChangeOnFailures() { | |
assertThat(lastMessage.get(), containsString("failed to index after [1] attempts. Will attempt again in")); | ||
} | ||
|
||
public void testBulkRequestChangeOnIrrecoverableFailures() { | ||
int maxFailureRetries = 10; | ||
resultsPersisterService.setMaxFailureRetries(maxFailureRetries); | ||
BulkItemResponse irrecoverable = new BulkItemResponse( | ||
2, | ||
DocWriteRequest.OpType.INDEX, | ||
new BulkItemResponse.Failure("my-index", "fail", new ElasticsearchStatusException("boom", RestStatus.BAD_REQUEST))); | ||
doAnswerWithResponses( | ||
new BulkResponse(new BulkItemResponse[]{irrecoverable, BULK_ITEM_RESPONSE_SUCCESS}, 0L), | ||
new BulkResponse(new BulkItemResponse[0], 0L)) | ||
.when(client).execute(eq(BulkAction.INSTANCE), any(), any()); | ||
|
||
BulkRequest bulkRequest = new BulkRequest(); | ||
bulkRequest.add(INDEX_REQUEST_FAILURE); | ||
bulkRequest.add(INDEX_REQUEST_SUCCESS); | ||
|
||
ElasticsearchException ex = expectThrows(ElasticsearchException.class, | ||
() -> resultsPersisterService.bulkIndexWithRetry(bulkRequest, JOB_ID, () -> true, (s)->{})); | ||
|
||
verify(client, times(1)).execute(eq(BulkAction.INSTANCE), any(), any()); | ||
assertThat(ex.getMessage(), containsString("experienced failure that cannot be automatically retried.")); | ||
} | ||
|
||
public void testBulkRequestDoesNotRetryWhenSupplierIsFalse() { | ||
doAnswerWithResponses( | ||
new BulkResponse(new BulkItemResponse[]{BULK_ITEM_RESPONSE_FAILURE, BULK_ITEM_RESPONSE_SUCCESS}, 0L), | ||
|
@@ -315,6 +357,15 @@ private static <Response> Answer<Response> withResponse(Response response) { | |
}; | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
private static <Response> Answer<Response> withFailure(Exception failure) { | ||
return invocationOnMock -> { | ||
ActionListener<Response> listener = (ActionListener<Response>) invocationOnMock.getArguments()[2]; | ||
listener.onFailure(failure); | ||
return null; | ||
}; | ||
} | ||
|
||
private static ResultsPersisterService buildResultsPersisterService(OriginSettingClient client) { | ||
CheckedConsumer<Integer, InterruptedException> sleeper = millis -> {}; | ||
ThreadPool tp = mock(ThreadPool.class); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this be final and passed in the constructor?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since the retry persistence logic depends on information stored within
AnalyticsResultProcessor
we would either have to make a Builder or a Factory class that we then pass toAnalyticsResultProcessor
which constructs a new joiner from that passed factory.This option just seemed simpler and less complex.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, got it. Please also consider the following approach which might be even less complex:
Add
isCancelled
field andcancel()
method toDataFrameRowsJoiner
class. InAnalyticsResultProcessor::cancel()
, you can also cancel the joiner:Then there is no need to pass the
Supplier
as you've already passed the actualisCancelled
value.