Skip to content

Commit 6ace2ef

Browse files
author
Hendrik Muhs
authored
[Transform] improve irrecoverable error detection - part 2 (#52003)
base error handling on rest status instead of listing individual exception types relates to #51820
1 parent 78a1185 commit 6ace2ef

File tree

3 files changed

+231
-38
lines changed

3 files changed

+231
-38
lines changed

x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/TransformIndexer.java

+51-32
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import org.elasticsearch.common.Strings;
2020
import org.elasticsearch.common.breaker.CircuitBreakingException;
2121
import org.elasticsearch.common.xcontent.XContentBuilder;
22-
import org.elasticsearch.index.IndexNotFoundException;
2322
import org.elasticsearch.index.query.BoolQueryBuilder;
2423
import org.elasticsearch.index.query.QueryBuilder;
2524
import org.elasticsearch.script.ScriptException;
@@ -42,7 +41,6 @@
4241
import org.elasticsearch.xpack.transform.checkpoint.CheckpointProvider;
4342
import org.elasticsearch.xpack.transform.notifications.TransformAuditor;
4443
import org.elasticsearch.xpack.transform.persistence.TransformConfigManager;
45-
import org.elasticsearch.xpack.transform.transforms.pivot.AggregationResultUtils;
4644
import org.elasticsearch.xpack.transform.transforms.pivot.Pivot;
4745
import org.elasticsearch.xpack.transform.utils.ExceptionRootCauseFinder;
4846

@@ -287,7 +285,7 @@ protected void onStart(long now, ActionListener<Boolean> listener) {
287285
// If the transform config index or the transform config is gone, something serious occurred
288286
// We are in an unknown state and should fail out
289287
if (failure instanceof ResourceNotFoundException) {
290-
updateConfigListener.onFailure(new TransformConfigReloadingException(msg, failure));
288+
updateConfigListener.onFailure(new TransformConfigLostOnReloadException(msg, failure));
291289
} else {
292290
auditor.warning(getJobId(), msg);
293291
updateConfigListener.onResponse(null);
@@ -477,37 +475,54 @@ synchronized void handleFailure(Exception e) {
477475

478476
if (unwrappedException instanceof CircuitBreakingException) {
479477
handleCircuitBreakingException((CircuitBreakingException) unwrappedException);
480-
} else if (unwrappedException instanceof ScriptException) {
478+
return;
479+
}
480+
481+
if (unwrappedException instanceof ScriptException) {
481482
handleScriptException((ScriptException) unwrappedException);
482-
// irrecoverable error without special handling
483-
} else if (unwrappedException instanceof BulkIndexingException && ((BulkIndexingException) unwrappedException).isIrrecoverable()) {
483+
return;
484+
}
485+
486+
if (unwrappedException instanceof BulkIndexingException && ((BulkIndexingException) unwrappedException).isIrrecoverable()) {
484487
handleIrrecoverableBulkIndexingException((BulkIndexingException) unwrappedException);
485-
} else if (unwrappedException instanceof IndexNotFoundException
486-
|| unwrappedException instanceof AggregationResultUtils.AggregationExtractionException
487-
|| unwrappedException instanceof TransformConfigReloadingException
488-
|| unwrappedException instanceof ResourceNotFoundException
489-
|| unwrappedException instanceof IllegalArgumentException) {
490-
failIndexer("task encountered irrecoverable failure: " + e.getMessage());
491-
} else if (context.getAndIncrementFailureCount() > context.getNumFailureRetries()) {
492-
failIndexer(
493-
"task encountered more than "
494-
+ context.getNumFailureRetries()
495-
+ " failures; latest failure: "
496-
+ ExceptionRootCauseFinder.getDetailedMessage(unwrappedException)
497-
);
498-
} else {
499-
// Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
500-
// times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
501-
if (e.getMessage().equals(lastAuditedExceptionMessage) == false) {
502-
String message = ExceptionRootCauseFinder.getDetailedMessage(unwrappedException);
488+
return;
489+
}
503490

504-
auditor.warning(
505-
getJobId(),
506-
"Transform encountered an exception: " + message + " Will attempt again at next scheduled trigger."
507-
);
508-
lastAuditedExceptionMessage = message;
509-
}
491+
// irrecoverable error without special handling
492+
if (unwrappedException instanceof ElasticsearchException) {
493+
ElasticsearchException elasticsearchException = (ElasticsearchException) unwrappedException;
494+
if (ExceptionRootCauseFinder.IRRECOVERABLE_REST_STATUSES.contains(elasticsearchException.status())) {
495+
failIndexer("task encountered irrecoverable failure: " + elasticsearchException.getDetailedMessage());
496+
return;
510497
}
498+
}
499+
500+
if (unwrappedException instanceof IllegalArgumentException) {
501+
failIndexer("task encountered irrecoverable failure: " + e.getMessage());
502+
return;
503+
}
504+
505+
if (context.getAndIncrementFailureCount() > context.getNumFailureRetries()) {
506+
failIndexer(
507+
"task encountered more than "
508+
+ context.getNumFailureRetries()
509+
+ " failures; latest failure: "
510+
+ ExceptionRootCauseFinder.getDetailedMessage(unwrappedException)
511+
);
512+
return;
513+
}
514+
515+
// Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
516+
// times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
517+
if (e.getMessage().equals(lastAuditedExceptionMessage) == false) {
518+
String message = ExceptionRootCauseFinder.getDetailedMessage(unwrappedException);
519+
520+
auditor.warning(
521+
getJobId(),
522+
"Transform encountered an exception: " + message + " Will attempt again at next scheduled trigger."
523+
);
524+
lastAuditedExceptionMessage = message;
525+
}
511526
}
512527

513528
/**
@@ -901,8 +916,12 @@ private RunState determineRunStateAtStart() {
901916
return RunState.PARTIAL_RUN_IDENTIFY_CHANGES;
902917
}
903918

904-
static class TransformConfigReloadingException extends ElasticsearchException {
905-
TransformConfigReloadingException(String msg, Throwable cause, Object... args) {
919+
/**
920+
* Thrown when the transform configuration disappeared permanently.
921+
* (not if reloading failed due to an intermittent problem)
922+
*/
923+
static class TransformConfigLostOnReloadException extends ResourceNotFoundException {
924+
TransformConfigLostOnReloadException(String msg, Throwable cause, Object... args) {
906925
super(msg, cause, args);
907926
}
908927
}

x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/utils/ExceptionRootCauseFinder.java

+29-6
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,37 @@
77
package org.elasticsearch.xpack.transform.utils;
88

99
import org.elasticsearch.ElasticsearchException;
10-
import org.elasticsearch.ResourceNotFoundException;
1110
import org.elasticsearch.action.bulk.BulkItemResponse;
1211
import org.elasticsearch.action.search.SearchPhaseExecutionException;
1312
import org.elasticsearch.action.search.ShardSearchFailure;
14-
import org.elasticsearch.index.mapper.MapperParsingException;
13+
import org.elasticsearch.rest.RestStatus;
1514

15+
import java.util.Arrays;
1616
import java.util.Collection;
17+
import java.util.HashSet;
18+
import java.util.Set;
1719

1820
/**
1921
* Set of static utils to find the cause of a search exception.
2022
*/
2123
public final class ExceptionRootCauseFinder {
2224

25+
/**
26+
* List of rest statuses that we consider irrecoverable
27+
*/
28+
public static final Set<RestStatus> IRRECOVERABLE_REST_STATUSES = new HashSet<>(
29+
Arrays.asList(
30+
RestStatus.GONE,
31+
RestStatus.NOT_IMPLEMENTED,
32+
RestStatus.NOT_FOUND,
33+
RestStatus.BAD_REQUEST,
34+
RestStatus.UNAUTHORIZED,
35+
RestStatus.FORBIDDEN,
36+
RestStatus.METHOD_NOT_ALLOWED,
37+
RestStatus.NOT_ACCEPTABLE
38+
)
39+
);
40+
2341
/**
2442
* Unwrap the exception stack and return the most likely cause.
2543
*
@@ -61,17 +79,22 @@ public static String getDetailedMessage(Throwable t) {
6179
/**
6280
* Return the first irrecoverableException from a collection of bulk responses if there are any.
6381
*
64-
* @param failures a collection of bulk item responses
82+
* @param failures a collection of bulk item responses with failures
6583
* @return The first exception considered irrecoverable if there are any, null if no irrecoverable exception found
6684
*/
6785
public static Throwable getFirstIrrecoverableExceptionFromBulkResponses(Collection<BulkItemResponse> failures) {
6886
for (BulkItemResponse failure : failures) {
6987
Throwable unwrappedThrowable = org.elasticsearch.ExceptionsHelper.unwrapCause(failure.getFailure().getCause());
70-
if (unwrappedThrowable instanceof MapperParsingException
71-
|| unwrappedThrowable instanceof IllegalArgumentException
72-
|| unwrappedThrowable instanceof ResourceNotFoundException) {
88+
if (unwrappedThrowable instanceof IllegalArgumentException) {
7389
return unwrappedThrowable;
7490
}
91+
92+
if (unwrappedThrowable instanceof ElasticsearchException) {
93+
ElasticsearchException elasticsearchException = (ElasticsearchException) unwrappedThrowable;
94+
if (IRRECOVERABLE_REST_STATUSES.contains(elasticsearchException.status())) {
95+
return elasticsearchException;
96+
}
97+
}
7598
}
7699

77100
return null;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
package org.elasticsearch.xpack.transform.utils;
8+
9+
import org.elasticsearch.ElasticsearchSecurityException;
10+
import org.elasticsearch.ResourceNotFoundException;
11+
import org.elasticsearch.action.DocWriteRequest.OpType;
12+
import org.elasticsearch.action.bulk.BulkItemResponse;
13+
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
14+
import org.elasticsearch.index.mapper.MapperParsingException;
15+
import org.elasticsearch.index.shard.ShardId;
16+
import org.elasticsearch.index.translog.TranslogException;
17+
import org.elasticsearch.rest.RestStatus;
18+
import org.elasticsearch.test.ESTestCase;
19+
20+
import java.util.Collection;
21+
import java.util.HashMap;
22+
import java.util.Map;
23+
24+
public class ExceptionRootCauseFinderTests extends ESTestCase {
25+
public void testFetFirstIrrecoverableExceptionFromBulkResponses() {
26+
Map<Integer, BulkItemResponse> bulkItemResponses = new HashMap<>();
27+
28+
int id = 1;
29+
// 1
30+
bulkItemResponses.put(
31+
id,
32+
new BulkItemResponse(
33+
id++,
34+
OpType.INDEX,
35+
new BulkItemResponse.Failure("the_index", "id", new MapperParsingException("mapper parsing error"))
36+
)
37+
);
38+
// 2
39+
bulkItemResponses.put(
40+
id,
41+
new BulkItemResponse(
42+
id++,
43+
OpType.INDEX,
44+
new BulkItemResponse.Failure("the_index", "id", new ResourceNotFoundException("resource not found error"))
45+
)
46+
);
47+
// 3
48+
bulkItemResponses.put(
49+
id,
50+
new BulkItemResponse(
51+
id++,
52+
OpType.INDEX,
53+
new BulkItemResponse.Failure("the_index", "id", new IllegalArgumentException("illegal argument error"))
54+
)
55+
);
56+
// 4 not irrecoverable
57+
bulkItemResponses.put(
58+
id,
59+
new BulkItemResponse(
60+
id++,
61+
OpType.INDEX,
62+
new BulkItemResponse.Failure("the_index", "id", new EsRejectedExecutionException("es rejected execution"))
63+
)
64+
);
65+
// 5 not irrecoverable
66+
bulkItemResponses.put(
67+
id,
68+
new BulkItemResponse(
69+
id++,
70+
OpType.INDEX,
71+
new BulkItemResponse.Failure("the_index", "id", new TranslogException(new ShardId("the_index", "uid", 0), "translog error"))
72+
)
73+
);
74+
// 6
75+
bulkItemResponses.put(
76+
id,
77+
new BulkItemResponse(
78+
id++,
79+
OpType.INDEX,
80+
new BulkItemResponse.Failure(
81+
"the_index",
82+
"id",
83+
new ElasticsearchSecurityException("Authentication required", RestStatus.UNAUTHORIZED)
84+
)
85+
)
86+
);
87+
// 7
88+
bulkItemResponses.put(
89+
id,
90+
new BulkItemResponse(
91+
id++,
92+
OpType.INDEX,
93+
new BulkItemResponse.Failure(
94+
"the_index",
95+
"id",
96+
new ElasticsearchSecurityException("current license is non-compliant for [transform]", RestStatus.FORBIDDEN)
97+
)
98+
)
99+
);
100+
// 8 not irrecoverable
101+
bulkItemResponses.put(
102+
id,
103+
new BulkItemResponse(
104+
id++,
105+
OpType.INDEX,
106+
new BulkItemResponse.Failure(
107+
"the_index",
108+
"id",
109+
new ElasticsearchSecurityException("overloaded, to many requests", RestStatus.TOO_MANY_REQUESTS)
110+
)
111+
)
112+
);
113+
// 9 not irrecoverable
114+
bulkItemResponses.put(
115+
id,
116+
new BulkItemResponse(
117+
id++,
118+
OpType.INDEX,
119+
new BulkItemResponse.Failure(
120+
"the_index",
121+
"id",
122+
new ElasticsearchSecurityException("internal error", RestStatus.INTERNAL_SERVER_ERROR)
123+
)
124+
)
125+
);
126+
127+
assertFirstException(bulkItemResponses.values(), MapperParsingException.class, "mapper parsing error");
128+
bulkItemResponses.remove(1);
129+
assertFirstException(bulkItemResponses.values(), ResourceNotFoundException.class, "resource not found error");
130+
bulkItemResponses.remove(2);
131+
assertFirstException(bulkItemResponses.values(), IllegalArgumentException.class, "illegal argument error");
132+
bulkItemResponses.remove(3);
133+
assertFirstException(bulkItemResponses.values(), ElasticsearchSecurityException.class, "Authentication required");
134+
bulkItemResponses.remove(6);
135+
assertFirstException(
136+
bulkItemResponses.values(),
137+
ElasticsearchSecurityException.class,
138+
"current license is non-compliant for [transform]"
139+
);
140+
bulkItemResponses.remove(7);
141+
142+
assertNull(ExceptionRootCauseFinder.getFirstIrrecoverableExceptionFromBulkResponses(bulkItemResponses.values()));
143+
}
144+
145+
private static void assertFirstException(Collection<BulkItemResponse> bulkItemResponses, Class<?> expectedClass, String message) {
146+
Throwable t = ExceptionRootCauseFinder.getFirstIrrecoverableExceptionFromBulkResponses(bulkItemResponses);
147+
assertNotNull(t);
148+
assertEquals(t.getClass(), expectedClass);
149+
assertEquals(t.getMessage(), message);
150+
}
151+
}

0 commit comments

Comments
 (0)