Skip to content

Commit 5d35eaa

Browse files
author
Hendrik Muhs
committed
[Transform] improve irrecoverable error detection - part 2 (#52003)
base error handling on rest status instead of listing individual exception types relates to #51820
1 parent 3f151d1 commit 5d35eaa

File tree

3 files changed

+240
-38
lines changed

3 files changed

+240
-38
lines changed

x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/TransformIndexer.java

Lines changed: 51 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import org.elasticsearch.common.Strings;
2020
import org.elasticsearch.common.breaker.CircuitBreakingException;
2121
import org.elasticsearch.common.xcontent.XContentBuilder;
22-
import org.elasticsearch.index.IndexNotFoundException;
2322
import org.elasticsearch.index.query.BoolQueryBuilder;
2423
import org.elasticsearch.index.query.QueryBuilder;
2524
import org.elasticsearch.script.ScriptException;
@@ -42,7 +41,6 @@
4241
import org.elasticsearch.xpack.transform.checkpoint.CheckpointProvider;
4342
import org.elasticsearch.xpack.transform.notifications.TransformAuditor;
4443
import org.elasticsearch.xpack.transform.persistence.TransformConfigManager;
45-
import org.elasticsearch.xpack.transform.transforms.pivot.AggregationResultUtils;
4644
import org.elasticsearch.xpack.transform.transforms.pivot.Pivot;
4745
import org.elasticsearch.xpack.transform.utils.ExceptionRootCauseFinder;
4846

@@ -287,7 +285,7 @@ protected void onStart(long now, ActionListener<Boolean> listener) {
287285
// If the transform config index or the transform config is gone, something serious occurred
288286
// We are in an unknown state and should fail out
289287
if (failure instanceof ResourceNotFoundException) {
290-
updateConfigListener.onFailure(new TransformConfigReloadingException(msg, failure));
288+
updateConfigListener.onFailure(new TransformConfigLostOnReloadException(msg, failure));
291289
} else {
292290
auditor.warning(getJobId(), msg);
293291
updateConfigListener.onResponse(null);
@@ -477,37 +475,54 @@ synchronized void handleFailure(Exception e) {
477475

478476
if (unwrappedException instanceof CircuitBreakingException) {
479477
handleCircuitBreakingException((CircuitBreakingException) unwrappedException);
480-
} else if (unwrappedException instanceof ScriptException) {
478+
return;
479+
}
480+
481+
if (unwrappedException instanceof ScriptException) {
481482
handleScriptException((ScriptException) unwrappedException);
482-
// irrecoverable error without special handling
483-
} else if (unwrappedException instanceof BulkIndexingException && ((BulkIndexingException) unwrappedException).isIrrecoverable()) {
483+
return;
484+
}
485+
486+
if (unwrappedException instanceof BulkIndexingException && ((BulkIndexingException) unwrappedException).isIrrecoverable()) {
484487
handleIrrecoverableBulkIndexingException((BulkIndexingException) unwrappedException);
485-
} else if (unwrappedException instanceof IndexNotFoundException
486-
|| unwrappedException instanceof AggregationResultUtils.AggregationExtractionException
487-
|| unwrappedException instanceof TransformConfigReloadingException
488-
|| unwrappedException instanceof ResourceNotFoundException
489-
|| unwrappedException instanceof IllegalArgumentException) {
490-
failIndexer("task encountered irrecoverable failure: " + e.getMessage());
491-
} else if (context.getAndIncrementFailureCount() > context.getNumFailureRetries()) {
492-
failIndexer(
493-
"task encountered more than "
494-
+ context.getNumFailureRetries()
495-
+ " failures; latest failure: "
496-
+ ExceptionRootCauseFinder.getDetailedMessage(unwrappedException)
497-
);
498-
} else {
499-
// Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
500-
// times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
501-
if (e.getMessage().equals(lastAuditedExceptionMessage) == false) {
502-
String message = ExceptionRootCauseFinder.getDetailedMessage(unwrappedException);
488+
return;
489+
}
503490

504-
auditor.warning(
505-
getJobId(),
506-
"Transform encountered an exception: " + message + " Will attempt again at next scheduled trigger."
507-
);
508-
lastAuditedExceptionMessage = message;
509-
}
491+
// irrecoverable error without special handling
492+
if (unwrappedException instanceof ElasticsearchException) {
493+
ElasticsearchException elasticsearchException = (ElasticsearchException) unwrappedException;
494+
if (ExceptionRootCauseFinder.IRRECOVERABLE_REST_STATUSES.contains(elasticsearchException.status())) {
495+
failIndexer("task encountered irrecoverable failure: " + elasticsearchException.getDetailedMessage());
496+
return;
510497
}
498+
}
499+
500+
if (unwrappedException instanceof IllegalArgumentException) {
501+
failIndexer("task encountered irrecoverable failure: " + e.getMessage());
502+
return;
503+
}
504+
505+
if (context.getAndIncrementFailureCount() > context.getNumFailureRetries()) {
506+
failIndexer(
507+
"task encountered more than "
508+
+ context.getNumFailureRetries()
509+
+ " failures; latest failure: "
510+
+ ExceptionRootCauseFinder.getDetailedMessage(unwrappedException)
511+
);
512+
return;
513+
}
514+
515+
// Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
516+
// times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
517+
if (e.getMessage().equals(lastAuditedExceptionMessage) == false) {
518+
String message = ExceptionRootCauseFinder.getDetailedMessage(unwrappedException);
519+
520+
auditor.warning(
521+
getJobId(),
522+
"Transform encountered an exception: " + message + " Will attempt again at next scheduled trigger."
523+
);
524+
lastAuditedExceptionMessage = message;
525+
}
511526
}
512527

513528
/**
@@ -901,8 +916,12 @@ private RunState determineRunStateAtStart() {
901916
return RunState.PARTIAL_RUN_IDENTIFY_CHANGES;
902917
}
903918

904-
static class TransformConfigReloadingException extends ElasticsearchException {
905-
TransformConfigReloadingException(String msg, Throwable cause, Object... args) {
919+
/**
920+
* Thrown when the transform configuration disappeared permanently.
921+
* (not if reloading failed due to an intermittent problem)
922+
*/
923+
static class TransformConfigLostOnReloadException extends ResourceNotFoundException {
924+
TransformConfigLostOnReloadException(String msg, Throwable cause, Object... args) {
906925
super(msg, cause, args);
907926
}
908927
}

x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/utils/ExceptionRootCauseFinder.java

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,37 @@
77
package org.elasticsearch.xpack.transform.utils;
88

99
import org.elasticsearch.ElasticsearchException;
10-
import org.elasticsearch.ResourceNotFoundException;
1110
import org.elasticsearch.action.bulk.BulkItemResponse;
1211
import org.elasticsearch.action.search.SearchPhaseExecutionException;
1312
import org.elasticsearch.action.search.ShardSearchFailure;
14-
import org.elasticsearch.index.mapper.MapperParsingException;
13+
import org.elasticsearch.rest.RestStatus;
1514

15+
import java.util.Arrays;
1616
import java.util.Collection;
17+
import java.util.HashSet;
18+
import java.util.Set;
1719

1820
/**
1921
* Set of static utils to find the cause of a search exception.
2022
*/
2123
public final class ExceptionRootCauseFinder {
2224

25+
/**
26+
* List of rest statuses that we consider irrecoverable
27+
*/
28+
public static final Set<RestStatus> IRRECOVERABLE_REST_STATUSES = new HashSet<>(
29+
Arrays.asList(
30+
RestStatus.GONE,
31+
RestStatus.NOT_IMPLEMENTED,
32+
RestStatus.NOT_FOUND,
33+
RestStatus.BAD_REQUEST,
34+
RestStatus.UNAUTHORIZED,
35+
RestStatus.FORBIDDEN,
36+
RestStatus.METHOD_NOT_ALLOWED,
37+
RestStatus.NOT_ACCEPTABLE
38+
)
39+
);
40+
2341
/**
2442
* Unwrap the exception stack and return the most likely cause.
2543
*
@@ -61,17 +79,22 @@ public static String getDetailedMessage(Throwable t) {
6179
/**
6280
* Return the first irrecoverableException from a collection of bulk responses if there are any.
6381
*
64-
* @param failures a collection of bulk item responses
82+
* @param failures a collection of bulk item responses with failures
6583
* @return The first exception considered irrecoverable if there are any, null if no irrecoverable exception found
6684
*/
6785
public static Throwable getFirstIrrecoverableExceptionFromBulkResponses(Collection<BulkItemResponse> failures) {
6886
for (BulkItemResponse failure : failures) {
6987
Throwable unwrappedThrowable = org.elasticsearch.ExceptionsHelper.unwrapCause(failure.getFailure().getCause());
70-
if (unwrappedThrowable instanceof MapperParsingException
71-
|| unwrappedThrowable instanceof IllegalArgumentException
72-
|| unwrappedThrowable instanceof ResourceNotFoundException) {
88+
if (unwrappedThrowable instanceof IllegalArgumentException) {
7389
return unwrappedThrowable;
7490
}
91+
92+
if (unwrappedThrowable instanceof ElasticsearchException) {
93+
ElasticsearchException elasticsearchException = (ElasticsearchException) unwrappedThrowable;
94+
if (IRRECOVERABLE_REST_STATUSES.contains(elasticsearchException.status())) {
95+
return elasticsearchException;
96+
}
97+
}
7598
}
7699

77100
return null;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
package org.elasticsearch.xpack.transform.utils;
8+
9+
import org.elasticsearch.ElasticsearchSecurityException;
10+
import org.elasticsearch.ResourceNotFoundException;
11+
import org.elasticsearch.action.DocWriteRequest.OpType;
12+
import org.elasticsearch.action.bulk.BulkItemResponse;
13+
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
14+
import org.elasticsearch.index.mapper.MapperParsingException;
15+
import org.elasticsearch.index.shard.ShardId;
16+
import org.elasticsearch.index.translog.TranslogException;
17+
import org.elasticsearch.rest.RestStatus;
18+
import org.elasticsearch.test.ESTestCase;
19+
20+
import java.util.Collection;
21+
import java.util.HashMap;
22+
import java.util.Map;
23+
24+
public class ExceptionRootCauseFinderTests extends ESTestCase {
25+
public void testFetFirstIrrecoverableExceptionFromBulkResponses() {
26+
Map<Integer, BulkItemResponse> bulkItemResponses = new HashMap<>();
27+
28+
int id = 1;
29+
// 1
30+
bulkItemResponses.put(
31+
id,
32+
new BulkItemResponse(
33+
id++,
34+
OpType.INDEX,
35+
new BulkItemResponse.Failure("the_index", "type", "id", new MapperParsingException("mapper parsing error"))
36+
)
37+
);
38+
// 2
39+
bulkItemResponses.put(
40+
id,
41+
new BulkItemResponse(
42+
id++,
43+
OpType.INDEX,
44+
new BulkItemResponse.Failure("the_index", "type", "id", new ResourceNotFoundException("resource not found error"))
45+
)
46+
);
47+
// 3
48+
bulkItemResponses.put(
49+
id,
50+
new BulkItemResponse(
51+
id++,
52+
OpType.INDEX,
53+
new BulkItemResponse.Failure("the_index", "type", "id", new IllegalArgumentException("illegal argument error"))
54+
)
55+
);
56+
// 4 not irrecoverable
57+
bulkItemResponses.put(
58+
id,
59+
new BulkItemResponse(
60+
id++,
61+
OpType.INDEX,
62+
new BulkItemResponse.Failure("the_index", "type", "id", new EsRejectedExecutionException("es rejected execution"))
63+
)
64+
);
65+
// 5 not irrecoverable
66+
bulkItemResponses.put(
67+
id,
68+
new BulkItemResponse(
69+
id++,
70+
OpType.INDEX,
71+
new BulkItemResponse.Failure(
72+
"the_index",
73+
"type",
74+
"id",
75+
new TranslogException(new ShardId("the_index", "uid", 0), "translog error")
76+
)
77+
)
78+
);
79+
// 6
80+
bulkItemResponses.put(
81+
id,
82+
new BulkItemResponse(
83+
id++,
84+
OpType.INDEX,
85+
new BulkItemResponse.Failure(
86+
"the_index",
87+
"type",
88+
"id",
89+
new ElasticsearchSecurityException("Authentication required", RestStatus.UNAUTHORIZED)
90+
)
91+
)
92+
);
93+
// 7
94+
bulkItemResponses.put(
95+
id,
96+
new BulkItemResponse(
97+
id++,
98+
OpType.INDEX,
99+
new BulkItemResponse.Failure(
100+
"the_index",
101+
"type",
102+
"id",
103+
new ElasticsearchSecurityException("current license is non-compliant for [transform]", RestStatus.FORBIDDEN)
104+
)
105+
)
106+
);
107+
// 8 not irrecoverable
108+
bulkItemResponses.put(
109+
id,
110+
new BulkItemResponse(
111+
id++,
112+
OpType.INDEX,
113+
new BulkItemResponse.Failure(
114+
"the_index",
115+
"type",
116+
"id",
117+
new ElasticsearchSecurityException("overloaded, to many requests", RestStatus.TOO_MANY_REQUESTS)
118+
)
119+
)
120+
);
121+
// 9 not irrecoverable
122+
bulkItemResponses.put(
123+
id,
124+
new BulkItemResponse(
125+
id++,
126+
OpType.INDEX,
127+
new BulkItemResponse.Failure(
128+
"the_index",
129+
"type",
130+
"id",
131+
new ElasticsearchSecurityException("internal error", RestStatus.INTERNAL_SERVER_ERROR)
132+
)
133+
)
134+
);
135+
136+
assertFirstException(bulkItemResponses.values(), MapperParsingException.class, "mapper parsing error");
137+
bulkItemResponses.remove(1);
138+
assertFirstException(bulkItemResponses.values(), ResourceNotFoundException.class, "resource not found error");
139+
bulkItemResponses.remove(2);
140+
assertFirstException(bulkItemResponses.values(), IllegalArgumentException.class, "illegal argument error");
141+
bulkItemResponses.remove(3);
142+
assertFirstException(bulkItemResponses.values(), ElasticsearchSecurityException.class, "Authentication required");
143+
bulkItemResponses.remove(6);
144+
assertFirstException(
145+
bulkItemResponses.values(),
146+
ElasticsearchSecurityException.class,
147+
"current license is non-compliant for [transform]"
148+
);
149+
bulkItemResponses.remove(7);
150+
151+
assertNull(ExceptionRootCauseFinder.getFirstIrrecoverableExceptionFromBulkResponses(bulkItemResponses.values()));
152+
}
153+
154+
private static void assertFirstException(Collection<BulkItemResponse> bulkItemResponses, Class<?> expectedClass, String message) {
155+
Throwable t = ExceptionRootCauseFinder.getFirstIrrecoverableExceptionFromBulkResponses(bulkItemResponses);
156+
assertNotNull(t);
157+
assertEquals(t.getClass(), expectedClass);
158+
assertEquals(t.getMessage(), message);
159+
}
160+
}

0 commit comments

Comments
 (0)