Skip to content

Commit 394b280

Browse files
committed
Limit retries of failed allocations per index
Today if a shard fails during initialization phase due to misconfiguration, broken disks, missing analyzers, not installed plugins etc. elasticsaerch keeps on trying to initialize or rather allocate that shard. Yet, in the worst case scenario this ends in an endless allocation loop. To prevent this loop and all it's sideeffects like spamming log files over and over again this commit adds an allocation decider that stops allocating a shard that failed more than N times in a row to allocate. The number or retries can be configured via `index.allocation.max_retry` and it's default is set to `5`. Once the setting is updated shards with less failures than the number set per index will be allowed to allocate again. Internally we maintain a counter on the UnassignedInfo that is reset to `0` once the shards has been started. Relates to elastic#18417
1 parent d77c299 commit 394b280

File tree

10 files changed

+262
-17
lines changed

10 files changed

+262
-17
lines changed

core/src/main/java/org/elasticsearch/cluster/ClusterModule.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.elasticsearch.cluster.routing.allocation.decider.NodeVersionAllocationDecider;
5050
import org.elasticsearch.cluster.routing.allocation.decider.RebalanceOnlyWhenActiveAllocationDecider;
5151
import org.elasticsearch.cluster.routing.allocation.decider.ReplicaAfterPrimaryActiveAllocationDecider;
52+
import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider;
5253
import org.elasticsearch.cluster.routing.allocation.decider.SameShardAllocationDecider;
5354
import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
5455
import org.elasticsearch.cluster.routing.allocation.decider.SnapshotInProgressAllocationDecider;
@@ -79,6 +80,7 @@ public class ClusterModule extends AbstractModule {
7980
new Setting<>("cluster.routing.allocation.type", BALANCED_ALLOCATOR, Function.identity(), Property.NodeScope);
8081
public static final List<Class<? extends AllocationDecider>> DEFAULT_ALLOCATION_DECIDERS =
8182
Collections.unmodifiableList(Arrays.asList(
83+
MaxRetryAllocationDecider.class,
8284
SameShardAllocationDecider.class,
8385
FilterAllocationDecider.class,
8486
ReplicaAfterPrimaryActiveAllocationDecider.class,

core/src/main/java/org/elasticsearch/cluster/routing/UnassignedInfo.java

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ public final class UnassignedInfo implements ToXContent, Writeable {
4848
public static final Setting<TimeValue> INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING =
4949
Setting.timeSetting("index.unassigned.node_left.delayed_timeout", DEFAULT_DELAYED_NODE_LEFT_TIMEOUT, Property.Dynamic,
5050
Property.IndexScope);
51-
5251
/**
5352
* Reason why the shard is in unassigned state.
5453
* <p>
@@ -103,7 +102,11 @@ public enum Reason {
103102
/**
104103
* A better replica location is identified and causes the existing replica allocation to be cancelled.
105104
*/
106-
REALLOCATED_REPLICA;
105+
REALLOCATED_REPLICA,
106+
/**
107+
* Unassigned as a result of a failed primary while the replica was initializing.
108+
*/
109+
PRIMARY_FAILED;
107110
}
108111

109112
private final Reason reason;
@@ -112,6 +115,7 @@ public enum Reason {
112115
private final long lastComputedLeftDelayNanos; // how long to delay shard allocation, not serialized (always positive, 0 means no delay)
113116
private final String message;
114117
private final Throwable failure;
118+
private final int failedAllocations;
115119

116120
/**
117121
* creates an UnassingedInfo object based **current** time
@@ -120,7 +124,7 @@ public enum Reason {
120124
* @param message more information about cause.
121125
**/
122126
public UnassignedInfo(Reason reason, String message) {
123-
this(reason, message, null, System.nanoTime(), System.currentTimeMillis());
127+
this(reason, message, null, reason == Reason.ALLOCATION_FAILED ? 1 : 0, System.nanoTime(), System.currentTimeMillis());
124128
}
125129

126130
/**
@@ -130,13 +134,16 @@ public UnassignedInfo(Reason reason, String message) {
130134
* @param unassignedTimeNanos the time to use as the base for any delayed re-assignment calculation
131135
* @param unassignedTimeMillis the time of unassignment used to display to in our reporting.
132136
*/
133-
public UnassignedInfo(Reason reason, @Nullable String message, @Nullable Throwable failure, long unassignedTimeNanos, long unassignedTimeMillis) {
137+
public UnassignedInfo(Reason reason, @Nullable String message, @Nullable Throwable failure, int failedAllocations, long unassignedTimeNanos, long unassignedTimeMillis) {
134138
this.reason = reason;
135139
this.unassignedTimeMillis = unassignedTimeMillis;
136140
this.unassignedTimeNanos = unassignedTimeNanos;
137141
this.lastComputedLeftDelayNanos = 0L;
138142
this.message = message;
139143
this.failure = failure;
144+
this.failedAllocations = failedAllocations;
145+
assert failedAllocations > 0 && reason == Reason.ALLOCATION_FAILED || failedAllocations == 0 && reason != Reason.ALLOCATION_FAILED:
146+
"failedAllocations: " + 0 + " for reason " + reason;
140147
assert !(message == null && failure != null) : "provide a message if a failure exception is provided";
141148
}
142149

@@ -147,17 +154,19 @@ public UnassignedInfo(UnassignedInfo unassignedInfo, long newComputedLeftDelayNa
147154
this.lastComputedLeftDelayNanos = newComputedLeftDelayNanos;
148155
this.message = unassignedInfo.message;
149156
this.failure = unassignedInfo.failure;
157+
this.failedAllocations = unassignedInfo.failedAllocations;
150158
}
151159

152160
public UnassignedInfo(StreamInput in) throws IOException {
153161
this.reason = Reason.values()[(int) in.readByte()];
154162
this.unassignedTimeMillis = in.readLong();
155163
// As System.nanoTime() cannot be compared across different JVMs, reset it to now.
156-
// This means that in master failover situations, elapsed delay time is forgotten.
164+
// This means that in master fail-over situations, elapsed delay time is forgotten.
157165
this.unassignedTimeNanos = System.nanoTime();
158166
this.lastComputedLeftDelayNanos = 0L;
159167
this.message = in.readOptionalString();
160168
this.failure = in.readThrowable();
169+
this.failedAllocations = in.readVInt();
161170
}
162171

163172
public void writeTo(StreamOutput out) throws IOException {
@@ -166,12 +175,18 @@ public void writeTo(StreamOutput out) throws IOException {
166175
// Do not serialize unassignedTimeNanos as System.nanoTime() cannot be compared across different JVMs
167176
out.writeOptionalString(message);
168177
out.writeThrowable(failure);
178+
out.writeVInt(failedAllocations);
169179
}
170180

171181
public UnassignedInfo readFrom(StreamInput in) throws IOException {
172182
return new UnassignedInfo(in);
173183
}
174184

185+
/**
186+
* Retruns the number of previously failed allocations of this shard.
187+
*/
188+
public int getNumFailedAllocations() {return failedAllocations;}
189+
175190
/**
176191
* The reason why the shard is unassigned.
177192
*/
@@ -325,7 +340,11 @@ public String shortSummary() {
325340
StringBuilder sb = new StringBuilder();
326341
sb.append("[reason=").append(reason).append("]");
327342
sb.append(", at[").append(DATE_TIME_FORMATTER.printer().print(unassignedTimeMillis)).append("]");
343+
if (failedAllocations > 0) {
344+
sb.append(", failed_attemps[").append(failedAllocations).append("]");
345+
}
328346
String details = getDetails();
347+
329348
if (details != null) {
330349
sb.append(", details[").append(details).append("]");
331350
}
@@ -342,6 +361,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
342361
builder.startObject("unassigned_info");
343362
builder.field("reason", reason);
344363
builder.field("at", DATE_TIME_FORMATTER.printer().print(unassignedTimeMillis));
364+
if (failedAllocations > 0) {
365+
builder.field("failed_attemps", failedAllocations);
366+
}
345367
String details = getDetails();
346368
if (details != null) {
347369
builder.field("details", details);

core/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,10 @@ public RoutingAllocation.Result applyFailedShards(ClusterState clusterState, Lis
222222
List<FailedRerouteAllocation.FailedShard> orderedFailedShards = new ArrayList<>(failedShards);
223223
orderedFailedShards.sort(Comparator.comparing(failedShard -> failedShard.shard.primary()));
224224
for (FailedRerouteAllocation.FailedShard failedShard : orderedFailedShards) {
225+
UnassignedInfo unassignedInfo = failedShard.shard.unassignedInfo();
226+
final int failedAllocations = unassignedInfo != null ? unassignedInfo.getNumFailedAllocations() : 0;
225227
changed |= applyFailedShard(allocation, failedShard.shard, true, new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShard.message, failedShard.failure,
226-
System.nanoTime(), System.currentTimeMillis()));
228+
failedAllocations + 1, System.nanoTime(), System.currentTimeMillis()));
227229
}
228230
if (!changed) {
229231
return new RoutingAllocation.Result(false, clusterState.routingTable(), clusterState.metaData());
@@ -437,7 +439,7 @@ private boolean deassociateDeadNodes(RoutingAllocation allocation) {
437439
// now, go over all the shards routing on the node, and fail them
438440
for (ShardRouting shardRouting : node.copyShards()) {
439441
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "node_left[" + node.nodeId() + "]", null,
440-
allocation.getCurrentNanoTime(), System.currentTimeMillis());
442+
0, allocation.getCurrentNanoTime(), System.currentTimeMillis());
441443
applyFailedShard(allocation, shardRouting, false, unassignedInfo);
442444
}
443445
// its a dead node, remove it, note, its important to remove it *after* we apply failed shard
@@ -457,8 +459,8 @@ private boolean failReplicasForUnassignedPrimary(RoutingAllocation allocation, S
457459
boolean changed = false;
458460
for (ShardRouting routing : replicas) {
459461
changed |= applyFailedShard(allocation, routing, false,
460-
new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, "primary failed while replica initializing",
461-
null, allocation.getCurrentNanoTime(), System.currentTimeMillis()));
462+
new UnassignedInfo(UnassignedInfo.Reason.PRIMARY_FAILED, "primary failed while replica initializing",
463+
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis()));
462464
}
463465
return changed;
464466
}

core/src/main/java/org/elasticsearch/cluster/routing/allocation/command/AllocateEmptyPrimaryAllocationCommand.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ public RerouteExplanation execute(RoutingAllocation allocation, boolean explain)
125125
// we need to move the unassigned info back to treat it as if it was index creation
126126
unassignedInfoToUpdate = new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED,
127127
"force empty allocation from previous reason " + shardRouting.unassignedInfo().getReason() + ", " + shardRouting.unassignedInfo().getMessage(),
128-
shardRouting.unassignedInfo().getFailure(), System.nanoTime(), System.currentTimeMillis());
128+
shardRouting.unassignedInfo().getFailure(), 0, System.nanoTime(), System.currentTimeMillis());
129129
}
130130

131131
initializeUnassignedShard(allocation, routingNodes, routingNode, shardRouting, unassignedInfoToUpdate);
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.cluster.routing.allocation.decider;
21+
22+
import org.elasticsearch.cluster.metadata.IndexMetaData;
23+
import org.elasticsearch.cluster.routing.RoutingNode;
24+
import org.elasticsearch.cluster.routing.ShardRouting;
25+
import org.elasticsearch.cluster.routing.UnassignedInfo;
26+
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
27+
import org.elasticsearch.common.inject.Inject;
28+
import org.elasticsearch.common.settings.Setting;
29+
import org.elasticsearch.common.settings.Settings;
30+
31+
/**
32+
* An allocation decider that prevents shards from being allocated on any node if the shards allocation has been retried N times without
33+
* success. This means if a shard has been INITIALIZING N times in a row without being moved to STARTED the shard will be ignored until
34+
* the setting for <tt>index.allocation.max_retry</tt> is raised. The default value is <tt>5</tt>.
35+
*/
36+
public class MaxRetryAllocationDecider extends AllocationDecider {
37+
38+
public static final Setting<Integer> SETTING_ALLOCATION_MAX_RETRY = Setting.intSetting("index.allocation.max_retry", 5, 0,
39+
Setting.Property.Dynamic, Setting.Property.IndexScope);
40+
41+
public static final String NAME = "max_retry";
42+
43+
/**
44+
* Initializes a new {@link MaxRetryAllocationDecider}
45+
*
46+
* @param settings {@link Settings} used by this {@link AllocationDecider}
47+
*/
48+
@Inject
49+
public MaxRetryAllocationDecider(Settings settings) {
50+
super(settings);
51+
}
52+
53+
@Override
54+
public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) {
55+
UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
56+
if (unassignedInfo != null && unassignedInfo.getNumFailedAllocations() > 0) {
57+
IndexMetaData indexSafe = allocation.metaData().getIndexSafe(shardRouting.index());
58+
int maxRetry = SETTING_ALLOCATION_MAX_RETRY.get(indexSafe.getSettings());
59+
if (unassignedInfo.getNumFailedAllocations() >= maxRetry) {
60+
return allocation.decision(Decision.NO, NAME, "shard has already failed allocating ["
61+
+ unassignedInfo.getNumFailedAllocations() + "] times");
62+
}
63+
}
64+
return allocation.decision(Decision.YES, NAME, "shard has no previous failures");
65+
}
66+
67+
@Override
68+
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
69+
return canAllocate(shardRouting, allocation);
70+
}
71+
}

core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.elasticsearch.cluster.metadata.IndexMetaData;
2222
import org.elasticsearch.cluster.routing.UnassignedInfo;
2323
import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
24+
import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider;
2425
import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
2526
import org.elasticsearch.common.settings.Setting.Property;
2627
import org.elasticsearch.gateway.PrimaryShardAllocator;
@@ -40,7 +41,6 @@
4041
import org.elasticsearch.index.store.FsDirectoryService;
4142
import org.elasticsearch.index.store.IndexStore;
4243
import org.elasticsearch.index.store.Store;
43-
import org.elasticsearch.index.IndexWarmer;
4444
import org.elasticsearch.indices.IndicesRequestCache;
4545

4646
import java.util.Arrays;
@@ -59,6 +59,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
5959
public static final Predicate<String> INDEX_SETTINGS_KEY_PREDICATE = (s) -> s.startsWith(IndexMetaData.INDEX_SETTING_PREFIX);
6060

6161
public static final Set<Setting<?>> BUILT_IN_INDEX_SETTINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
62+
MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY,
6263
IndexSettings.INDEX_TTL_DISABLE_PURGE_SETTING,
6364
IndexStore.INDEX_STORE_THROTTLE_TYPE_SETTING,
6465
IndexStore.INDEX_STORE_THROTTLE_MAX_BYTES_PER_SEC_SETTING,

core/src/main/java/org/elasticsearch/gateway/ReplicaShardAllocator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ public boolean processExistingRecoveries(RoutingAllocation allocation) {
108108
currentNode, nodeWithHighestMatch);
109109
it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REALLOCATED_REPLICA,
110110
"existing allocation of replica to [" + currentNode + "] cancelled, sync id match found on node [" + nodeWithHighestMatch + "]",
111-
null, allocation.getCurrentNanoTime(), System.currentTimeMillis()));
111+
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis()));
112112
changed = true;
113113
}
114114
}

core/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,19 @@ public void testReasonOrdinalOrder() {
6464
UnassignedInfo.Reason.NODE_LEFT,
6565
UnassignedInfo.Reason.REROUTE_CANCELLED,
6666
UnassignedInfo.Reason.REINITIALIZED,
67-
UnassignedInfo.Reason.REALLOCATED_REPLICA};
67+
UnassignedInfo.Reason.REALLOCATED_REPLICA,
68+
UnassignedInfo.Reason.PRIMARY_FAILED};
6869
for (int i = 0; i < order.length; i++) {
6970
assertThat(order[i].ordinal(), equalTo(i));
7071
}
7172
assertThat(UnassignedInfo.Reason.values().length, equalTo(order.length));
7273
}
7374

7475
public void testSerialization() throws Exception {
75-
UnassignedInfo meta = new UnassignedInfo(RandomPicks.randomFrom(random(), UnassignedInfo.Reason.values()), randomBoolean() ? randomAsciiOfLength(4) : null);
76+
UnassignedInfo.Reason reason = RandomPicks.randomFrom(random(), UnassignedInfo.Reason.values());
77+
UnassignedInfo meta = reason == UnassignedInfo.Reason.ALLOCATION_FAILED ?
78+
new UnassignedInfo(reason, randomBoolean() ? randomAsciiOfLength(4) : null, null, randomIntBetween(1, 100), System.nanoTime(), System.currentTimeMillis()):
79+
new UnassignedInfo(reason, randomBoolean() ? randomAsciiOfLength(4) : null);
7680
BytesStreamOutput out = new BytesStreamOutput();
7781
meta.writeTo(out);
7882
out.close();
@@ -82,6 +86,7 @@ public void testSerialization() throws Exception {
8286
assertThat(read.getUnassignedTimeInMillis(), equalTo(meta.getUnassignedTimeInMillis()));
8387
assertThat(read.getMessage(), equalTo(meta.getMessage()));
8488
assertThat(read.getDetails(), equalTo(meta.getDetails()));
89+
assertThat(read.getNumFailedAllocations(), equalTo(meta.getNumFailedAllocations()));
8590
}
8691

8792
public void testIndexCreated() {
@@ -273,7 +278,10 @@ public void testUnassignedDelayedOnlyOnNodeLeft() throws Exception {
273278
public void testUnassignedDelayOnlyNodeLeftNonNodeLeftReason() throws Exception {
274279
EnumSet<UnassignedInfo.Reason> reasons = EnumSet.allOf(UnassignedInfo.Reason.class);
275280
reasons.remove(UnassignedInfo.Reason.NODE_LEFT);
276-
UnassignedInfo unassignedInfo = new UnassignedInfo(RandomPicks.randomFrom(random(), reasons), null);
281+
UnassignedInfo.Reason reason = RandomPicks.randomFrom(random(), reasons);
282+
UnassignedInfo unassignedInfo = reason == UnassignedInfo.Reason.ALLOCATION_FAILED ?
283+
new UnassignedInfo(reason, null, null, 1, System.nanoTime(), System.currentTimeMillis()):
284+
new UnassignedInfo(reason, null);
277285
unassignedInfo = unassignedInfo.updateDelay(unassignedInfo.getUnassignedTimeInNanos() + 1, // add 1 tick delay
278286
Settings.builder().put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "10h").build(), Settings.EMPTY);
279287
long delay = unassignedInfo.getLastComputedLeftDelayNanos();
@@ -287,7 +295,7 @@ public void testUnassignedDelayOnlyNodeLeftNonNodeLeftReason() throws Exception
287295
*/
288296
public void testLeftDelayCalculation() throws Exception {
289297
final long baseTime = System.nanoTime();
290-
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "test", null, baseTime, System.currentTimeMillis());
298+
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "test", null, 0, baseTime, System.currentTimeMillis());
291299
final long totalDelayNanos = TimeValue.timeValueMillis(10).nanos();
292300
final Settings settings = Settings.builder().put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), TimeValue.timeValueNanos(totalDelayNanos)).build();
293301
unassignedInfo = unassignedInfo.updateDelay(baseTime, settings, Settings.EMPTY);

0 commit comments

Comments
 (0)