Skip to content

Commit 9a42e71

Browse files
committed
Do not cancel recovery for copy on broken node (#48265)
This change fixes a poisonous situation where an ongoing recovery was canceled because a better copy was found on a node that the cluster had previously tried allocating the shard to but failed. The solution is to keep track of the set of nodes that an allocation was failed on so that we can avoid canceling the current recovery for a copy on failed nodes. Closes #47974
1 parent 5e4501e commit 9a42e71

File tree

12 files changed

+281
-34
lines changed

12 files changed

+281
-34
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@ assert getByAllocationId(failedShard.shardId(), failedShard.allocationId().getId
547547
assert replicaShard != null : "failed to re-resolve " + routing + " when failing replicas";
548548
UnassignedInfo primaryFailedUnassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.PRIMARY_FAILED,
549549
"primary failed while replica initializing", null, 0, unassignedInfo.getUnassignedTimeInNanos(),
550-
unassignedInfo.getUnassignedTimeInMillis(), false, AllocationStatus.NO_ATTEMPT);
550+
unassignedInfo.getUnassignedTimeInMillis(), false, AllocationStatus.NO_ATTEMPT, Collections.emptySet());
551551
failShard(logger, replicaShard, primaryFailedUnassignedInfo, indexMetaData, routingChangesObserver);
552552
}
553553
}
@@ -873,7 +873,7 @@ public void ignoreShard(ShardRouting shard, AllocationStatus allocationStatus, R
873873
UnassignedInfo newInfo = new UnassignedInfo(currInfo.getReason(), currInfo.getMessage(), currInfo.getFailure(),
874874
currInfo.getNumFailedAllocations(), currInfo.getUnassignedTimeInNanos(),
875875
currInfo.getUnassignedTimeInMillis(), currInfo.isDelayed(),
876-
allocationStatus);
876+
allocationStatus, currInfo.getFailedNodeIds());
877877
ShardRouting updatedShard = shard.updateUnassigned(newInfo, shard.recoverySource());
878878
changes.unassignedInfoUpdated(shard, newInfo);
879879
shard = updatedShard;

server/src/main/java/org/elasticsearch/cluster/routing/UnassignedInfo.java

+43-4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.elasticsearch.Version;
2424
import org.elasticsearch.cluster.ClusterState;
2525
import org.elasticsearch.cluster.metadata.MetaData;
26+
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
2627
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
2728
import org.elasticsearch.common.Nullable;
2829
import org.elasticsearch.common.io.stream.StreamInput;
@@ -39,8 +40,11 @@
3940
import java.io.IOException;
4041
import java.time.Instant;
4142
import java.time.ZoneOffset;
43+
import java.util.Collections;
44+
import java.util.List;
4245
import java.util.Locale;
4346
import java.util.Objects;
47+
import java.util.Set;
4448

4549
/**
4650
* Holds additional information as to why the shard is in unassigned state.
@@ -214,6 +218,7 @@ public String value() {
214218
private final String message;
215219
private final Exception failure;
216220
private final int failedAllocations;
221+
private final Set<String> failedNodeIds;
217222
private final AllocationStatus lastAllocationStatus; // result of the last allocation attempt for this shard
218223

219224
/**
@@ -224,7 +229,7 @@ public String value() {
224229
**/
225230
public UnassignedInfo(Reason reason, String message) {
226231
this(reason, message, null, reason == Reason.ALLOCATION_FAILED ? 1 : 0, System.nanoTime(), System.currentTimeMillis(), false,
227-
AllocationStatus.NO_ATTEMPT);
232+
AllocationStatus.NO_ATTEMPT, Collections.emptySet());
228233
}
229234

230235
/**
@@ -235,9 +240,11 @@ public UnassignedInfo(Reason reason, String message) {
235240
* @param unassignedTimeMillis the time of unassignment used to display to in our reporting.
236241
* @param delayed if allocation of this shard is delayed due to INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.
237242
* @param lastAllocationStatus the result of the last allocation attempt for this shard
243+
* @param failedNodeIds a set of nodeIds that failed to complete allocations for this shard
238244
*/
239245
public UnassignedInfo(Reason reason, @Nullable String message, @Nullable Exception failure, int failedAllocations,
240-
long unassignedTimeNanos, long unassignedTimeMillis, boolean delayed, AllocationStatus lastAllocationStatus) {
246+
long unassignedTimeNanos, long unassignedTimeMillis, boolean delayed, AllocationStatus lastAllocationStatus,
247+
Set<String> failedNodeIds) {
241248
this.reason = Objects.requireNonNull(reason);
242249
this.unassignedTimeMillis = unassignedTimeMillis;
243250
this.unassignedTimeNanos = unassignedTimeNanos;
@@ -246,6 +253,7 @@ public UnassignedInfo(Reason reason, @Nullable String message, @Nullable Excepti
246253
this.failure = failure;
247254
this.failedAllocations = failedAllocations;
248255
this.lastAllocationStatus = Objects.requireNonNull(lastAllocationStatus);
256+
this.failedNodeIds = Collections.unmodifiableSet(failedNodeIds);
249257
assert (failedAllocations > 0) == (reason == Reason.ALLOCATION_FAILED) :
250258
"failedAllocations: " + failedAllocations + " for reason " + reason;
251259
assert !(message == null && failure != null) : "provide a message if a failure exception is provided";
@@ -263,6 +271,11 @@ public UnassignedInfo(StreamInput in) throws IOException {
263271
this.failure = in.readException();
264272
this.failedAllocations = in.readVInt();
265273
this.lastAllocationStatus = AllocationStatus.readFrom(in);
274+
if (in.getVersion().onOrAfter(Version.V_7_5_0)) {
275+
this.failedNodeIds = Collections.unmodifiableSet(in.readSet(StreamInput::readString));
276+
} else {
277+
this.failedNodeIds = Collections.emptySet();
278+
}
266279
}
267280

268281
public void writeTo(StreamOutput out) throws IOException {
@@ -280,6 +293,9 @@ public void writeTo(StreamOutput out) throws IOException {
280293
out.writeException(failure);
281294
out.writeVInt(failedAllocations);
282295
lastAllocationStatus.writeTo(out);
296+
if (out.getVersion().onOrAfter(Version.V_7_5_0)) {
297+
out.writeCollection(failedNodeIds, StreamOutput::writeString);
298+
}
283299
}
284300

285301
/**
@@ -354,6 +370,19 @@ public AllocationStatus getLastAllocationStatus() {
354370
return lastAllocationStatus;
355371
}
356372

373+
/**
374+
* A set of nodeIds that failed to complete allocations for this shard. {@link org.elasticsearch.gateway.ReplicaShardAllocator}
375+
* uses this set to avoid repeatedly canceling ongoing recoveries for copies on those nodes although they can perform noop recoveries.
376+
* This set will be discarded when a shard moves to started. And if a shard is failed while started (i.e., from started to unassigned),
377+
* the currently assigned node won't be added to this set.
378+
*
379+
* @see org.elasticsearch.gateway.ReplicaShardAllocator#processExistingRecoveries(RoutingAllocation)
380+
* @see org.elasticsearch.cluster.routing.allocation.AllocationService#applyFailedShards(ClusterState, List, List)
381+
*/
382+
public Set<String> getFailedNodeIds() {
383+
return failedNodeIds;
384+
}
385+
357386
/**
358387
* Calculates the delay left based on current time (in nanoseconds) and the delay defined by the index settings.
359388
* Only relevant if shard is effectively delayed (see {@link #isDelayed()})
@@ -410,6 +439,9 @@ public String shortSummary() {
410439
if (failedAllocations > 0) {
411440
sb.append(", failed_attempts[").append(failedAllocations).append("]");
412441
}
442+
if (failedNodeIds.isEmpty() == false) {
443+
sb.append(", failed_nodes[").append(failedNodeIds).append("]");
444+
}
413445
sb.append(", delayed=").append(delayed);
414446
String details = getDetails();
415447

@@ -433,6 +465,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
433465
if (failedAllocations > 0) {
434466
builder.field("failed_attempts", failedAllocations);
435467
}
468+
if (failedNodeIds.isEmpty() == false) {
469+
builder.field("failed_nodes", failedNodeIds);
470+
}
436471
builder.field("delayed", delayed);
437472
String details = getDetails();
438473
if (details != null) {
@@ -466,13 +501,16 @@ public boolean equals(Object o) {
466501
if (reason != that.reason) {
467502
return false;
468503
}
469-
if (message != null ? !message.equals(that.message) : that.message != null) {
504+
if (Objects.equals(message, that.message) == false) {
470505
return false;
471506
}
472507
if (lastAllocationStatus != that.lastAllocationStatus) {
473508
return false;
474509
}
475-
return !(failure != null ? !failure.equals(that.failure) : that.failure != null);
510+
if (Objects.equals(failure, that.failure) == false) {
511+
return false;
512+
}
513+
return failedNodeIds.equals(that.failedNodeIds);
476514
}
477515

478516
@Override
@@ -484,6 +522,7 @@ public int hashCode() {
484522
result = 31 * result + (message != null ? message.hashCode() : 0);
485523
result = 31 * result + (failure != null ? failure.hashCode() : 0);
486524
result = 31 * result + lastAllocationStatus.hashCode();
525+
result = 31 * result + failedNodeIds.hashCode();
487526
return result;
488527
}
489528

server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

+16-5
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,11 @@
4545
import java.util.ArrayList;
4646
import java.util.Collections;
4747
import java.util.Comparator;
48+
import java.util.HashSet;
4849
import java.util.Iterator;
4950
import java.util.List;
5051
import java.util.Map;
52+
import java.util.Set;
5153
import java.util.function.Function;
5254
import java.util.stream.Collectors;
5355

@@ -193,10 +195,18 @@ public ClusterState applyFailedShards(final ClusterState clusterState, final Lis
193195
shardToFail.shardId(), shardToFail, failedShard);
194196
}
195197
int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
198+
final Set<String> failedNodeIds;
199+
if (failedShard.unassignedInfo() != null) {
200+
failedNodeIds = new HashSet<>(failedShard.unassignedInfo().getFailedNodeIds().size() + 1);
201+
failedNodeIds.addAll(failedShard.unassignedInfo().getFailedNodeIds());
202+
failedNodeIds.add(failedShard.currentNodeId());
203+
} else {
204+
failedNodeIds = Collections.emptySet();
205+
}
196206
String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
197207
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message,
198208
failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false,
199-
AllocationStatus.NO_ATTEMPT);
209+
AllocationStatus.NO_ATTEMPT, failedNodeIds);
200210
if (failedShardEntry.markAsStale()) {
201211
allocation.removeAllocationId(failedShard);
202212
}
@@ -288,8 +298,8 @@ private void removeDelayMarkers(RoutingAllocation allocation) {
288298
if (newComputedLeftDelayNanos == 0) {
289299
unassignedIterator.updateUnassigned(new UnassignedInfo(unassignedInfo.getReason(), unassignedInfo.getMessage(),
290300
unassignedInfo.getFailure(), unassignedInfo.getNumFailedAllocations(), unassignedInfo.getUnassignedTimeInNanos(),
291-
unassignedInfo.getUnassignedTimeInMillis(), false, unassignedInfo.getLastAllocationStatus()),
292-
shardRouting.recoverySource(), allocation.changes());
301+
unassignedInfo.getUnassignedTimeInMillis(), false, unassignedInfo.getLastAllocationStatus(),
302+
unassignedInfo.getFailedNodeIds()), shardRouting.recoverySource(), allocation.changes());
293303
}
294304
}
295305
}
@@ -307,7 +317,7 @@ private void resetFailedAllocationCounter(RoutingAllocation allocation) {
307317
UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.getReason(), unassignedInfo.getMessage(),
308318
unassignedInfo.getFailure(), 0, unassignedInfo.getUnassignedTimeInNanos(),
309319
unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(),
310-
unassignedInfo.getLastAllocationStatus()), shardRouting.recoverySource(), allocation.changes());
320+
unassignedInfo.getLastAllocationStatus(), Collections.emptySet()), shardRouting.recoverySource(), allocation.changes());
311321
}
312322
}
313323

@@ -416,7 +426,8 @@ private void disassociateDeadNodes(RoutingAllocation allocation) {
416426
final IndexMetaData indexMetaData = allocation.metaData().getIndexSafe(shardRouting.index());
417427
boolean delayed = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexMetaData.getSettings()).nanos() > 0;
418428
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "node_left [" + node.nodeId() + "]",
419-
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis(), delayed, AllocationStatus.NO_ATTEMPT);
429+
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis(), delayed, AllocationStatus.NO_ATTEMPT,
430+
Collections.emptySet());
420431
allocation.routingNodes().failShard(logger, shardRouting, unassignedInfo, indexMetaData, allocation.changes());
421432
}
422433
// its a dead node, remove it, note, its important to remove it *after* we apply failed shard

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ private void failAllocationOfNewPrimaries(RoutingAllocation allocation) {
152152
if (shardRouting.primary() && unassignedInfo.getLastAllocationStatus() == AllocationStatus.NO_ATTEMPT) {
153153
unassignedIterator.updateUnassigned(new UnassignedInfo(unassignedInfo.getReason(), unassignedInfo.getMessage(),
154154
unassignedInfo.getFailure(), unassignedInfo.getNumFailedAllocations(), unassignedInfo.getUnassignedTimeInNanos(),
155-
unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(), AllocationStatus.DECIDERS_NO),
155+
unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(), AllocationStatus.DECIDERS_NO,
156+
unassignedInfo.getFailedNodeIds()),
156157
shardRouting.recoverySource(), allocation.changes());
157158
}
158159
}

server/src/main/java/org/elasticsearch/cluster/routing/allocation/command/AllocateEmptyPrimaryAllocationCommand.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.elasticsearch.index.shard.ShardNotFoundException;
3939

4040
import java.io.IOException;
41+
import java.util.Collections;
4142
import java.util.Optional;
4243

4344
/**
@@ -139,7 +140,7 @@ public RerouteExplanation execute(RoutingAllocation allocation, boolean explain)
139140
", " + shardRouting.unassignedInfo().getMessage();
140141
unassignedInfoToUpdate = new UnassignedInfo(UnassignedInfo.Reason.FORCED_EMPTY_PRIMARY, unassignedInfoMessage,
141142
shardRouting.unassignedInfo().getFailure(), 0, System.nanoTime(), System.currentTimeMillis(), false,
142-
shardRouting.unassignedInfo().getLastAllocationStatus());
143+
shardRouting.unassignedInfo().getLastAllocationStatus(), Collections.emptySet());
143144
}
144145

145146
initializeUnassignedShard(allocation, routingNodes, routingNode, shardRouting, unassignedInfoToUpdate,

server/src/main/java/org/elasticsearch/gateway/ReplicaShardAllocator.java

+13-4
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,12 @@
4343
import org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData;
4444

4545
import java.util.ArrayList;
46+
import java.util.Collections;
4647
import java.util.Comparator;
4748
import java.util.HashMap;
4849
import java.util.List;
4950
import java.util.Map;
51+
import java.util.Set;
5052

5153
import static org.elasticsearch.cluster.routing.UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING;
5254

@@ -95,7 +97,7 @@ public void processExistingRecoveries(RoutingAllocation allocation) {
9597
continue;
9698
}
9799

98-
MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryNode, primaryStore, shardStores, false);
100+
MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, true, primaryNode, primaryStore, shardStores, false);
99101
if (matchingNodes.getNodeWithHighestMatch() != null) {
100102
DiscoveryNode currentNode = allocation.nodes().get(shard.currentNodeId());
101103
DiscoveryNode nodeWithHighestMatch = matchingNodes.getNodeWithHighestMatch();
@@ -106,11 +108,13 @@ && canPerformOperationBasedRecovery(primaryStore, shardStores, currentNode) == f
106108
// we found a better match that can perform noop recovery, cancel the existing allocation.
107109
logger.debug("cancelling allocation of replica on [{}], can perform a noop recovery on node [{}]",
108110
currentNode, nodeWithHighestMatch);
111+
final Set<String> failedNodeIds =
112+
shard.unassignedInfo() == null ? Collections.emptySet() : shard.unassignedInfo().getFailedNodeIds();
109113
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.REALLOCATED_REPLICA,
110114
"existing allocation of replica to [" + currentNode + "] cancelled, can perform a noop recovery on ["+
111115
nodeWithHighestMatch + "]",
112116
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis(), false,
113-
UnassignedInfo.AllocationStatus.NO_ATTEMPT);
117+
UnassignedInfo.AllocationStatus.NO_ATTEMPT, failedNodeIds);
114118
// don't cancel shard in the loop as it will cause a ConcurrentModificationException
115119
shardCancellationActions.add(() -> routingNodes.failShard(logger, shard, unassignedInfo,
116120
metaData.getIndexSafe(shard.index()), allocation.changes()));
@@ -186,7 +190,8 @@ public AllocateUnassignedDecision makeAllocationDecision(final ShardRouting unas
186190
return AllocateUnassignedDecision.NOT_TAKEN;
187191
}
188192

189-
MatchingNodes matchingNodes = findMatchingNodes(unassignedShard, allocation, primaryNode, primaryStore, shardStores, explain);
193+
MatchingNodes matchingNodes = findMatchingNodes(
194+
unassignedShard, allocation, false, primaryNode, primaryStore, shardStores, explain);
190195
assert explain == false || matchingNodes.nodeDecisions != null : "in explain mode, we must have individual node decisions";
191196

192197
List<NodeAllocationResult> nodeDecisions = augmentExplanationsWithStoreInfo(result.v2(), matchingNodes.nodeDecisions);
@@ -297,14 +302,18 @@ private static TransportNodesListShardStoreMetaData.StoreFilesMetaData findStore
297302
return nodeFilesStore.storeFilesMetaData();
298303
}
299304

300-
private MatchingNodes findMatchingNodes(ShardRouting shard, RoutingAllocation allocation,
305+
private MatchingNodes findMatchingNodes(ShardRouting shard, RoutingAllocation allocation, boolean noMatchFailedNodes,
301306
DiscoveryNode primaryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
302307
AsyncShardFetch.FetchResult<NodeStoreFilesMetaData> data,
303308
boolean explain) {
304309
Map<DiscoveryNode, MatchingNode> matchingNodes = new HashMap<>();
305310
Map<String, NodeAllocationResult> nodeDecisions = explain ? new HashMap<>() : null;
306311
for (Map.Entry<DiscoveryNode, NodeStoreFilesMetaData> nodeStoreEntry : data.getData().entrySet()) {
307312
DiscoveryNode discoNode = nodeStoreEntry.getKey();
313+
if (noMatchFailedNodes && shard.unassignedInfo() != null &&
314+
shard.unassignedInfo().getFailedNodeIds().contains(discoNode.getId())) {
315+
continue;
316+
}
308317
TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData();
309318
// we don't have any files at all, it is an empty index
310319
if (storeFilesMetaData.isEmpty()) {

0 commit comments

Comments
 (0)