Skip to content

Commit 255015d

Browse files
committed
Ignore shard started requests when primary term does not match (#37899)
This commit changes the StartedShardEntry so that it also contains the primary term of the shard to start. This way the master node can also checks that the primary term from the start request is equal to the current shard's primary term in the cluster state, and it can ignore any shard started request that would concerns a previous instance of the shard that would have been allocated to the same node. Such situation are likely to happen with frozen (or restored) indices and the replication of closed indices, because with replicated closed indices the shards will be initialized again after the index is closed and can potentially be re initialized again if the index is reopened as a frozen index. In such cases the lifecycle of the shards would be something like: * shard is STARTED * index is closed * shards is INITIALIZING (index state is CLOSED, primary term is X) * index is reopened * shards are INITIALIZING again (index state is OPENED, potentially frozen, primary term is X+1) Adding the primary term to the shard started request will allow to discard potential StartedShardEntry requests received by the master node if the request concerns the shard with primary term X because it has been moved/reinitialized in the meanwhile under the primary term X+1. Relates to #33888
1 parent 18f5c7a commit 255015d

File tree

7 files changed

+252
-82
lines changed

7 files changed

+252
-82
lines changed

server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -493,12 +493,20 @@ public int hashCode() {
493493
}
494494
}
495495

496-
public void shardStarted(final ShardRouting shardRouting, final String message, Listener listener) {
497-
shardStarted(shardRouting, message, listener, clusterService.state());
496+
public void shardStarted(final ShardRouting shardRouting,
497+
final long primaryTerm,
498+
final String message,
499+
final Listener listener) {
500+
shardStarted(shardRouting, primaryTerm, message, listener, clusterService.state());
498501
}
499-
public void shardStarted(final ShardRouting shardRouting, final String message, Listener listener, ClusterState currentState) {
500-
StartedShardEntry shardEntry = new StartedShardEntry(shardRouting.shardId(), shardRouting.allocationId().getId(), message);
501-
sendShardAction(SHARD_STARTED_ACTION_NAME, currentState, shardEntry, listener);
502+
503+
public void shardStarted(final ShardRouting shardRouting,
504+
final long primaryTerm,
505+
final String message,
506+
final Listener listener,
507+
final ClusterState currentState) {
508+
StartedShardEntry entry = new StartedShardEntry(shardRouting.shardId(), shardRouting.allocationId().getId(), primaryTerm, message);
509+
sendShardAction(SHARD_STARTED_ACTION_NAME, currentState, entry, listener);
502510
}
503511

504512
private static class ShardStartedTransportHandler implements TransportRequestHandler<StartedShardEntry> {
@@ -543,7 +551,7 @@ public ClusterTasksResult<StartedShardEntry> execute(ClusterState currentState,
543551
List<ShardRouting> shardRoutingsToBeApplied = new ArrayList<>(tasks.size());
544552
Set<ShardRouting> seenShardRoutings = new HashSet<>(); // to prevent duplicates
545553
for (StartedShardEntry task : tasks) {
546-
ShardRouting matched = currentState.getRoutingTable().getByAllocationId(task.shardId, task.allocationId);
554+
final ShardRouting matched = currentState.getRoutingTable().getByAllocationId(task.shardId, task.allocationId);
547555
if (matched == null) {
548556
// tasks that correspond to non-existent shards are marked as successful. The reason is that we resend shard started
549557
// events on every cluster state publishing that does not contain the shard as started yet. This means that old stale
@@ -552,6 +560,19 @@ public ClusterTasksResult<StartedShardEntry> execute(ClusterState currentState,
552560
logger.debug("{} ignoring shard started task [{}] (shard does not exist anymore)", task.shardId, task);
553561
builder.success(task);
554562
} else {
563+
if (matched.primary() && task.primaryTerm > 0) {
564+
final IndexMetaData indexMetaData = currentState.metaData().index(task.shardId.getIndex());
565+
assert indexMetaData != null;
566+
final long currentPrimaryTerm = indexMetaData.primaryTerm(task.shardId.id());
567+
if (currentPrimaryTerm != task.primaryTerm) {
568+
assert currentPrimaryTerm > task.primaryTerm : "received a primary term with a higher term than in the " +
569+
"current cluster state (received [" + task.primaryTerm + "] but current is [" + currentPrimaryTerm + "])";
570+
logger.debug("{} ignoring shard started task [{}] (primary term {} does not match current term {})",
571+
task.shardId, task, task.primaryTerm, currentPrimaryTerm);
572+
builder.success(task);
573+
continue;
574+
}
575+
}
555576
if (matched.initializing() == false) {
556577
assert matched.active() : "expected active shard routing for task " + task + " but found " + matched;
557578
// same as above, this might have been a stale in-flight request, so we just ignore.
@@ -596,15 +617,20 @@ public void onFailure(String source, Exception e) {
596617
public static class StartedShardEntry extends TransportRequest {
597618
final ShardId shardId;
598619
final String allocationId;
620+
final long primaryTerm;
599621
final String message;
600622

601623
StartedShardEntry(StreamInput in) throws IOException {
602624
super(in);
603625
shardId = ShardId.readShardId(in);
604626
allocationId = in.readString();
605627
if (in.getVersion().before(Version.V_6_3_0)) {
606-
final long primaryTerm = in.readVLong();
628+
primaryTerm = in.readVLong();
607629
assert primaryTerm == UNASSIGNED_PRIMARY_TERM : "shard is only started by itself: primary term [" + primaryTerm + "]";
630+
} else if (in.getVersion().onOrAfter(Version.V_6_7_0)) {
631+
primaryTerm = in.readVLong();
632+
} else {
633+
primaryTerm = UNASSIGNED_PRIMARY_TERM;
608634
}
609635
this.message = in.readString();
610636
if (in.getVersion().before(Version.V_6_3_0)) {
@@ -613,9 +639,10 @@ public static class StartedShardEntry extends TransportRequest {
613639
}
614640
}
615641

616-
public StartedShardEntry(ShardId shardId, String allocationId, String message) {
642+
public StartedShardEntry(final ShardId shardId, final String allocationId, final long primaryTerm, final String message) {
617643
this.shardId = shardId;
618644
this.allocationId = allocationId;
645+
this.primaryTerm = primaryTerm;
619646
this.message = message;
620647
}
621648

@@ -626,6 +653,8 @@ public void writeTo(StreamOutput out) throws IOException {
626653
out.writeString(allocationId);
627654
if (out.getVersion().before(Version.V_6_3_0)) {
628655
out.writeVLong(0L);
656+
} else if (out.getVersion().onOrAfter(Version.V_6_7_0)) {
657+
out.writeVLong(primaryTerm);
629658
}
630659
out.writeString(message);
631660
if (out.getVersion().before(Version.V_6_3_0)) {
@@ -635,8 +664,8 @@ public void writeTo(StreamOutput out) throws IOException {
635664

636665
@Override
637666
public String toString() {
638-
return String.format(Locale.ROOT, "StartedShardEntry{shardId [%s], allocationId [%s], message [%s]}",
639-
shardId, allocationId, message);
667+
return String.format(Locale.ROOT, "StartedShardEntry{shardId [%s], allocationId [%s], primary term [%d], message [%s]}",
668+
shardId, allocationId, primaryTerm, message);
640669
}
641670
}
642671

server/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -575,13 +575,14 @@ private void createShard(DiscoveryNodes nodes, RoutingTable routingTable, ShardR
575575
}
576576

577577
try {
578-
logger.debug("{} creating shard", shardRouting.shardId());
578+
final long primaryTerm = state.metaData().index(shardRouting.index()).primaryTerm(shardRouting.id());
579+
logger.debug("{} creating shard with primary term [{}]", shardRouting.shardId(), primaryTerm);
579580
RecoveryState recoveryState = new RecoveryState(shardRouting, nodes.getLocalNode(), sourceNode);
580581
indicesService.createShard(
581582
shardRouting,
582583
recoveryState,
583584
recoveryTargetService,
584-
new RecoveryListener(shardRouting),
585+
new RecoveryListener(shardRouting, primaryTerm),
585586
repositoriesService,
586587
failedShardHandler,
587588
globalCheckpointSyncer,
@@ -598,9 +599,10 @@ private void updateShard(DiscoveryNodes nodes, ShardRouting shardRouting, Shard
598599
"local shard has a different allocation id but wasn't cleaning by removeShards. "
599600
+ "cluster state: " + shardRouting + " local: " + currentRoutingEntry;
600601

602+
final long primaryTerm;
601603
try {
602604
final IndexMetaData indexMetaData = clusterState.metaData().index(shard.shardId().getIndex());
603-
final long primaryTerm = indexMetaData.primaryTerm(shard.shardId().id());
605+
primaryTerm = indexMetaData.primaryTerm(shard.shardId().id());
604606
final Set<String> inSyncIds = indexMetaData.inSyncAllocationIds(shard.shardId().id());
605607
final IndexShardRoutingTable indexShardRoutingTable = routingTable.shardRoutingTable(shardRouting.shardId());
606608
final Set<String> pre60AllocationIds = indexShardRoutingTable.assignedShards()
@@ -633,7 +635,7 @@ private void updateShard(DiscoveryNodes nodes, ShardRouting shardRouting, Shard
633635
shardRouting.shardId(), state, nodes.getMasterNode());
634636
}
635637
if (nodes.getMasterNode() != null) {
636-
shardStateAction.shardStarted(shardRouting, "master " + nodes.getMasterNode() +
638+
shardStateAction.shardStarted(shardRouting, primaryTerm, "master " + nodes.getMasterNode() +
637639
" marked shard as initializing, but shard state is [" + state + "], mark shard as started",
638640
SHARD_STATE_ACTION_LISTENER, clusterState);
639641
}
@@ -673,15 +675,24 @@ private static DiscoveryNode findSourceNodeForPeerRecovery(Logger logger, Routin
673675

674676
private class RecoveryListener implements PeerRecoveryTargetService.RecoveryListener {
675677

678+
/**
679+
* ShardRouting with which the shard was created
680+
*/
676681
private final ShardRouting shardRouting;
677682

678-
private RecoveryListener(ShardRouting shardRouting) {
683+
/**
684+
* Primary term with which the shard was created
685+
*/
686+
private final long primaryTerm;
687+
688+
private RecoveryListener(final ShardRouting shardRouting, final long primaryTerm) {
679689
this.shardRouting = shardRouting;
690+
this.primaryTerm = primaryTerm;
680691
}
681692

682693
@Override
683-
public void onRecoveryDone(RecoveryState state) {
684-
shardStateAction.shardStarted(shardRouting, "after " + state.getRecoverySource(), SHARD_STATE_ACTION_LISTENER);
694+
public void onRecoveryDone(final RecoveryState state) {
695+
shardStateAction.shardStarted(shardRouting, primaryTerm, "after " + state.getRecoverySource(), SHARD_STATE_ACTION_LISTENER);
685696
}
686697

687698
@Override

0 commit comments

Comments
 (0)