Skip to content

Commit 6959203

Browse files
authored
StartedShardUpdateTask task listener (#82854)
Today node removal tasks executed by the master have a separate ClusterStateTaskListener to feed back the result to the requestor. It'd be preferable to use the task itself as the listener.
1 parent 7d0d409 commit 6959203

File tree

3 files changed

+205
-139
lines changed

3 files changed

+205
-139
lines changed

server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java

Lines changed: 69 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import org.apache.logging.log4j.LogManager;
1212
import org.apache.logging.log4j.Logger;
1313
import org.apache.logging.log4j.message.ParameterizedMessage;
14-
import org.apache.logging.log4j.util.MessageSupplier;
1514
import org.elasticsearch.ElasticsearchException;
1615
import org.elasticsearch.ExceptionsHelper;
1716
import org.elasticsearch.action.ActionListener;
@@ -616,43 +615,20 @@ public void messageReceived(StartedShardEntry request, TransportChannel channel,
616615
SHARD_STARTED_ACTION_NAME,
617616
request
618617
);
618+
619+
var update = new StartedShardUpdateTask(request, listener);
620+
619621
clusterService.submitStateUpdateTask(
620622
"shard-started " + request,
621-
request,
623+
update,
622624
ClusterStateTaskConfig.build(Priority.URGENT),
623625
shardStartedClusterStateTaskExecutor,
624-
new ClusterStateTaskListener() {
625-
@Override
626-
public void onFailure(Exception e) {
627-
final MessageSupplier msg = () -> new ParameterizedMessage(
628-
"{} unexpected failure while starting shard [{}]",
629-
request.shardId,
630-
request
631-
);
632-
if (e instanceof FailedToCommitClusterStateException) {
633-
logger.debug(msg, e);
634-
} else {
635-
logger.error(msg, e);
636-
}
637-
listener.onFailure(e);
638-
}
639-
640-
@Override
641-
public void onNoLongerMaster() {
642-
logger.debug("{} no longer master while starting shard [{}]", request.shardId, request);
643-
listener.onFailure(new NotMasterException("shard-started"));
644-
}
645-
646-
@Override
647-
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
648-
listener.onResponse(TransportResponse.Empty.INSTANCE);
649-
}
650-
}
626+
update
651627
);
652628
}
653629
}
654630

655-
public static class ShardStartedClusterStateTaskExecutor implements ClusterStateTaskExecutor<StartedShardEntry> {
631+
public static class ShardStartedClusterStateTaskExecutor implements ClusterStateTaskExecutor<StartedShardUpdateTask> {
656632
private final AllocationService allocationService;
657633
private final RerouteService rerouteService;
658634

@@ -662,52 +638,54 @@ public ShardStartedClusterStateTaskExecutor(AllocationService allocationService,
662638
}
663639

664640
@Override
665-
public ClusterTasksResult<StartedShardEntry> execute(ClusterState currentState, List<StartedShardEntry> tasks) throws Exception {
666-
ClusterTasksResult.Builder<StartedShardEntry> builder = ClusterTasksResult.builder();
667-
List<StartedShardEntry> tasksToBeApplied = new ArrayList<>();
641+
public ClusterTasksResult<StartedShardUpdateTask> execute(ClusterState currentState, List<StartedShardUpdateTask> tasks)
642+
throws Exception {
643+
ClusterTasksResult.Builder<StartedShardUpdateTask> builder = ClusterTasksResult.builder();
644+
List<StartedShardUpdateTask> tasksToBeApplied = new ArrayList<>();
668645
List<ShardRouting> shardRoutingsToBeApplied = new ArrayList<>(tasks.size());
669646
Set<ShardRouting> seenShardRoutings = new HashSet<>(); // to prevent duplicates
670647
final Map<Index, IndexLongFieldRange> updatedTimestampRanges = new HashMap<>();
671-
for (StartedShardEntry task : tasks) {
672-
final ShardRouting matched = currentState.getRoutingTable().getByAllocationId(task.shardId, task.allocationId);
648+
for (StartedShardUpdateTask task : tasks) {
649+
StartedShardEntry entry = task.getEntry();
650+
final ShardRouting matched = currentState.getRoutingTable().getByAllocationId(entry.shardId, entry.allocationId);
673651
if (matched == null) {
674652
// tasks that correspond to non-existent shards are marked as successful. The reason is that we resend shard started
675653
// events on every cluster state publishing that does not contain the shard as started yet. This means that old stale
676654
// requests might still be in flight even after the shard has already been started or failed on the master. We just
677655
// ignore these requests for now.
678-
logger.debug("{} ignoring shard started task [{}] (shard does not exist anymore)", task.shardId, task);
656+
logger.debug("{} ignoring shard started task [{}] (shard does not exist anymore)", entry.shardId, entry);
679657
builder.success(task);
680658
} else {
681-
if (matched.primary() && task.primaryTerm > 0) {
682-
final IndexMetadata indexMetadata = currentState.metadata().index(task.shardId.getIndex());
659+
if (matched.primary() && entry.primaryTerm > 0) {
660+
final IndexMetadata indexMetadata = currentState.metadata().index(entry.shardId.getIndex());
683661
assert indexMetadata != null;
684-
final long currentPrimaryTerm = indexMetadata.primaryTerm(task.shardId.id());
685-
if (currentPrimaryTerm != task.primaryTerm) {
686-
assert currentPrimaryTerm > task.primaryTerm
662+
final long currentPrimaryTerm = indexMetadata.primaryTerm(entry.shardId.id());
663+
if (currentPrimaryTerm != entry.primaryTerm) {
664+
assert currentPrimaryTerm > entry.primaryTerm
687665
: "received a primary term with a higher term than in the "
688666
+ "current cluster state (received ["
689-
+ task.primaryTerm
667+
+ entry.primaryTerm
690668
+ "] but current is ["
691669
+ currentPrimaryTerm
692670
+ "])";
693671
logger.debug(
694672
"{} ignoring shard started task [{}] (primary term {} does not match current term {})",
695-
task.shardId,
696-
task,
697-
task.primaryTerm,
673+
entry.shardId,
674+
entry,
675+
entry.primaryTerm,
698676
currentPrimaryTerm
699677
);
700678
builder.success(task);
701679
continue;
702680
}
703681
}
704682
if (matched.initializing() == false) {
705-
assert matched.active() : "expected active shard routing for task " + task + " but found " + matched;
683+
assert matched.active() : "expected active shard routing for task " + entry + " but found " + matched;
706684
// same as above, this might have been a stale in-flight request, so we just ignore.
707685
logger.debug(
708686
"{} ignoring shard started task [{}] (shard exists but is not initializing: {})",
709-
task.shardId,
710-
task,
687+
entry.shardId,
688+
entry,
711689
matched
712690
);
713691
builder.success(task);
@@ -716,29 +694,29 @@ public ClusterTasksResult<StartedShardEntry> execute(ClusterState currentState,
716694
if (seenShardRoutings.contains(matched)) {
717695
logger.trace(
718696
"{} ignoring shard started task [{}] (already scheduled to start {})",
719-
task.shardId,
720-
task,
697+
entry.shardId,
698+
entry,
721699
matched
722700
);
723701
tasksToBeApplied.add(task);
724702
} else {
725-
logger.debug("{} starting shard {} (shard started task: [{}])", task.shardId, matched, task);
703+
logger.debug("{} starting shard {} (shard started task: [{}])", entry.shardId, matched, entry);
726704
tasksToBeApplied.add(task);
727705
shardRoutingsToBeApplied.add(matched);
728706
seenShardRoutings.add(matched);
729707

730708
// expand the timestamp range recorded in the index metadata if needed
731-
final Index index = task.shardId.getIndex();
709+
final Index index = entry.shardId.getIndex();
732710
IndexLongFieldRange currentTimestampMillisRange = updatedTimestampRanges.get(index);
733711
final IndexMetadata indexMetadata = currentState.metadata().index(index);
734712
if (currentTimestampMillisRange == null) {
735713
currentTimestampMillisRange = indexMetadata.getTimestampRange();
736714
}
737715
final IndexLongFieldRange newTimestampMillisRange;
738716
newTimestampMillisRange = currentTimestampMillisRange.extendWithShardRange(
739-
task.shardId.id(),
717+
entry.shardId.id(),
740718
indexMetadata.getNumberOfShards(),
741-
task.timestampRange
719+
entry.timestampRange
742720
);
743721
if (newTimestampMillisRange != currentTimestampMillisRange) {
744722
updatedTimestampRanges.put(index, newTimestampMillisRange);
@@ -872,6 +850,43 @@ public int hashCode() {
872850
}
873851
}
874852

853+
public static class StartedShardUpdateTask implements ClusterStateTaskListener {
854+
855+
private final StartedShardEntry entry;
856+
private final ActionListener<TransportResponse.Empty> listener;
857+
858+
public StartedShardUpdateTask(StartedShardEntry entry, ActionListener<TransportResponse.Empty> listener) {
859+
this.entry = entry;
860+
this.listener = listener;
861+
}
862+
863+
public StartedShardEntry getEntry() {
864+
return entry;
865+
}
866+
867+
@Override
868+
public void onFailure(Exception e) {
869+
if (e instanceof NotMasterException) {
870+
logger.debug(() -> new ParameterizedMessage("{} no longer master while starting shard [{}]", entry.shardId, entry));
871+
} else if (e instanceof FailedToCommitClusterStateException) {
872+
logger.debug(() -> new ParameterizedMessage("{} unexpected failure while starting shard [{}]", entry.shardId, entry), e);
873+
} else {
874+
logger.error(() -> new ParameterizedMessage("{} unexpected failure while starting shard [{}]", entry.shardId, entry), e);
875+
}
876+
listener.onFailure(e);
877+
}
878+
879+
@Override
880+
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
881+
listener.onResponse(TransportResponse.Empty.INSTANCE);
882+
}
883+
884+
@Override
885+
public String toString() {
886+
return "StartedShardUpdateTask{entry=" + entry + ", listener=" + listener + "}";
887+
}
888+
}
889+
875890
public static class NoLongerPrimaryShardException extends ElasticsearchException {
876891

877892
public NoLongerPrimaryShardException(ShardId shardId, String msg) {

0 commit comments

Comments
 (0)