Skip to content

Commit 6d976e1

Browse files
committed
Resolve some coordination-layer TODOs (#54511)
This commit removes a handful of TODO comments in the cluster coordination layer that no longer apply. Relates #32006
1 parent 1fe2705 commit 6d976e1

File tree

9 files changed

+13
-37
lines changed

9 files changed

+13
-37
lines changed

Diff for: server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java

-1
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,6 @@ String getDescription() {
160160
if (INITIAL_MASTER_NODES_SETTING.get(Settings.EMPTY).equals(INITIAL_MASTER_NODES_SETTING.get(settings))) {
161161
bootstrappingDescription = "[" + INITIAL_MASTER_NODES_SETTING.getKey() + "] is empty on this node";
162162
} else {
163-
// TODO update this when we can bootstrap on only a quorum of the initial nodes
164163
bootstrappingDescription = String.format(Locale.ROOT,
165164
"this node must discover master-eligible nodes %s to bootstrap a cluster",
166165
INITIAL_MASTER_NODES_SETTING.get(settings));

Diff for: server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java

+3-6
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
121121
private final NodeRemovalClusterStateTaskExecutor nodeRemovalExecutor;
122122
private final Supplier<CoordinationState.PersistedState> persistedStateSupplier;
123123
private final NoMasterBlockService noMasterBlockService;
124-
// TODO: the following field is package-private as some tests require access to it
125-
// These tests can be rewritten to use public methods once Coordinator is more feature-complete
126-
final Object mutex = new Object();
124+
final Object mutex = new Object(); // package-private to allow tests to call methods that assert that the mutex is held
127125
private final SetOnce<CoordinationState> coordinationState = new SetOnce<>(); // initialized on start-up (see doStart)
128126
private volatile ClusterState applierState; // the state that should be exposed to the cluster state applier
129127

@@ -885,7 +883,7 @@ public boolean setInitialConfiguration(final VotingConfiguration votingConfigura
885883

886884
Metadata.Builder metadataBuilder = Metadata.builder(currentState.metadata());
887885
// automatically generate a UID for the metadata if we need to
888-
metadataBuilder.generateClusterUuidIfNeeded(); // TODO generate UUID in bootstrapping tool?
886+
metadataBuilder.generateClusterUuidIfNeeded();
889887
metadataBuilder.coordinationMetadata(coordinationMetadata);
890888

891889
coordinationState.get().setInitialState(ClusterState.builder(currentState).metadata(metadataBuilder).build());
@@ -1192,7 +1190,7 @@ private void startElectionScheduler() {
11921190
return;
11931191
}
11941192

1195-
final TimeValue gracePeriod = TimeValue.ZERO; // TODO variable grace period
1193+
final TimeValue gracePeriod = TimeValue.ZERO;
11961194
electionScheduler = electionSchedulerFactory.startElectionScheduler(gracePeriod, new Runnable() {
11971195
@Override
11981196
public void run() {
@@ -1225,7 +1223,6 @@ public String toString() {
12251223
}
12261224

12271225
public Iterable<DiscoveryNode> getFoundPeers() {
1228-
// TODO everyone takes this and adds the local node. Maybe just add the local node here?
12291226
return peerFinder.getFoundPeers();
12301227
}
12311228

Diff for: server/src/main/java/org/elasticsearch/cluster/coordination/FollowersChecker.java

-10
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ private void handleFollowerCheck(FollowerCheckRequest request, TransportChannel
170170
FastResponseState responder = this.fastResponseState;
171171

172172
if (responder.mode == Mode.FOLLOWER && responder.term == request.term) {
173-
// TODO trigger a term bump if we voted for a different leader in this term
174173
logger.trace("responding to {} on fast path", request);
175174
transportChannel.sendResponse(Empty.INSTANCE);
176175
return;
@@ -205,15 +204,6 @@ public String toString() {
205204
});
206205
}
207206

208-
// TODO in the PoC a faulty node was considered non-faulty again if it sent us a PeersRequest:
209-
// - node disconnects, detected faulty, removal is enqueued
210-
// - node reconnects, pings us, finds we are master, requests to join, all before removal is applied
211-
// - join is processed before removal, but we do not publish to known-faulty nodes so the joining node does not receive this publication
212-
// - it doesn't start its leader checker since it receives nothing to cause it to become a follower
213-
// Apparently this meant that it remained a candidate for too long, leading to a test failure. At the time this logic was added, we did
214-
// not have gossip-based discovery which would (I think) have retried this joining process a short time later. It's therefore possible
215-
// that this is no longer required, so it's omitted here until we can be sure if it's necessary or not.
216-
217207
/**
218208
* @return nodes in the current cluster state which have failed their follower checks.
219209
*/

Diff for: server/src/main/java/org/elasticsearch/cluster/coordination/InMemoryPersistedState.java

-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import org.elasticsearch.cluster.ClusterState;
2222

2323
public class InMemoryPersistedState implements CoordinationState.PersistedState {
24-
// TODO add support and tests for behaviour with persistence-layer failures
2524

2625
private long currentTerm;
2726
private ClusterState acceptedState;

Diff for: server/src/main/java/org/elasticsearch/cluster/coordination/PreVoteCollector.java

-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ public class PreVoteCollector {
6363
this.updateMaxTermSeen = updateMaxTermSeen;
6464
this.electionStrategy = electionStrategy;
6565

66-
// TODO does this need to be on the generic threadpool or can it use SAME?
6766
transportService.registerRequestHandler(REQUEST_PRE_VOTE_ACTION_NAME, Names.GENERIC, false, false,
6867
PreVoteRequest::new,
6968
(request, channel, task) -> channel.sendResponse(handlePreVoteRequest(request)));

Diff for: server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java

-1
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,6 @@ void sendPublishRequest() {
244244
assert state == PublicationTargetState.NOT_STARTED : state + " -> " + PublicationTargetState.SENT_PUBLISH_REQUEST;
245245
state = PublicationTargetState.SENT_PUBLISH_REQUEST;
246246
Publication.this.sendPublishRequest(discoveryNode, publishRequest, new PublishResponseHandler());
247-
// TODO Can this ^ fail with an exception? Target should be failed if so.
248247
assert publicationCompletedIffAllTargetsInactiveOrCancelled();
249248
}
250249

Diff for: server/src/test/java/org/elasticsearch/cluster/coordination/CoordinatorTests.java

+2-4
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,6 @@ public void testUnresponsiveLeaderDetectedEventually() {
449449
logger.info("--> blackholing leader {}", originalLeader);
450450
originalLeader.blackhole();
451451

452-
// This stabilisation time bound is undesirably long. TODO try and reduce it.
453452
cluster.stabilise(Math.max(
454453
// first wait for all the followers to notice the leader has gone
455454
(defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + defaultMillis(LEADER_CHECK_TIMEOUT_SETTING))
@@ -686,7 +685,7 @@ public void testAckListenerReceivesNacksIfLeaderStandsDown() {
686685
// cluster has two nodes in mode LEADER, in different terms ofc, and the one in the lower term won’t be able to publish anything
687686
leader.heal();
688687
AckCollector ackCollector = leader.submitValue(randomLong());
689-
cluster.stabilise(); // TODO: check if can find a better bound here
688+
cluster.stabilise();
690689
assertTrue("expected nack from " + leader, ackCollector.hasAckedUnsuccessfully(leader));
691690
assertTrue("expected nack from " + follower0, ackCollector.hasAckedUnsuccessfully(follower0));
692691
assertTrue("expected nack from " + follower1, ackCollector.hasAckedUnsuccessfully(follower1));
@@ -739,14 +738,13 @@ public void testSettingInitialConfigurationTriggersElection() {
739738
cluster.stabilise(
740739
// the first election should succeed, because only one node knows of the initial configuration and therefore can win a
741740
// pre-voting round and proceed to an election, so there cannot be any collisions
742-
defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING) // TODO this wait is unnecessary, we could trigger the election immediately
741+
defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING)
743742
// Allow two round-trip for pre-voting and voting
744743
+ 4 * DEFAULT_DELAY_VARIABILITY
745744
// Then a commit of the new leader's first cluster state
746745
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY
747746
// Then allow time for all the other nodes to join, each of which might cause a reconfiguration
748747
+ (cluster.size() - 1) * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY
749-
// TODO Investigate whether 4 publications is sufficient due to batching? A bound linear in the number of nodes isn't great.
750748
);
751749
}
752750
}

Diff for: server/src/test/java/org/elasticsearch/cluster/coordination/PreVoteCollectorTests.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ public String toString() {
112112
assert electionOccurred == false;
113113
electionOccurred = true;
114114
}, l -> {
115-
}, ElectionStrategy.DEFAULT_INSTANCE); // TODO need tests that check that the max term seen is updated
115+
}, ElectionStrategy.DEFAULT_INSTANCE);
116116
preVoteCollector.update(getLocalPreVoteResponse(), null);
117117
}
118118

Diff for: test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java

+7-12
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,6 @@ class Cluster implements Releasable {
244244

245245
final List<ClusterNode> clusterNodes;
246246
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(
247-
// TODO does ThreadPool need a node name any more?
248247
Settings.builder().put(NODE_NAME_SETTING.getKey(), "deterministic-task-queue").build(), random());
249248
private boolean disruptStorage;
250249

@@ -289,8 +288,13 @@ class Cluster implements Releasable {
289288
initialNodeCount, masterEligibleNodeIds, initialConfiguration);
290289
}
291290

292-
List<ClusterNode> addNodesAndStabilise(int newNodesCount) {
293-
final List<ClusterNode> addedNodes = addNodes(newNodesCount);
291+
void addNodesAndStabilise(int newNodesCount) {
292+
293+
// The stabilisation time bound is O(#new nodes) which isn't ideal; it's possible that the real bound is O(1) since node-join
294+
// events are batched together, but in practice we have not seen problems in this area so have not invested the time needed to
295+
// investigate this more closely.
296+
297+
addNodes(newNodesCount);
294298
stabilise(
295299
// The first pinging discovers the master
296300
defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING)
@@ -299,8 +303,6 @@ List<ClusterNode> addNodesAndStabilise(int newNodesCount) {
299303
// Commit a new cluster state with the new node(s). Might be split into multiple commits, and each might need a
300304
// followup reconfiguration
301305
+ newNodesCount * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
302-
// TODO Investigate whether 4 publications is sufficient due to batching? A bound linear in the number of nodes isn't great.
303-
return addedNodes;
304306
}
305307

306308
List<ClusterNode> addNodes(int newNodesCount) {
@@ -331,7 +333,6 @@ void runRandomly() {
331333
*/
332334
void runRandomly(boolean allowReboots, boolean coolDown, long delayVariability) {
333335

334-
// TODO supporting (preserving?) existing disruptions needs implementing if needed, for now we just forbid it
335336
assertThat("may reconnect disconnected nodes, probably unexpected", disconnectedNodes, empty());
336337
assertThat("may reconnect blackholed nodes, probably unexpected", blackholedNodes, empty());
337338

@@ -449,11 +450,6 @@ public String toString() {
449450
deterministicTaskQueue.runRandomTask();
450451
}
451452
}
452-
453-
// TODO other random steps:
454-
// - reboot a node
455-
// - abdicate leadership
456-
457453
} catch (CoordinationStateRejectedException | UncheckedIOException ignored) {
458454
// This is ok: it just means a message couldn't currently be handled.
459455
}
@@ -1254,7 +1250,6 @@ static class AckCollector implements ClusterStatePublisher.AckListener {
12541250

12551251
@Override
12561252
public void onCommit(TimeValue commitTime) {
1257-
// TODO we only currently care about per-node acks
12581253
}
12591254

12601255
@Override

0 commit comments

Comments
 (0)