Skip to content

Commit 3e76b15

Browse files
committed
Treat ack timeout more like a publish timeout (#31303)
This commit changes the ack timeout mechanism so that its behavior is closer to the publish timeout, i.e., it only comes into play after committing a cluster state. This ensures for example that an index creation request with a low (ack) timeout value does not return before the cluster state that contains information about the newly created index is even committed.
1 parent 8305332 commit 3e76b15

File tree

6 files changed

+207
-33
lines changed

6 files changed

+207
-33
lines changed

server/src/main/java/org/elasticsearch/cluster/service/MasterService.java

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
import org.elasticsearch.discovery.Discovery;
5151
import org.elasticsearch.threadpool.ThreadPool;
5252

53-
import java.util.ArrayList;
5453
import java.util.Arrays;
5554
import java.util.Collections;
5655
import java.util.List;
@@ -365,28 +364,11 @@ public void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
365364
}
366365

367366
public Discovery.AckListener createAckListener(ThreadPool threadPool, ClusterState newClusterState) {
368-
ArrayList<Discovery.AckListener> ackListeners = new ArrayList<>();
369-
370-
//timeout straightaway, otherwise we could wait forever as the timeout thread has not started
371-
nonFailedTasks.stream().filter(task -> task.listener instanceof AckedClusterStateTaskListener).forEach(task -> {
372-
final AckedClusterStateTaskListener ackedListener = (AckedClusterStateTaskListener) task.listener;
373-
if (ackedListener.ackTimeout() == null || ackedListener.ackTimeout().millis() == 0) {
374-
ackedListener.onAckTimeout();
375-
} else {
376-
try {
377-
ackListeners.add(new AckCountDownListener(ackedListener, newClusterState.version(), newClusterState.nodes(),
378-
threadPool));
379-
} catch (EsRejectedExecutionException ex) {
380-
if (logger.isDebugEnabled()) {
381-
logger.debug("Couldn't schedule timeout thread - node might be shutting down", ex);
382-
}
383-
//timeout straightaway, otherwise we could wait forever as the timeout thread has not started
384-
ackedListener.onAckTimeout();
385-
}
386-
}
387-
});
388-
389-
return new DelegatingAckListener(ackListeners);
367+
return new DelegatingAckListener(nonFailedTasks.stream()
368+
.filter(task -> task.listener instanceof AckedClusterStateTaskListener)
369+
.map(task -> new AckCountDownListener((AckedClusterStateTaskListener) task.listener, newClusterState.version(),
370+
newClusterState.nodes(), threadPool))
371+
.collect(Collectors.toList()));
390372
}
391373

392374
public boolean clusterStateUnchanged() {
@@ -549,6 +531,13 @@ private DelegatingAckListener(List<Discovery.AckListener> listeners) {
549531
this.listeners = listeners;
550532
}
551533

534+
@Override
535+
public void onCommit(TimeValue commitTime) {
536+
for (Discovery.AckListener listener : listeners) {
537+
listener.onCommit(commitTime);
538+
}
539+
}
540+
552541
@Override
553542
public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
554543
for (Discovery.AckListener listener : listeners) {
@@ -564,14 +553,16 @@ private static class AckCountDownListener implements Discovery.AckListener {
564553
private final AckedClusterStateTaskListener ackedTaskListener;
565554
private final CountDown countDown;
566555
private final DiscoveryNode masterNode;
556+
private final ThreadPool threadPool;
567557
private final long clusterStateVersion;
568-
private final Future<?> ackTimeoutCallback;
558+
private volatile Future<?> ackTimeoutCallback;
569559
private Exception lastFailure;
570560

571561
AckCountDownListener(AckedClusterStateTaskListener ackedTaskListener, long clusterStateVersion, DiscoveryNodes nodes,
572562
ThreadPool threadPool) {
573563
this.ackedTaskListener = ackedTaskListener;
574564
this.clusterStateVersion = clusterStateVersion;
565+
this.threadPool = threadPool;
575566
this.masterNode = nodes.getMasterNode();
576567
int countDown = 0;
577568
for (DiscoveryNode node : nodes) {
@@ -581,8 +572,27 @@ private static class AckCountDownListener implements Discovery.AckListener {
581572
}
582573
}
583574
logger.trace("expecting {} acknowledgements for cluster_state update (version: {})", countDown, clusterStateVersion);
584-
this.countDown = new CountDown(countDown);
585-
this.ackTimeoutCallback = threadPool.schedule(ackedTaskListener.ackTimeout(), ThreadPool.Names.GENERIC, () -> onTimeout());
575+
this.countDown = new CountDown(countDown + 1); // we also wait for onCommit to be called
576+
}
577+
578+
@Override
579+
public void onCommit(TimeValue commitTime) {
580+
TimeValue ackTimeout = ackedTaskListener.ackTimeout();
581+
if (ackTimeout == null) {
582+
ackTimeout = TimeValue.ZERO;
583+
}
584+
final TimeValue timeLeft = TimeValue.timeValueNanos(Math.max(0, ackTimeout.nanos() - commitTime.nanos()));
585+
if (timeLeft.nanos() == 0L) {
586+
onTimeout();
587+
} else if (countDown.countDown()) {
588+
finish();
589+
} else {
590+
this.ackTimeoutCallback = threadPool.schedule(timeLeft, ThreadPool.Names.GENERIC, this::onTimeout);
591+
// re-check if onNodeAck has not completed while we were scheduling the timeout
592+
if (countDown.isCountedDown()) {
593+
FutureUtils.cancel(ackTimeoutCallback);
594+
}
595+
}
586596
}
587597

588598
@Override
@@ -599,12 +609,16 @@ public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
599609
}
600610

601611
if (countDown.countDown()) {
602-
logger.trace("all expected nodes acknowledged cluster_state update (version: {})", clusterStateVersion);
603-
FutureUtils.cancel(ackTimeoutCallback);
604-
ackedTaskListener.onAllNodesAcked(lastFailure);
612+
finish();
605613
}
606614
}
607615

616+
private void finish() {
617+
logger.trace("all expected nodes acknowledged cluster_state update (version: {})", clusterStateVersion);
618+
FutureUtils.cancel(ackTimeoutCallback);
619+
ackedTaskListener.onAllNodesAcked(lastFailure);
620+
}
621+
608622
public void onTimeout() {
609623
if (countDown.fastForward()) {
610624
logger.trace("timeout waiting for acknowledgement for cluster_state update (version: {})", clusterStateVersion);

server/src/main/java/org/elasticsearch/discovery/Discovery.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.elasticsearch.common.Nullable;
2626
import org.elasticsearch.common.component.LifecycleComponent;
2727
import org.elasticsearch.common.io.stream.StreamInput;
28+
import org.elasticsearch.common.unit.TimeValue;
2829

2930
import java.io.IOException;
3031

@@ -48,6 +49,19 @@ public interface Discovery extends LifecycleComponent {
4849
void publish(ClusterChangedEvent clusterChangedEvent, AckListener ackListener);
4950

5051
interface AckListener {
52+
/**
53+
* Should be called when the discovery layer has committed the clusters state (i.e. even if this publication fails,
54+
* it is guaranteed to appear in future publications).
55+
* @param commitTime the time it took to commit the cluster state
56+
*/
57+
void onCommit(TimeValue commitTime);
58+
59+
/**
60+
* Should be called whenever the discovery layer receives confirmation from a node that it has successfully applied
61+
* the cluster state. In case of failures, an exception should be provided as parameter.
62+
* @param node the node
63+
* @param e the optional exception
64+
*/
5165
void onNodeAck(DiscoveryNode node, @Nullable Exception e);
5266
}
5367

server/src/main/java/org/elasticsearch/discovery/single/SingleNodeDiscovery.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.cluster.service.MasterService;
3131
import org.elasticsearch.common.component.AbstractLifecycleComponent;
3232
import org.elasticsearch.common.settings.Settings;
33+
import org.elasticsearch.common.unit.TimeValue;
3334
import org.elasticsearch.discovery.Discovery;
3435
import org.elasticsearch.discovery.DiscoveryStats;
3536
import org.elasticsearch.transport.TransportService;
@@ -61,6 +62,7 @@ public SingleNodeDiscovery(final Settings settings, final TransportService trans
6162
public synchronized void publish(final ClusterChangedEvent event,
6263
final AckListener ackListener) {
6364
clusterState = event.state();
65+
ackListener.onCommit(TimeValue.ZERO);
6466
CountDownLatch latch = new CountDownLatch(1);
6567

6668
ClusterApplyListener listener = new ClusterApplyListener() {

server/src/main/java/org/elasticsearch/discovery/zen/PublishClusterStateAction.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ public void publish(final ClusterChangedEvent clusterChangedEvent, final int min
158158
}
159159

160160
try {
161-
innerPublish(clusterChangedEvent, nodesToPublishTo, sendingController, sendFullVersion, serializedStates, serializedDiffs);
161+
innerPublish(clusterChangedEvent, nodesToPublishTo, sendingController, ackListener, sendFullVersion, serializedStates,
162+
serializedDiffs);
162163
} catch (Discovery.FailedToCommitClusterStateException t) {
163164
throw t;
164165
} catch (Exception e) {
@@ -173,8 +174,9 @@ public void publish(final ClusterChangedEvent clusterChangedEvent, final int min
173174
}
174175

175176
private void innerPublish(final ClusterChangedEvent clusterChangedEvent, final Set<DiscoveryNode> nodesToPublishTo,
176-
final SendingController sendingController, final boolean sendFullVersion,
177-
final Map<Version, BytesReference> serializedStates, final Map<Version, BytesReference> serializedDiffs) {
177+
final SendingController sendingController, final Discovery.AckListener ackListener,
178+
final boolean sendFullVersion, final Map<Version, BytesReference> serializedStates,
179+
final Map<Version, BytesReference> serializedDiffs) {
178180

179181
final ClusterState clusterState = clusterChangedEvent.state();
180182
final ClusterState previousState = clusterChangedEvent.previousState();
@@ -195,8 +197,12 @@ private void innerPublish(final ClusterChangedEvent clusterChangedEvent, final S
195197

196198
sendingController.waitForCommit(discoverySettings.getCommitTimeout());
197199

200+
final long commitTime = System.nanoTime() - publishingStartInNanos;
201+
202+
ackListener.onCommit(TimeValue.timeValueNanos(commitTime));
203+
198204
try {
199-
long timeLeftInNanos = Math.max(0, publishTimeout.nanos() - (System.nanoTime() - publishingStartInNanos));
205+
long timeLeftInNanos = Math.max(0, publishTimeout.nanos() - commitTime);
200206
final BlockingClusterStatePublishResponseHandler publishResponseHandler = sendingController.getPublishResponseHandler();
201207
sendingController.setPublishingTimedOut(!publishResponseHandler.awaitAllNodes(TimeValue.timeValueNanos(timeLeftInNanos)));
202208
if (sendingController.getPublishingTimedOut()) {

server/src/test/java/org/elasticsearch/cluster/service/MasterServiceTests.java

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.logging.log4j.Logger;
2323
import org.elasticsearch.ExceptionsHelper;
2424
import org.elasticsearch.Version;
25+
import org.elasticsearch.cluster.AckedClusterStateUpdateTask;
2526
import org.elasticsearch.cluster.ClusterChangedEvent;
2627
import org.elasticsearch.cluster.ClusterName;
2728
import org.elasticsearch.cluster.ClusterState;
@@ -39,6 +40,7 @@
3940
import org.elasticsearch.common.settings.Settings;
4041
import org.elasticsearch.common.unit.TimeValue;
4142
import org.elasticsearch.common.util.concurrent.BaseFuture;
43+
import org.elasticsearch.discovery.Discovery;
4244
import org.elasticsearch.test.ESTestCase;
4345
import org.elasticsearch.test.MockLogAppender;
4446
import org.elasticsearch.test.junit.annotations.TestLogging;
@@ -65,6 +67,7 @@
6567
import java.util.concurrent.atomic.AtomicBoolean;
6668
import java.util.concurrent.atomic.AtomicInteger;
6769
import java.util.concurrent.atomic.AtomicReference;
70+
import java.util.function.BiConsumer;
6871

6972
import static java.util.Collections.emptyMap;
7073
import static java.util.Collections.emptySet;
@@ -680,6 +683,132 @@ public void onFailure(String source, Exception e) {
680683
mockAppender.assertAllExpectationsMatched();
681684
}
682685

686+
public void testAcking() throws InterruptedException {
687+
final DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
688+
final DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
689+
final DiscoveryNode node3 = new DiscoveryNode("node3", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
690+
TimedMasterService timedMasterService = new TimedMasterService(Settings.builder().put("cluster.name",
691+
MasterServiceTests.class.getSimpleName()).build(), threadPool);
692+
ClusterState initialClusterState = ClusterState.builder(new ClusterName(MasterServiceTests.class.getSimpleName()))
693+
.nodes(DiscoveryNodes.builder()
694+
.add(node1)
695+
.add(node2)
696+
.add(node3)
697+
.localNodeId(node1.getId())
698+
.masterNodeId(node1.getId()))
699+
.blocks(ClusterBlocks.EMPTY_CLUSTER_BLOCK).build();
700+
final AtomicReference<BiConsumer<ClusterChangedEvent, Discovery.AckListener>> publisherRef = new AtomicReference<>();
701+
timedMasterService.setClusterStatePublisher((cce, l) -> publisherRef.get().accept(cce, l));
702+
timedMasterService.setClusterStateSupplier(() -> initialClusterState);
703+
timedMasterService.start();
704+
705+
706+
// check that we don't time out before even committing the cluster state
707+
{
708+
final CountDownLatch latch = new CountDownLatch(1);
709+
710+
publisherRef.set((clusterChangedEvent, ackListener) -> {
711+
throw new Discovery.FailedToCommitClusterStateException("mock exception");
712+
});
713+
714+
timedMasterService.submitStateUpdateTask("test2", new AckedClusterStateUpdateTask<Void>(null, null) {
715+
@Override
716+
public ClusterState execute(ClusterState currentState) {
717+
return ClusterState.builder(currentState).build();
718+
}
719+
720+
@Override
721+
public TimeValue ackTimeout() {
722+
return TimeValue.ZERO;
723+
}
724+
725+
@Override
726+
public TimeValue timeout() {
727+
return null;
728+
}
729+
730+
@Override
731+
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
732+
fail();
733+
}
734+
735+
@Override
736+
protected Void newResponse(boolean acknowledged) {
737+
fail();
738+
return null;
739+
}
740+
741+
@Override
742+
public void onFailure(String source, Exception e) {
743+
latch.countDown();
744+
}
745+
746+
@Override
747+
public void onAckTimeout() {
748+
fail();
749+
}
750+
});
751+
752+
latch.await();
753+
}
754+
755+
// check that we timeout if commit took too long
756+
{
757+
final CountDownLatch latch = new CountDownLatch(2);
758+
759+
final TimeValue ackTimeout = TimeValue.timeValueMillis(randomInt(100));
760+
761+
publisherRef.set((clusterChangedEvent, ackListener) -> {
762+
ackListener.onCommit(TimeValue.timeValueMillis(ackTimeout.millis() + randomInt(100)));
763+
ackListener.onNodeAck(node1, null);
764+
ackListener.onNodeAck(node2, null);
765+
ackListener.onNodeAck(node3, null);
766+
});
767+
768+
timedMasterService.submitStateUpdateTask("test2", new AckedClusterStateUpdateTask<Void>(null, null) {
769+
@Override
770+
public ClusterState execute(ClusterState currentState) {
771+
return ClusterState.builder(currentState).build();
772+
}
773+
774+
@Override
775+
public TimeValue ackTimeout() {
776+
return ackTimeout;
777+
}
778+
779+
@Override
780+
public TimeValue timeout() {
781+
return null;
782+
}
783+
784+
@Override
785+
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
786+
latch.countDown();
787+
}
788+
789+
@Override
790+
protected Void newResponse(boolean acknowledged) {
791+
fail();
792+
return null;
793+
}
794+
795+
@Override
796+
public void onFailure(String source, Exception e) {
797+
fail();
798+
}
799+
800+
@Override
801+
public void onAckTimeout() {
802+
latch.countDown();
803+
}
804+
});
805+
806+
latch.await();
807+
}
808+
809+
timedMasterService.close();
810+
}
811+
683812
static class TimedMasterService extends MasterService {
684813

685814
public volatile Long currentTimeOverride = null;

server/src/test/java/org/elasticsearch/discovery/zen/PublishClusterStateActionTests.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import org.elasticsearch.common.io.stream.StreamOutput;
4343
import org.elasticsearch.common.settings.ClusterSettings;
4444
import org.elasticsearch.common.settings.Settings;
45+
import org.elasticsearch.common.unit.TimeValue;
4546
import org.elasticsearch.discovery.Discovery;
4647
import org.elasticsearch.discovery.DiscoverySettings;
4748
import org.elasticsearch.node.Node;
@@ -815,9 +816,16 @@ public AssertingAckListener publishState(PublishClusterStateAction action, Clust
815816
public static class AssertingAckListener implements Discovery.AckListener {
816817
private final List<Tuple<DiscoveryNode, Throwable>> errors = new CopyOnWriteArrayList<>();
817818
private final CountDownLatch countDown;
819+
private final CountDownLatch commitCountDown;
818820

819821
public AssertingAckListener(int nodeCount) {
820822
countDown = new CountDownLatch(nodeCount);
823+
commitCountDown = new CountDownLatch(1);
824+
}
825+
826+
@Override
827+
public void onCommit(TimeValue commitTime) {
828+
commitCountDown.countDown();
821829
}
822830

823831
@Override
@@ -830,6 +838,7 @@ public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
830838

831839
public void await(long timeout, TimeUnit unit) throws InterruptedException {
832840
assertThat(awaitErrors(timeout, unit), emptyIterable());
841+
assertTrue(commitCountDown.await(timeout, unit));
833842
}
834843

835844
public List<Tuple<DiscoveryNode, Throwable>> awaitErrors(long timeout, TimeUnit unit) throws InterruptedException {

0 commit comments

Comments
 (0)