Skip to content

Commit a8376f4

Browse files
committed
Assert no exceptions during state application (#47090)
Today we log and swallow exceptions during cluster state application, but such an exception should not occur. This commit adds assertions of this fact, and updates the Javadocs to explain it. Relates #47038
1 parent a439743 commit a8376f4

File tree

7 files changed

+53
-6
lines changed

7 files changed

+53
-6
lines changed

server/src/main/java/org/elasticsearch/cluster/ClusterStateApplier.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@
2828
public interface ClusterStateApplier {
2929

3030
/**
31-
* Called when a new cluster state ({@link ClusterChangedEvent#state()} needs to be applied
31+
* Called when a new cluster state ({@link ClusterChangedEvent#state()} needs to be applied. The cluster state to be applied is already
32+
* committed when this method is called, so an applier must therefore be prepared to deal with any state it receives without throwing
33+
* an exception. Throwing an exception from an applier is very bad because it will stop the application of this state before it has
34+
* reached all the other appliers, and will likely result in another attempt to apply the same (or very similar) cluster state which
35+
* might continue until this node is removed from the cluster.
3236
*/
3337
void applyClusterState(ClusterChangedEvent event);
3438
}

server/src/main/java/org/elasticsearch/cluster/service/ClusterApplierService.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ private static boolean assertNotCalledFromClusterStateApplier(String reason) {
390390
return true;
391391
}
392392

393-
protected void runTask(UpdateTask task) {
393+
private void runTask(UpdateTask task) {
394394
if (!lifecycle.started()) {
395395
logger.debug("processing [{}]: ignoring, cluster applier service not started", task.source);
396396
return;
@@ -447,6 +447,9 @@ protected void runTask(UpdateTask task) {
447447
"failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]",
448448
executionTime, newClusterState.version(), newClusterState.stateUUID(), task.source), e);
449449
}
450+
// failing to apply a cluster state with an exception indicates a bug in validation or in one of the appliers; if we
451+
// continue we will retry with the same cluster state but that might not help.
452+
assert applicationMayFail();
450453
task.listener.onFailure(task.source, e);
451454
}
452455
}
@@ -667,4 +670,8 @@ protected long currentTimeInMillis() {
667670
return threadPool.relativeTimeInMillis();
668671
}
669672

673+
// overridden by tests that need to check behaviour in the event of an application failure
674+
protected boolean applicationMayFail() {
675+
return false;
676+
}
670677
}

server/src/main/java/org/elasticsearch/common/settings/AbstractScopedSettings.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ public synchronized Settings applySettings(Settings newSettings) {
193193
} catch (Exception ex) {
194194
logger.warn("failed to apply settings", ex);
195195
throw ex;
196-
} finally {
197196
}
198197
return lastSettingsApplied = newSettings;
199198
}

server/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -532,23 +532,32 @@ private void updateIndices(ClusterChangedEvent event) {
532532
final IndexMetaData newIndexMetaData = state.metaData().index(index);
533533
assert newIndexMetaData != null : "index " + index + " should have been removed by deleteIndices";
534534
if (ClusterChangedEvent.indexMetaDataChanged(currentIndexMetaData, newIndexMetaData)) {
535-
indexService.updateMetaData(currentIndexMetaData, newIndexMetaData);
535+
String reason = null;
536536
try {
537+
reason = "metadata update failed";
538+
try {
539+
indexService.updateMetaData(currentIndexMetaData, newIndexMetaData);
540+
} catch (Exception e) {
541+
assert false : e;
542+
throw e;
543+
}
544+
545+
reason = "mapping update failed";
537546
if (indexService.updateMapping(currentIndexMetaData, newIndexMetaData) && sendRefreshMapping) {
538547
nodeMappingRefreshAction.nodeMappingRefresh(state.nodes().getMasterNode(),
539548
new NodeMappingRefreshAction.NodeMappingRefreshRequest(newIndexMetaData.getIndex().getName(),
540549
newIndexMetaData.getIndexUUID(), state.nodes().getLocalNodeId())
541550
);
542551
}
543552
} catch (Exception e) {
544-
indicesService.removeIndex(indexService.index(), FAILURE, "removing index (mapping update failed)");
553+
indicesService.removeIndex(indexService.index(), FAILURE, "removing index (" + reason + ")");
545554

546555
// fail shards that would be created or updated by createOrUpdateShards
547556
RoutingNode localRoutingNode = state.getRoutingNodes().node(state.nodes().getLocalNodeId());
548557
if (localRoutingNode != null) {
549558
for (final ShardRouting shardRouting : localRoutingNode) {
550559
if (shardRouting.index().equals(index) && failedShardsCache.containsKey(shardRouting.shardId()) == false) {
551-
sendFailShard(shardRouting, "failed to update mapping for index", e, state);
560+
sendFailShard(shardRouting, "failed to update index (" + reason + ")", e, state);
552561
}
553562
}
554563
}

server/src/test/java/org/elasticsearch/cluster/coordination/CoordinatorTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,7 @@ public void testAckListenerReceivesNackFromFollower() {
586586
final ClusterNode follower0 = cluster.getAnyNodeExcept(leader);
587587
final ClusterNode follower1 = cluster.getAnyNodeExcept(leader, follower0);
588588

589+
follower0.allowClusterStateApplicationFailure();
589590
follower0.setClusterStateApplyResponse(ClusterStateApplyResponse.FAIL);
590591
AckCollector ackCollector = leader.submitValue(randomLong());
591592
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
@@ -605,6 +606,7 @@ public void testAckListenerReceivesNackFromLeader() {
605606
final ClusterNode follower1 = cluster.getAnyNodeExcept(leader, follower0);
606607
final long startingTerm = leader.coordinator.getCurrentTerm();
607608

609+
leader.allowClusterStateApplicationFailure();
608610
leader.setClusterStateApplyResponse(ClusterStateApplyResponse.FAIL);
609611
AckCollector ackCollector = leader.submitValue(randomLong());
610612
cluster.runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY, "committing value");

server/src/test/java/org/elasticsearch/cluster/service/ClusterApplierServiceTests.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ public void testClusterStateApplierBubblesUpExceptionsInApplier() throws Interru
358358
clusterApplierService.addStateApplier(event -> {
359359
throw new RuntimeException("dummy exception");
360360
});
361+
clusterApplierService.allowClusterStateApplicationFailure();
361362

362363
CountDownLatch latch = new CountDownLatch(1);
363364
clusterApplierService.onNewClusterState("test", () -> ClusterState.builder(clusterApplierService.state()).build(),
@@ -386,6 +387,7 @@ public void testClusterStateApplierBubblesUpExceptionsInSettingsApplier() throws
386387
AtomicReference<Throwable> error = new AtomicReference<>();
387388
clusterApplierService.clusterSettings.addSettingsUpdateConsumer(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING,
388389
v -> {});
390+
clusterApplierService.allowClusterStateApplicationFailure();
389391

390392
CountDownLatch latch = new CountDownLatch(1);
391393
clusterApplierService.onNewClusterState("test", () -> ClusterState.builder(clusterApplierService.state())
@@ -496,6 +498,7 @@ static class TimedClusterApplierService extends ClusterApplierService {
496498

497499
final ClusterSettings clusterSettings;
498500
volatile Long currentTimeOverride = null;
501+
boolean applicationMayFail;
499502

500503
TimedClusterApplierService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) {
501504
super("test_node", settings, clusterSettings, threadPool);
@@ -509,6 +512,15 @@ protected long currentTimeInMillis() {
509512
}
510513
return super.currentTimeInMillis();
511514
}
515+
516+
@Override
517+
protected boolean applicationMayFail() {
518+
return this.applicationMayFail;
519+
}
520+
521+
void allowClusterStateApplicationFailure() {
522+
this.applicationMayFail = true;
523+
}
512524
}
513525

514526
}

test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,6 +1170,10 @@ void applyInitialConfiguration() {
11701170
private boolean isNotUsefullyBootstrapped() {
11711171
return getLocalNode().isMasterNode() == false || coordinator.isInitialConfigurationSet() == false;
11721172
}
1173+
1174+
void allowClusterStateApplicationFailure() {
1175+
clusterApplierService.allowClusterStateApplicationFailure();
1176+
}
11731177
}
11741178

11751179
private List<TransportAddress> provideSeedHosts(SeedHostsProvider.HostsResolver ignored) {
@@ -1280,6 +1284,7 @@ static class DisruptableClusterApplierService extends ClusterApplierService {
12801284
private final String nodeName;
12811285
private final DeterministicTaskQueue deterministicTaskQueue;
12821286
ClusterStateApplyResponse clusterStateApplyResponse = ClusterStateApplyResponse.SUCCEED;
1287+
private boolean applicationMayFail;
12831288

12841289
DisruptableClusterApplierService(String nodeName, Settings settings, ClusterSettings clusterSettings,
12851290
DeterministicTaskQueue deterministicTaskQueue, Function<Runnable, Runnable> runnableWrapper) {
@@ -1324,6 +1329,15 @@ public void onNewClusterState(String source, Supplier<ClusterState> clusterState
13241329
protected void connectToNodesAndWait(ClusterState newClusterState) {
13251330
// don't do anything, and don't block
13261331
}
1332+
1333+
@Override
1334+
protected boolean applicationMayFail() {
1335+
return this.applicationMayFail;
1336+
}
1337+
1338+
void allowClusterStateApplicationFailure() {
1339+
this.applicationMayFail = true;
1340+
}
13271341
}
13281342

13291343
protected DiscoveryNode createDiscoveryNode(int nodeIndex, boolean masterEligible) {

0 commit comments

Comments
 (0)