From 884dcfcc3ae83c247ec300b2b911bcb720d2e79a Mon Sep 17 00:00:00 2001 From: "ievgen.degtiarenko" Date: Fri, 24 Feb 2023 09:12:11 +0100 Subject: [PATCH 1/2] Fix flaky testRebalanceOnlyAfterAllShardsAreActive --- .../allocation/RebalanceAfterActiveTests.java | 68 ++++++------------- 1 file changed, 20 insertions(+), 48 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java index 85e86d5592e10..07c46c1a89e04 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java @@ -13,13 +13,13 @@ import org.elasticsearch.Version; import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; +import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.TestShardRoutingRoleStrategies; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.Metadata; import org.elasticsearch.cluster.node.DiscoveryNodes; -import org.elasticsearch.cluster.routing.IndexShardRoutingTable; import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.RoutingNodes; import org.elasticsearch.cluster.routing.RoutingTable; @@ -34,11 +34,11 @@ import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.nullValue; +import static org.hamcrest.Matchers.oneOf; public class RebalanceAfterActiveTests extends ESAllocationTestCase { private final Logger logger = LogManager.getLogger(RebalanceAfterActiveTests.class); - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/94086") public void testRebalanceOnlyAfterAllShardsAreActive() { final long[] sizes = new long[5]; for (int i = 0; i < sizes.length; i++) { @@ -63,25 +63,20 @@ public Long getShardSize(ShardRouting shardRouting) { ); logger.info("Building initial routing table"); - Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(5).numberOfReplicas(1)) - .build(); + var indexMetadata = IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(5).numberOfReplicas(1).build(); - RoutingTable initialRoutingTable = RoutingTable.builder(TestShardRoutingRoleStrategies.DEFAULT_ROLE_ONLY) - .addAsNew(metadata.index("test")) + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT) + .metadata(Metadata.builder().put(indexMetadata, false)) + .routingTable(RoutingTable.builder(TestShardRoutingRoleStrategies.DEFAULT_ROLE_ONLY).addAsNew(indexMetadata)) .build(); - ClusterState clusterState = ClusterState.builder( - org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY) - ).metadata(metadata).routingTable(initialRoutingTable).build(); - assertThat(clusterState.routingTable().index("test").size(), equalTo(5)); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); - assertThat(clusterState.routingTable().index("test").shard(i).shard(0).state(), equalTo(UNASSIGNED)); - assertThat(clusterState.routingTable().index("test").shard(i).shard(1).state(), equalTo(UNASSIGNED)); - assertThat(clusterState.routingTable().index("test").shard(i).shard(0).currentNodeId(), nullValue()); - assertThat(clusterState.routingTable().index("test").shard(i).shard(1).currentNodeId(), nullValue()); + assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().state(), equalTo(UNASSIGNED)); + assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().currentNodeId(), nullValue()); + assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).state(), equalTo(UNASSIGNED)); + assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).currentNodeId(), nullValue()); } logger.info("start two nodes and fully start the shards"); @@ -93,6 +88,7 @@ public Long getShardSize(ShardRouting shardRouting) { for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().state(), equalTo(INITIALIZING)); + assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().getExpectedShardSize(), equalTo(sizes[i])); assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).state(), equalTo(UNASSIGNED)); } @@ -103,7 +99,7 @@ public Long getShardSize(ShardRouting shardRouting) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().state(), equalTo(STARTED)); assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).state(), equalTo(INITIALIZING)); - assertEquals(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).getExpectedShardSize(), sizes[i]); + assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).getExpectedShardSize(), equalTo(sizes[i])); } logger.info("now, start 8 more nodes, and check that no rebalancing/relocation have happened"); @@ -126,44 +122,20 @@ public Long getShardSize(ShardRouting shardRouting) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().state(), equalTo(STARTED)); assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).state(), equalTo(INITIALIZING)); - assertEquals(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).getExpectedShardSize(), sizes[i]); - + assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).getExpectedShardSize(), equalTo(sizes[i])); } logger.info("start the replica shards, rebalancing should start"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); - // we only allow one relocation at a time - assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(5)); - assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), equalTo(5)); - for (int shardId = 0; shardId < clusterState.routingTable().index("test").size(); shardId++) { - int num = 0; - final IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(shardId); - for (int copy = 0; copy < shardRoutingTable.size(); copy++) { - ShardRouting routing = shardRoutingTable.shard(copy); - if (routing.state() == RELOCATING || routing.state() == INITIALIZING) { - assertEquals(routing.getExpectedShardSize(), sizes[shardId]); - num++; - } - } - assertTrue(num > 0); - } + // both primary and replica should not be rebalanced at once so 5 replicas should start moving + // unless we computed the balance where one of the indices already have both primary and replica on desired nodes + // in such case only 4 shards are immediately relocating + assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), oneOf(5, 6)); + assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), oneOf(4, 5)); - logger.info("complete relocation, other half of relocation should happen"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - // we now only relocate 3, since 2 remain where they are! - assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(7)); - assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), equalTo(3)); - for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { - final IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(i); - for (int j = 0; j < shardRoutingTable.size(); j++) { - ShardRouting routing = shardRoutingTable.shard(j); - if (routing.state() == RELOCATING || routing.state() == INITIALIZING) { - assertEquals(routing.getExpectedShardSize(), sizes[i]); - } - } - } + logger.info("complete all relocations"); + clusterState = applyStartedShardsUntilNoChange(clusterState, strategy); logger.info("complete relocation, that's it!"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); From c2c13b5e21de91e2c7a9fb78132f1c776bb56f24 Mon Sep 17 00:00:00 2001 From: "ievgen.degtiarenko" Date: Fri, 24 Feb 2023 15:07:53 +0100 Subject: [PATCH 2/2] revert --- .../routing/allocation/RebalanceAfterActiveTests.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java index 07c46c1a89e04..78d0bab798538 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java @@ -73,10 +73,10 @@ public Long getShardSize(ShardRouting shardRouting) { assertThat(clusterState.routingTable().index("test").size(), equalTo(5)); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); - assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().state(), equalTo(UNASSIGNED)); - assertThat(clusterState.routingTable().index("test").shard(i).primaryShard().currentNodeId(), nullValue()); - assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).state(), equalTo(UNASSIGNED)); - assertThat(clusterState.routingTable().index("test").shard(i).replicaShards().get(0).currentNodeId(), nullValue()); + assertThat(clusterState.routingTable().index("test").shard(i).shard(0).state(), equalTo(UNASSIGNED)); + assertThat(clusterState.routingTable().index("test").shard(i).shard(1).state(), equalTo(UNASSIGNED)); + assertThat(clusterState.routingTable().index("test").shard(i).shard(0).currentNodeId(), nullValue()); + assertThat(clusterState.routingTable().index("test").shard(i).shard(1).currentNodeId(), nullValue()); } logger.info("start two nodes and fully start the shards");