Skip to content

Commit 0a39ebd

Browse files
committed
Avoid overshooting watermarks during relocation
Today the `DiskThresholdDecider` attempts to account for already-relocating shards when deciding how to allocate or relocate a shard. Its goal is to stop relocating shards onto a node before that node exceeds the low watermark, and to stop relocating shards away from a node as soon as the node drops below the high watermark. The decider handles multiple data paths by only accounting for relocating shards that affect the appropriate data path. However, this mechanism does not correctly account for _new_ relocating shards, which are unwittingly ignored. This means that we may evict far too many shards from a node above the high watermark, and may relocate far too many shards onto a node causing it to blow right past the low watermark and potentially other watermarks too. There are in fact two distinct issues that this PR fixes. New incoming shards have an unknown data path until the `ClusterInfoService` refreshes its statistics. New outgoing shards have a known data path, but we fail to account for the change of the corresponding `ShardRouting` from `STARTED` to `RELOCATING`, meaning that we fail to find the correct data path and treat the path as unknown here too. This PR also reworks the `MockDiskUsagesIT` test to avoid using fake data paths for all shards. With the changes here, the data paths are handled in tests as they are in production, except that their sizes are fake. Fixes elastic#45177 Backport of elastic#46079
1 parent 266de98 commit 0a39ebd

File tree

6 files changed

+383
-168
lines changed

6 files changed

+383
-168
lines changed

server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java

+13-7
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,11 @@ private void maybeRefresh() {
278278
}
279279
}
280280

281+
// allow tests to adjust the node stats on receipt
282+
List<NodeStats> adjustNodesStats(List<NodeStats> nodeStats) {
283+
return nodeStats;
284+
}
285+
281286
/**
282287
* Refreshes the ClusterInfo in a blocking fashion
283288
*/
@@ -287,12 +292,13 @@ public final ClusterInfo refresh() {
287292
}
288293
final CountDownLatch nodeLatch = updateNodeStats(new ActionListener<NodesStatsResponse>() {
289294
@Override
290-
public void onResponse(NodesStatsResponse nodeStatses) {
291-
ImmutableOpenMap.Builder<String, DiskUsage> newLeastAvaiableUsages = ImmutableOpenMap.builder();
292-
ImmutableOpenMap.Builder<String, DiskUsage> newMostAvaiableUsages = ImmutableOpenMap.builder();
293-
fillDiskUsagePerNode(logger, nodeStatses.getNodes(), newLeastAvaiableUsages, newMostAvaiableUsages);
294-
leastAvailableSpaceUsages = newLeastAvaiableUsages.build();
295-
mostAvailableSpaceUsages = newMostAvaiableUsages.build();
295+
public void onResponse(NodesStatsResponse nodesStatsResponse) {
296+
ImmutableOpenMap.Builder<String, DiskUsage> leastAvailableUsagesBuilder = ImmutableOpenMap.builder();
297+
ImmutableOpenMap.Builder<String, DiskUsage> mostAvailableUsagesBuilder = ImmutableOpenMap.builder();
298+
fillDiskUsagePerNode(logger, adjustNodesStats(nodesStatsResponse.getNodes()),
299+
leastAvailableUsagesBuilder, mostAvailableUsagesBuilder);
300+
leastAvailableSpaceUsages = leastAvailableUsagesBuilder.build();
301+
mostAvailableSpaceUsages = mostAvailableUsagesBuilder.build();
296302
}
297303

298304
@Override
@@ -394,7 +400,7 @@ static void fillDiskUsagePerNode(Logger logger, List<NodeStats> nodeStatsArray,
394400
if (leastAvailablePath == null) {
395401
assert mostAvailablePath == null;
396402
mostAvailablePath = leastAvailablePath = info;
397-
} else if (leastAvailablePath.getAvailable().getBytes() > info.getAvailable().getBytes()){
403+
} else if (leastAvailablePath.getAvailable().getBytes() > info.getAvailable().getBytes()) {
398404
leastAvailablePath = info;
399405
} else if (mostAvailablePath.getAvailable().getBytes() < info.getAvailable().getBytes()) {
400406
mostAvailablePath = info;

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java

+26-6
Original file line numberDiff line numberDiff line change
@@ -90,16 +90,36 @@ static long sizeOfRelocatingShards(RoutingNode node, RoutingAllocation allocatio
9090
boolean subtractShardsMovingAway, String dataPath) {
9191
ClusterInfo clusterInfo = allocation.clusterInfo();
9292
long totalSize = 0;
93-
for (ShardRouting routing : node.shardsWithState(ShardRoutingState.RELOCATING, ShardRoutingState.INITIALIZING)) {
94-
String actualPath = clusterInfo.getDataPath(routing);
95-
if (dataPath.equals(actualPath)) {
96-
if (routing.initializing() && routing.relocatingNodeId() != null) {
97-
totalSize += getExpectedShardSize(routing, allocation, 0);
98-
} else if (subtractShardsMovingAway && routing.relocating()) {
93+
94+
for (ShardRouting routing : node.shardsWithState(ShardRoutingState.INITIALIZING)) {
95+
if (routing.relocatingNodeId() == null) {
96+
// in practice the only initializing-but-not-relocating shards with a nonzero expected shard size will be ones created
97+
// by a resize (shrink/split/clone) operation which we expect to happen using hard links, so they shouldn't be taking
98+
// any additional space and can be ignored here
99+
continue;
100+
}
101+
102+
final String actualPath = clusterInfo.getDataPath(routing);
103+
// if we don't yet know the actual path of the incoming shard then conservatively assume it's going to the path with the least
104+
// free space
105+
if (actualPath == null || actualPath.equals(dataPath)) {
106+
totalSize += getExpectedShardSize(routing, allocation, 0);
107+
}
108+
}
109+
110+
if (subtractShardsMovingAway) {
111+
for (ShardRouting routing : node.shardsWithState(ShardRoutingState.RELOCATING)) {
112+
String actualPath = clusterInfo.getDataPath(routing);
113+
if (actualPath == null) {
114+
// we might know the path of this shard from before when it was relocating
115+
actualPath = clusterInfo.getDataPath(routing.cancelRelocation());
116+
}
117+
if (dataPath.equals(actualPath)) {
99118
totalSize -= getExpectedShardSize(routing, allocation, 0);
100119
}
101120
}
102121
}
122+
103123
return totalSize;
104124
}
105125

server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java

+16-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.elasticsearch.cluster.ClusterState;
2727
import org.elasticsearch.cluster.DiskUsage;
2828
import org.elasticsearch.cluster.ESAllocationTestCase;
29-
import org.elasticsearch.cluster.MockInternalClusterInfoService.DevNullClusterInfo;
3029
import org.elasticsearch.cluster.metadata.IndexMetaData;
3130
import org.elasticsearch.cluster.metadata.MetaData;
3231
import org.elasticsearch.cluster.node.DiscoveryNode;
@@ -1028,4 +1027,20 @@ public void logShardStates(ClusterState state) {
10281027
rn.shardsWithState(RELOCATING),
10291028
rn.shardsWithState(STARTED));
10301029
}
1030+
1031+
/**
1032+
* ClusterInfo that always reports /dev/null for the shards' data paths.
1033+
*/
1034+
static class DevNullClusterInfo extends ClusterInfo {
1035+
DevNullClusterInfo(ImmutableOpenMap<String, DiskUsage> leastAvailableSpaceUsage,
1036+
ImmutableOpenMap<String, DiskUsage> mostAvailableSpaceUsage,
1037+
ImmutableOpenMap<String, Long> shardSizes) {
1038+
super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null);
1039+
}
1040+
1041+
@Override
1042+
public String getDataPath(ShardRouting shardRouting) {
1043+
return "/dev/null";
1044+
}
1045+
}
10311046
}

server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import org.elasticsearch.cluster.ClusterState;
2626
import org.elasticsearch.cluster.DiskUsage;
2727
import org.elasticsearch.cluster.ESAllocationTestCase;
28-
import org.elasticsearch.cluster.MockInternalClusterInfoService.DevNullClusterInfo;
28+
import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDeciderTests.DevNullClusterInfo;
2929
import org.elasticsearch.cluster.metadata.IndexMetaData;
3030
import org.elasticsearch.cluster.metadata.MetaData;
3131
import org.elasticsearch.cluster.node.DiscoveryNode;

0 commit comments

Comments
 (0)