Skip to content

Commit eaa7bab

Browse files
committed
Handle negative free disk space in deciders
Today it is possible that the total size of all relocating shards exceeds the total amount of free disk space. For instance, this may be caused by another user of the same disk increasing their disk usage, or may be due to how Elasticsearch double-counts relocations that are nearly complete particularly if there are many concurrent relocations in progress. The `DiskThresholdDecider` treats negative free space similarly to zero free space, but it then fails when rendering the messages that explain its decision. This commit fixes its handling of negative free space. Fixes #48380
1 parent f9a9dcb commit eaa7bab

File tree

2 files changed

+93
-28
lines changed

2 files changed

+93
-28
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java

+26
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,19 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
145145
// Cache the used disk percentage for displaying disk percentages consistent with documentation
146146
double usedDiskPercentage = usage.getUsedDiskAsPercentage();
147147
long freeBytes = usage.getFreeBytes();
148+
if (freeBytes < 0L) {
149+
final long sizeOfRelocatingShards = sizeOfRelocatingShards(node, false, usage.getPath(),
150+
allocation.clusterInfo(), allocation.metaData(), allocation.routingTable());
151+
logger.debug("fewer free bytes remaining than the size of all incoming shards: " +
152+
"usage {} on node {} including {} bytes of relocations, preventing allocation",
153+
usage, node.nodeId(), sizeOfRelocatingShards);
154+
155+
return allocation.decision(Decision.NO, NAME,
156+
"the node has fewer free bytes remaining than the total size of all incoming shards: " +
157+
"free space [%sB], relocating shards [%sB]",
158+
freeBytes + sizeOfRelocatingShards, sizeOfRelocatingShards);
159+
}
160+
148161
ByteSizeValue freeBytesValue = new ByteSizeValue(freeBytes);
149162
if (logger.isTraceEnabled()) {
150163
logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage);
@@ -242,6 +255,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
242255
// Secondly, check that allocating the shard to this node doesn't put it above the high watermark
243256
final long shardSize = getExpectedShardSize(shardRouting, 0L,
244257
allocation.clusterInfo(), allocation.metaData(), allocation.routingTable());
258+
assert shardSize >= 0 : shardSize;
245259
double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize);
246260
long freeBytesAfterShard = freeBytes - shardSize;
247261
if (freeBytesAfterShard < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
@@ -268,6 +282,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
268282
diskThresholdSettings.getHighWatermarkRaw(), usedDiskThresholdHigh, freeSpaceAfterShard);
269283
}
270284

285+
assert freeBytesAfterShard >= 0 : freeBytesAfterShard;
271286
return allocation.decision(Decision.YES, NAME,
272287
"enough disk for shard on node, free: [%s], shard size: [%s], free after allocating shard: [%s]",
273288
freeBytesValue,
@@ -301,6 +316,17 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl
301316
return allocation.decision(Decision.YES, NAME,
302317
"this shard is not allocated on the most utilized disk and can remain");
303318
}
319+
if (freeBytes < 0L) {
320+
final long sizeOfRelocatingShards = sizeOfRelocatingShards(node, false, usage.getPath(),
321+
allocation.clusterInfo(), allocation.metaData(), allocation.routingTable());
322+
logger.debug("fewer free bytes remaining than the size of all incoming shards: " +
323+
"usage {} on node {} including {} bytes of relocations, shard cannot remain",
324+
usage, node.nodeId(), sizeOfRelocatingShards);
325+
return allocation.decision(Decision.NO, NAME,
326+
"the shard cannot remain on this node because the node has fewer free bytes remaining than the total size of all " +
327+
"incoming shards: free space [%s], relocating shards [%s]",
328+
freeBytes + sizeOfRelocatingShards, sizeOfRelocatingShards);
329+
}
304330
if (freeBytes < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
305331
if (logger.isDebugEnabled()) {
306332
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain",

server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java

+67-28
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,15 @@
5656
import java.util.HashMap;
5757
import java.util.HashSet;
5858
import java.util.Map;
59+
import java.util.concurrent.atomic.AtomicReference;
5960

6061
import static java.util.Collections.emptyMap;
6162
import static java.util.Collections.singleton;
6263
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
6364
import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
6465
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
6566
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
67+
import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING;
6668
import static org.hamcrest.Matchers.containsString;
6769
import static org.hamcrest.Matchers.equalTo;
6870
import static org.hamcrest.Matchers.nullValue;
@@ -653,18 +655,19 @@ public void testShardRelocationsTakenIntoAccount() {
653655
final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes);
654656

655657
DiskThresholdDecider decider = makeDecider(diskSettings);
658+
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
656659
AllocationDeciders deciders = new AllocationDeciders(
657-
new HashSet<>(Arrays.asList(new SameShardAllocationDecider(
658-
Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)
659-
), decider)));
660+
new HashSet<>(Arrays.asList(
661+
new SameShardAllocationDecider(Settings.EMPTY, clusterSettings),
662+
new EnableAllocationDecider(
663+
Settings.builder().put(CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none").build(), clusterSettings),
664+
decider)));
660665

661-
ClusterInfoService cis = () -> {
662-
logger.info("--> calling fake getClusterInfo");
663-
return clusterInfo;
664-
};
666+
final AtomicReference<ClusterInfo> clusterInfoReference = new AtomicReference<>(clusterInfo);
667+
final ClusterInfoService cis = clusterInfoReference::get;
665668

666669
AllocationService strategy = new AllocationService(deciders, new TestGatewayAllocator(),
667-
new BalancedShardsAllocator(Settings.EMPTY), cis);
670+
new BalancedShardsAllocator(Settings.EMPTY), cis);
668671

669672
MetaData metaData = MetaData.builder()
670673
.put(IndexMetaData.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1))
@@ -702,30 +705,66 @@ Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLU
702705
.add(newNode("node3"))
703706
).build();
704707

705-
AllocationCommand relocate1 = new MoveAllocationCommand("test", 0, "node2", "node3");
706-
AllocationCommands cmds = new AllocationCommands(relocate1);
708+
{
709+
AllocationCommand moveAllocationCommand = new MoveAllocationCommand("test", 0, "node2", "node3");
710+
AllocationCommands cmds = new AllocationCommands(moveAllocationCommand);
707711

708-
clusterState = strategy.reroute(clusterState, cmds, false, false).getClusterState();
709-
logShardStates(clusterState);
712+
clusterState = strategy.reroute(clusterState, cmds, false, false).getClusterState();
713+
logShardStates(clusterState);
714+
}
715+
716+
final ImmutableOpenMap.Builder<String, DiskUsage> overfullUsagesBuilder = ImmutableOpenMap.builder();
717+
overfullUsagesBuilder.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 40)); // 60% used
718+
overfullUsagesBuilder.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 40)); // 60% used
719+
overfullUsagesBuilder.put("node3", new DiskUsage("node3", "n3", "/dev/null", 100, 0)); // 100% used
720+
final ImmutableOpenMap<String, DiskUsage> overfullUsages = overfullUsagesBuilder.build();
721+
722+
final ImmutableOpenMap.Builder<String, Long> largerShardSizesBuilder = ImmutableOpenMap.builder();
723+
largerShardSizesBuilder.put("[test][0][p]", 14L);
724+
largerShardSizesBuilder.put("[test][0][r]", 14L);
725+
largerShardSizesBuilder.put("[test2][0][p]", 2L);
726+
largerShardSizesBuilder.put("[test2][0][r]", 2L);
727+
final ImmutableOpenMap<String, Long> largerShardSizes = largerShardSizesBuilder.build();
710728

711-
AllocationCommand relocate2 = new MoveAllocationCommand("test2", 0, "node2", "node3");
712-
cmds = new AllocationCommands(relocate2);
713-
714-
try {
715-
// The shard for the "test" index is already being relocated to
716-
// node3, which will put it over the low watermark when it
717-
// completes, with shard relocations taken into account this should
718-
// throw an exception about not being able to complete
719-
strategy.reroute(clusterState, cmds, false, false);
720-
fail("should not have been able to reroute the shard");
721-
} catch (IllegalArgumentException e) {
722-
assertThat("can't be allocated because there isn't enough room: " + e.getMessage(),
723-
e.getMessage(),
724-
containsString("the node is above the low watermark cluster setting " +
725-
"[cluster.routing.allocation.disk.watermark.low=0.7], using more disk space than the maximum " +
726-
"allowed [70.0%], actual free: [26.0%]"));
729+
final ClusterInfo overfullClusterInfo = new DevNullClusterInfo(overfullUsages, overfullUsages, largerShardSizes);
730+
731+
{
732+
AllocationCommand moveAllocationCommand = new MoveAllocationCommand("test2", 0, "node2", "node3");
733+
AllocationCommands cmds = new AllocationCommands(moveAllocationCommand);
734+
735+
final ClusterState clusterStateThatRejectsCommands = clusterState;
736+
737+
assertThat(expectThrows(IllegalArgumentException.class,
738+
() -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false)).getMessage(),
739+
containsString("the node is above the low watermark cluster setting " +
740+
"[cluster.routing.allocation.disk.watermark.low=0.7], using more disk space than the maximum " +
741+
"allowed [70.0%], actual free: [26.0%]"));
742+
743+
clusterInfoReference.set(overfullClusterInfo);
744+
745+
assertThat(expectThrows(IllegalArgumentException.class,
746+
() -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false)).getMessage(),
747+
containsString("the node has fewer free bytes remaining than the total size of all incoming shards"));
748+
749+
clusterInfoReference.set(clusterInfo);
727750
}
728751

752+
{
753+
AllocationCommand moveAllocationCommand = new MoveAllocationCommand("test2", 0, "node2", "node3");
754+
AllocationCommands cmds = new AllocationCommands(moveAllocationCommand);
755+
756+
logger.info("--> before starting: {}", clusterState);
757+
clusterState = startInitializingShardsAndReroute(strategy, clusterState);
758+
logger.info("--> after starting: {}", clusterState);
759+
clusterState = strategy.reroute(clusterState, cmds, false, false).getClusterState();
760+
logger.info("--> after running another command: {}", clusterState);
761+
logShardStates(clusterState);
762+
763+
clusterInfoReference.set(overfullClusterInfo);
764+
765+
clusterState = strategy.reroute(clusterState, "foo");
766+
logger.info("--> after another reroute: {}", clusterState);
767+
}
729768
}
730769

731770
public void testCanRemainWithShardRelocatingAway() {

0 commit comments

Comments
 (0)