Skip to content

Commit c592d24

Browse files
BukhtawarDaveCTurner
authored andcommitted
Auto-release flood-stage write block (#42559)
If a node exceeds the flood-stage disk watermark then we add a block to all of its indices to prevent further writes as a last-ditch attempt to prevent the node completely exhausting its disk space. However today this block remains in place until manually removed, and this block is a source of confusion for users who current have ample disk space and did not even realise they nearly ran out at some point in the past. This commit changes our behaviour to automatically remove this block when a node drops below the high watermark again. The expectation is that the high watermark is some distance below the flood-stage watermark and therefore the disk space problem is truly resolved. Fixes #39334
1 parent 8880ada commit c592d24

File tree

6 files changed

+355
-34
lines changed

6 files changed

+355
-34
lines changed

docs/reference/modules/cluster/disk_allocator.asciidoc

+4-2
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@ Elasticsearch enforces a read-only index block
4040
(`index.blocks.read_only_allow_delete`) on every index that has one or more
4141
shards allocated on the node that has at least one disk exceeding the flood
4242
stage. This is a last resort to prevent nodes from running out of disk space.
43-
The index block must be released manually once there is enough disk space
44-
available to allow indexing operations to continue.
43+
The index block is automatically released once the disk utilization falls below
44+
the high watermark.
45+
The automatic release can however be disabled in 7.x through a system property
46+
`es.disk.auto_release_flood_stage_block`
4547

4648
NOTE: You can not mix the usage of percentage values and byte values within
4749
these settings. Either all are set to percentage values, or all are set to byte

server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java

+78-15
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
2424
import org.apache.logging.log4j.LogManager;
2525
import org.apache.logging.log4j.Logger;
26+
import org.apache.logging.log4j.message.ParameterizedMessage;
27+
import org.elasticsearch.Version;
2628
import org.elasticsearch.action.ActionListener;
2729
import org.elasticsearch.action.support.GroupedActionListener;
2830
import org.elasticsearch.client.Client;
@@ -33,10 +35,12 @@
3335
import org.elasticsearch.cluster.metadata.IndexMetaData;
3436
import org.elasticsearch.cluster.routing.RerouteService;
3537
import org.elasticsearch.cluster.routing.RoutingNode;
38+
import org.elasticsearch.cluster.routing.RoutingNodes;
3639
import org.elasticsearch.cluster.routing.ShardRouting;
3740
import org.elasticsearch.common.Priority;
3841
import org.elasticsearch.common.Strings;
3942
import org.elasticsearch.common.collect.ImmutableOpenMap;
43+
import org.elasticsearch.common.logging.DeprecationLogger;
4044
import org.elasticsearch.common.settings.ClusterSettings;
4145
import org.elasticsearch.common.settings.Settings;
4246
import org.elasticsearch.common.util.set.Sets;
@@ -47,6 +51,8 @@
4751
import java.util.concurrent.atomic.AtomicLong;
4852
import java.util.function.LongSupplier;
4953
import java.util.function.Supplier;
54+
import java.util.stream.Collectors;
55+
import java.util.stream.StreamSupport;
5056

5157
/**
5258
* Listens for a node to go over the high watermark and kicks off an empty
@@ -65,6 +71,7 @@ public class DiskThresholdMonitor {
6571
private final RerouteService rerouteService;
6672
private final AtomicLong lastRunTimeMillis = new AtomicLong(Long.MIN_VALUE);
6773
private final AtomicBoolean checkInProgress = new AtomicBoolean();
74+
private final DeprecationLogger deprecationLogger = new DeprecationLogger(logger);
6875

6976
public DiskThresholdMonitor(Settings settings, Supplier<ClusterState> clusterStateSupplier, ClusterSettings clusterSettings,
7077
Client client, LongSupplier currentTimeMillisSupplier, RerouteService rerouteService) {
@@ -73,6 +80,10 @@ public DiskThresholdMonitor(Settings settings, Supplier<ClusterState> clusterSta
7380
this.rerouteService = rerouteService;
7481
this.diskThresholdSettings = new DiskThresholdSettings(settings, clusterSettings);
7582
this.client = client;
83+
if (diskThresholdSettings.isAutoReleaseIndexEnabled() == false) {
84+
deprecationLogger.deprecated("[{}] will be removed in version {}",
85+
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY, Version.V_7_4_0.major + 1);
86+
}
7687
}
7788

7889
/**
@@ -136,21 +147,33 @@ public void onNewInfo(ClusterInfo info) {
136147
}
137148
final ClusterState state = clusterStateSupplier.get();
138149
final Set<String> indicesToMarkReadOnly = new HashSet<>();
150+
RoutingNodes routingNodes = state.getRoutingNodes();
151+
Set<String> indicesNotToAutoRelease = new HashSet<>();
152+
markNodesMissingUsageIneligibleForRelease(routingNodes, usages, indicesNotToAutoRelease);
139153

140154
for (final ObjectObjectCursor<String, DiskUsage> entry : usages) {
141155
final String node = entry.key;
142156
final DiskUsage usage = entry.value;
143157
warnAboutDiskIfNeeded(usage);
158+
RoutingNode routingNode = routingNodes.node(node);
159+
// Only unblock index if all nodes that contain shards of it are below the high disk watermark
144160
if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes() ||
145161
usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
146-
final RoutingNode routingNode = state.getRoutingNodes().node(node);
147162
if (routingNode != null) { // this might happen if we haven't got the full cluster-state yet?!
148163
for (ShardRouting routing : routingNode) {
149-
indicesToMarkReadOnly.add(routing.index().getName());
164+
String indexName = routing.index().getName();
165+
indicesToMarkReadOnly.add(indexName);
166+
indicesNotToAutoRelease.add(indexName);
150167
}
151168
}
152169
} else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() ||
153170
usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
171+
if (routingNode != null) {
172+
for (ShardRouting routing : routingNode) {
173+
String indexName = routing.index().getName();
174+
indicesNotToAutoRelease.add(indexName);
175+
}
176+
}
154177
if (lastRunTimeMillis.get() < currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
155178
reroute = true;
156179
explanation = "high disk watermark exceeded on one or more nodes";
@@ -182,7 +205,7 @@ public void onNewInfo(ClusterInfo info) {
182205
}
183206
}
184207

185-
final ActionListener<Void> listener = new GroupedActionListener<>(ActionListener.wrap(this::checkFinished), 2);
208+
final ActionListener<Void> listener = new GroupedActionListener<>(ActionListener.wrap(this::checkFinished), 3);
186209

187210
if (reroute) {
188211
logger.info("rerouting shards: [{}]", explanation);
@@ -197,30 +220,70 @@ public void onNewInfo(ClusterInfo info) {
197220
} else {
198221
listener.onResponse(null);
199222
}
223+
Set<String> indicesToAutoRelease = StreamSupport.stream(state.routingTable().indicesRouting()
224+
.spliterator(), false)
225+
.map(c -> c.key)
226+
.filter(index -> indicesNotToAutoRelease.contains(index) == false)
227+
.filter(index -> state.getBlocks().hasIndexBlock(index, IndexMetaData.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK))
228+
.collect(Collectors.toSet());
229+
230+
if (indicesToAutoRelease.isEmpty() == false) {
231+
if (diskThresholdSettings.isAutoReleaseIndexEnabled()) {
232+
logger.info("releasing read-only-allow-delete block on indices: [{}]", indicesToAutoRelease);
233+
updateIndicesReadOnly(indicesToAutoRelease, listener, false);
234+
} else {
235+
deprecationLogger.deprecated("[{}] will be removed in version {}",
236+
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY, Version.V_7_4_0.major + 1);
237+
logger.debug("[{}] disabled, not releasing read-only-allow-delete block on indices: [{}]",
238+
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY, indicesToAutoRelease);
239+
listener.onResponse(null);
240+
}
241+
} else {
242+
listener.onResponse(null);
243+
}
200244

201245
indicesToMarkReadOnly.removeIf(index -> state.getBlocks().indexBlocked(ClusterBlockLevel.WRITE, index));
202246
if (indicesToMarkReadOnly.isEmpty() == false) {
203-
markIndicesReadOnly(indicesToMarkReadOnly, ActionListener.wrap(r -> {
204-
setLastRunTimeMillis();
205-
listener.onResponse(r);
206-
}, e -> {
207-
logger.debug("marking indices readonly failed", e);
208-
setLastRunTimeMillis();
209-
listener.onFailure(e);
210-
}));
247+
updateIndicesReadOnly(indicesToMarkReadOnly, listener, true);
211248
} else {
212249
listener.onResponse(null);
213250
}
214251
}
215252

253+
private void markNodesMissingUsageIneligibleForRelease(RoutingNodes routingNodes, ImmutableOpenMap<String, DiskUsage> usages,
254+
Set<String> indicesToMarkIneligibleForAutoRelease) {
255+
for (RoutingNode routingNode : routingNodes) {
256+
if (usages.containsKey(routingNode.nodeId()) == false) {
257+
if (routingNode != null) {
258+
for (ShardRouting routing : routingNode) {
259+
String indexName = routing.index().getName();
260+
indicesToMarkIneligibleForAutoRelease.add(indexName);
261+
}
262+
}
263+
}
264+
}
265+
266+
}
267+
216268
private void setLastRunTimeMillis() {
217269
lastRunTimeMillis.getAndUpdate(l -> Math.max(l, currentTimeMillisSupplier.getAsLong()));
218270
}
219271

220-
protected void markIndicesReadOnly(Set<String> indicesToMarkReadOnly, ActionListener<Void> listener) {
272+
protected void updateIndicesReadOnly(Set<String> indicesToUpdate, ActionListener<Void> listener, boolean readOnly) {
221273
// set read-only block but don't block on the response
222-
client.admin().indices().prepareUpdateSettings(indicesToMarkReadOnly.toArray(Strings.EMPTY_ARRAY))
223-
.setSettings(Settings.builder().put(IndexMetaData.SETTING_READ_ONLY_ALLOW_DELETE, true).build())
224-
.execute(ActionListener.map(listener, r -> null));
274+
ActionListener<Void> wrappedListener = ActionListener.wrap(r -> {
275+
setLastRunTimeMillis();
276+
listener.onResponse(r);
277+
}, e -> {
278+
logger.debug(new ParameterizedMessage("setting indices [{}] read-only failed", readOnly), e);
279+
setLastRunTimeMillis();
280+
listener.onFailure(e);
281+
});
282+
Settings readOnlySettings = readOnly ? Settings.builder()
283+
.put(IndexMetaData.SETTING_READ_ONLY_ALLOW_DELETE, Boolean.TRUE.toString()).build() :
284+
Settings.builder().putNull(IndexMetaData.SETTING_READ_ONLY_ALLOW_DELETE).build();
285+
client.admin().indices().prepareUpdateSettings(indicesToUpdate.toArray(Strings.EMPTY_ARRAY))
286+
.setSettings(readOnlySettings)
287+
.execute(ActionListener.map(wrappedListener, r -> null));
225288
}
226289
}

server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java

+18
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,20 @@ public class DiskThresholdSettings {
7272
private volatile TimeValue rerouteInterval;
7373
private volatile Double freeDiskThresholdFloodStage;
7474
private volatile ByteSizeValue freeBytesThresholdFloodStage;
75+
private static final boolean autoReleaseIndexEnabled;
76+
public static final String AUTO_RELEASE_INDEX_ENABLED_KEY = "es.disk.auto_release_flood_stage_block";
77+
78+
static {
79+
final String property = System.getProperty(AUTO_RELEASE_INDEX_ENABLED_KEY);
80+
if (property == null) {
81+
autoReleaseIndexEnabled = true;
82+
} else if (Boolean.FALSE.toString().equals(property)){
83+
autoReleaseIndexEnabled = false;
84+
} else {
85+
throw new IllegalArgumentException(AUTO_RELEASE_INDEX_ENABLED_KEY + " may only be unset or set to [false] but was [" +
86+
property + "]");
87+
}
88+
}
7589

7690
public DiskThresholdSettings(Settings settings, ClusterSettings clusterSettings) {
7791
final String lowWatermark = CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.get(settings);
@@ -286,6 +300,10 @@ public ByteSizeValue getFreeBytesThresholdFloodStage() {
286300
return freeBytesThresholdFloodStage;
287301
}
288302

303+
public boolean isAutoReleaseIndexEnabled() {
304+
return autoReleaseIndexEnabled;
305+
}
306+
289307
public boolean includeRelocations() {
290308
return includeRelocations;
291309
}

0 commit comments

Comments
 (0)