Skip to content

Commit 1d2462e

Browse files
authored
Move monitoring collection timeouts to coordinator (#67084)
With #66993 there is now support for coordinator-side timeouts on a `BroadcastRequest`, which includes requests for node stats and recoveries. This commit adjusts Monitoring to use these coordinator-side timeouts where applicable, which will prevent partial stats responses from accumulating on the master while one or more nodes are not responding quickly enough. It also enhances the message logged on a timeout to include the IDs of the nodes which did not respond in time. Closes #60188.
1 parent 1cbccb1 commit 1d2462e

File tree

15 files changed

+359
-27
lines changed

15 files changed

+359
-27
lines changed

server/src/main/java/org/elasticsearch/action/support/broadcast/BroadcastOperationRequestBuilder.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919

2020
package org.elasticsearch.action.support.broadcast;
2121

22-
import org.elasticsearch.action.ActionType;
2322
import org.elasticsearch.action.ActionRequestBuilder;
23+
import org.elasticsearch.action.ActionType;
2424
import org.elasticsearch.action.support.IndicesOptions;
2525
import org.elasticsearch.client.ElasticsearchClient;
26+
import org.elasticsearch.common.unit.TimeValue;
2627

2728
public abstract class BroadcastOperationRequestBuilder<
2829
Request extends BroadcastRequest<Request>,
@@ -45,4 +46,10 @@ public final RequestBuilder setIndicesOptions(IndicesOptions indicesOptions) {
4546
request.indicesOptions(indicesOptions);
4647
return (RequestBuilder) this;
4748
}
49+
50+
@SuppressWarnings("unchecked")
51+
public RequestBuilder setTimeout(TimeValue timeout) {
52+
request.timeout(timeout);
53+
return (RequestBuilder) this;
54+
}
4855
}

server/src/main/java/org/elasticsearch/action/support/nodes/NodesOperationRequestBuilder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public final RequestBuilder setNodesIds(String... nodesIds) {
3939
}
4040

4141
@SuppressWarnings("unchecked")
42-
public final RequestBuilder setTimeout(TimeValue timeout) {
42+
public RequestBuilder setTimeout(TimeValue timeout) {
4343
request.timeout(timeout);
4444
return (RequestBuilder) this;
4545
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/GetJobsStatsAction.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,9 @@ public boolean equals(Object obj) {
127127
return false;
128128
}
129129
Request other = (Request) obj;
130-
return Objects.equals(jobId, other.jobId) && Objects.equals(allowNoMatch, other.allowNoMatch);
130+
return Objects.equals(jobId, other.jobId)
131+
&& Objects.equals(allowNoMatch, other.allowNoMatch)
132+
&& Objects.equals(getTimeout(), other.getTimeout());
131133
}
132134
}
133135

x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/collector/Collector.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public Collection<MonitoringDoc> collect(final long timestamp, final long interv
8484
return doCollect(convertNode(timestamp, clusterService.localNode()), interval, clusterState);
8585
}
8686
} catch (ElasticsearchTimeoutException e) {
87-
logger.error((Supplier<?>) () -> new ParameterizedMessage("collector [{}] timed out when collecting data", name()));
87+
logger.error("collector [{}] timed out when collecting data: {}", name(), e.getMessage());
8888
} catch (Exception e) {
8989
logger.error((Supplier<?>) () -> new ParameterizedMessage("collector [{}] failed to collect data", name()), e);
9090
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
package org.elasticsearch.xpack.monitoring.collector;
8+
9+
import org.elasticsearch.ElasticsearchException;
10+
import org.elasticsearch.ElasticsearchTimeoutException;
11+
import org.elasticsearch.action.FailedNodeException;
12+
import org.elasticsearch.action.support.DefaultShardOperationFailedException;
13+
import org.elasticsearch.action.support.broadcast.BroadcastResponse;
14+
import org.elasticsearch.action.support.nodes.BaseNodeResponse;
15+
import org.elasticsearch.action.support.nodes.BaseNodesResponse;
16+
import org.elasticsearch.action.support.tasks.BaseTasksResponse;
17+
import org.elasticsearch.common.unit.TimeValue;
18+
import org.elasticsearch.transport.ReceiveTimeoutTransportException;
19+
20+
import java.util.HashSet;
21+
import java.util.concurrent.TimeoutException;
22+
23+
/**
24+
* Utilities for identifying timeouts in responses to collection requests, since we prefer to fail the whole collection attempt if any of
25+
* the involved nodes times out.
26+
*/
27+
public final class TimeoutUtils {
28+
private TimeoutUtils() {
29+
}
30+
31+
/**
32+
* @throws ElasticsearchTimeoutException iff the {@code response} contains any node-level timeout. The exception message identifies the
33+
* nodes that timed out and mentions {@code collectionTimeout}.
34+
*/
35+
public static <T extends BaseNodeResponse> void ensureNoTimeouts(TimeValue collectionTimeout, BaseNodesResponse<T> response) {
36+
HashSet<String> timedOutNodeIds = null;
37+
for (FailedNodeException failedNodeException : response.failures()) {
38+
if (isTimeoutFailure(failedNodeException)) {
39+
if (timedOutNodeIds == null) {
40+
timedOutNodeIds = new HashSet<>();
41+
}
42+
timedOutNodeIds.add(failedNodeException.nodeId());
43+
}
44+
}
45+
ensureNoTimeouts(collectionTimeout, timedOutNodeIds);
46+
}
47+
48+
/**
49+
* @throws ElasticsearchTimeoutException iff the {@code response} contains any node-level timeout. The exception message identifies the
50+
* nodes that timed out and mentions {@code collectionTimeout}.
51+
*/
52+
public static void ensureNoTimeouts(TimeValue collectionTimeout, BaseTasksResponse response) {
53+
HashSet<String> timedOutNodeIds = null;
54+
for (ElasticsearchException nodeFailure : response.getNodeFailures()) {
55+
if (nodeFailure instanceof FailedNodeException) {
56+
FailedNodeException failedNodeException = (FailedNodeException) nodeFailure;
57+
if (isTimeoutFailure(failedNodeException)) {
58+
if (timedOutNodeIds == null) {
59+
timedOutNodeIds = new HashSet<>();
60+
}
61+
timedOutNodeIds.add(failedNodeException.nodeId());
62+
}
63+
}
64+
}
65+
ensureNoTimeouts(collectionTimeout, timedOutNodeIds);
66+
}
67+
68+
/**
69+
* @throws ElasticsearchTimeoutException iff the {@code response} contains any node-level timeout. The exception message identifies the
70+
* nodes that timed out and mentions {@code collectionTimeout}.
71+
*/
72+
public static void ensureNoTimeouts(TimeValue collectionTimeout, BroadcastResponse response) {
73+
HashSet<String> timedOutNodeIds = null;
74+
for (DefaultShardOperationFailedException shardFailure : response.getShardFailures()) {
75+
final Throwable shardFailureCause = shardFailure.getCause();
76+
if (shardFailureCause instanceof FailedNodeException) {
77+
FailedNodeException failedNodeException = (FailedNodeException) shardFailureCause;
78+
if (isTimeoutFailure(failedNodeException)) {
79+
if (timedOutNodeIds == null) {
80+
timedOutNodeIds = new HashSet<>();
81+
}
82+
timedOutNodeIds.add(failedNodeException.nodeId());
83+
}
84+
}
85+
}
86+
ensureNoTimeouts(collectionTimeout, timedOutNodeIds);
87+
}
88+
89+
private static boolean isTimeoutFailure(FailedNodeException failedNodeException) {
90+
final Throwable cause = failedNodeException.getCause();
91+
return cause instanceof ElasticsearchTimeoutException
92+
|| cause instanceof TimeoutException
93+
|| cause instanceof ReceiveTimeoutTransportException;
94+
}
95+
96+
private static void ensureNoTimeouts(TimeValue collectionTimeout, HashSet<String> timedOutNodeIds) {
97+
if (timedOutNodeIds != null) {
98+
throw new ElasticsearchTimeoutException((timedOutNodeIds.size() == 1 ? "node " : "nodes ") + timedOutNodeIds +
99+
" did not respond within [" + collectionTimeout + "]");
100+
}
101+
}
102+
103+
}

x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/collector/cluster/ClusterStatsCollector.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737

3838
import static org.elasticsearch.xpack.core.XPackSettings.SECURITY_ENABLED;
3939
import static org.elasticsearch.xpack.core.XPackSettings.TRANSPORT_SSL_ENABLED;
40+
import static org.elasticsearch.xpack.monitoring.collector.TimeoutUtils.ensureNoTimeouts;
4041

4142
/**
4243
* Collector for cluster stats.
@@ -82,13 +83,12 @@ protected boolean shouldCollect(final boolean isElectedMaster) {
8283
@Override
8384
protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node,
8485
final long interval,
85-
final ClusterState clusterState) throws Exception {
86-
final Supplier<ClusterStatsResponse> clusterStatsSupplier =
87-
() -> client.admin().cluster().prepareClusterStats().get(getCollectionTimeout());
86+
final ClusterState clusterState) {
8887
final Supplier<List<XPackFeatureSet.Usage>> usageSupplier =
8988
() -> new XPackUsageRequestBuilder(client).get().getUsages();
9089

91-
final ClusterStatsResponse clusterStats = clusterStatsSupplier.get();
90+
final ClusterStatsResponse clusterStats = client.admin().cluster().prepareClusterStats().setTimeout(getCollectionTimeout()).get();
91+
ensureNoTimeouts(getCollectionTimeout(), clusterStats);
9292

9393
final String clusterName = clusterService.getClusterName().value();
9494
final String clusterUuid = clusterUuid(clusterState);

x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/collector/indices/IndexRecoveryCollector.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.util.Objects;
2424

2525
import static org.elasticsearch.common.settings.Setting.boolSetting;
26+
import static org.elasticsearch.xpack.monitoring.collector.TimeoutUtils.ensureNoTimeouts;
2627

2728
/**
2829
* Collector for the Recovery API.
@@ -64,13 +65,16 @@ protected boolean shouldCollect(final boolean isElectedMaster) {
6465
@Override
6566
protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node,
6667
final long interval,
67-
final ClusterState clusterState) throws Exception {
68+
final ClusterState clusterState) {
6869
List<MonitoringDoc> results = new ArrayList<>(1);
6970
RecoveryResponse recoveryResponse = client.admin().indices().prepareRecoveries()
7071
.setIndices(getCollectionIndices())
7172
.setIndicesOptions(IndicesOptions.lenientExpandOpen())
7273
.setActiveOnly(getActiveRecoveriesOnly())
73-
.get(getCollectionTimeout());
74+
.setTimeout(getCollectionTimeout())
75+
.get();
76+
77+
ensureNoTimeouts(getCollectionTimeout(), recoveryResponse);
7478

7579
if (recoveryResponse.hasRecoveries()) {
7680
final String clusterUuid = clusterUuid(clusterState);

x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/collector/indices/IndexStatsCollector.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import java.util.Collections;
2525
import java.util.List;
2626

27+
import static org.elasticsearch.xpack.monitoring.collector.TimeoutUtils.ensureNoTimeouts;
28+
2729
/**
2830
* Collector for indices and singular index statistics.
2931
* <p>
@@ -54,7 +56,7 @@ protected boolean shouldCollect(final boolean isElectedMaster) {
5456
@Override
5557
protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node,
5658
final long interval,
57-
final ClusterState clusterState) throws Exception {
59+
final ClusterState clusterState) {
5860
final List<MonitoringDoc> results = new ArrayList<>();
5961
final IndicesStatsResponse indicesStatsResponse = client.admin().indices().prepareStats()
6062
.setIndices(getCollectionIndices())
@@ -71,7 +73,10 @@ protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node,
7173
.setQueryCache(true)
7274
.setRequestCache(true)
7375
.setBulk(true)
74-
.get(getCollectionTimeout());
76+
.setTimeout(getCollectionTimeout())
77+
.get();
78+
79+
ensureNoTimeouts(getCollectionTimeout(), indicesStatsResponse);
7580

7681
final long timestamp = timestamp();
7782
final String clusterUuid = clusterUuid(clusterState);

x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/collector/ml/JobStatsCollector.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.util.stream.Collectors;
2424

2525
import static org.elasticsearch.xpack.core.ClientHelper.MONITORING_ORIGIN;
26+
import static org.elasticsearch.xpack.monitoring.collector.TimeoutUtils.ensureNoTimeouts;
2627

2728
/**
2829
* Collector for Machine Learning Job Stats.
@@ -71,9 +72,10 @@ protected List<MonitoringDoc> doCollect(final MonitoringDoc.Node node,
7172
final ClusterState clusterState) throws Exception {
7273
// fetch details about all jobs
7374
try (ThreadContext.StoredContext ignore = threadContext.stashWithOrigin(MONITORING_ORIGIN)) {
74-
final GetJobsStatsAction.Response jobs =
75-
client.execute(GetJobsStatsAction.INSTANCE, new GetJobsStatsAction.Request(Metadata.ALL))
76-
.actionGet(getCollectionTimeout());
75+
final GetJobsStatsAction.Request request = new GetJobsStatsAction.Request(Metadata.ALL).setTimeout(getCollectionTimeout());
76+
final GetJobsStatsAction.Response jobs = client.execute(GetJobsStatsAction.INSTANCE, request).actionGet();
77+
78+
ensureNoTimeouts(getCollectionTimeout(), jobs);
7779

7880
final long timestamp = timestamp();
7981
final String clusterUuid = clusterUuid(clusterState);

x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsCollector.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import java.util.Collections;
2424
import java.util.Objects;
2525

26+
import static org.elasticsearch.xpack.monitoring.collector.TimeoutUtils.ensureNoTimeouts;
27+
2628
/**
2729
* Collector for nodes statistics.
2830
* <p>
@@ -65,7 +67,7 @@ protected boolean shouldCollect(final boolean isElectedMaster) {
6567
@Override
6668
protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node,
6769
final long interval,
68-
final ClusterState clusterState) throws Exception {
70+
final ClusterState clusterState) {
6971
NodesStatsRequest request = new NodesStatsRequest("_local");
7072
request.indices(FLAGS);
7173
request.addMetrics(
@@ -74,8 +76,10 @@ protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node,
7476
NodesStatsRequest.Metric.PROCESS.metricName(),
7577
NodesStatsRequest.Metric.THREAD_POOL.metricName(),
7678
NodesStatsRequest.Metric.FS.metricName());
79+
request.timeout(getCollectionTimeout());
7780

78-
final NodesStatsResponse response = client.admin().cluster().nodesStats(request).actionGet(getCollectionTimeout());
81+
final NodesStatsResponse response = client.admin().cluster().nodesStats(request).actionGet();
82+
ensureNoTimeouts(getCollectionTimeout(), response);
7983

8084
// if there's a failure, then we failed to work with the
8185
// _local node (guaranteed a single exception)

x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/cluster/ClusterStatsCollectorTests.java

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
*/
66
package org.elasticsearch.xpack.monitoring.collector.cluster;
77

8+
import org.elasticsearch.ElasticsearchTimeoutException;
89
import org.elasticsearch.Version;
910
import org.elasticsearch.action.ActionFuture;
11+
import org.elasticsearch.action.FailedNodeException;
1012
import org.elasticsearch.action.admin.cluster.stats.ClusterStatsIndices;
1113
import org.elasticsearch.action.admin.cluster.stats.ClusterStatsNodes;
1214
import org.elasticsearch.action.admin.cluster.stats.ClusterStatsRequestBuilder;
@@ -37,6 +39,7 @@
3739
import org.junit.Assert;
3840

3941
import java.util.Collection;
42+
import java.util.List;
4043
import java.util.Locale;
4144
import java.util.UUID;
4245

@@ -189,7 +192,8 @@ public void testDoCollect() throws Exception {
189192
when(mockClusterStatsResponse.getIndicesStats()).thenReturn(mockClusterStatsIndices);
190193

191194
final ClusterStatsRequestBuilder clusterStatsRequestBuilder = mock(ClusterStatsRequestBuilder.class);
192-
when(clusterStatsRequestBuilder.get(eq(timeout))).thenReturn(mockClusterStatsResponse);
195+
when(clusterStatsRequestBuilder.setTimeout(eq(timeout))).thenReturn(clusterStatsRequestBuilder);
196+
when(clusterStatsRequestBuilder.get()).thenReturn(mockClusterStatsResponse);
193197

194198
final ClusterAdminClient clusterAdminClient = mock(ClusterAdminClient.class);
195199
when(clusterAdminClient.prepareClusterStats()).thenReturn(clusterStatsRequestBuilder);
@@ -280,7 +284,7 @@ public void testDoCollectNoLicense() throws Exception {
280284
{
281285
indexNameExpressionResolver = mock(IndexNameExpressionResolver.class);
282286
when(indexNameExpressionResolver.concreteIndices(clusterState, IndicesOptions.lenientExpandOpen(), "apm-*"))
283-
.thenReturn(new Index[0]);
287+
.thenReturn(Index.EMPTY_ARRAY);
284288
}
285289

286290
final Client client = mock(Client.class);
@@ -296,7 +300,8 @@ public void testDoCollectNoLicense() throws Exception {
296300
when(mockClusterStatsResponse.getIndicesStats()).thenReturn(mockClusterStatsIndices);
297301

298302
final ClusterStatsRequestBuilder clusterStatsRequestBuilder = mock(ClusterStatsRequestBuilder.class);
299-
when(clusterStatsRequestBuilder.get(eq(timeout))).thenReturn(mockClusterStatsResponse);
303+
when(clusterStatsRequestBuilder.setTimeout(eq(timeout))).thenReturn(clusterStatsRequestBuilder);
304+
when(clusterStatsRequestBuilder.get()).thenReturn(mockClusterStatsResponse);
300305

301306
final ClusterAdminClient clusterAdminClient = mock(ClusterAdminClient.class);
302307
when(clusterAdminClient.prepareClusterStats()).thenReturn(clusterStatsRequestBuilder);
@@ -325,4 +330,58 @@ public void testDoCollectNoLicense() throws Exception {
325330
final ClusterStatsMonitoringDoc doc = (ClusterStatsMonitoringDoc) results.iterator().next();
326331
assertThat(doc.getLicense(), nullValue());
327332
}
333+
334+
public void testDoCollectThrowsTimeoutException() throws Exception {
335+
final TimeValue timeout;
336+
{
337+
final String clusterName = randomAlphaOfLength(10);
338+
whenClusterStateWithName(clusterName);
339+
final String clusterUUID = UUID.randomUUID().toString();
340+
whenClusterStateWithUUID(clusterUUID);
341+
timeout = TimeValue.timeValueSeconds(randomIntBetween(1, 120));
342+
withCollectionTimeout(ClusterStatsCollector.CLUSTER_STATS_TIMEOUT, timeout);
343+
}
344+
final IndexNameExpressionResolver indexNameExpressionResolver;
345+
{
346+
indexNameExpressionResolver = mock(IndexNameExpressionResolver.class);
347+
when(indexNameExpressionResolver.concreteIndices(clusterState, IndicesOptions.lenientExpandOpen(), "apm-*"))
348+
.thenReturn(Index.EMPTY_ARRAY);
349+
}
350+
351+
final Client client = mock(Client.class);
352+
{
353+
final ClusterStatsResponse mockClusterStatsResponse = mock(ClusterStatsResponse.class);
354+
final ClusterHealthStatus clusterStatus = randomFrom(ClusterHealthStatus.values());
355+
when(mockClusterStatsResponse.getStatus()).thenReturn(clusterStatus);
356+
when(mockClusterStatsResponse.getNodesStats()).thenReturn(mock(ClusterStatsNodes.class));
357+
when(mockClusterStatsResponse.failures()).thenReturn(List.of(new FailedNodeException("node", "msg",
358+
new ElasticsearchTimeoutException("timed out"))));
359+
360+
final ClusterStatsIndices mockClusterStatsIndices = mock(ClusterStatsIndices.class);
361+
362+
when(mockClusterStatsIndices.getIndexCount()).thenReturn(0);
363+
when(mockClusterStatsResponse.getIndicesStats()).thenReturn(mockClusterStatsIndices);
364+
365+
final ClusterStatsRequestBuilder clusterStatsRequestBuilder = mock(ClusterStatsRequestBuilder.class);
366+
when(clusterStatsRequestBuilder.setTimeout(eq(timeout))).thenReturn(clusterStatsRequestBuilder);
367+
when(clusterStatsRequestBuilder.get()).thenReturn(mockClusterStatsResponse);
368+
369+
final ClusterAdminClient clusterAdminClient = mock(ClusterAdminClient.class);
370+
when(clusterAdminClient.prepareClusterStats()).thenReturn(clusterStatsRequestBuilder);
371+
372+
final AdminClient adminClient = mock(AdminClient.class);
373+
when(adminClient.cluster()).thenReturn(clusterAdminClient);
374+
when(client.admin()).thenReturn(adminClient);
375+
}
376+
377+
final long interval = randomNonNegativeLong();
378+
final Settings.Builder settings = Settings.builder();
379+
final MonitoringDoc.Node node = MonitoringTestUtils.randomMonitoringNode(random());
380+
381+
final ClusterStatsCollector collector =
382+
new ClusterStatsCollector(settings.build(), clusterService, licenseState,
383+
client, licenseService, indexNameExpressionResolver);
384+
expectThrows(ElasticsearchTimeoutException.class, () -> collector.doCollect(node, interval, clusterState));
385+
}
386+
328387
}

0 commit comments

Comments
 (0)