Skip to content

Commit 37c8c0f

Browse files
authored
Improve logging for batched cluster state updates (#19255)
We've been slowly improving batch support in `ClusterService` so service won't need to implement this tricky logic themselves. These good changes are blessed but our logging infra didn't catch up and we now log things like: ``` [2016-07-04 21:51:22,318][DEBUG][cluster.service ] [node_sm0] processing [put-mapping [type1],put-mapping [type1]]: ``` Depending on the `source` string this can get quite ugly (mostly in the ZenDiscovery area). This PR adds some infra to improve logging, keeping the non-batched task the same. As result the above line looks like: ``` [2016-07-04 21:44:45,047][DEBUG][cluster.service ] [node_s0] processing [put-mapping[type0, type0, type0]]: execute ``` ZenDiscovery waiting on join moved from: ``` [2016-07-04 17:09:45,111][DEBUG][cluster.service ] [node_t0] processing [elected_as_master, [1] nodes joined),elected_as_master, [1] nodes joined)]: execute ``` To ``` [2016-07-04 22:03:30,142][DEBUG][cluster.service ] [node_t3] processing [elected_as_master ([3] nodes joined)[{node_t2}{R3hu3uoSQee0B6bkuw8pjw}{p9n28HDJQdiDMdh3tjxA5g}{127.0.0.1}{127.0.0.1:30107}, {node_t1}{ynYQfk7uR8qR5wKIysFlQg}{wa_OKuJHSl-Oyl9Gis-GXg}{127.0.0.1}{127.0.0.1:30106}, {node_t0}{pweq-2T4TlKPrEVAVW6bJw}{NPBSLXSTTguT1So0JsZY8g}{127.0.0.1}{127.0.0.1:30105}]]: execute ``` As a bonus, I removed all `zen-disco` prefixes to sources from that area.
1 parent 6861d35 commit 37c8c0f

File tree

8 files changed

+101
-39
lines changed

8 files changed

+101
-39
lines changed

core/src/main/java/org/elasticsearch/cluster/ClusterStateTaskExecutor.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,25 @@ default boolean runOnlyOnMaster() {
4646
default void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
4747
}
4848

49+
/**
50+
* Builds a concise description of a list of tasks (to be used in logging etc.).
51+
*
52+
* Note that the tasks given are not necessarily the same as those that will be passed to {@link #execute(ClusterState, List)}.
53+
* but are guaranteed to be a subset of them. This method can be called multiple times with different lists before execution.
54+
* This allows groupd task description but the submitting source.
55+
*/
56+
default String describeTasks(List<T> tasks) {
57+
return tasks.stream().map(T::toString).reduce((s1,s2) -> {
58+
if (s1.isEmpty()) {
59+
return s2;
60+
} else if (s2.isEmpty()) {
61+
return s1;
62+
} else {
63+
return s1 + ", " + s2;
64+
}
65+
}).orElse("");
66+
}
67+
4968
/**
5069
* Represents the result of a batched execution of cluster state update tasks
5170
* @param <T> the type of the cluster state update task

core/src/main/java/org/elasticsearch/cluster/ClusterStateUpdateTask.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ public final BatchResult<ClusterStateUpdateTask> execute(ClusterState currentSta
4646
return BatchResult.<ClusterStateUpdateTask>builder().successes(tasks).build(result);
4747
}
4848

49+
@Override
50+
public String describeTasks(List<ClusterStateUpdateTask> tasks) {
51+
return ""; // one of task, source is enough
52+
}
53+
4954
/**
5055
* Update the cluster state based on the current state. Return the *same instance* if no state
5156
* should be changed.

core/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ public ShardFailedTransportHandler(ClusterService clusterService, ShardFailedClu
183183
public void messageReceived(ShardRoutingEntry request, TransportChannel channel) throws Exception {
184184
logger.warn("{} received shard failed for {}", request.failure, request.shardRouting.shardId(), request);
185185
clusterService.submitStateUpdateTask(
186-
"shard-failed (" + request.shardRouting + "), message [" + request.message + "]",
186+
"shard-failed",
187187
request,
188188
ClusterStateTaskConfig.build(Priority.HIGH),
189189
shardFailedClusterStateTaskExecutor,
@@ -233,6 +233,11 @@ static class ShardFailedClusterStateTaskExecutor implements ClusterStateTaskExec
233233
this.logger = logger;
234234
}
235235

236+
@Override
237+
public String describeTasks(List<ShardRoutingEntry> tasks) {
238+
return tasks.stream().map(entry -> entry.getShardRouting().toString()).reduce((s1, s2) -> s1 + ", " + s2).orElse("");
239+
}
240+
236241
@Override
237242
public BatchResult<ShardRoutingEntry> execute(ClusterState currentState, List<ShardRoutingEntry> tasks) throws Exception {
238243
BatchResult.Builder<ShardRoutingEntry> batchResultBuilder = BatchResult.builder();
@@ -346,7 +351,7 @@ public ShardStartedTransportHandler(ClusterService clusterService, ShardStartedC
346351
public void messageReceived(ShardRoutingEntry request, TransportChannel channel) throws Exception {
347352
logger.debug("{} received shard started for [{}]", request.shardRouting.shardId(), request);
348353
clusterService.submitStateUpdateTask(
349-
"shard-started (" + request.shardRouting + "), reason [" + request.message + "]",
354+
"shard-started",
350355
request,
351356
ClusterStateTaskConfig.build(Priority.URGENT),
352357
shardStartedClusterStateTaskExecutor,
@@ -364,6 +369,11 @@ public ShardStartedClusterStateTaskExecutor(AllocationService allocationService,
364369
this.logger = logger;
365370
}
366371

372+
@Override
373+
public String describeTasks(List<ShardRoutingEntry> tasks) {
374+
return tasks.stream().map(entry -> entry.getShardRouting().toString()).reduce((s1, s2) -> s1 + ", " + s2).orElse("");
375+
}
376+
367377
@Override
368378
public BatchResult<ShardRoutingEntry> execute(ClusterState currentState, List<ShardRoutingEntry> tasks) throws Exception {
369379
BatchResult.Builder<ShardRoutingEntry> builder = BatchResult.builder();

core/src/main/java/org/elasticsearch/cluster/metadata/MetaDataMappingService.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ static class RefreshTask {
8282
this.index = index;
8383
this.indexUUID = indexUUID;
8484
}
85+
86+
@Override
87+
public String toString() {
88+
return "[" + index + "][" + indexUUID + "]";
89+
}
8590
}
8691

8792
class RefreshTaskExecutor implements ClusterStateTaskExecutor<RefreshTask> {
@@ -198,7 +203,7 @@ private boolean refreshIndexMapping(IndexService indexService, IndexMetaData.Bui
198203
*/
199204
public void refreshMapping(final String index, final String indexUUID) {
200205
final RefreshTask refreshTask = new RefreshTask(index, indexUUID);
201-
clusterService.submitStateUpdateTask("refresh-mapping [" + index + "]",
206+
clusterService.submitStateUpdateTask("refresh-mapping",
202207
refreshTask,
203208
ClusterStateTaskConfig.build(Priority.HIGH),
204209
refreshExecutor,
@@ -347,10 +352,15 @@ private ClusterState applyRequest(ClusterState currentState, PutMappingClusterSt
347352

348353
return ClusterState.builder(currentState).metaData(builder).build();
349354
}
355+
356+
@Override
357+
public String describeTasks(List<PutMappingClusterStateUpdateRequest> tasks) {
358+
return tasks.stream().map(PutMappingClusterStateUpdateRequest::type).reduce((s1, s2) -> s1 + ", " + s2).orElse("");
359+
}
350360
}
351361

352362
public void putMapping(final PutMappingClusterStateUpdateRequest request, final ActionListener<ClusterStateUpdateResponse> listener) {
353-
clusterService.submitStateUpdateTask("put-mapping [" + request.type() + "]",
363+
clusterService.submitStateUpdateTask("put-mapping",
354364
request,
355365
ClusterStateTaskConfig.build(Priority.HIGH, request.masterNodeTimeout()),
356366
putMappingExecutor,

core/src/main/java/org/elasticsearch/cluster/service/ClusterService.java

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
import org.elasticsearch.cluster.routing.RoutingTable;
4343
import org.elasticsearch.common.Nullable;
4444
import org.elasticsearch.common.Priority;
45-
import org.elasticsearch.common.Strings;
4645
import org.elasticsearch.common.component.AbstractLifecycleComponent;
4746
import org.elasticsearch.common.logging.ESLogger;
4847
import org.elasticsearch.common.logging.Loggers;
@@ -512,33 +511,37 @@ public String source() {
512511

513512
<T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
514513
final ArrayList<UpdateTask<T>> toExecute = new ArrayList<>();
515-
final ArrayList<String> sources = new ArrayList<>();
514+
final Map<String, ArrayList<T>> processTasksBySource = new HashMap<>();
516515
synchronized (updateTasksPerExecutor) {
517516
List<UpdateTask> pending = updateTasksPerExecutor.remove(executor);
518517
if (pending != null) {
519518
for (UpdateTask<T> task : pending) {
520519
if (task.processed.getAndSet(true) == false) {
521-
logger.trace("will process [{}]", task.source);
520+
logger.trace("will process [{}[{}]]", task.source, task.task);
522521
toExecute.add(task);
523-
sources.add(task.source);
522+
processTasksBySource.computeIfAbsent(task.source, s -> new ArrayList<>()).add(task.task);
524523
} else {
525-
logger.trace("skipping [{}], already processed", task.source);
524+
logger.trace("skipping [{}[{}]], already processed", task.source, task.task);
526525
}
527526
}
528527
}
529528
}
530529
if (toExecute.isEmpty()) {
531530
return;
532531
}
533-
final String source = Strings.collectionToCommaDelimitedString(sources);
532+
final String tasksSummary = processTasksBySource.entrySet().stream().map(entry -> {
533+
String tasks = executor.describeTasks(entry.getValue());
534+
return tasks.isEmpty() ? entry.getKey() : entry.getKey() + "[" + tasks + "]";
535+
}).reduce((s1, s2) -> s1 + ", " + s2).orElse("");
536+
534537
if (!lifecycle.started()) {
535-
logger.debug("processing [{}]: ignoring, cluster_service not started", source);
538+
logger.debug("processing [{}]: ignoring, cluster_service not started", tasksSummary);
536539
return;
537540
}
538-
logger.debug("processing [{}]: execute", source);
541+
logger.debug("processing [{}]: execute", tasksSummary);
539542
ClusterState previousClusterState = clusterState;
540543
if (!previousClusterState.nodes().isLocalNodeElectedMaster() && executor.runOnlyOnMaster()) {
541-
logger.debug("failing [{}]: local node is no longer master", source);
544+
logger.debug("failing [{}]: local node is no longer master", tasksSummary);
542545
toExecute.stream().forEach(task -> task.listener.onNoLongerMaster(task.source));
543546
return;
544547
}
@@ -551,10 +554,10 @@ <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
551554
TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
552555
if (logger.isTraceEnabled()) {
553556
logger.trace("failed to execute cluster state update in [{}], state:\nversion [{}], source [{}]\n{}{}{}", e, executionTime,
554-
previousClusterState.version(), source, previousClusterState.nodes().prettyPrint(),
557+
previousClusterState.version(), tasksSummary, previousClusterState.nodes().prettyPrint(),
555558
previousClusterState.routingTable().prettyPrint(), previousClusterState.getRoutingNodes().prettyPrint());
556559
}
557-
warnAboutSlowTaskIfNeeded(executionTime, source);
560+
warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
558561
batchResult = ClusterStateTaskExecutor.BatchResult.<T>builder()
559562
.failures(toExecute.stream().map(updateTask -> updateTask.task)::iterator, e)
560563
.build(previousClusterState);
@@ -597,8 +600,8 @@ <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
597600
task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState);
598601
}
599602
TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
600-
logger.debug("processing [{}]: took [{}] no change in cluster_state", source, executionTime);
601-
warnAboutSlowTaskIfNeeded(executionTime, source);
603+
logger.debug("processing [{}]: took [{}] no change in cluster_state", tasksSummary, executionTime);
604+
warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
602605
return;
603606
}
604607

@@ -640,18 +643,18 @@ <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
640643
newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED);
641644

642645
if (logger.isTraceEnabled()) {
643-
logger.trace("cluster state updated, source [{}]\n{}", source, newClusterState.prettyPrint());
646+
logger.trace("cluster state updated, source [{}]\n{}", tasksSummary, newClusterState.prettyPrint());
644647
} else if (logger.isDebugEnabled()) {
645-
logger.debug("cluster state updated, version [{}], source [{}]", newClusterState.version(), source);
648+
logger.debug("cluster state updated, version [{}], source [{}]", newClusterState.version(), tasksSummary);
646649
}
647650

648-
ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(source, newClusterState, previousClusterState);
651+
ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(tasksSummary, newClusterState, previousClusterState);
649652
// new cluster state, notify all listeners
650653
final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta();
651654
if (nodesDelta.hasChanges() && logger.isInfoEnabled()) {
652655
String summary = nodesDelta.shortSummary();
653656
if (summary.length() > 0) {
654-
logger.info("{}, reason: {}", summary, source);
657+
logger.info("{}, reason: {}", summary, tasksSummary);
655658
}
656659
}
657660

@@ -665,7 +668,7 @@ <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
665668
try {
666669
clusterStatePublisher.accept(clusterChangedEvent, ackListener);
667670
} catch (Discovery.FailedToCommitClusterStateException t) {
668-
logger.warn("failing [{}]: failed to commit cluster state version [{}]", t, source, newClusterState.version());
671+
logger.warn("failing [{}]: failed to commit cluster state version [{}]", t, tasksSummary, newClusterState.version());
669672
proccessedListeners.forEach(task -> task.listener.onFailure(task.source, t));
670673
return;
671674
}
@@ -719,17 +722,17 @@ <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
719722
try {
720723
executor.clusterStatePublished(clusterChangedEvent);
721724
} catch (Exception e) {
722-
logger.error("exception thrown while notifying executor of new cluster state publication [{}]", e, source);
725+
logger.error("exception thrown while notifying executor of new cluster state publication [{}]", e, tasksSummary);
723726
}
724727

725728
TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
726-
logger.debug("processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})", source, executionTime,
727-
newClusterState.version(), newClusterState.stateUUID());
728-
warnAboutSlowTaskIfNeeded(executionTime, source);
729+
logger.debug("processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})", tasksSummary,
730+
executionTime, newClusterState.version(), newClusterState.stateUUID());
731+
warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
729732
} catch (Exception e) {
730733
TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
731734
logger.warn("failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}", e, executionTime,
732-
newClusterState.version(), newClusterState.stateUUID(), source, newClusterState.prettyPrint());
735+
newClusterState.version(), newClusterState.stateUUID(), tasksSummary, newClusterState.prettyPrint());
733736
// TODO: do we want to call updateTask.onFailure here?
734737
}
735738

core/src/main/java/org/elasticsearch/discovery/zen/NodeJoinController.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ public synchronized void handleJoinRequest(final DiscoveryNode node, final Membe
180180
electionContext.addIncomingJoin(node, callback);
181181
checkPendingJoinsAndElectIfNeeded();
182182
} else {
183-
clusterService.submitStateUpdateTask("zen-disco-join(node " + node + "])",
183+
clusterService.submitStateUpdateTask("zen-disco-node-join",
184184
node, ClusterStateTaskConfig.build(Priority.URGENT),
185185
joinTaskExecutor, new JoinTaskListener(callback, logger));
186186
}
@@ -279,7 +279,7 @@ assert isEnoughPendingJoins(getPendingMasterJoinsCount()) : "becoming a master b
279279
innerClose();
280280

281281
Map<DiscoveryNode, ClusterStateTaskListener> tasks = getPendingAsTasks();
282-
final String source = "zen-disco-join(elected_as_master, [" + tasks.size() + "] nodes joined)";
282+
final String source = "zen-disco-elected-as-master ([" + tasks.size() + "] nodes joined)";
283283

284284
tasks.put(BECOME_MASTER_TASK, joinProcessedListener);
285285
clusterService.submitStateUpdateTasks(source, tasks, ClusterStateTaskConfig.build(Priority.URGENT), joinTaskExecutor);
@@ -288,7 +288,7 @@ assert isEnoughPendingJoins(getPendingMasterJoinsCount()) : "becoming a master b
288288
public synchronized void closeAndProcessPending(String reason) {
289289
innerClose();
290290
Map<DiscoveryNode, ClusterStateTaskListener> tasks = getPendingAsTasks();
291-
final String source = "zen-disco-join(election stopped [" + reason + "] nodes joined";
291+
final String source = "zen-disco-process-pending-joins [" + reason + "]";
292292

293293
tasks.put(FINISH_ELECTION_NOT_MASTER_TASK, joinProcessedListener);
294294
clusterService.submitStateUpdateTasks(source, tasks, ClusterStateTaskConfig.build(Priority.URGENT), joinTaskExecutor);
@@ -381,12 +381,22 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
381381

382382
// a task indicated that the current node should become master, if no current master is known
383383
private static final DiscoveryNode BECOME_MASTER_TASK = new DiscoveryNode("_BECOME_MASTER_TASK_", LocalTransportAddress.buildUnique(),
384-
Collections.emptyMap(), Collections.emptySet(), Version.CURRENT);
384+
Collections.emptyMap(), Collections.emptySet(), Version.CURRENT) {
385+
@Override
386+
public String toString() {
387+
return ""; // this is not really task , so don't log anything about it...
388+
}
389+
};
385390

386391
// a task that is used to process pending joins without explicitly becoming a master and listening to the results
387392
// this task is used when election is stop without the local node becoming a master per se (though it might
388393
private static final DiscoveryNode FINISH_ELECTION_NOT_MASTER_TASK = new DiscoveryNode("_NOT_MASTER_TASK_",
389-
LocalTransportAddress.buildUnique(), Collections.emptyMap(), Collections.emptySet(), Version.CURRENT);
394+
LocalTransportAddress.buildUnique(), Collections.emptyMap(), Collections.emptySet(), Version.CURRENT) {
395+
@Override
396+
public String toString() {
397+
return ""; // this is not really task , so don't log anything about it...
398+
}
399+
};
390400

391401
class JoinTaskExecutor implements ClusterStateTaskExecutor<DiscoveryNode> {
392402

0 commit comments

Comments
 (0)