Skip to content

Commit e155fc0

Browse files
committed
Avoid stack overflow in auto-follow coordinator (#44421)
This commit avoids a situation where we might stack overflow in the auto-follower coordinator. In the face of repeated failures to get the remote cluster state, we would previously be called back on the same thread and then recurse to try again. If this failure persists, the repeated callbacks on the same thread would lead to a stack overflow. The repeated failures can occur, for example, if the connect queue is full when we attempt to make a connection to the remote cluster. This commit avoids this by truncating the call stack if we are called back on the same thread as the initial request was made on.
1 parent f2cef27 commit e155fc0

File tree

3 files changed

+153
-35
lines changed

3 files changed

+153
-35
lines changed

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/Ccr.java

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -181,16 +181,17 @@ public Collection<Object> createComponents(
181181
CcrRestoreSourceService restoreSourceService = new CcrRestoreSourceService(threadPool, ccrSettings);
182182
this.restoreSourceService.set(restoreSourceService);
183183
return Arrays.asList(
184+
ccrLicenseChecker,
185+
restoreSourceService,
186+
new CcrRepositoryManager(settings, clusterService, client),
187+
new AutoFollowCoordinator(
188+
settings,
189+
client,
190+
clusterService,
184191
ccrLicenseChecker,
185-
restoreSourceService,
186-
new CcrRepositoryManager(settings, clusterService, client),
187-
new AutoFollowCoordinator(
188-
settings,
189-
client,
190-
clusterService,
191-
ccrLicenseChecker,
192-
threadPool::relativeTimeInMillis,
193-
threadPool::absoluteTimeInMillis));
192+
threadPool::relativeTimeInMillis,
193+
threadPool::absoluteTimeInMillis,
194+
threadPool.executor(Ccr.CCR_THREAD_POOL_NAME)));
194195
}
195196

196197
@Override

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/AutoFollowCoordinator.java

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import java.util.Objects;
5656
import java.util.Set;
5757
import java.util.TreeMap;
58+
import java.util.concurrent.Executor;
5859
import java.util.function.BiConsumer;
5960
import java.util.function.Consumer;
6061
import java.util.function.Function;
@@ -78,6 +79,7 @@ public class AutoFollowCoordinator extends AbstractLifecycleComponent implements
7879
private final CcrLicenseChecker ccrLicenseChecker;
7980
private final LongSupplier relativeMillisTimeProvider;
8081
private final LongSupplier absoluteMillisTimeProvider;
82+
private final Executor executor;
8183

8284
private volatile TimeValue waitForMetadataTimeOut;
8385
private volatile Map<String, AutoFollower> autoFollowers = Collections.emptyMap();
@@ -89,18 +91,20 @@ public class AutoFollowCoordinator extends AbstractLifecycleComponent implements
8991
private final LinkedHashMap<String, Tuple<Long, ElasticsearchException>> recentAutoFollowErrors;
9092

9193
public AutoFollowCoordinator(
92-
Settings settings,
93-
Client client,
94-
ClusterService clusterService,
95-
CcrLicenseChecker ccrLicenseChecker,
96-
LongSupplier relativeMillisTimeProvider,
97-
LongSupplier absoluteMillisTimeProvider) {
94+
final Settings settings,
95+
final Client client,
96+
final ClusterService clusterService,
97+
final CcrLicenseChecker ccrLicenseChecker,
98+
final LongSupplier relativeMillisTimeProvider,
99+
final LongSupplier absoluteMillisTimeProvider,
100+
final Executor executor) {
98101

99102
this.client = client;
100103
this.clusterService = clusterService;
101104
this.ccrLicenseChecker = Objects.requireNonNull(ccrLicenseChecker, "ccrLicenseChecker");
102105
this.relativeMillisTimeProvider = relativeMillisTimeProvider;
103106
this.absoluteMillisTimeProvider = absoluteMillisTimeProvider;
107+
this.executor = Objects.requireNonNull(executor);
104108
this.recentAutoFollowErrors = new LinkedHashMap<String, Tuple<Long, ElasticsearchException>>() {
105109
@Override
106110
protected boolean removeEldestEntry(final Map.Entry<String, Tuple<Long, ElasticsearchException>> eldest) {
@@ -210,7 +214,7 @@ void updateAutoFollowers(ClusterState followerClusterState) {
210214
Map<String, AutoFollower> newAutoFollowers = new HashMap<>(newRemoteClusters.size());
211215
for (String remoteCluster : newRemoteClusters) {
212216
AutoFollower autoFollower =
213-
new AutoFollower(remoteCluster, this::updateStats, clusterService::state, relativeMillisTimeProvider) {
217+
new AutoFollower(remoteCluster, this::updateStats, clusterService::state, relativeMillisTimeProvider, executor) {
214218

215219
@Override
216220
void getRemoteClusterState(final String remoteCluster,
@@ -332,6 +336,7 @@ abstract static class AutoFollower {
332336
private final Consumer<List<AutoFollowResult>> statsUpdater;
333337
private final Supplier<ClusterState> followerClusterStateSupplier;
334338
private final LongSupplier relativeTimeProvider;
339+
private final Executor executor;
335340

336341
private volatile long lastAutoFollowTimeInMillis = -1;
337342
private volatile long metadataVersion = 0;
@@ -344,11 +349,13 @@ abstract static class AutoFollower {
344349
AutoFollower(final String remoteCluster,
345350
final Consumer<List<AutoFollowResult>> statsUpdater,
346351
final Supplier<ClusterState> followerClusterStateSupplier,
347-
LongSupplier relativeTimeProvider) {
352+
final LongSupplier relativeTimeProvider,
353+
final Executor executor) {
348354
this.remoteCluster = remoteCluster;
349355
this.statsUpdater = statsUpdater;
350356
this.followerClusterStateSupplier = followerClusterStateSupplier;
351357
this.relativeTimeProvider = relativeTimeProvider;
358+
this.executor = Objects.requireNonNull(executor);
352359
}
353360

354361
void start() {
@@ -387,6 +394,7 @@ void start() {
387394
this.autoFollowPatternsCountDown = new CountDown(patterns.size());
388395
this.autoFollowResults = new AtomicArray<>(patterns.size());
389396

397+
final Thread thread = Thread.currentThread();
390398
getRemoteClusterState(remoteCluster, metadataVersion + 1, (remoteClusterStateResponse, remoteError) -> {
391399
// Also check removed flag here, as it may take a while for this remote cluster state api call to return:
392400
if (removed) {
@@ -403,7 +411,7 @@ void start() {
403411
}
404412
ClusterState remoteClusterState = remoteClusterStateResponse.getState();
405413
metadataVersion = remoteClusterState.metaData().version();
406-
autoFollowIndices(autoFollowMetadata, clusterState, remoteClusterState, patterns);
414+
autoFollowIndices(autoFollowMetadata, clusterState, remoteClusterState, patterns, thread);
407415
} else {
408416
assert remoteError != null;
409417
if (remoteError instanceof NoSuchRemoteClusterException) {
@@ -414,7 +422,7 @@ void start() {
414422

415423
for (int i = 0; i < patterns.size(); i++) {
416424
String autoFollowPatternName = patterns.get(i);
417-
finalise(i, new AutoFollowResult(autoFollowPatternName, remoteError));
425+
finalise(i, new AutoFollowResult(autoFollowPatternName, remoteError), thread);
418426
}
419427
}
420428
});
@@ -428,7 +436,8 @@ void stop() {
428436
private void autoFollowIndices(final AutoFollowMetadata autoFollowMetadata,
429437
final ClusterState clusterState,
430438
final ClusterState remoteClusterState,
431-
final List<String> patterns) {
439+
final List<String> patterns,
440+
final Thread thread) {
432441
int i = 0;
433442
for (String autoFollowPatternName : patterns) {
434443
final int slot = i;
@@ -439,7 +448,7 @@ private void autoFollowIndices(final AutoFollowMetadata autoFollowMetadata,
439448
final List<Index> leaderIndicesToFollow =
440449
getLeaderIndicesToFollow(autoFollowPattern, remoteClusterState, followedIndices);
441450
if (leaderIndicesToFollow.isEmpty()) {
442-
finalise(slot, new AutoFollowResult(autoFollowPatternName));
451+
finalise(slot, new AutoFollowResult(autoFollowPatternName), thread);
443452
} else {
444453
List<Tuple<String, AutoFollowPattern>> patternsForTheSameRemoteCluster = autoFollowMetadata.getPatterns()
445454
.entrySet().stream()
@@ -448,7 +457,7 @@ private void autoFollowIndices(final AutoFollowMetadata autoFollowMetadata,
448457
.map(item -> new Tuple<>(item.getKey(), item.getValue()))
449458
.collect(Collectors.toList());
450459

451-
Consumer<AutoFollowResult> resultHandler = result -> finalise(slot, result);
460+
Consumer<AutoFollowResult> resultHandler = result -> finalise(slot, result, thread);
452461
checkAutoFollowPattern(autoFollowPatternName, remoteCluster, autoFollowPattern, leaderIndicesToFollow, headers,
453462
patternsForTheSameRemoteCluster, remoteClusterState.metaData(), clusterState.metaData(), resultHandler);
454463
}
@@ -561,11 +570,23 @@ private void followLeaderIndex(String autoFollowPattenName,
561570
createAndFollow(headers, request, successHandler, onResult);
562571
}
563572

564-
private void finalise(int slot, AutoFollowResult result) {
573+
private void finalise(int slot, AutoFollowResult result, final Thread thread) {
565574
assert autoFollowResults.get(slot) == null;
566575
autoFollowResults.set(slot, result);
567576
if (autoFollowPatternsCountDown.countDown()) {
568577
statsUpdater.accept(autoFollowResults.asList());
578+
/*
579+
* In the face of a failure, we could be called back on the same thread. That is, it could be that we
580+
* never fired off the asynchronous remote cluster state call, instead failing beforehand. In this case,
581+
* we will recurse on the same thread. If there are repeated failures, we could blow the stack and
582+
* overflow. A real-world scenario in which this can occur is if the local connect queue is full. To
583+
* avoid this, if we are called back on the same thread, then we truncate the stack by forking to
584+
* another thread.
585+
*/
586+
if (thread == Thread.currentThread()) {
587+
executor.execute(this::start);
588+
return;
589+
}
569590
start();
570591
}
571592
}

0 commit comments

Comments
 (0)