Skip to content

Commit cfb0780

Browse files
authored
More robust timeout for repo analysis (#101184)
Replaces the transport-level timeout with an overall timeout on the whole repository analysis task to ensure that all child tasks terminate promptly. Relates #66992 Closes #101182
1 parent 6b87905 commit cfb0780

File tree

4 files changed

+79
-19
lines changed

4 files changed

+79
-19
lines changed

docs/changelog/101184.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 101184
2+
summary: More robust timeout for repo analysis
3+
area: Snapshot/Restore
4+
type: bug
5+
issues:
6+
- 101182

x-pack/plugin/snapshot-repo-test-kit/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/10_analyze.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -176,4 +176,5 @@ setup:
176176
- match: { status: 500 }
177177
- match: { error.type: repository_verification_exception }
178178
- match: { error.reason: "/.*test_repo_slow..analysis.failed.*/" }
179-
- match: { error.root_cause.0.type: receive_timeout_transport_exception }
179+
- match: { error.root_cause.0.type: repository_verification_exception }
180+
- match: { error.root_cause.0.reason: "/.*test_repo_slow..analysis.timed.out.after..1s.*/" }

x-pack/plugin/snapshot-repo-test-kit/src/internalClusterTest/java/org/elasticsearch/repositories/blobstore/testkit/RepositoryAnalysisFailureIT.java

+32-4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.elasticsearch.common.util.concurrent.CountDown;
2828
import org.elasticsearch.core.CheckedConsumer;
2929
import org.elasticsearch.core.Nullable;
30+
import org.elasticsearch.core.TimeValue;
3031
import org.elasticsearch.env.Environment;
3132
import org.elasticsearch.indices.recovery.RecoverySettings;
3233
import org.elasticsearch.plugins.Plugin;
@@ -61,6 +62,7 @@
6162
import java.util.stream.Collectors;
6263

6364
import static org.hamcrest.Matchers.anEmptyMap;
65+
import static org.hamcrest.Matchers.containsString;
6466
import static org.hamcrest.Matchers.equalTo;
6567
import static org.hamcrest.Matchers.nullValue;
6668

@@ -347,8 +349,26 @@ public BytesReference onCompareAndExchange(BytesRegister register, BytesReferenc
347349
}
348350
}
349351

350-
private RepositoryAnalyzeAction.Response analyseRepository(RepositoryAnalyzeAction.Request request) {
351-
return client().execute(RepositoryAnalyzeAction.INSTANCE, request).actionGet(30L, TimeUnit.SECONDS);
352+
public void testTimesOutSpinningRegisterAnalysis() {
353+
final RepositoryAnalyzeAction.Request request = new RepositoryAnalyzeAction.Request("test-repo");
354+
request.timeout(TimeValue.timeValueMillis(between(1, 1000)));
355+
356+
blobStore.setDisruption(new Disruption() {
357+
@Override
358+
public boolean compareAndExchangeReturnsWitness() {
359+
return false;
360+
}
361+
});
362+
final var exception = expectThrows(RepositoryVerificationException.class, () -> analyseRepository(request));
363+
assertThat(exception.getMessage(), containsString("analysis failed"));
364+
assertThat(
365+
asInstanceOf(RepositoryVerificationException.class, exception.getCause()).getMessage(),
366+
containsString("analysis timed out")
367+
);
368+
}
369+
370+
private void analyseRepository(RepositoryAnalyzeAction.Request request) {
371+
client().execute(RepositoryAnalyzeAction.INSTANCE, request).actionGet(30L, TimeUnit.SECONDS);
352372
}
353373

354374
private static void assertPurpose(OperationPurpose purpose) {
@@ -464,6 +484,10 @@ default boolean createBlobOnAbort() {
464484
return false;
465485
}
466486

487+
default boolean compareAndExchangeReturnsWitness() {
488+
return true;
489+
}
490+
467491
default BytesReference onCompareAndExchange(BytesRegister register, BytesReference expected, BytesReference updated) {
468492
return register.compareAndExchange(expected, updated);
469493
}
@@ -637,8 +661,12 @@ public void compareAndExchangeRegister(
637661
ActionListener<OptionalBytesReference> listener
638662
) {
639663
assertPurpose(purpose);
640-
final var register = registers.computeIfAbsent(key, ignored -> new BytesRegister());
641-
listener.onResponse(OptionalBytesReference.of(disruption.onCompareAndExchange(register, expected, updated)));
664+
if (disruption.compareAndExchangeReturnsWitness()) {
665+
final var register = registers.computeIfAbsent(key, ignored -> new BytesRegister());
666+
listener.onResponse(OptionalBytesReference.of(disruption.onCompareAndExchange(register, expected, updated)));
667+
} else {
668+
listener.onResponse(OptionalBytesReference.MISSING);
669+
}
642670
}
643671
}
644672

x-pack/plugin/snapshot-repo-test-kit/src/main/java/org/elasticsearch/repositories/blobstore/testkit/RepositoryAnalyzeAction.java

+39-14
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import org.apache.logging.log4j.LogManager;
1111
import org.apache.logging.log4j.Logger;
12+
import org.elasticsearch.ElasticsearchTimeoutException;
1213
import org.elasticsearch.ExceptionsHelper;
1314
import org.elasticsearch.TransportVersions;
1415
import org.elasticsearch.Version;
@@ -22,6 +23,7 @@
2223
import org.elasticsearch.action.support.ActionFilters;
2324
import org.elasticsearch.action.support.HandledTransportAction;
2425
import org.elasticsearch.action.support.RefCountingRunnable;
26+
import org.elasticsearch.action.support.SubscribableListener;
2527
import org.elasticsearch.cluster.ClusterState;
2628
import org.elasticsearch.cluster.node.DiscoveryNode;
2729
import org.elasticsearch.cluster.node.DiscoveryNodes;
@@ -364,6 +366,7 @@ public static class AsyncAction {
364366
private final DiscoveryNodes discoveryNodes;
365367
private final LongSupplier currentTimeMillisSupplier;
366368
private final ActionListener<Response> listener;
369+
private final SubscribableListener<Void> cancellationListener;
367370
private final long timeoutTimeMillis;
368371

369372
// choose the blob path nondeterministically to avoid clashes, assuming that the actual path doesn't matter for reproduction
@@ -394,15 +397,24 @@ public AsyncAction(
394397
this.discoveryNodes = discoveryNodes;
395398
this.currentTimeMillisSupplier = currentTimeMillisSupplier;
396399
this.timeoutTimeMillis = currentTimeMillisSupplier.getAsLong() + request.getTimeout().millis();
397-
this.listener = listener;
400+
401+
this.cancellationListener = new SubscribableListener<>();
402+
this.listener = ActionListener.runBefore(listener, () -> cancellationListener.onResponse(null));
398403

399404
responses = new ArrayList<>(request.blobCount);
400405
}
401406

402-
private void fail(Exception e) {
407+
private boolean setFirstFailure(Exception e) {
403408
if (failure.compareAndSet(null, e)) {
404409
transportService.getTaskManager().cancelTaskAndDescendants(task, "task failed", false, ActionListener.noop());
410+
return true;
405411
} else {
412+
return false;
413+
}
414+
}
415+
416+
private void fail(Exception e) {
417+
if (setFirstFailure(e) == false) {
406418
if (innerFailures.tryAcquire()) {
407419
final Throwable cause = ExceptionsHelper.unwrapCause(e);
408420
if (cause instanceof TaskCancelledException || cause instanceof ReceiveTimeoutTransportException) {
@@ -424,24 +436,34 @@ private boolean isRunning() {
424436
}
425437

426438
if (task.isCancelled()) {
427-
failure.compareAndSet(null, new RepositoryVerificationException(request.repositoryName, "verification cancelled"));
439+
setFirstFailure(new RepositoryVerificationException(request.repositoryName, "verification cancelled"));
428440
// if this CAS failed then we're failing for some other reason, nbd; also if the task is cancelled then its descendants are
429441
// also cancelled, so no further action is needed either way.
430442
return false;
431443
}
432444

433-
if (timeoutTimeMillis < currentTimeMillisSupplier.getAsLong()) {
434-
if (failure.compareAndSet(
435-
null,
436-
new RepositoryVerificationException(request.repositoryName, "analysis timed out after [" + request.getTimeout() + "]")
437-
)) {
438-
transportService.getTaskManager().cancelTaskAndDescendants(task, "timed out", false, ActionListener.noop());
439-
}
440-
// if this CAS failed then we're already failing for some other reason, nbd
441-
return false;
445+
return true;
446+
}
447+
448+
private class CheckForCancelListener implements ActionListener<Void> {
449+
@Override
450+
public void onResponse(Void unused) {
451+
// task complete, nothing to do
442452
}
443453

444-
return true;
454+
@Override
455+
public void onFailure(Exception e) {
456+
assert e instanceof ElasticsearchTimeoutException : e;
457+
if (isRunning()) {
458+
// if this CAS fails then we're already failing for some other reason, nbd
459+
setFirstFailure(
460+
new RepositoryVerificationException(
461+
request.repositoryName,
462+
"analysis timed out after [" + request.getTimeout() + "]"
463+
)
464+
);
465+
}
466+
}
445467
}
446468

447469
public void run() {
@@ -450,6 +472,9 @@ public void run() {
450472

451473
logger.info("running analysis of repository [{}] using path [{}]", request.getRepositoryName(), blobPath);
452474

475+
cancellationListener.addTimeout(request.getTimeout(), repository.threadPool(), EsExecutors.DIRECT_EXECUTOR_SERVICE);
476+
cancellationListener.addListener(new CheckForCancelListener());
477+
453478
final Random random = new Random(request.getSeed());
454479
final List<DiscoveryNode> nodes = getSnapshotNodes(discoveryNodes);
455480

@@ -536,7 +561,7 @@ private void runBlobAnalysis(Releasable ref, final BlobAnalyzeAction.Request req
536561
BlobAnalyzeAction.NAME,
537562
request,
538563
task,
539-
TransportRequestOptions.timeout(TimeValue.timeValueMillis(timeoutTimeMillis - currentTimeMillisSupplier.getAsLong())),
564+
TransportRequestOptions.EMPTY,
540565
new ActionListenerResponseHandler<>(ActionListener.releaseAfter(new ActionListener<>() {
541566
@Override
542567
public void onResponse(BlobAnalyzeAction.Response response) {

0 commit comments

Comments
 (0)