Skip to content

Commit 2567e2e

Browse files
committed
SearchRequest#allowPartialSearchResults does not handle successful retries
When set to false, allowPartialSearchResults option does not check if the shard failures have been reseted to null. The atomic array, that is used to record shard failures, is filled with a null value if a successful request on a shard happens after a failure on a shard of another replica. In this case the atomic array is not empty but contains only null values so this shouldn't be considered as a failure since all shards are successful (some replicas have failed but the retries on another replica succeeded). This change fixes this bug by checking the content of the atomic array and fails the request only if allowPartialSearchResults is set to false and at least one shard failure is not null. Closes elastic#40743
1 parent 4b44561 commit 2567e2e

File tree

3 files changed

+125
-22
lines changed

3 files changed

+125
-22
lines changed

server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -140,24 +140,29 @@ public final void executeNextPhase(SearchPhase currentPhase, SearchPhase nextPha
140140
} else {
141141
Boolean allowPartialResults = request.allowPartialSearchResults();
142142
assert allowPartialResults != null : "SearchRequest missing setting for allowPartialSearchResults";
143-
if (allowPartialResults == false && shardFailures.get() != null ){
144-
if (logger.isDebugEnabled()) {
145-
final ShardOperationFailedException[] shardSearchFailures = ExceptionsHelper.groupBy(buildShardFailures());
146-
Throwable cause = shardSearchFailures.length == 0 ? null :
147-
ElasticsearchException.guessRootCauses(shardSearchFailures[0].getCause())[0];
148-
logger.debug(() -> new ParameterizedMessage("{} shards failed for phase: [{}]",
149-
shardSearchFailures.length, getName()), cause);
150-
}
151-
onPhaseFailure(currentPhase, "Partial shards failure", null);
152-
} else {
153-
if (logger.isTraceEnabled()) {
154-
final String resultsFrom = results.getSuccessfulResults()
155-
.map(r -> r.getSearchShardTarget().toString()).collect(Collectors.joining(","));
156-
logger.trace("[{}] Moving to next phase: [{}], based on results from: {} (cluster state version: {})",
157-
currentPhase.getName(), nextPhase.getName(), resultsFrom, clusterStateVersion);
143+
if (allowPartialResults == false && shardFailures.get() != null) {
144+
// check if there are actual failures in the atomic array since
145+
// successful retries can reset the failures to null
146+
ShardOperationFailedException[] shardSearchFailures = buildShardFailures();
147+
if (shardSearchFailures.length > 0) {
148+
if (logger.isDebugEnabled()) {
149+
int numShardFailures = shardSearchFailures.length;
150+
shardSearchFailures = ExceptionsHelper.groupBy(shardSearchFailures);
151+
Throwable cause = ElasticsearchException.guessRootCauses(shardSearchFailures[0].getCause())[0];
152+
logger.debug(() -> new ParameterizedMessage("{} shards failed for phase: [{}]",
153+
numShardFailures, getName()), cause);
154+
}
155+
onPhaseFailure(currentPhase, "Partial shards failure", null);
156+
return;
158157
}
159-
executePhase(nextPhase);
160158
}
159+
if (logger.isTraceEnabled()) {
160+
final String resultsFrom = results.getSuccessfulResults()
161+
.map(r -> r.getSearchShardTarget().toString()).collect(Collectors.joining(","));
162+
logger.trace("[{}] Moving to next phase: [{}], based on results from: {} (cluster state version: {})",
163+
currentPhase.getName(), nextPhase.getName(), resultsFrom, clusterStateVersion);
164+
}
165+
executePhase(nextPhase);
161166
}
162167
}
163168

server/src/main/java/org/elasticsearch/action/search/InitialSearchPhase.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,13 @@ public final void run() {
139139
for (int index = 0; index < shardsIts.size(); index++) {
140140
final SearchShardIterator shardRoutings = shardsIts.get(index);
141141
if (shardRoutings.size() == 0) {
142-
if(missingShards.length() >0 ){
142+
if(missingShards.length() > 0){
143143
missingShards.append(", ");
144144
}
145145
missingShards.append(shardRoutings.shardId());
146146
}
147147
}
148-
if (missingShards.length() >0) {
148+
if (missingShards.length() > 0) {
149149
//Status red - shard is missing all copies and would produce partial results for an index search
150150
final String msg = "Search rejected due to missing shards ["+ missingShards +
151151
"]. Consider using `allow_partial_search_results` setting to bypass this error.";

server/src/test/java/org/elasticsearch/action/search/SearchAsyncActionTests.java

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.elasticsearch.transport.TransportException;
3939
import org.elasticsearch.transport.TransportRequest;
4040
import org.elasticsearch.transport.TransportRequestOptions;
41+
import org.hamcrest.Matchers;
4142

4243
import java.io.IOException;
4344
import java.util.ArrayList;
@@ -56,6 +57,7 @@
5657

5758
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
5859
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentSet;
60+
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
5961

6062
public class SearchAsyncActionTests extends ESTestCase {
6163

@@ -371,6 +373,102 @@ protected void executeNext(Runnable runnable, Thread originalThread) {
371373
executor.shutdown();
372374
}
373375

376+
public void testAllowPartialResults() throws InterruptedException {
377+
SearchRequest request = new SearchRequest();
378+
request.allowPartialSearchResults(false);
379+
int numConcurrent = randomIntBetween(1, 5);
380+
request.setMaxConcurrentShardRequests(numConcurrent);
381+
int numShards = randomIntBetween(5, 10);
382+
AtomicBoolean searchPhaseDidRun = new AtomicBoolean(false);
383+
ActionListener<SearchResponse> responseListener = ActionListener.wrap(response -> {},
384+
(e) -> { throw new AssertionError("unexpected", e);} );
385+
DiscoveryNode primaryNode = new DiscoveryNode("node_1", buildNewFakeTransportAddress(), Version.CURRENT);
386+
// for the sake of this test we place the replica on the same node. ie. this is not a mistake since we limit per node now
387+
DiscoveryNode replicaNode = new DiscoveryNode("node_1", buildNewFakeTransportAddress(), Version.CURRENT);
388+
389+
AtomicInteger contextIdGenerator = new AtomicInteger(0);
390+
GroupShardsIterator<SearchShardIterator> shardsIter = getShardsIter("idx",
391+
new OriginalIndices(new String[]{"idx"}, SearchRequest.DEFAULT_INDICES_OPTIONS),
392+
numShards, true, primaryNode, replicaNode);
393+
int numShardAttempts = 0;
394+
for (SearchShardIterator it : shardsIter) {
395+
numShardAttempts += it.remaining();
396+
}
397+
CountDownLatch latch = new CountDownLatch(numShardAttempts);
398+
399+
SearchTransportService transportService = new SearchTransportService(null, null);
400+
Map<String, Transport.Connection> lookup = new HashMap<>();
401+
Map<ShardId, Boolean> seenShard = new ConcurrentHashMap<>();
402+
lookup.put(primaryNode.getId(), new MockConnection(primaryNode));
403+
lookup.put(replicaNode.getId(), new MockConnection(replicaNode));
404+
Map<String, AliasFilter> aliasFilters = Collections.singletonMap("_na_", new AliasFilter(null, Strings.EMPTY_ARRAY));
405+
AtomicInteger numRequests = new AtomicInteger(0);
406+
AtomicInteger numFailReplicas = new AtomicInteger(0);
407+
AbstractSearchAsyncAction<TestSearchPhaseResult> asyncAction =
408+
new AbstractSearchAsyncAction<>(
409+
"test",
410+
logger,
411+
transportService,
412+
(cluster, node) -> {
413+
assert cluster == null : "cluster was not null: " + cluster;
414+
return lookup.get(node); },
415+
aliasFilters,
416+
Collections.emptyMap(),
417+
Collections.emptyMap(),
418+
null,
419+
request,
420+
responseListener,
421+
shardsIter,
422+
new TransportSearchAction.SearchTimeProvider(0, 0, () -> 0),
423+
0,
424+
null,
425+
new InitialSearchPhase.ArraySearchPhaseResults<>(shardsIter.size()),
426+
request.getMaxConcurrentShardRequests(),
427+
SearchResponse.Clusters.EMPTY) {
428+
429+
@Override
430+
protected void executePhaseOnShard(SearchShardIterator shardIt, ShardRouting shard,
431+
SearchActionListener<TestSearchPhaseResult> listener) {
432+
seenShard.computeIfAbsent(shard.shardId(), (i) -> {
433+
numRequests.incrementAndGet(); // only count this once per shard copy
434+
return Boolean.TRUE;
435+
});
436+
new Thread(() -> {
437+
Transport.Connection connection = getConnection(null, shard.currentNodeId());
438+
TestSearchPhaseResult testSearchPhaseResult = new TestSearchPhaseResult(contextIdGenerator.incrementAndGet(),
439+
connection.getNode());
440+
if (shardIt.remaining() > 0) {
441+
numFailReplicas.incrementAndGet();
442+
listener.onFailure(new RuntimeException());
443+
} else {
444+
listener.onResponse(testSearchPhaseResult);
445+
}
446+
}).start();
447+
}
448+
449+
@Override
450+
protected SearchPhase getNextPhase(SearchPhaseResults<TestSearchPhaseResult> results, SearchPhaseContext context) {
451+
return new SearchPhase("test") {
452+
@Override
453+
public void run() {
454+
assertTrue(searchPhaseDidRun.compareAndSet(false, true));
455+
}
456+
};
457+
}
458+
459+
@Override
460+
protected void executeNext(Runnable runnable, Thread originalThread) {
461+
super.executeNext(runnable, originalThread);
462+
latch.countDown();
463+
}
464+
};
465+
asyncAction.start();
466+
latch.await();
467+
assertTrue(searchPhaseDidRun.get());
468+
assertEquals(numShards, numRequests.get());
469+
assertThat(numFailReplicas.get(), greaterThanOrEqualTo(1));
470+
}
471+
374472
static GroupShardsIterator<SearchShardIterator> getShardsIter(String index, OriginalIndices originalIndices, int numShards,
375473
boolean doReplicas, DiscoveryNode primaryNode, DiscoveryNode replicaNode) {
376474
ArrayList<SearchShardIterator> list = new ArrayList<>();
@@ -389,12 +487,12 @@ static GroupShardsIterator<SearchShardIterator> getShardsIter(String index, Orig
389487
RecoverySource.PeerRecoverySource.INSTANCE, new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "foobar"));
390488
if (replicaNode != null) {
391489
routing = routing.initialize(replicaNode.getId(), i + "r", 0);
392-
if (randomBoolean()) {
490+
//if (randomBoolean()) {
393491
routing.started();
394492
started.add(routing);
395-
} else {
396-
initializing.add(routing);
397-
}
493+
//} else {
494+
// initializing.add(routing);
495+
//}
398496
} else {
399497
unassigned.add(routing); // unused yet
400498
}

0 commit comments

Comments
 (0)