Skip to content

Commit 7d1eb52

Browse files
authored
[ML] fix random sampling background query consistency (#83676)
There was a consistency bug where the documents returned by the created scorer could change while looking at the same shard. This can occur if multiple weights are created from the same query. For scenarios like Significant Terms/Text, we need a consistent view of each shard when using the same probability and seed. This commit ensures this by creating a new random value supplier seeded by the shard hash & seed.
1 parent fffd444 commit 7d1eb52

File tree

2 files changed

+27
-3
lines changed

2 files changed

+27
-3
lines changed

server/src/main/java/org/elasticsearch/search/aggregations/bucket/sampler/random/RandomSamplingQuery.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
public final class RandomSamplingQuery extends Query {
3333

3434
private final double p;
35-
private final SplittableRandom splittableRandom;
3635
private final int seed;
3736
private final int hash;
3837

@@ -49,7 +48,6 @@ public RandomSamplingQuery(double p, int seed, int hash) {
4948
this.p = p;
5049
this.seed = seed;
5150
this.hash = hash;
52-
this.splittableRandom = new SplittableRandom(BitMixer.mix(hash, seed));
5351
}
5452

5553
@Override
@@ -78,7 +76,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
7876

7977
@Override
8078
public Scorer scorer(LeafReaderContext context) {
81-
final SplittableRandom random = splittableRandom.split();
79+
final SplittableRandom random = new SplittableRandom(BitMixer.mix(hash, seed));
8280
int maxDoc = context.reader().maxDoc();
8381
return new ConstantScoreScorer(
8482
this,

server/src/test/java/org/elasticsearch/search/aggregations/bucket/sampler/random/RandomDocIDSetIteratorTests.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,12 @@
1111
import org.apache.lucene.search.DocIdSetIterator;
1212
import org.elasticsearch.test.ESTestCase;
1313

14+
import java.util.ArrayList;
15+
import java.util.List;
1416
import java.util.SplittableRandom;
1517

18+
import static org.hamcrest.Matchers.equalTo;
19+
1620
public class RandomDocIDSetIteratorTests extends ESTestCase {
1721

1822
public void testRandomSampler() {
@@ -43,4 +47,26 @@ public void testRandomSampler() {
4347
}
4448
}
4549

50+
public void testRandomSamplerConsistency() {
51+
int maxDoc = 10000;
52+
int seed = randomInt();
53+
54+
for (int i = 1; i < 100; i++) {
55+
double p = i / 100.0;
56+
SplittableRandom random = new SplittableRandom(seed);
57+
List<Integer> iterationOne = new ArrayList<>();
58+
RandomSamplingQuery.RandomSamplingIterator iter = new RandomSamplingQuery.RandomSamplingIterator(maxDoc, p, random::nextInt);
59+
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
60+
iterationOne.add(iter.docID());
61+
}
62+
random = new SplittableRandom(seed);
63+
List<Integer> iterationTwo = new ArrayList<>();
64+
iter = new RandomSamplingQuery.RandomSamplingIterator(maxDoc, p, random::nextInt);
65+
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
66+
iterationTwo.add(iter.docID());
67+
}
68+
assertThat(iterationOne, equalTo(iterationTwo));
69+
}
70+
}
71+
4672
}

0 commit comments

Comments
 (0)