Skip to content

Add a simple random sampling query #25561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,25 @@ public static GeoShapeQueryBuilder geoDisjointQuery(String name, String indexedS
return builder;
}

/**
* Creates a random sampler query
*
* @param probability The probability that a document is randomly sampled
*/
public static RandomSampleQueryBuilder randomSampleQuery(double probability) {
return new RandomSampleQueryBuilder(probability);
}

/**
* Creates a random sampler query
*
* @param probability The probability that a document is randomly sampled
* @param seed A seed to use with the random generator
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe also document that the field is used as a source of randomness

*/
public static RandomSampleQueryBuilder randomSampleQuery(double probability, int seed, String field) {
return new RandomSampleQueryBuilder(probability, seed, field);
}

/**
* A filter to filter only documents where a field exists in them.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.query;


import java.io.IOException;
import java.util.Collection;
import java.util.Objects;

import com.carrotsearch.hppc.BitMixer;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.StringHelper;
import org.elasticsearch.index.fielddata.AtomicFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;

/**
* A query that randomly matches documents with a user-provided probability. May
* optionally include a seed so that matches are reproducible.
*
* This query uses two strategies depending on the size of the segment.
* For small segments, this uses simple random sampling per-document. For larger
* segments, sampling is sped up by approximating the _gap_ between samples, then
* skipping forward that amount.
*
* Gap-sampling is based on the work of Jeffrey Vitter in "An Efficient Algorithm
* for Sequential Random Sampling" (http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf),
* and more recently documented by Erik Erlandson
* (http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/)
*/
public final class RandomSampleQuery extends Query {

private final double p;
private final int seed;
private final int salt;
private final IndexFieldData<?> fieldData;
private float LOG_INVERSE_P;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instance variables should use camelcase

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make it final?


// Above this threshold, it is probably faster to just use simple random sampling
private static final double PROBABILITY_THRESHOLD = 0.5;
private static final float EPSILON = 1e-10f;

RandomSampleQuery(double p, int seed, int salt, IndexFieldData<?> fieldData) {
assert(p > 0.0 && p < 1.0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should use actual exceptions instead of assertions since we are validating some user input

this.p = p;
this.seed = seed;
this.salt = salt;
this.fieldData = fieldData;
LOG_INVERSE_P = (float)Math.log(1-p);
}

private int getGap(DocIdRNG rng, int doc) throws IOException {
float u = Math.max(rng.getFloat(doc), EPSILON);
return (int)(Math.log(u) / LOG_INVERSE_P) + 1;
}

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) {
return new ConstantScoreWeight(this, boost) {
@Override
public String toString() {
return "weight(" + RandomSampleQuery.this + ")";
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {

DocIdRNG rng = new DocIdRNG(seed, salt, fieldData, context);
int maxDoc = context.reader().maxDoc();

// For small doc sets, it's easier/more accurate to just sample directly
// instead of sampling gaps. Or, if the probability is high, faster to use SRS
if (maxDoc < 100 || p > PROBABILITY_THRESHOLD) {
return new ConstantScoreScorer(this, score(), new RandomSamplingDocIdSetIterator(maxDoc, p, rng));
} else {
return new ConstantScoreScorer(this, score(), new RandomGapSamplingDocIdSetIterator(maxDoc, p, rng));
}

}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering why you decided to override this optional API. Is this impl expected to be faster than pulling the iterator and calling next in a loop? (this is what the default impl does)

final float score = score();
final int maxDoc = context.reader().maxDoc();
DocIdRNG rng = new DocIdRNG(seed, salt, fieldData, context);

// For small doc sets, it's easier/more accurate to just sample directly
// instead of sampling gaps. Or, if the probability is high, faster to use SRS
if (maxDoc < 100 || p > PROBABILITY_THRESHOLD) {
return new BulkScorer() {
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
max = Math.min(max, maxDoc);
FakeScorer scorer = new FakeScorer();
scorer.score = score;
collector.setScorer(scorer);

for (int current = min; current < max; current++) {
if (rng.getFloat(current) <= p) {
scorer.doc = current;
if (acceptDocs == null || acceptDocs.get(current)) {
collector.collect(current);
}
}
}
return max == maxDoc ? DocIdSetIterator.NO_MORE_DOCS : max;
}
@Override
public long cost() {
return maxDoc;
}
};
} else {
return new BulkScorer() {
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
max = Math.min(max, maxDoc);
FakeScorer scorer = new FakeScorer();
scorer.score = score;
collector.setScorer(scorer);

int current = min;
while (current < max) {
int gap = getGap(rng, current);
current = current + gap;
if (current >= maxDoc) {
return DocIdSetIterator.NO_MORE_DOCS;
}
scorer.doc = current;
if (acceptDocs == null || acceptDocs.get(current)) {
collector.collect(current);
}
}
return max == maxDoc ? DocIdSetIterator.NO_MORE_DOCS : max;
}
@Override
public long cost() {
return maxDoc;
}
};
}

}
};
}

@Override
public String toString(String field) {
return "RandomSample[p=" + this.p + "](*:*)";
}

@Override
public boolean equals(Object o) {
if (o == null || getClass() != o.getClass()) {
return false;
}
RandomSampleQuery other = (RandomSampleQuery)o;
return Objects.equals(this.p, other.p) &&
Objects.equals(this.seed, other.seed);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to compare the last and fielddata instances as well?

}

@Override
public int hashCode() {
return classHash();
}

/**
* A DocIDSetIter that samples on each document. Empirically, this tends to
* be more accurate when the segment is small. It may also be faster
* when the probability is high, since many docs are collected and gap approximation
* uses more expensive math
*/
private static class RandomSamplingDocIdSetIterator extends DocIdSetIterator {
int doc = -1;
final int maxDoc;
final DocIdRNG rng;
final double p;

RandomSamplingDocIdSetIterator(int maxDoc, double p, DocIdRNG rng) {
this.maxDoc = maxDoc;
this.rng = rng;
this.p = p;
}

@Override
public int docID() {
return doc;
}

@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}

@Override
public int advance(int target) throws IOException {
doc = target;
while (doc < maxDoc) {
if (rng.getFloat(doc) <= p) {
return doc;
}
doc = doc + 1;
}
return NO_MORE_DOCS;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one issue I have with this is that the next matching document depends on the target. Would it be possible to fix it so that the set of matching documents does not depend on how the scorer as advanced?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ie. could the random number be only a function of seed and doc?

Copy link
Member

@rjernst rjernst Jul 6, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And can docid be the default, but the value be optionally pulled from a field? See #25240 (comment). Otherwise there is no way to have real consistency with a seed (and IMO seed should only be allowed when a field is specified).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jpountz Not entirely sure I understand. Is the issue that the same seeded rng may match different documents depending on how many times it was invoked (due to different scorer positions)? If that's the case... does it matter? Every document has the same probability p to be matched, indepedendent of other documents. Technically it's influenced by the prior docs due to not being a truly random generator, but the result is essentially the same from a sampling perspective.

@rjernst Are you saying to re-seed the rng on each document (using it's docid or field, etc)? Wouldn't that be rather expensive?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See RandomScoreFunction (it does not depend on the order of visiting docs, and just substitute using the uid field with either docid or a user supplied field).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha, I see how it works now (not using Random at all, just hashing/mixing). Will make changes similar to @jpountz recent PR for the random_score :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@polyfractal what I meant is that the matches of a given query should be the same regardless how the query is consumed. For instance that that for a given source of randomness and seed, documents 5 and 12 should match but no other documents. Then calling advance(10) or advance(11) should both return 12. However the impl seems a bit broken here since the next matching document is computed using the target.

}

@Override
public long cost() {
return maxDoc;
}
}

/**
* A DocIDSetIter that approximates the gaps between sampled documents, and advances
* according to the gap. This is more efficient, especially for low probabilities,
* because it can skip by many documents entirely.
*/
private class RandomGapSamplingDocIdSetIterator extends DocIdSetIterator {
final int maxDoc;
final DocIdRNG rng;
final double p;
int doc = -1;

RandomGapSamplingDocIdSetIterator(int maxDoc, double p, DocIdRNG rng) {
this.maxDoc = maxDoc;
this.rng = rng;
this.p = p;
}

@Override
public int docID() {
return doc;
}

@Override
public int nextDoc() throws IOException {
return advance(doc);
}

@Override
public int advance(int target) throws IOException {
// Keep approximating gaps until we hit or surpass the target
while (doc <= target) {
doc += getGap(rng, doc);
}
if (doc >= maxDoc) {
doc = NO_MORE_DOCS;
}
return doc;
}

@Override
public long cost() {
return (long)(maxDoc * p);
}
}

/**
* This is a copy of Lucene's FakeScorer since it is package-private
*/
final class FakeScorer extends Scorer {
float score;
int doc = -1;
int freq = 1;

FakeScorer() {
super(null);
}

@Override
public int docID() {
return doc;
}

@Override
public int freq() {
return freq;
}

@Override
public float score() {
return score;
}

@Override
public DocIdSetIterator iterator() {
throw new UnsupportedOperationException();
}

@Override
public Weight getWeight() {
throw new UnsupportedOperationException();
}

@Override
public Collection<ChildScorer> getChildren() {
throw new UnsupportedOperationException();
}
}

class DocIdRNG {

final SortedBinaryDocValues values;
final LeafReaderContext context;
final int saltedSeed;

DocIdRNG(int seed, int salt, IndexFieldData<?> fieldData, LeafReaderContext context) {
this.context = context;
this.saltedSeed = BitMixer.mix(seed, salt);

if (fieldData != null) {
AtomicFieldData leafData = fieldData.load(context);
values = leafData.getBytesValues();
if (values == null) {
throw new NullPointerException("failed to get fielddata");
}
} else {
values = null;
}
}

float getFloat(int docId) throws IOException {
int hash;
if (values == null) {
hash = BitMixer.mix(context.docBase + docId + 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not really wrong since we do not require things to be reproducible in that case, but I'd rather like to use context.reader().maxDoc() instead of context.docBase so that matches only depend on the current segment

} else if (values.advanceExact(docId)) {
hash = StringHelper.murmurhash3_x86_32(values.nextValue(), saltedSeed);
} else {
// field has no value
hash = saltedSeed;
}
return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0
}
}

}
Loading