Skip to content

Commit 68ba571

Browse files
authored
Adds recall@k metric to rank eval API (#52889)
This change adds the recall@k metric and refactors precision@k to match the new metric. Recall@k is an important metric to use for learning to rank (LTR) use-cases. Candidate generation or first ranking phase ranking functions are often optimized for high recall, in order to generate as many relevant candidates in the top-k as possible for a second phase of ranking. Adding this metric allows tuning that base query for LTR. See: #51676 Backports: #52577
1 parent 3c8b46a commit 68ba571

File tree

11 files changed

+745
-129
lines changed

11 files changed

+745
-129
lines changed

client/rest-high-level/src/test/java/org/elasticsearch/client/RankEvalIT.java

+4-3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.elasticsearch.index.rankeval.ExpectedReciprocalRank;
2929
import org.elasticsearch.index.rankeval.MeanReciprocalRank;
3030
import org.elasticsearch.index.rankeval.PrecisionAtK;
31+
import org.elasticsearch.index.rankeval.RecallAtK;
3132
import org.elasticsearch.index.rankeval.RankEvalRequest;
3233
import org.elasticsearch.index.rankeval.RankEvalResponse;
3334
import org.elasticsearch.index.rankeval.RankEvalSpec;
@@ -130,9 +131,9 @@ private static List<RatedRequest> createTestEvaluationSpec() {
130131
*/
131132
public void testMetrics() throws IOException {
132133
List<RatedRequest> specifications = createTestEvaluationSpec();
133-
List<Supplier<EvaluationMetric>> metrics = Arrays.asList(PrecisionAtK::new, MeanReciprocalRank::new, DiscountedCumulativeGain::new,
134-
() -> new ExpectedReciprocalRank(1));
135-
double expectedScores[] = new double[] {0.4285714285714286, 0.75, 1.6408962261063627, 0.4407738095238095};
134+
List<Supplier<EvaluationMetric>> metrics = Arrays.asList(PrecisionAtK::new, RecallAtK::new,
135+
MeanReciprocalRank::new, DiscountedCumulativeGain::new, () -> new ExpectedReciprocalRank(1));
136+
double expectedScores[] = new double[] {0.4285714285714286, 1.0, 0.75, 1.6408962261063627, 0.4407738095238095};
136137
int i = 0;
137138
for (Supplier<EvaluationMetric> metricSupplier : metrics) {
138139
RankEvalSpec spec = new RankEvalSpec(specifications, metricSupplier.get());

client/rest-high-level/src/test/java/org/elasticsearch/client/RestHighLevelClientTests.java

+6-3
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@
9898
import org.elasticsearch.index.rankeval.MeanReciprocalRank;
9999
import org.elasticsearch.index.rankeval.MetricDetail;
100100
import org.elasticsearch.index.rankeval.PrecisionAtK;
101+
import org.elasticsearch.index.rankeval.RecallAtK;
101102
import org.elasticsearch.join.aggregations.ChildrenAggregationBuilder;
102103
import org.elasticsearch.rest.RestStatus;
103104
import org.elasticsearch.search.SearchHits;
@@ -696,7 +697,7 @@ public void testDefaultNamedXContents() {
696697

697698
public void testProvidedNamedXContents() {
698699
List<NamedXContentRegistry.Entry> namedXContents = RestHighLevelClient.getProvidedNamedXContents();
699-
assertEquals(57, namedXContents.size());
700+
assertEquals(59, namedXContents.size());
700701
Map<Class<?>, Integer> categories = new HashMap<>();
701702
List<String> names = new ArrayList<>();
702703
for (NamedXContentRegistry.Entry namedXContent : namedXContents) {
@@ -710,13 +711,15 @@ public void testProvidedNamedXContents() {
710711
assertEquals(Integer.valueOf(3), categories.get(Aggregation.class));
711712
assertTrue(names.contains(ChildrenAggregationBuilder.NAME));
712713
assertTrue(names.contains(MatrixStatsAggregationBuilder.NAME));
713-
assertEquals(Integer.valueOf(4), categories.get(EvaluationMetric.class));
714+
assertEquals(Integer.valueOf(5), categories.get(EvaluationMetric.class));
714715
assertTrue(names.contains(PrecisionAtK.NAME));
716+
assertTrue(names.contains(RecallAtK.NAME));
715717
assertTrue(names.contains(DiscountedCumulativeGain.NAME));
716718
assertTrue(names.contains(MeanReciprocalRank.NAME));
717719
assertTrue(names.contains(ExpectedReciprocalRank.NAME));
718-
assertEquals(Integer.valueOf(4), categories.get(MetricDetail.class));
720+
assertEquals(Integer.valueOf(5), categories.get(MetricDetail.class));
719721
assertTrue(names.contains(PrecisionAtK.NAME));
722+
assertTrue(names.contains(RecallAtK.NAME));
720723
assertTrue(names.contains(MeanReciprocalRank.NAME));
721724
assertTrue(names.contains(DiscountedCumulativeGain.NAME));
722725
assertTrue(names.contains(ExpectedReciprocalRank.NAME));

docs/reference/search/rank-eval.asciidoc

+67-14
Original file line numberDiff line numberDiff line change
@@ -203,20 +203,21 @@ will be used. The following metrics are supported:
203203
[[k-precision]]
204204
===== Precision at K (P@k)
205205

206-
This metric measures the number of relevant results in the top k search results.
207-
It's a form of the well-known
208-
https://en.wikipedia.org/wiki/Information_retrieval#Precision[Precision] metric
209-
that only looks at the top k documents. It is the fraction of relevant documents
210-
in those first k results. A precision at 10 (P@10) value of 0.6 then means six
211-
out of the 10 top hits are relevant with respect to the user's information need.
212-
213-
P@k works well as a simple evaluation metric that has the benefit of being easy
214-
to understand and explain. Documents in the collection need to be rated as either
215-
relevant or irrelevant with respect to the current query. P@k does not take
216-
into account the position of the relevant documents within the top k results,
217-
so a ranking of ten results that contains one relevant result in position 10 is
218-
equally as good as a ranking of ten results that contains one relevant result
219-
in position 1.
206+
This metric measures the proportion of relevant results in the top k search results.
207+
It's a form of the well-known
208+
https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision[Precision]
209+
metric that only looks at the top k documents. It is the fraction of relevant
210+
documents in those first k results. A precision at 10 (P@10) value of 0.6 then
211+
means 6 out of the 10 top hits are relevant with respect to the user's
212+
information need.
213+
214+
P@k works well as a simple evaluation metric that has the benefit of being easy
215+
to understand and explain. Documents in the collection need to be rated as either
216+
relevant or irrelevant with respect to the current query. P@k is a set-based
217+
metric and does not take into account the position of the relevant documents
218+
within the top k results, so a ranking of ten results that contains one
219+
relevant result in position 10 is equally as good as a ranking of ten results
220+
that contains one relevant result in position 1.
220221

221222
[source,console]
222223
--------------------------------
@@ -253,6 +254,58 @@ If set to 'true', unlabeled documents are ignored and neither count as relevant
253254
|=======================================================================
254255

255256

257+
[float]
258+
[[k-recall]]
259+
===== Recall at K (R@k)
260+
261+
This metric measures the total number of relevant results in the top k search
262+
results. It's a form of the well-known
263+
https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Recall[Recall]
264+
metric. It is the fraction of relevant documents in those first k results
265+
relative to all possible relevant results. A recall at 10 (R@10) value of 0.5 then
266+
means 4 out of 8 relevant documents, with respect to the user's information
267+
need, were retrieved in the 10 top hits.
268+
269+
R@k works well as a simple evaluation metric that has the benefit of being easy
270+
to understand and explain. Documents in the collection need to be rated as either
271+
relevant or irrelevant with respect to the current query. R@k is a set-based
272+
metric and does not take into account the position of the relevant documents
273+
within the top k results, so a ranking of ten results that contains one
274+
relevant result in position 10 is equally as good as a ranking of ten results
275+
that contains one relevant result in position 1.
276+
277+
[source,console]
278+
--------------------------------
279+
GET /twitter/_rank_eval
280+
{
281+
"requests": [
282+
{
283+
"id": "JFK query",
284+
"request": { "query": { "match_all": {}}},
285+
"ratings": []
286+
}],
287+
"metric": {
288+
"recall": {
289+
"k" : 20,
290+
"relevant_rating_threshold": 1
291+
}
292+
}
293+
}
294+
--------------------------------
295+
// TEST[setup:twitter]
296+
297+
The `recall` metric takes the following optional parameters
298+
299+
[cols="<,<",options="header",]
300+
|=======================================================================
301+
|Parameter |Description
302+
|`k` |sets the maximum number of documents retrieved per query. This value will act in place of the usual `size` parameter
303+
in the query. Defaults to 10.
304+
|`relevant_rating_threshold` |sets the rating threshold above which documents are considered to be
305+
"relevant". Defaults to `1`.
306+
|=======================================================================
307+
308+
256309
[float]
257310
===== Mean reciprocal rank
258311

modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/MetricDetail.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import java.io.IOException;
2727

2828
/**
29-
* Details about a specific {@link EvaluationMetric} that should be included in the resonse.
29+
* Details about a specific {@link EvaluationMetric} that should be included in the response.
3030
*/
3131
public interface MetricDetail extends ToXContentObject, NamedWriteable {
3232

0 commit comments

Comments
 (0)