Skip to content

Commit 2632d1d

Browse files
author
Christoph Büscher
committed
Rename ranking evaluation quality_level to metric_score (#32168)
The notion of "quality" is an overloaded term in the search ranking evaluation context. Its usually used to decribe certain levels of "good" vs. "bad" of a seach result with respect to the users information need. We currently report the result of the ranking evaluation as `quality_level` which is a bit missleading. This changes the response parameter name to `metric_score` which fits better.
1 parent 82c9bc0 commit 2632d1d

File tree

20 files changed

+114
-117
lines changed

20 files changed

+114
-117
lines changed

client/rest-high-level/src/test/java/org/elasticsearch/client/RankEvalIT.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public void testRankEvalRequest() throws IOException {
8181
highLevelClient()::rankEval, highLevelClient()::rankEvalAsync);
8282
// the expected Prec@ for the first query is 5/7 and the expected Prec@ for the second is 1/7, divided by 2 to get the average
8383
double expectedPrecision = (1.0 / 7.0 + 5.0 / 7.0) / 2.0;
84-
assertEquals(expectedPrecision, response.getEvaluationResult(), Double.MIN_VALUE);
84+
assertEquals(expectedPrecision, response.getMetricScore(), Double.MIN_VALUE);
8585
Map<String, EvalQueryQuality> partialResults = response.getPartialResults();
8686
assertEquals(2, partialResults.size());
8787
EvalQueryQuality amsterdamQueryQuality = partialResults.get("amsterdam_query");

client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/SearchDocumentationIT.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -1136,14 +1136,14 @@ public void testRankEval() throws Exception {
11361136
// end::rank-eval-execute
11371137

11381138
// tag::rank-eval-response
1139-
double evaluationResult = response.getEvaluationResult(); // <1>
1139+
double evaluationResult = response.getMetricScore(); // <1>
11401140
assertEquals(1.0 / 3.0, evaluationResult, 0.0);
11411141
Map<String, EvalQueryQuality> partialResults =
11421142
response.getPartialResults();
11431143
EvalQueryQuality evalQuality =
11441144
partialResults.get("kimchy_query"); // <2>
11451145
assertEquals("kimchy_query", evalQuality.getId());
1146-
double qualityLevel = evalQuality.getQualityLevel(); // <3>
1146+
double qualityLevel = evalQuality.metricScore(); // <3>
11471147
assertEquals(1.0 / 3.0, qualityLevel, 0.0);
11481148
List<RatedSearchHit> hitsAndRatings = evalQuality.getHitsAndRatings();
11491149
RatedSearchHit ratedSearchHit = hitsAndRatings.get(0);

docs/reference/search/rank-eval.asciidoc

+3-3
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,10 @@ that shows potential errors of individual queries. The response has the followin
274274
--------------------------------
275275
{
276276
"rank_eval": {
277-
"quality_level": 0.4, <1>
277+
"metric_score": 0.4, <1>
278278
"details": {
279279
"my_query_id1": { <2>
280-
"quality_level": 0.6, <3>
280+
"metric_score": 0.6, <3>
281281
"unrated_docs": [ <4>
282282
{
283283
"_index": "my_index",
@@ -312,7 +312,7 @@ that shows potential errors of individual queries. The response has the followin
312312

313313
<1> the overall evaluation quality calculated by the defined metric
314314
<2> the `details` section contains one entry for every query in the original `requests` section, keyed by the search request id
315-
<3> the `quality_level` in the `details` section shows the contribution of this query to the global quality score
315+
<3> the `metric_score` in the `details` section shows the contribution of this query to the global quality metric score
316316
<4> the `unrated_docs` section contains an `_index` and `_id` entry for each document in the search result for this
317317
query that didn't have a ratings value. This can be used to ask the user to supply ratings for these documents
318318
<5> the `hits` section shows a grouping of the search results with their supplied rating

modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGain.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,6 @@ public Optional<Integer> forcedSearchSize() {
126126
@Override
127127
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
128128
List<RatedDocument> ratedDocs) {
129-
List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
130-
.collect(Collectors.toList());
131129
List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
132130
List<Integer> ratingsInSearchHits = new ArrayList<>(ratedHits.size());
133131
int unratedResults = 0;
@@ -144,6 +142,8 @@ public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
144142
double idcg = 0;
145143

146144
if (normalize) {
145+
List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
146+
.collect(Collectors.toList());
147147
Collections.sort(allRatings, Comparator.nullsLast(Collections.reverseOrder()));
148148
idcg = computeDCG(allRatings.subList(0, Math.min(ratingsInSearchHits.size(), allRatings.size())));
149149
if (idcg != 0) {

modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/EvalQueryQuality.java

+13-13
Original file line numberDiff line numberDiff line change
@@ -41,35 +41,35 @@
4141
public class EvalQueryQuality implements ToXContentFragment, Writeable {
4242

4343
private final String queryId;
44-
private final double evaluationResult;
44+
private final double metricScore;
4545
private MetricDetail optionalMetricDetails;
4646
private final List<RatedSearchHit> ratedHits;
4747

48-
public EvalQueryQuality(String id, double evaluationResult) {
48+
public EvalQueryQuality(String id, double metricScore) {
4949
this.queryId = id;
50-
this.evaluationResult = evaluationResult;
50+
this.metricScore = metricScore;
5151
this.ratedHits = new ArrayList<>();
5252
}
5353

5454
public EvalQueryQuality(StreamInput in) throws IOException {
5555
this.queryId = in.readString();
56-
this.evaluationResult = in.readDouble();
56+
this.metricScore = in.readDouble();
5757
this.ratedHits = in.readList(RatedSearchHit::new);
5858
this.optionalMetricDetails = in.readOptionalNamedWriteable(MetricDetail.class);
5959
}
6060

6161
// only used for parsing internally
6262
private EvalQueryQuality(String queryId, ParsedEvalQueryQuality builder) {
6363
this.queryId = queryId;
64-
this.evaluationResult = builder.evaluationResult;
64+
this.metricScore = builder.evaluationResult;
6565
this.optionalMetricDetails = builder.optionalMetricDetails;
6666
this.ratedHits = builder.ratedHits;
6767
}
6868

6969
@Override
7070
public void writeTo(StreamOutput out) throws IOException {
7171
out.writeString(queryId);
72-
out.writeDouble(evaluationResult);
72+
out.writeDouble(metricScore);
7373
out.writeList(ratedHits);
7474
out.writeOptionalNamedWriteable(this.optionalMetricDetails);
7575
}
@@ -78,8 +78,8 @@ public String getId() {
7878
return queryId;
7979
}
8080

81-
public double getQualityLevel() {
82-
return evaluationResult;
81+
public double metricScore() {
82+
return metricScore;
8383
}
8484

8585
public void setMetricDetails(MetricDetail breakdown) {
@@ -101,7 +101,7 @@ public List<RatedSearchHit> getHitsAndRatings() {
101101
@Override
102102
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
103103
builder.startObject(queryId);
104-
builder.field(QUALITY_LEVEL_FIELD.getPreferredName(), this.evaluationResult);
104+
builder.field(METRIC_SCORE_FIELD.getPreferredName(), this.metricScore);
105105
builder.startArray(UNRATED_DOCS_FIELD.getPreferredName());
106106
for (DocumentKey key : EvaluationMetric.filterUnratedDocuments(ratedHits)) {
107107
builder.startObject();
@@ -122,7 +122,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
122122
return builder;
123123
}
124124

125-
private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
125+
static final ParseField METRIC_SCORE_FIELD = new ParseField("metric_score");
126126
private static final ParseField UNRATED_DOCS_FIELD = new ParseField("unrated_docs");
127127
private static final ParseField HITS_FIELD = new ParseField("hits");
128128
private static final ParseField METRIC_DETAILS_FIELD = new ParseField("metric_details");
@@ -136,7 +136,7 @@ private static class ParsedEvalQueryQuality {
136136
}
137137

138138
static {
139-
PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, QUALITY_LEVEL_FIELD);
139+
PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, METRIC_SCORE_FIELD);
140140
PARSER.declareObject((obj, value) -> obj.optionalMetricDetails = value, (p, c) -> parseMetricDetail(p),
141141
METRIC_DETAILS_FIELD);
142142
PARSER.declareObjectArray((obj, list) -> obj.ratedHits = list, (p, c) -> RatedSearchHit.parse(p), HITS_FIELD);
@@ -164,13 +164,13 @@ public final boolean equals(Object obj) {
164164
}
165165
EvalQueryQuality other = (EvalQueryQuality) obj;
166166
return Objects.equals(queryId, other.queryId) &&
167-
Objects.equals(evaluationResult, other.evaluationResult) &&
167+
Objects.equals(metricScore, other.metricScore) &&
168168
Objects.equals(ratedHits, other.ratedHits) &&
169169
Objects.equals(optionalMetricDetails, other.optionalMetricDetails);
170170
}
171171

172172
@Override
173173
public final int hashCode() {
174-
return Objects.hash(queryId, evaluationResult, ratedHits, optionalMetricDetails);
174+
return Objects.hash(queryId, metricScore, ratedHits, optionalMetricDetails);
175175
}
176176
}

modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/EvaluationMetric.java

+12-13
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,22 @@
3939
public interface EvaluationMetric extends ToXContentObject, NamedWriteable {
4040

4141
/**
42-
* Returns a single metric representing the ranking quality of a set of returned
43-
* documents wrt. to a set of document ids labeled as relevant for this search.
42+
* Evaluates a single ranking evaluation case.
4443
*
4544
* @param taskId
46-
* the id of the query for which the ranking is currently evaluated
45+
* an identifier of the query for which the search ranking is
46+
* evaluated
4747
* @param hits
48-
* the result hits as returned by a search request
48+
* the search result hits
4949
* @param ratedDocs
50-
* the documents that were ranked by human annotators for this query
51-
* case
52-
* @return some metric representing the quality of the result hit list wrt. to
53-
* relevant doc ids.
50+
* the documents that contain the document rating for this query case
51+
* @return an {@link EvalQueryQuality} instance that contains the metric score
52+
* with respect to the provided search hits and ratings
5453
*/
5554
EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
5655

5756
/**
58-
* join hits with rated documents using the joint _index/_id document key
57+
* Joins hits with rated documents using the joint _index/_id document key.
5958
*/
6059
static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocument> ratedDocs) {
6160
Map<DocumentKey, RatedDocument> ratedDocumentMap = ratedDocs.stream()
@@ -74,19 +73,19 @@ static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocu
7473
}
7574

7675
/**
77-
* filter @link {@link RatedSearchHit} that don't have a rating
76+
* Filter {@link RatedSearchHit}s that do not have a rating.
7877
*/
7978
static List<DocumentKey> filterUnratedDocuments(List<RatedSearchHit> ratedHits) {
8079
return ratedHits.stream().filter(hit -> hit.getRating().isPresent() == false)
8180
.map(hit -> new DocumentKey(hit.getSearchHit().getIndex(), hit.getSearchHit().getId())).collect(Collectors.toList());
8281
}
8382

8483
/**
85-
* how evaluation metrics for particular search queries get combined for the overall evaluation score.
86-
* Defaults to averaging over the partial results.
84+
* Combine several {@link EvalQueryQuality} results into the overall evaluation score.
85+
* This defaults to averaging over the partial results, but can be overwritten to obtain a different behavior.
8786
*/
8887
default double combine(Collection<EvalQueryQuality> partialResults) {
89-
return partialResults.stream().mapToDouble(EvalQueryQuality::getQualityLevel).sum() / partialResults.size();
88+
return partialResults.stream().mapToDouble(EvalQueryQuality::metricScore).sum() / partialResults.size();
9089
}
9190

9291
/**

modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/MeanReciprocalRank.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,7 @@ public int getRelevantRatingThreshold() {
110110
* Compute ReciprocalRank based on provided relevant document IDs.
111111
**/
112112
@Override
113-
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
114-
List<RatedDocument> ratedDocs) {
113+
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs) {
115114
List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
116115
int firstRelevant = -1;
117116
int rank = 1;

modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/RankEvalResponse.java

+9-10
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,15 @@
4848
public class RankEvalResponse extends ActionResponse implements ToXContentObject {
4949

5050
/** The overall evaluation result. */
51-
private double evaluationResult;
51+
private double metricScore;
5252
/** details about individual ranking evaluation queries, keyed by their id */
5353
private Map<String, EvalQueryQuality> details;
5454
/** exceptions for specific ranking evaluation queries, keyed by their id */
5555
private Map<String, Exception> failures;
5656

57-
public RankEvalResponse(double qualityLevel, Map<String, EvalQueryQuality> partialResults,
57+
public RankEvalResponse(double metricScore, Map<String, EvalQueryQuality> partialResults,
5858
Map<String, Exception> failures) {
59-
this.evaluationResult = qualityLevel;
59+
this.metricScore = metricScore;
6060
this.details = new HashMap<>(partialResults);
6161
this.failures = new HashMap<>(failures);
6262
}
@@ -65,8 +65,8 @@ public RankEvalResponse(double qualityLevel, Map<String, EvalQueryQuality> parti
6565
// only used in RankEvalAction#newResponse()
6666
}
6767

68-
public double getEvaluationResult() {
69-
return evaluationResult;
68+
public double getMetricScore() {
69+
return metricScore;
7070
}
7171

7272
public Map<String, EvalQueryQuality> getPartialResults() {
@@ -85,7 +85,7 @@ public String toString() {
8585
@Override
8686
public void writeTo(StreamOutput out) throws IOException {
8787
super.writeTo(out);
88-
out.writeDouble(evaluationResult);
88+
out.writeDouble(metricScore);
8989
out.writeVInt(details.size());
9090
for (String queryId : details.keySet()) {
9191
out.writeString(queryId);
@@ -101,7 +101,7 @@ public void writeTo(StreamOutput out) throws IOException {
101101
@Override
102102
public void readFrom(StreamInput in) throws IOException {
103103
super.readFrom(in);
104-
this.evaluationResult = in.readDouble();
104+
this.metricScore = in.readDouble();
105105
int partialResultSize = in.readVInt();
106106
this.details = new HashMap<>(partialResultSize);
107107
for (int i = 0; i < partialResultSize; i++) {
@@ -120,7 +120,7 @@ public void readFrom(StreamInput in) throws IOException {
120120
@Override
121121
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
122122
builder.startObject();
123-
builder.field("quality_level", evaluationResult);
123+
builder.field("metric_score", metricScore);
124124
builder.startObject("details");
125125
for (String key : details.keySet()) {
126126
details.get(key).toXContent(builder, params);
@@ -137,7 +137,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
137137
return builder;
138138
}
139139

140-
private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
141140
private static final ParseField DETAILS_FIELD = new ParseField("details");
142141
private static final ParseField FAILURES_FIELD = new ParseField("failures");
143142
@SuppressWarnings("unchecked")
@@ -147,7 +146,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
147146
((List<EvalQueryQuality>) a[1]).stream().collect(Collectors.toMap(EvalQueryQuality::getId, Function.identity())),
148147
((List<Tuple<String, Exception>>) a[2]).stream().collect(Collectors.toMap(Tuple::v1, Tuple::v2))));
149148
static {
150-
PARSER.declareDouble(ConstructingObjectParser.constructorArg(), QUALITY_LEVEL_FIELD);
149+
PARSER.declareDouble(ConstructingObjectParser.constructorArg(), EvalQueryQuality.METRIC_SCORE_FIELD);
151150
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> EvalQueryQuality.fromXContent(p, n),
152151
DETAILS_FIELD);
153152
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> {

modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainTests.java

+8-8
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ public void testDCGAt() {
7676
hits[i].shard(new SearchShardTarget("testnode", new Index("index", "uuid"), 0, null));
7777
}
7878
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
79-
assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
79+
assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
8080

8181
/**
8282
* Check with normalization: to get the maximal possible dcg, sort documents by
@@ -94,7 +94,7 @@ public void testDCGAt() {
9494
* idcg = 14.595390756454922 (sum of last column)
9595
*/
9696
dcg = new DiscountedCumulativeGain(true, null, 10);
97-
assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
97+
assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
9898
}
9999

100100
/**
@@ -127,7 +127,7 @@ public void testDCGAtSixMissingRatings() {
127127
}
128128
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
129129
EvalQueryQuality result = dcg.evaluate("id", hits, rated);
130-
assertEquals(12.779642067948913, result.getQualityLevel(), DELTA);
130+
assertEquals(12.779642067948913, result.metricScore(), DELTA);
131131
assertEquals(2, filterUnratedDocuments(result.getHitsAndRatings()).size());
132132

133133
/**
@@ -146,7 +146,7 @@ public void testDCGAtSixMissingRatings() {
146146
* idcg = 13.347184833073591 (sum of last column)
147147
*/
148148
dcg = new DiscountedCumulativeGain(true, null, 10);
149-
assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
149+
assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
150150
}
151151

152152
/**
@@ -184,7 +184,7 @@ public void testDCGAtFourMoreRatings() {
184184
}
185185
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
186186
EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
187-
assertEquals(12.392789260714371, result.getQualityLevel(), DELTA);
187+
assertEquals(12.392789260714371, result.metricScore(), DELTA);
188188
assertEquals(1, filterUnratedDocuments(result.getHitsAndRatings()).size());
189189

190190
/**
@@ -204,7 +204,7 @@ public void testDCGAtFourMoreRatings() {
204204
* idcg = 13.347184833073591 (sum of last column)
205205
*/
206206
dcg = new DiscountedCumulativeGain(true, null, 10);
207-
assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).getQualityLevel(), DELTA);
207+
assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).metricScore(), DELTA);
208208
}
209209

210210
/**
@@ -223,13 +223,13 @@ public void testNoResults() throws Exception {
223223
SearchHit[] hits = new SearchHit[0];
224224
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
225225
EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
226-
assertEquals(0.0d, result.getQualityLevel(), DELTA);
226+
assertEquals(0.0d, result.metricScore(), DELTA);
227227
assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());
228228

229229
// also check normalized
230230
dcg = new DiscountedCumulativeGain(true, null, 10);
231231
result = dcg.evaluate("id", hits, ratedDocs);
232-
assertEquals(0.0d, result.getQualityLevel(), DELTA);
232+
assertEquals(0.0d, result.metricScore(), DELTA);
233233
assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());
234234
}
235235

0 commit comments

Comments
 (0)