Skip to content

Commit c13f5f2

Browse files
committed
Term Vectors: support for distributed frequencies
Adds distributed frequencies support for the Term Vectors API. A new parameter called `dfs` is introduced which defaults to `false`. Closes #8144
1 parent 19514a2 commit c13f5f2

17 files changed

+774
-20
lines changed

docs/reference/docs/termvectors.asciidoc

+12-5
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,23 @@ omit :
7676
* sum of total term frequencies (the sum of total term frequencies of
7777
each term in this field)
7878

79+
[float]
80+
==== Distributed frequencies coming[1.5.0]
81+
82+
Setting `dfs` to `true` (default is `false`) will return the term statistics
83+
or the field statistics of the entire index, and not just at the shard. Use it
84+
with caution as distributed frequencies can have a serious performance impact.
85+
7986
[float]
8087
=== Behaviour
8188

8289
The term and field statistics are not accurate. Deleted documents
8390
are not taken into account. The information is only retrieved for the
84-
shard the requested document resides in. The term and field statistics
85-
are therefore only useful as relative measures whereas the absolute
86-
numbers have no meaning in this context. By default, when requesting
87-
term vectors of artificial documents, a shard to get the statistics from
88-
is randomly selected. Use `routing` only to hit a particular shard.
91+
shard the requested document resides in, unless `dfs` is set to `true`.
92+
The term and field statistics are therefore only useful as relative measures
93+
whereas the absolute numbers have no meaning in this context. By default,
94+
when requesting term vectors of artificial documents, a shard to get the statistics
95+
from is randomly selected. Use `routing` only to hit a particular shard.
8996

9097
[float]
9198
=== Example 1

rest-api-spec/api/termvector.json

+6
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@
3535
"default" : true,
3636
"required" : false
3737
},
38+
"dfs" : {
39+
"type" : "boolean",
40+
"description" : "Specifies if distributed frequencies should be returned instead shard frequencies.",
41+
"default" : false,
42+
"required" : false
43+
},
3844
"fields" : {
3945
"type" : "list",
4046
"description" : "A comma-separated list of fields to return.",

src/main/java/org/elasticsearch/action/ActionModule.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@
160160
import org.elasticsearch.action.support.ActionFilters;
161161
import org.elasticsearch.action.support.TransportAction;
162162
import org.elasticsearch.action.termvector.*;
163+
import org.elasticsearch.action.termvector.dfs.TransportDfsOnlyAction;
163164
import org.elasticsearch.action.update.TransportUpdateAction;
164165
import org.elasticsearch.action.update.UpdateAction;
165166
import org.elasticsearch.common.inject.AbstractModule;
@@ -280,7 +281,8 @@ protected void configure() {
280281

281282
registerAction(IndexAction.INSTANCE, TransportIndexAction.class);
282283
registerAction(GetAction.INSTANCE, TransportGetAction.class);
283-
registerAction(TermVectorAction.INSTANCE, TransportSingleShardTermVectorAction.class);
284+
registerAction(TermVectorAction.INSTANCE, TransportSingleShardTermVectorAction.class,
285+
TransportDfsOnlyAction.class);
284286
registerAction(MultiTermVectorsAction.INSTANCE, TransportMultiTermVectorsAction.class,
285287
TransportSingleShardMultiTermsVectorAction.class);
286288
registerAction(DeleteAction.INSTANCE, TransportDeleteAction.class,

src/main/java/org/elasticsearch/action/termvector/TermVectorRequest.java

+27-3
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,22 @@ public TermVectorRequest fieldStatistics(boolean fieldStatistics) {
292292
return this;
293293
}
294294

295+
/**
296+
* @return <code>true</code> if distributed frequencies should be returned. Otherwise
297+
* <code>false</code>
298+
*/
299+
public boolean dfs() {
300+
return flagsEnum.contains(Flag.Dfs);
301+
}
302+
303+
/**
304+
* Use distributed frequencies instead of shard statistics.
305+
*/
306+
public TermVectorRequest dfs(boolean dfs) {
307+
setFlag(Flag.Dfs, dfs);
308+
return this;
309+
}
310+
295311
/**
296312
* Return only term vectors for special selected fields. Returns for term
297313
* vectors for all fields if selectedFields == null
@@ -309,24 +325,30 @@ public TermVectorRequest selectedFields(String... fields) {
309325
return this;
310326
}
311327

328+
/**
329+
* Return whether term vectors should be generated real-time (default to true).
330+
*/
312331
public boolean realtime() {
313332
return this.realtime == null ? true : this.realtime;
314333
}
315334

335+
/**
336+
* Choose whether term vectors be generated real-time.
337+
*/
316338
public TermVectorRequest realtime(Boolean realtime) {
317339
this.realtime = realtime;
318340
return this;
319341
}
320342

321343
/**
322-
* Return the overridden analyzers at each field
344+
* Return the overridden analyzers at each field.
323345
*/
324346
public Map<String, String> perFieldAnalyzer() {
325347
return perFieldAnalyzer;
326348
}
327349

328350
/**
329-
* Override the analyzer used at each field when generating term vectors
351+
* Override the analyzer used at each field when generating term vectors.
330352
*/
331353
public TermVectorRequest perFieldAnalyzer(Map<String, String> perFieldAnalyzer) {
332354
this.perFieldAnalyzer = perFieldAnalyzer != null && perFieldAnalyzer.size() != 0 ? Maps.newHashMap(perFieldAnalyzer) : null;
@@ -444,7 +466,7 @@ public void writeTo(StreamOutput out) throws IOException {
444466
public static enum Flag {
445467
// Do not change the order of these flags we use
446468
// the ordinal for encoding! Only append to the end!
447-
Positions, Offsets, Payloads, FieldStatistics, TermStatistics
469+
Positions, Offsets, Payloads, FieldStatistics, TermStatistics, Dfs
448470
}
449471

450472
/**
@@ -477,6 +499,8 @@ public static void parseRequest(TermVectorRequest termVectorRequest, XContentPar
477499
termVectorRequest.termStatistics(parser.booleanValue());
478500
} else if (currentFieldName.equals("field_statistics") || currentFieldName.equals("fieldStatistics")) {
479501
termVectorRequest.fieldStatistics(parser.booleanValue());
502+
} else if (currentFieldName.equals("dfs")) {
503+
termVectorRequest.dfs(parser.booleanValue());
480504
} else if (currentFieldName.equals("per_field_analyzer") || currentFieldName.equals("perFieldAnalyzer")) {
481505
termVectorRequest.perFieldAnalyzer(readPerFieldAnalyzer(parser.map()));
482506
} else if ("_index".equals(currentFieldName)) { // the following is important for multi request parsing.

src/main/java/org/elasticsearch/action/termvector/TermVectorRequestBuilder.java

+46-2
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,23 @@
2727
import java.util.Map;
2828

2929
/**
30+
* The builder class for a term vector request.
31+
* Returns the term vector (doc frequency, positions, offsets) for a document.
32+
* <p/>
33+
* Note, the {@code index}, {@code type} and {@code id} are
34+
* required.
3035
*/
3136
public class TermVectorRequestBuilder extends ActionRequestBuilder<TermVectorRequest, TermVectorResponse, TermVectorRequestBuilder, Client> {
3237

3338
public TermVectorRequestBuilder(Client client) {
3439
super(client, new TermVectorRequest());
3540
}
3641

42+
/**
43+
* Constructs a new term vector request builder for a document that will be fetch
44+
* from the provided index. Use {@code index}, {@code type} and
45+
* {@code id} to specify the document to load.
46+
*/
3747
public TermVectorRequestBuilder(Client client, String index, String type, String id) {
3848
super(client, new TermVectorRequest(index, type, id));
3949
}
@@ -92,47 +102,81 @@ public TermVectorRequestBuilder setParent(String parent) {
92102
* <tt>_local</tt> to prefer local shards, <tt>_primary</tt> to execute only on primary shards, or
93103
* a custom value, which guarantees that the same order will be used across different requests.
94104
*/
95-
96105
public TermVectorRequestBuilder setPreference(String preference) {
97106
request.preference(preference);
98107
return this;
99108
}
100-
109+
110+
/**
111+
* Sets whether to return the start and stop offsets for each term if they were stored or
112+
* skip offsets.
113+
*/
101114
public TermVectorRequestBuilder setOffsets(boolean offsets) {
102115
request.offsets(offsets);
103116
return this;
104117
}
105118

119+
120+
/**
121+
* Sets whether to return the positions for each term if stored or skip.
122+
*/
106123
public TermVectorRequestBuilder setPositions(boolean positions) {
107124
request.positions(positions);
108125
return this;
109126
}
110127

128+
/**
129+
* Sets whether to return the payloads for each term or skip.
130+
*/
111131
public TermVectorRequestBuilder setPayloads(boolean payloads) {
112132
request.payloads(payloads);
113133
return this;
114134
}
115135

136+
/**
137+
* Sets whether to return the term statistics for each term in the shard or skip.
138+
*/
116139
public TermVectorRequestBuilder setTermStatistics(boolean termStatistics) {
117140
request.termStatistics(termStatistics);
118141
return this;
119142
}
120143

144+
/**
145+
* Sets whether to return the field statistics for each term in the shard or skip.
146+
*/
121147
public TermVectorRequestBuilder setFieldStatistics(boolean fieldStatistics) {
122148
request.fieldStatistics(fieldStatistics);
123149
return this;
124150
}
125151

152+
/**
153+
* Sets whether to use distributed frequencies instead of shard statistics.
154+
*/
155+
public TermVectorRequestBuilder setDfs(boolean dfs) {
156+
request.dfs(dfs);
157+
return this;
158+
}
159+
160+
/**
161+
* Sets whether to return only term vectors for special selected fields. Returns the term
162+
* vectors for all fields if selectedFields == null
163+
*/
126164
public TermVectorRequestBuilder setSelectedFields(String... fields) {
127165
request.selectedFields(fields);
128166
return this;
129167
}
130168

169+
/**
170+
* Sets whether term vectors are generated real-time.
171+
*/
131172
public TermVectorRequestBuilder setRealtime(Boolean realtime) {
132173
request.realtime(realtime);
133174
return this;
134175
}
135176

177+
/**
178+
* Sets the analyzer used at each field when generating term vectors.
179+
*/
136180
public TermVectorRequestBuilder setPerFieldAnalyzer(Map<String, String> perFieldAnalyzer) {
137181
request.perFieldAnalyzer(perFieldAnalyzer);
138182
return this;

src/main/java/org/elasticsearch/action/termvector/TermVectorResponse.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.ElasticsearchIllegalStateException;
3131
import org.elasticsearch.action.ActionResponse;
3232
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
33+
import org.elasticsearch.common.Nullable;
3334
import org.elasticsearch.common.bytes.BytesArray;
3435
import org.elasticsearch.common.bytes.BytesReference;
3536
import org.elasticsearch.common.io.stream.BytesStreamOutput;
@@ -38,6 +39,7 @@
3839
import org.elasticsearch.common.xcontent.ToXContent;
3940
import org.elasticsearch.common.xcontent.XContentBuilder;
4041
import org.elasticsearch.common.xcontent.XContentBuilderString;
42+
import org.elasticsearch.search.dfs.AggregatedDfs;
4143

4244
import java.io.IOException;
4345
import java.util.EnumSet;
@@ -320,10 +322,14 @@ public void setExists(boolean exists) {
320322
}
321323

322324
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
325+
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null);
326+
}
327+
328+
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
323329
TermVectorWriter tvw = new TermVectorWriter(this);
324330

325331
if (termVectorsByField != null) {
326-
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields);
332+
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs);
327333
}
328334

329335
}

src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java

+34-4
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@
1919
package org.elasticsearch.action.termvector;
2020

2121
import org.apache.lucene.index.*;
22+
import org.apache.lucene.search.CollectionStatistics;
23+
import org.apache.lucene.search.TermStatistics;
2224
import org.apache.lucene.util.BytesRef;
2325
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
26+
import org.elasticsearch.common.Nullable;
2427
import org.elasticsearch.common.bytes.BytesReference;
2528
import org.elasticsearch.common.io.stream.BytesStreamOutput;
29+
import org.elasticsearch.search.dfs.AggregatedDfs;
2630

2731
import java.io.IOException;
2832
import java.util.ArrayList;
@@ -45,7 +49,7 @@ final class TermVectorWriter {
4549
response = termVectorResponse;
4650
}
4751

48-
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
52+
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
4953
int numFieldsWritten = 0;
5054
TermsEnum iterator = null;
5155
DocsAndPositionsEnum docsAndPosEnum = null;
@@ -70,7 +74,11 @@ void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Fl
7074
boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
7175
startField(field, fieldTermVector.size(), positions, offsets, payloads);
7276
if (flags.contains(Flag.FieldStatistics)) {
73-
writeFieldStatistics(topLevelTerms);
77+
if (dfs != null) {
78+
writeFieldStatistics(dfs.fieldStatistics().get(field));
79+
} else {
80+
writeFieldStatistics(topLevelTerms);
81+
}
7482
}
7583
iterator = fieldTermVector.iterator(iterator);
7684
final boolean useDocsAndPos = positions || offsets || payloads;
@@ -81,7 +89,11 @@ void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Fl
8189
boolean foundTerm = topLevelIterator.seekExact(term);
8290
startTerm(term);
8391
if (flags.contains(Flag.TermStatistics)) {
84-
writeTermStatistics(topLevelIterator);
92+
if (dfs != null) {
93+
writeTermStatistics(dfs.termStatistics().get(new Term(field, term.utf8ToString())));
94+
} else {
95+
writeTermStatistics(topLevelIterator);
96+
}
8597
}
8698
if (useDocsAndPos) {
8799
// given we have pos or offsets
@@ -161,7 +173,6 @@ private void writePayload(BytesRef payload) throws IOException {
161173
}
162174

163175
private void writeFreq(int termFreq) throws IOException {
164-
165176
writePotentiallyNegativeVInt(termFreq);
166177
}
167178

@@ -205,7 +216,15 @@ private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException
205216
long ttf = topLevelIterator.totalTermFreq();
206217
assert (ttf >= -1);
207218
writePotentiallyNegativeVLong(ttf);
219+
}
208220

221+
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
222+
int docFreq = (int) termStatistics.docFreq();
223+
assert (docFreq >= -1);
224+
writePotentiallyNegativeVInt(docFreq);
225+
long ttf = termStatistics.totalTermFreq();
226+
assert (ttf >= -1);
227+
writePotentiallyNegativeVLong(ttf);
209228
}
210229

211230
private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
@@ -218,7 +237,18 @@ private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
218237
int dc = topLevelTerms.getDocCount();
219238
assert (dc >= -1);
220239
writePotentiallyNegativeVInt(dc);
240+
}
221241

242+
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
243+
long sttf = fieldStats.sumTotalTermFreq();
244+
assert (sttf >= -1);
245+
writePotentiallyNegativeVLong(sttf);
246+
long sdf = fieldStats.sumDocFreq();
247+
assert (sdf >= -1);
248+
writePotentiallyNegativeVLong(sdf);
249+
int dc = (int) fieldStats.docCount();
250+
assert (dc >= -1);
251+
writePotentiallyNegativeVInt(dc);
222252
}
223253

224254
private void writePotentiallyNegativeVInt(int value) throws IOException {

0 commit comments

Comments
 (0)