|
21 | 21 | import org.apache.logging.log4j.LogManager;
|
22 | 22 | import org.apache.logging.log4j.Logger;
|
23 | 23 | import org.apache.lucene.index.DirectoryReader;
|
24 |
| -import org.apache.lucene.index.FilteredTermsEnum; |
25 |
| -import org.apache.lucene.index.LeafReader; |
26 | 24 | import org.apache.lucene.index.LeafReaderContext;
|
27 | 25 | import org.apache.lucene.index.OrdinalMap;
|
| 26 | +import org.apache.lucene.index.SortedSetDocValues; |
28 | 27 | import org.apache.lucene.index.Terms;
|
29 | 28 | import org.apache.lucene.index.TermsEnum;
|
30 | 29 | import org.apache.lucene.util.BytesRef;
|
31 | 30 | import org.elasticsearch.ElasticsearchException;
|
32 | 31 | import org.elasticsearch.index.fielddata.IndexFieldDataCache;
|
33 | 32 | import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
|
34 | 33 | import org.elasticsearch.index.fielddata.LeafOrdinalsFieldData;
|
| 34 | +import org.elasticsearch.index.fielddata.RamAccountingTermsEnum; |
| 35 | +import org.elasticsearch.index.fielddata.ScriptDocValues; |
35 | 36 | import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsBuilder;
|
36 | 37 | import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsIndexFieldData;
|
37 | 38 | import org.elasticsearch.indices.breaker.CircuitBreakerService;
|
38 | 39 | import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
39 | 40 |
|
40 | 41 | import java.io.IOException;
|
| 42 | +import java.util.function.Function; |
41 | 43 |
|
42 |
| -public abstract class AbstractIndexOrdinalsFieldData extends AbstractIndexFieldData<LeafOrdinalsFieldData> |
43 |
| - implements IndexOrdinalsFieldData { |
| 44 | +public abstract class AbstractIndexOrdinalsFieldData implements IndexOrdinalsFieldData { |
44 | 45 | private static final Logger logger = LogManager.getLogger(AbstractBinaryDVLeafFieldData.class);
|
45 | 46 |
|
46 |
| - private final double minFrequency, maxFrequency; |
47 |
| - private final int minSegmentSize; |
| 47 | + private final String fieldName; |
| 48 | + private final ValuesSourceType valuesSourceType; |
| 49 | + private final IndexFieldDataCache cache; |
48 | 50 | protected final CircuitBreakerService breakerService;
|
| 51 | + protected final Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction; |
49 | 52 |
|
50 | 53 | protected AbstractIndexOrdinalsFieldData(
|
51 | 54 | String fieldName,
|
52 | 55 | ValuesSourceType valuesSourceType,
|
53 | 56 | IndexFieldDataCache cache,
|
54 | 57 | CircuitBreakerService breakerService,
|
55 |
| - double minFrequency, |
56 |
| - double maxFrequency, |
57 |
| - int minSegmentSize |
| 58 | + Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction |
58 | 59 | ) {
|
59 |
| - super(fieldName, valuesSourceType, cache); |
| 60 | + this.fieldName = fieldName; |
| 61 | + this.valuesSourceType = valuesSourceType; |
| 62 | + this.cache = cache; |
60 | 63 | this.breakerService = breakerService;
|
61 |
| - this.minFrequency = minFrequency; |
62 |
| - this.maxFrequency = maxFrequency; |
63 |
| - this.minSegmentSize = minSegmentSize; |
| 64 | + this.scriptFunction = scriptFunction; |
| 65 | + } |
| 66 | + |
| 67 | + @Override |
| 68 | + public String getFieldName() { |
| 69 | + return this.fieldName; |
| 70 | + } |
| 71 | + |
| 72 | + @Override |
| 73 | + public ValuesSourceType getValuesSourceType() { |
| 74 | + return valuesSourceType; |
64 | 75 | }
|
65 | 76 |
|
66 | 77 | @Override
|
67 | 78 | public OrdinalMap getOrdinalMap() {
|
68 | 79 | return null;
|
69 | 80 | }
|
70 | 81 |
|
| 82 | + @Override |
| 83 | + public LeafOrdinalsFieldData load(LeafReaderContext context) { |
| 84 | + if (context.reader().getFieldInfos().fieldInfo(fieldName) == null) { |
| 85 | + // Some leaf readers may be wrapped and report different set of fields and use the same cache key. |
| 86 | + // If a field can't be found then it doesn't mean it isn't there, |
| 87 | + // so if a field doesn't exist then we don't cache it and just return an empty field data instance. |
| 88 | + // The next time the field is found, we do cache. |
| 89 | + return AbstractLeafOrdinalsFieldData.empty(); |
| 90 | + } |
| 91 | + |
| 92 | + try { |
| 93 | + return cache.load(context, this); |
| 94 | + } catch (Exception e) { |
| 95 | + if (e instanceof ElasticsearchException) { |
| 96 | + throw (ElasticsearchException) e; |
| 97 | + } else { |
| 98 | + throw new ElasticsearchException(e); |
| 99 | + } |
| 100 | + } |
| 101 | + } |
| 102 | + |
71 | 103 | @Override
|
72 | 104 | public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) {
|
73 | 105 | IndexOrdinalsFieldData fieldData = loadGlobalInternal(indexReader);
|
@@ -121,60 +153,49 @@ public IndexOrdinalsFieldData loadGlobalDirect(DirectoryReader indexReader) thro
|
121 | 153 | this,
|
122 | 154 | breakerService,
|
123 | 155 | logger,
|
124 |
| - AbstractLeafOrdinalsFieldData.DEFAULT_SCRIPT_FUNCTION |
| 156 | + scriptFunction |
125 | 157 | );
|
126 | 158 | }
|
127 | 159 |
|
128 |
| - @Override |
129 |
| - protected LeafOrdinalsFieldData empty(int maxDoc) { |
130 |
| - return AbstractLeafOrdinalsFieldData.empty(); |
131 |
| - } |
132 |
| - |
133 |
| - protected TermsEnum filter(Terms terms, TermsEnum iterator, LeafReader reader) throws IOException { |
134 |
| - if (iterator == null) { |
135 |
| - return null; |
136 |
| - } |
137 |
| - int docCount = terms.getDocCount(); |
138 |
| - if (docCount == -1) { |
139 |
| - docCount = reader.maxDoc(); |
140 |
| - } |
141 |
| - if (docCount >= minSegmentSize) { |
142 |
| - final int minFreq = minFrequency > 1.0 |
143 |
| - ? (int) minFrequency |
144 |
| - : (int)(docCount * minFrequency); |
145 |
| - final int maxFreq = maxFrequency > 1.0 |
146 |
| - ? (int) maxFrequency |
147 |
| - : (int)(docCount * maxFrequency); |
148 |
| - if (minFreq > 1 || maxFreq < docCount) { |
149 |
| - iterator = new FrequencyFilter(iterator, minFreq, maxFreq); |
150 |
| - } |
151 |
| - } |
152 |
| - return iterator; |
153 |
| - } |
154 |
| - |
155 | 160 | @Override
|
156 | 161 | public boolean supportsGlobalOrdinalsMapping() {
|
157 | 162 | return false;
|
158 | 163 | }
|
159 | 164 |
|
160 |
| - private static final class FrequencyFilter extends FilteredTermsEnum { |
161 |
| - |
162 |
| - private int minFreq; |
163 |
| - private int maxFreq; |
164 |
| - FrequencyFilter(TermsEnum delegate, int minFreq, int maxFreq) { |
165 |
| - super(delegate, false); |
166 |
| - this.minFreq = minFreq; |
167 |
| - this.maxFreq = maxFreq; |
168 |
| - } |
169 |
| - |
170 |
| - @Override |
171 |
| - protected AcceptStatus accept(BytesRef arg0) throws IOException { |
172 |
| - int docFreq = docFreq(); |
173 |
| - if (docFreq >= minFreq && docFreq <= maxFreq) { |
174 |
| - return AcceptStatus.YES; |
175 |
| - } |
176 |
| - return AcceptStatus.NO; |
177 |
| - } |
| 165 | + /** |
| 166 | + * A {@code PerValueEstimator} is a sub-class that can be used to estimate |
| 167 | + * the memory overhead for loading the data. Each field data |
| 168 | + * implementation should implement its own {@code PerValueEstimator} if it |
| 169 | + * intends to take advantage of the CircuitBreaker. |
| 170 | + * <p> |
| 171 | + * Note that the .beforeLoad(...) and .afterLoad(...) methods must be |
| 172 | + * manually called. |
| 173 | + */ |
| 174 | + public interface PerValueEstimator { |
| 175 | + |
| 176 | + /** |
| 177 | + * @return the number of bytes for the given term |
| 178 | + */ |
| 179 | + long bytesPerValue(BytesRef term); |
| 180 | + |
| 181 | + /** |
| 182 | + * Execute any pre-loading estimations for the terms. May also |
| 183 | + * optionally wrap a {@link TermsEnum} in a |
| 184 | + * {@link RamAccountingTermsEnum} |
| 185 | + * which will estimate the memory on a per-term basis. |
| 186 | + * |
| 187 | + * @param terms terms to be estimated |
| 188 | + * @return A TermsEnum for the given terms |
| 189 | + */ |
| 190 | + TermsEnum beforeLoad(Terms terms) throws IOException; |
| 191 | + |
| 192 | + /** |
| 193 | + * Possibly adjust a circuit breaker after field data has been loaded, |
| 194 | + * now that the actual amount of memory used by the field data is known |
| 195 | + * |
| 196 | + * @param termsEnum terms that were loaded |
| 197 | + * @param actualUsed actual field data memory usage |
| 198 | + */ |
| 199 | + void afterLoad(TermsEnum termsEnum, long actualUsed); |
178 | 200 | }
|
179 |
| - |
180 | 201 | }
|
0 commit comments