|
20 | 20 |
|
21 | 21 | import org.apache.lucene.index.LeafReaderContext;
|
22 | 22 | import org.apache.lucene.index.SortedNumericDocValues;
|
23 |
| -import org.apache.lucene.util.CollectionUtil; |
24 | 23 | import org.elasticsearch.common.lease.Releasables;
|
25 | 24 | import org.elasticsearch.common.util.LongHash;
|
| 25 | +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; |
26 | 26 | import org.elasticsearch.search.DocValueFormat;
|
27 | 27 | import org.elasticsearch.search.aggregations.Aggregator;
|
28 | 28 | import org.elasticsearch.search.aggregations.AggregatorFactories;
|
|
34 | 34 |
|
35 | 35 | import java.io.IOException;
|
36 | 36 | import java.util.ArrayList;
|
| 37 | +import java.util.Arrays; |
37 | 38 | import java.util.List;
|
38 | 39 | import java.util.Map;
|
39 | 40 |
|
|
42 | 43 | /**
|
43 | 44 | * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending)
|
44 | 45 | */
|
45 |
| -public class LongRareTermsAggregator extends AbstractRareTermsAggregator<ValuesSource.Numeric, IncludeExclude.LongFilter, Long> { |
46 |
| - |
47 |
| - protected LongHash bucketOrds; |
48 |
| - |
49 |
| - LongRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, DocValueFormat format, |
50 |
| - SearchContext aggregationContext, Aggregator parent, IncludeExclude.LongFilter longFilter, |
51 |
| - int maxDocCount, double precision, Map<String, Object> metadata) throws IOException { |
52 |
| - super(name, factories, aggregationContext, parent, metadata, maxDocCount, precision, format, valuesSource, longFilter); |
53 |
| - this.bucketOrds = new LongHash(1, aggregationContext.bigArrays()); |
| 46 | +public class LongRareTermsAggregator extends AbstractRareTermsAggregator { |
| 47 | + private final ValuesSource.Numeric valuesSource; |
| 48 | + private final IncludeExclude.LongFilter filter; |
| 49 | + private final LongKeyedBucketOrds bucketOrds; |
| 50 | + |
| 51 | + LongRareTermsAggregator( |
| 52 | + String name, |
| 53 | + AggregatorFactories factories, |
| 54 | + ValuesSource.Numeric valuesSource, |
| 55 | + DocValueFormat format, |
| 56 | + SearchContext aggregationContext, |
| 57 | + Aggregator parent, |
| 58 | + IncludeExclude.LongFilter filter, |
| 59 | + int maxDocCount, |
| 60 | + double precision, |
| 61 | + boolean collectsFromSingleBucket, |
| 62 | + Map<String, Object> metadata |
| 63 | + ) throws IOException { |
| 64 | + super( |
| 65 | + name, |
| 66 | + factories, |
| 67 | + aggregationContext, |
| 68 | + parent, |
| 69 | + metadata, |
| 70 | + maxDocCount, |
| 71 | + precision, |
| 72 | + format, |
| 73 | + collectsFromSingleBucket |
| 74 | + ); |
| 75 | + this.valuesSource = valuesSource; |
| 76 | + this.filter = filter; |
| 77 | + this.bucketOrds = LongKeyedBucketOrds.build(context.bigArrays(), collectsFromSingleBucket); |
54 | 78 | }
|
55 | 79 |
|
56 | 80 | protected SortedNumericDocValues getValues(ValuesSource.Numeric valuesSource, LeafReaderContext ctx) throws IOException {
|
57 | 81 | return valuesSource.longValues(ctx);
|
58 | 82 | }
|
59 | 83 |
|
60 | 84 | @Override
|
61 |
| - public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, |
62 |
| - final LeafBucketCollector sub) throws IOException { |
63 |
| - final SortedNumericDocValues values = getValues(valuesSource, ctx); |
| 85 | + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { |
| 86 | + SortedNumericDocValues values = getValues(valuesSource, ctx); |
64 | 87 | return new LeafBucketCollectorBase(sub, values) {
|
65 |
| - |
66 | 88 | @Override
|
67 |
| - public void collect(int docId, long owningBucketOrdinal) throws IOException { |
68 |
| - if (values.advanceExact(docId)) { |
69 |
| - final int valuesCount = values.docValueCount(); |
70 |
| - long previous = Long.MAX_VALUE; |
71 |
| - for (int i = 0; i < valuesCount; ++i) { |
72 |
| - final long val = values.nextValue(); |
73 |
| - if (previous != val || i == 0) { |
74 |
| - if ((includeExclude == null) || (includeExclude.accept(val))) { |
75 |
| - doCollect(sub, val, docId); |
76 |
| - } |
77 |
| - previous = val; |
78 |
| - } |
| 89 | + public void collect(int docId, long owningBucketOrd) throws IOException { |
| 90 | + if (false == values.advanceExact(docId)) { |
| 91 | + return; |
| 92 | + } |
| 93 | + int valuesCount = values.docValueCount(); |
| 94 | + long previous = Long.MAX_VALUE; |
| 95 | + for (int i = 0; i < valuesCount; ++i) { |
| 96 | + long val = values.nextValue(); |
| 97 | + if (i == 0 && previous == val) { |
| 98 | + continue; |
| 99 | + } |
| 100 | + previous = val; |
| 101 | + if (filter != null && false == filter.accept(val)) { |
| 102 | + continue; |
| 103 | + } |
| 104 | + long bucketOrdinal = bucketOrds.add(owningBucketOrd, val); |
| 105 | + if (bucketOrdinal < 0) { // already seen |
| 106 | + bucketOrdinal = -1 - bucketOrdinal; |
| 107 | + collectExistingBucket(sub, docId, bucketOrdinal); |
| 108 | + } else { |
| 109 | + collectBucket(sub, docId, bucketOrdinal); |
79 | 110 | }
|
80 | 111 | }
|
81 | 112 | }
|
82 | 113 | };
|
83 | 114 | }
|
84 | 115 |
|
85 | 116 | @Override
|
86 |
| - long addValueToOrds(Long value) { |
87 |
| - return bucketOrds.add(value); |
88 |
| - } |
89 |
| - |
90 |
| - /** |
91 |
| - * Merges the ordinals to a minimal set, populates the CuckooFilter and |
92 |
| - * generates a final set of buckets. |
93 |
| - * |
94 |
| - * If a term is below the maxDocCount, it is turned into a Bucket. Otherwise, |
95 |
| - * the term is added to the filter, and pruned from the ordinal map. If |
96 |
| - * necessary the ordinal map is merged down to a minimal set to remove deletions |
97 |
| - */ |
98 |
| - private List<LongRareTerms.Bucket> buildSketch() { |
99 |
| - long deletionCount = 0; |
100 |
| - LongHash newBucketOrds = new LongHash(1, context.bigArrays()); |
101 |
| - List<LongRareTerms.Bucket> buckets = new ArrayList<>(); |
102 |
| - try (LongHash oldBucketOrds = bucketOrds) { |
103 |
| - |
104 |
| - long[] mergeMap = new long[(int) oldBucketOrds.size()]; |
105 |
| - for (int i = 0; i < oldBucketOrds.size(); i++) { |
106 |
| - long oldKey = oldBucketOrds.get(i); |
107 |
| - long newBucketOrd = -1; |
108 |
| - |
109 |
| - long docCount = bucketDocCount(i); |
110 |
| - // if the key is below threshold, reinsert into the new ords |
111 |
| - if (docCount <= maxDocCount) { |
112 |
| - newBucketOrd = newBucketOrds.add(oldKey); |
113 |
| - LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(oldKey, docCount, null, format); |
114 |
| - bucket.bucketOrd = newBucketOrd; |
115 |
| - buckets.add(bucket); |
116 |
| - } else { |
117 |
| - // Make a note when one of the ords has been deleted |
118 |
| - deletionCount += 1; |
119 |
| - filter.add(oldKey); |
| 117 | + public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { |
| 118 | + /* |
| 119 | + * Collect the list of buckets, populate the filter with terms |
| 120 | + * that are too frequent, and figure out how to merge sub-buckets. |
| 121 | + */ |
| 122 | + LongRareTerms.Bucket[][] rarestPerOrd = new LongRareTerms.Bucket[owningBucketOrds.length][]; |
| 123 | + SetBackedScalingCuckooFilter[] filters = new SetBackedScalingCuckooFilter[owningBucketOrds.length]; |
| 124 | + long keepCount = 0; |
| 125 | + long[] mergeMap = new long[(int) bucketOrds.size()]; |
| 126 | + Arrays.fill(mergeMap, -1); |
| 127 | + long size = 0; |
| 128 | + for (int ordIdx = 0; ordIdx < owningBucketOrds.length; ordIdx++) { |
| 129 | + try (LongHash ordsToCollect = new LongHash(1, context.bigArrays())) { |
| 130 | + filters[ordIdx] = newFilter(); |
| 131 | + List<LongRareTerms.Bucket> buckets = new ArrayList<>(); |
| 132 | + LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrds[ordIdx]); |
| 133 | + while (ordsEnum.next()) { |
| 134 | + long docCount = bucketDocCount(ordsEnum.ord()); |
| 135 | + // if the key is below threshold, reinsert into the new ords |
| 136 | + if (docCount <= maxDocCount) { |
| 137 | + LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(ordsEnum.value(), docCount, null, format); |
| 138 | + bucket.bucketOrd = mergeMap[(int) ordsEnum.ord()] = size + ordsToCollect.add(ordsEnum.value()); |
| 139 | + buckets.add(bucket); |
| 140 | + keepCount++; |
| 141 | + } else { |
| 142 | + filters[ordIdx].add(ordsEnum.value()); |
| 143 | + } |
120 | 144 | }
|
121 |
| - mergeMap[i] = newBucketOrd; |
| 145 | + rarestPerOrd[ordIdx] = buckets.toArray(LongRareTerms.Bucket[]::new); |
| 146 | + size += ordsToCollect.size(); |
122 | 147 | }
|
| 148 | + } |
123 | 149 |
|
124 |
| - // Only merge/delete the ordinals if we have actually deleted one, |
125 |
| - // to save on some redundant work |
126 |
| - if (deletionCount > 0) { |
127 |
| - mergeBuckets(mergeMap, newBucketOrds.size()); |
128 |
| - if (deferringCollector != null) { |
129 |
| - deferringCollector.mergeBuckets(mergeMap); |
130 |
| - } |
| 150 | + /* |
| 151 | + * Only merge/delete the ordinals if we have actually deleted one, |
| 152 | + * to save on some redundant work. |
| 153 | + */ |
| 154 | + if (keepCount != mergeMap.length) { |
| 155 | + mergeBuckets(mergeMap, size); |
| 156 | + if (deferringCollector != null) { |
| 157 | + deferringCollector.mergeBuckets(mergeMap); |
131 | 158 | }
|
132 | 159 | }
|
133 |
| - bucketOrds = newBucketOrds; |
134 |
| - return buckets; |
135 |
| - } |
136 | 160 |
|
137 |
| - @Override |
138 |
| - public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { |
139 |
| - assert owningBucketOrds.length == 1 && owningBucketOrds[0] == 0; |
140 |
| - List<LongRareTerms.Bucket> buckets = buildSketch(); |
141 |
| - buildSubAggsForBuckets(buckets, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); |
142 |
| - |
143 |
| - CollectionUtil.introSort(buckets, ORDER.comparator()); |
144 |
| - return new InternalAggregation[] {new LongRareTerms(name, ORDER, metadata(), format, buckets, maxDocCount, filter)}; |
| 161 | + /* |
| 162 | + * Now build the results! |
| 163 | + */ |
| 164 | + buildSubAggsForAllBuckets(rarestPerOrd, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); |
| 165 | + InternalAggregation[] result = new InternalAggregation[owningBucketOrds.length]; |
| 166 | + for (int ordIdx = 0; ordIdx < owningBucketOrds.length; ordIdx++) { |
| 167 | + Arrays.sort(rarestPerOrd[ordIdx], ORDER.comparator()); |
| 168 | + result[ordIdx] = new LongRareTerms( |
| 169 | + name, |
| 170 | + ORDER, |
| 171 | + metadata(), |
| 172 | + format, |
| 173 | + Arrays.asList(rarestPerOrd[ordIdx]), |
| 174 | + maxDocCount, |
| 175 | + filters[ordIdx] |
| 176 | + ); |
| 177 | + } |
| 178 | + return result; |
145 | 179 | }
|
146 | 180 |
|
147 | 181 | @Override
|
148 | 182 | public InternalAggregation buildEmptyAggregation() {
|
149 |
| - return new LongRareTerms(name, ORDER, metadata(), format, emptyList(), 0, filter); |
| 183 | + return new LongRareTerms(name, ORDER, metadata(), format, emptyList(), 0, newFilter()); |
150 | 184 | }
|
151 | 185 |
|
152 | 186 | @Override
|
|
0 commit comments