Skip to content

Commit 7e1a1fe

Browse files
author
Paul Sanwald
authored
auto-interval date histogram - 6.x backport (#32107)
Backport of auto-interval date histogram for 6.x: the material changes in this backport are around checking for an optional soft limit on max number of buckets, and logging a deprecation warning if that bucket limit isn't specified, and the bucket ceiling would have been tripped on a 7.x cluster.
1 parent 0e5e585 commit 7e1a1fe

21 files changed

+3401
-6
lines changed

client/rest-high-level/src/main/java/org/elasticsearch/client/RestHighLevelClient.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,10 @@
8585
import org.elasticsearch.search.aggregations.bucket.geogrid.ParsedGeoHashGrid;
8686
import org.elasticsearch.search.aggregations.bucket.global.GlobalAggregationBuilder;
8787
import org.elasticsearch.search.aggregations.bucket.global.ParsedGlobal;
88+
import org.elasticsearch.search.aggregations.bucket.histogram.AutoDateHistogramAggregationBuilder;
8889
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregationBuilder;
8990
import org.elasticsearch.search.aggregations.bucket.histogram.HistogramAggregationBuilder;
91+
import org.elasticsearch.search.aggregations.bucket.histogram.ParsedAutoDateHistogram;
9092
import org.elasticsearch.search.aggregations.bucket.histogram.ParsedDateHistogram;
9193
import org.elasticsearch.search.aggregations.bucket.histogram.ParsedHistogram;
9294
import org.elasticsearch.search.aggregations.bucket.missing.MissingAggregationBuilder;
@@ -1435,6 +1437,7 @@ static List<NamedXContentRegistry.Entry> getDefaultNamedXContents() {
14351437
map.put(GeoCentroidAggregationBuilder.NAME, (p, c) -> ParsedGeoCentroid.fromXContent(p, (String) c));
14361438
map.put(HistogramAggregationBuilder.NAME, (p, c) -> ParsedHistogram.fromXContent(p, (String) c));
14371439
map.put(DateHistogramAggregationBuilder.NAME, (p, c) -> ParsedDateHistogram.fromXContent(p, (String) c));
1440+
map.put(AutoDateHistogramAggregationBuilder.NAME, (p, c) -> ParsedAutoDateHistogram.fromXContent(p, (String) c));
14381441
map.put(StringTerms.NAME, (p, c) -> ParsedStringTerms.fromXContent(p, (String) c));
14391442
map.put(LongTerms.NAME, (p, c) -> ParsedLongTerms.fromXContent(p, (String) c));
14401443
map.put(DoubleTerms.NAME, (p, c) -> ParsedDoubleTerms.fromXContent(p, (String) c));

docs/reference/aggregations/bucket.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ setting named `search.max_buckets`. It is disabled by default (-1) but requests
1919

2020
include::bucket/adjacency-matrix-aggregation.asciidoc[]
2121

22+
include::bucket/autodatehistogram-aggregation.asciidoc[]
23+
2224
include::bucket/children-aggregation.asciidoc[]
2325

2426
include::bucket/composite-aggregation.asciidoc[]
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
[[search-aggregations-bucket-autodatehistogram-aggregation]]
2+
=== Auto-interval Date Histogram Aggregation
3+
4+
A multi-bucket aggregation similar to the <<search-aggregations-bucket-datehistogram-aggregation>> except
5+
instead of providing an interval to use as the width of each bucket, a target number of buckets is provided
6+
indicating the number of buckets needed and the interval of the buckets is automatically chosen to best achieve
7+
that target. The number of buckets returned will always be less than or equal to this target number.
8+
9+
The buckets field is optional, and will default to 10 buckets if not specified.
10+
11+
Requesting a target of 10 buckets.
12+
13+
[source,js]
14+
--------------------------------------------------
15+
POST /sales/_search?size=0
16+
{
17+
"aggs" : {
18+
"sales_over_time" : {
19+
"auto_date_histogram" : {
20+
"field" : "date",
21+
"buckets" : 10
22+
}
23+
}
24+
}
25+
}
26+
--------------------------------------------------
27+
// CONSOLE
28+
// TEST[setup:sales]
29+
30+
==== Keys
31+
32+
Internally, a date is represented as a 64 bit number representing a timestamp
33+
in milliseconds-since-the-epoch. These timestamps are returned as the bucket
34+
++key++s. The `key_as_string` is the same timestamp converted to a formatted
35+
date string using the format specified with the `format` parameter:
36+
37+
TIP: If no `format` is specified, then it will use the first date
38+
<<mapping-date-format,format>> specified in the field mapping.
39+
40+
[source,js]
41+
--------------------------------------------------
42+
POST /sales/_search?size=0
43+
{
44+
"aggs" : {
45+
"sales_over_time" : {
46+
"auto_date_histogram" : {
47+
"field" : "date",
48+
"buckets" : 5,
49+
"format" : "yyyy-MM-dd" <1>
50+
}
51+
}
52+
}
53+
}
54+
--------------------------------------------------
55+
// CONSOLE
56+
// TEST[setup:sales]
57+
58+
<1> Supports expressive date <<date-format-pattern,format pattern>>
59+
60+
Response:
61+
62+
[source,js]
63+
--------------------------------------------------
64+
{
65+
...
66+
"aggregations": {
67+
"sales_over_time": {
68+
"buckets": [
69+
{
70+
"key_as_string": "2015-01-01",
71+
"key": 1420070400000,
72+
"doc_count": 3
73+
},
74+
{
75+
"key_as_string": "2015-02-01",
76+
"key": 1422748800000,
77+
"doc_count": 2
78+
},
79+
{
80+
"key_as_string": "2015-03-01",
81+
"key": 1425168000000,
82+
"doc_count": 2
83+
}
84+
]
85+
}
86+
}
87+
}
88+
--------------------------------------------------
89+
// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
90+
91+
=== Intervals
92+
93+
The interval of the returned buckets is selected based on the data collected by the
94+
aggregation so that the number of buckets returned is less than or equal to the number
95+
requested. The possible intervals returned are:
96+
97+
[horizontal]
98+
seconds:: In multiples of 1, 5, 10 and 30
99+
minutes:: In multiples of 1, 5, 10 and 30
100+
hours:: In multiples of 1, 3 and 12
101+
days:: In multiples of 1, and 7
102+
months:: In multiples of 1, and 3
103+
years:: In multiples of 1, 5, 10, 20, 50 and 100
104+
105+
In the worst case, where the number of daily buckets are too many for the requested
106+
number of buckets, the number of buckets returned will be 1/7th of the number of
107+
buckets requested.
108+
109+
==== Time Zone
110+
111+
Date-times are stored in Elasticsearch in UTC. By default, all bucketing and
112+
rounding is also done in UTC. The `time_zone` parameter can be used to indicate
113+
that bucketing should use a different time zone.
114+
115+
Time zones may either be specified as an ISO 8601 UTC offset (e.g. `+01:00` or
116+
`-08:00`) or as a timezone id, an identifier used in the TZ database like
117+
`America/Los_Angeles`.
118+
119+
Consider the following example:
120+
121+
[source,js]
122+
---------------------------------
123+
PUT my_index/log/1?refresh
124+
{
125+
"date": "2015-10-01T00:30:00Z"
126+
}
127+
128+
PUT my_index/log/2?refresh
129+
{
130+
"date": "2015-10-01T01:30:00Z"
131+
}
132+
133+
PUT my_index/log/3?refresh
134+
{
135+
"date": "2015-10-01T02:30:00Z"
136+
}
137+
138+
GET my_index/_search?size=0
139+
{
140+
"aggs": {
141+
"by_day": {
142+
"auto_date_histogram": {
143+
"field": "date",
144+
"buckets" : 3
145+
}
146+
}
147+
}
148+
}
149+
---------------------------------
150+
// CONSOLE
151+
152+
UTC is used if no time zone is specified, three 1-hour buckets are returned
153+
starting at midnight UTC on 1 October 2015:
154+
155+
[source,js]
156+
---------------------------------
157+
{
158+
...
159+
"aggregations": {
160+
"by_day": {
161+
"buckets": [
162+
{
163+
"key_as_string": "2015-10-01T00:00:00.000Z",
164+
"key": 1443657600000,
165+
"doc_count": 1
166+
},
167+
{
168+
"key_as_string": "2015-10-01T01:00:00.000Z",
169+
"key": 1443661200000,
170+
"doc_count": 1
171+
},
172+
{
173+
"key_as_string": "2015-10-01T02:00:00.000Z",
174+
"key": 1443664800000,
175+
"doc_count": 1
176+
}
177+
]
178+
}
179+
}
180+
}
181+
---------------------------------
182+
// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
183+
184+
If a `time_zone` of `-01:00` is specified, then midnight starts at one hour before
185+
midnight UTC:
186+
187+
[source,js]
188+
---------------------------------
189+
GET my_index/_search?size=0
190+
{
191+
"aggs": {
192+
"by_day": {
193+
"auto_date_histogram": {
194+
"field": "date",
195+
"buckets" : 3,
196+
"time_zone": "-01:00"
197+
}
198+
}
199+
}
200+
}
201+
---------------------------------
202+
// CONSOLE
203+
// TEST[continued]
204+
205+
206+
Now three 1-hour buckets are still returned but the first bucket starts at
207+
11:00pm on 30 September 2015 since that is the local time for the bucket in
208+
the specified time zone.
209+
210+
[source,js]
211+
---------------------------------
212+
{
213+
...
214+
"aggregations": {
215+
"by_day": {
216+
"buckets": [
217+
{
218+
"key_as_string": "2015-09-30T23:00:00.000-01:00", <1>
219+
"key": 1443657600000,
220+
"doc_count": 1
221+
},
222+
{
223+
"key_as_string": "2015-10-01T00:00:00.000-01:00",
224+
"key": 1443661200000,
225+
"doc_count": 1
226+
},
227+
{
228+
"key_as_string": "2015-10-01T01:00:00.000-01:00",
229+
"key": 1443664800000,
230+
"doc_count": 1
231+
}
232+
]
233+
}
234+
}
235+
}
236+
---------------------------------
237+
// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
238+
239+
<1> The `key_as_string` value represents midnight on each day
240+
in the specified time zone.
241+
242+
WARNING: When using time zones that follow DST (daylight savings time) changes,
243+
buckets close to the moment when those changes happen can have slightly different
244+
sizes than neighbouring buckets.
245+
For example, consider a DST start in the `CET` time zone: on 27 March 2016 at 2am,
246+
clocks were turned forward 1 hour to 3am local time. If the result of the aggregation
247+
was daily buckets, the bucket covering that day will only hold data for 23 hours
248+
instead of the usual 24 hours for other buckets. The same is true for shorter intervals
249+
like e.g. 12h. Here, we will have only a 11h bucket on the morning of 27 March when the
250+
DST shift happens.
251+
252+
==== Scripts
253+
254+
Like with the normal <<search-aggregations-bucket-datehistogram-aggregation, `date_histogram`>>, both document level
255+
scripts and value level scripts are supported. This aggregation does not however, support the `min_doc_count`,
256+
`extended_bounds` and `order` parameters.
257+
258+
==== Missing value
259+
260+
The `missing` parameter defines how documents that are missing a value should be treated.
261+
By default they will be ignored but it is also possible to treat them as if they
262+
had a value.
263+
264+
[source,js]
265+
--------------------------------------------------
266+
POST /sales/_search?size=0
267+
{
268+
"aggs" : {
269+
"sale_date" : {
270+
"auto_date_histogram" : {
271+
"field" : "date",
272+
"buckets": 10,
273+
"missing": "2000/01/01" <1>
274+
}
275+
}
276+
}
277+
}
278+
--------------------------------------------------
279+
// CONSOLE
280+
// TEST[setup:sales]
281+
282+
<1> Documents without a value in the `publish_date` field will fall into the same bucket as documents that have the value `2000-01-01`.
283+

server/src/main/java/org/elasticsearch/search/SearchModule.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,10 @@
109109
import org.elasticsearch.search.aggregations.bucket.geogrid.InternalGeoHashGrid;
110110
import org.elasticsearch.search.aggregations.bucket.global.GlobalAggregationBuilder;
111111
import org.elasticsearch.search.aggregations.bucket.global.InternalGlobal;
112+
import org.elasticsearch.search.aggregations.bucket.histogram.AutoDateHistogramAggregationBuilder;
112113
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregationBuilder;
113114
import org.elasticsearch.search.aggregations.bucket.histogram.HistogramAggregationBuilder;
115+
import org.elasticsearch.search.aggregations.bucket.histogram.InternalAutoDateHistogram;
114116
import org.elasticsearch.search.aggregations.bucket.histogram.InternalDateHistogram;
115117
import org.elasticsearch.search.aggregations.bucket.histogram.InternalHistogram;
116118
import org.elasticsearch.search.aggregations.bucket.missing.InternalMissing;
@@ -400,6 +402,8 @@ private void registerAggregations(List<SearchPlugin> plugins) {
400402
HistogramAggregationBuilder::parse).addResultReader(InternalHistogram::new));
401403
registerAggregation(new AggregationSpec(DateHistogramAggregationBuilder.NAME, DateHistogramAggregationBuilder::new,
402404
DateHistogramAggregationBuilder::parse).addResultReader(InternalDateHistogram::new));
405+
registerAggregation(new AggregationSpec(AutoDateHistogramAggregationBuilder.NAME, AutoDateHistogramAggregationBuilder::new,
406+
AutoDateHistogramAggregationBuilder::parse).addResultReader(InternalAutoDateHistogram::new));
403407
registerAggregation(new AggregationSpec(GeoDistanceAggregationBuilder.NAME, GeoDistanceAggregationBuilder::new,
404408
GeoDistanceAggregationBuilder::parse).addResultReader(InternalGeoDistance::new));
405409
registerAggregation(new AggregationSpec(GeoGridAggregationBuilder.NAME, GeoGridAggregationBuilder::new,

server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,19 @@ public final void collectExistingBucket(LeafBucketCollector subCollector, int do
8484
subCollector.collect(doc, bucketOrd);
8585
}
8686

87+
public final void mergeBuckets(long[] mergeMap, long newNumBuckets) {
88+
try (IntArray oldDocCounts = docCounts) {
89+
docCounts = bigArrays.newIntArray(newNumBuckets, true);
90+
docCounts.fill(0, newNumBuckets, 0);
91+
for (int i = 0; i < oldDocCounts.size(); i++) {
92+
int docCount = oldDocCounts.get(i);
93+
if (docCount != 0) {
94+
docCounts.increment(mergeMap[i], docCount);
95+
}
96+
}
97+
}
98+
}
99+
87100
public IntArray getDocCounts() {
88101
return docCounts;
89102
}

0 commit comments

Comments
 (0)