Skip to content

Commit 75a7daa

Browse files
committed
SQL: use calendar interval of 1y instead of fixed interval for grouping by YEAR and HISTOGRAMs (#47558)
(cherry picked from commit 55f5463)
1 parent 54c2aec commit 75a7daa

File tree

9 files changed

+180
-130
lines changed

9 files changed

+180
-130
lines changed

docs/reference/sql/functions/grouping.asciidoc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,16 @@ the multiple of a day. E.g.: for `HISTOGRAM(CAST(birth_date AS DATE), INTERVAL '
8686
actually used will be `INTERVAL '2' DAY`. If the interval specified is less than 1 day, e.g.:
8787
`HISTOGRAM(CAST(birth_date AS DATE), INTERVAL '20' HOUR)` then the interval used will be `INTERVAL '1' DAY`.
8888

89+
[IMPORTANT]
90+
All intervals specified for a date/time HISTOGRAM will use a <<search-aggregations-bucket-datehistogram-aggregation,fixed interval>>
91+
in their `date_histogram` aggregation definition, with the notable exception of `INTERVAL '1' YEAR` where a calendar interval is used.
92+
The choice for a calendar interval was made for having a more intuitive result for YEAR groupings. Calendar intervals consider a one year
93+
bucket as the one starting on January 1st that specific year, whereas a fixed interval one-year-bucket considers one year as a number
94+
of milliseconds (for example, `31536000000ms` corresponding to 365 days, 24 hours per day, 60 minutes per hour etc.). With fixed intervals,
95+
the day of February 5th, 2019 for example, belongs to a bucket that starts on December 20th, 2018 and {es} (and implicitly {es-sql}) would
96+
have returned the year 2018 for a date that's actually in 2019. With calendar interval this behavior is more intuitive, having the day of
97+
February 5th, 2019 actually belonging to the 2019 year bucket.
98+
8999
[IMPORTANT]
90100
Histogram in SQL cannot be applied applied on **TIME** type.
91101
E.g.: `HISTOGRAM(CAST(birth_date AS TIME), INTERVAL '10' MINUTES)` is currently not supported.

x-pack/plugin/sql/qa/src/main/resources/agg.csv-spec

Lines changed: 51 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -273,47 +273,46 @@ histogramDateTime
273273
schema::h:ts|c:l
274274
SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp GROUP BY h;
275275

276-
h | c
277-
--------------------+---------------
278-
null |10
279-
1951-04-11T00:00:00Z|1
280-
1952-04-05T00:00:00Z|10
281-
1953-03-31T00:00:00Z|10
282-
1954-03-26T00:00:00Z|7
283-
1955-03-21T00:00:00Z|4
284-
1956-03-15T00:00:00Z|4
285-
1957-03-10T00:00:00Z|6
286-
1958-03-05T00:00:00Z|6
287-
1959-02-28T00:00:00Z|9
288-
1960-02-23T00:00:00Z|7
289-
1961-02-17T00:00:00Z|8
290-
1962-02-12T00:00:00Z|6
291-
1963-02-07T00:00:00Z|7
292-
1964-02-02T00:00:00Z|5
293-
276+
h | c
277+
------------------------+---------------
278+
null |10
279+
1952-01-01T00:00:00.000Z|8
280+
1953-01-01T00:00:00.000Z|11
281+
1954-01-01T00:00:00.000Z|8
282+
1955-01-01T00:00:00.000Z|4
283+
1956-01-01T00:00:00.000Z|5
284+
1957-01-01T00:00:00.000Z|4
285+
1958-01-01T00:00:00.000Z|7
286+
1959-01-01T00:00:00.000Z|9
287+
1960-01-01T00:00:00.000Z|8
288+
1961-01-01T00:00:00.000Z|8
289+
1962-01-01T00:00:00.000Z|6
290+
1963-01-01T00:00:00.000Z|7
291+
1964-01-01T00:00:00.000Z|4
292+
1965-01-01T00:00:00.000Z|1
294293
;
295294

296295
histogramDateTimeWithCountAndOrder
297296
schema::h:ts|c:l
298297
SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
299298

300-
h | c
301-
--------------------+---------------
302-
1964-02-02T00:00:00Z|5
303-
1963-02-07T00:00:00Z|7
304-
1962-02-12T00:00:00Z|6
305-
1961-02-17T00:00:00Z|8
306-
1960-02-23T00:00:00Z|7
307-
1959-02-28T00:00:00Z|9
308-
1958-03-05T00:00:00Z|6
309-
1957-03-10T00:00:00Z|6
310-
1956-03-15T00:00:00Z|4
311-
1955-03-21T00:00:00Z|4
312-
1954-03-26T00:00:00Z|7
313-
1953-03-31T00:00:00Z|10
314-
1952-04-05T00:00:00Z|10
315-
1951-04-11T00:00:00Z|1
316-
null |10
299+
h | c
300+
------------------------+---------------
301+
1965-01-01T00:00:00.000Z|1
302+
1964-01-01T00:00:00.000Z|4
303+
1963-01-01T00:00:00.000Z|7
304+
1962-01-01T00:00:00.000Z|6
305+
1961-01-01T00:00:00.000Z|8
306+
1960-01-01T00:00:00.000Z|8
307+
1959-01-01T00:00:00.000Z|9
308+
1958-01-01T00:00:00.000Z|7
309+
1957-01-01T00:00:00.000Z|4
310+
1956-01-01T00:00:00.000Z|5
311+
1955-01-01T00:00:00.000Z|4
312+
1954-01-01T00:00:00.000Z|8
313+
1953-01-01T00:00:00.000Z|11
314+
1952-01-01T00:00:00.000Z|8
315+
null |10
317316
;
318317

319318
histogramDateTimeWithMonthOnTop
@@ -369,23 +368,23 @@ histogramGroupByWithoutAlias
369368
schema::h:ts|c:l
370369
SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp GROUP BY HISTOGRAM(birth_date, INTERVAL 1 YEAR) ORDER BY h DESC;
371370

372-
h | c
373-
--------------------+---------------
374-
1964-02-02T00:00:00Z|5
375-
1963-02-07T00:00:00Z|7
376-
1962-02-12T00:00:00Z|6
377-
1961-02-17T00:00:00Z|8
378-
1960-02-23T00:00:00Z|7
379-
1959-02-28T00:00:00Z|9
380-
1958-03-05T00:00:00Z|6
381-
1957-03-10T00:00:00Z|6
382-
1956-03-15T00:00:00Z|4
383-
1955-03-21T00:00:00Z|4
384-
1954-03-26T00:00:00Z|7
385-
1953-03-31T00:00:00Z|10
386-
1952-04-05T00:00:00Z|10
387-
1951-04-11T00:00:00Z|1
388-
null |10
371+
h | c
372+
------------------------+---------------
373+
1965-01-01T00:00:00.000Z|1
374+
1964-01-01T00:00:00.000Z|4
375+
1963-01-01T00:00:00.000Z|7
376+
1962-01-01T00:00:00.000Z|6
377+
1961-01-01T00:00:00.000Z|8
378+
1960-01-01T00:00:00.000Z|8
379+
1959-01-01T00:00:00.000Z|9
380+
1958-01-01T00:00:00.000Z|7
381+
1957-01-01T00:00:00.000Z|4
382+
1956-01-01T00:00:00.000Z|5
383+
1955-01-01T00:00:00.000Z|4
384+
1954-01-01T00:00:00.000Z|8
385+
1953-01-01T00:00:00.000Z|11
386+
1952-01-01T00:00:00.000Z|8
387+
null |10
389388
;
390389

391390
countAll

x-pack/plugin/sql/qa/src/main/resources/docs/docs.csv-spec

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -811,23 +811,23 @@ schema::h:ts|c:l
811811
SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) AS c FROM emp GROUP BY h;
812812

813813

814-
h | c
815-
--------------------+---------------
816-
null |10
817-
1951-04-11T00:00:00Z|1
818-
1952-04-05T00:00:00Z|10
819-
1953-03-31T00:00:00Z|10
820-
1954-03-26T00:00:00Z|7
821-
1955-03-21T00:00:00Z|4
822-
1956-03-15T00:00:00Z|4
823-
1957-03-10T00:00:00Z|6
824-
1958-03-05T00:00:00Z|6
825-
1959-02-28T00:00:00Z|9
826-
1960-02-23T00:00:00Z|7
827-
1961-02-17T00:00:00Z|8
828-
1962-02-12T00:00:00Z|6
829-
1963-02-07T00:00:00Z|7
830-
1964-02-02T00:00:00Z|5
814+
h | c
815+
------------------------+---------------
816+
null |10
817+
1952-01-01T00:00:00.000Z|8
818+
1953-01-01T00:00:00.000Z|11
819+
1954-01-01T00:00:00.000Z|8
820+
1955-01-01T00:00:00.000Z|4
821+
1956-01-01T00:00:00.000Z|5
822+
1957-01-01T00:00:00.000Z|4
823+
1958-01-01T00:00:00.000Z|7
824+
1959-01-01T00:00:00.000Z|9
825+
1960-01-01T00:00:00.000Z|8
826+
1961-01-01T00:00:00.000Z|8
827+
1962-01-01T00:00:00.000Z|6
828+
1963-01-01T00:00:00.000Z|7
829+
1964-01-01T00:00:00.000Z|4
830+
1965-01-01T00:00:00.000Z|1
831831

832832
// end::histogramDateTime
833833
;

x-pack/plugin/sql/qa/src/main/resources/math.csv-spec

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -101,31 +101,31 @@ SELECT MIN(salary) mi, MAX(salary) ma, YEAR(hire_date) year, ROUND(AVG(languages
101101

102102
mi:i | ma:i | year:i |ROUND(AVG(languages), 1):d|TRUNCATE(AVG(languages), 1):d| COUNT(*):l
103103
---------------+---------------+---------------+--------------------------+-----------------------------+---------------
104-
25324 |70011 |1986 |3.0 |3.0 |15
105-
25945 |73578 |1987 |2.9 |2.8 |9
106-
25976 |74970 |1988 |3.0 |3.0 |13
107-
31120 |71165 |1989 |3.1 |3.0 |12
108-
30404 |58715 |1992 |3.0 |3.0 |3
109-
35742 |67492 |1993 |2.8 |2.7 |4
110-
45656 |45656 |1995 |3.0 |3.0 |1
104+
25324 |70011 |1987 |3.0 |3.0 |15
105+
25945 |73578 |1988 |2.9 |2.8 |9
106+
25976 |74970 |1989 |3.0 |3.0 |13
107+
31120 |71165 |1990 |3.1 |3.0 |12
108+
30404 |58715 |1993 |3.0 |3.0 |3
109+
35742 |67492 |1994 |2.8 |2.7 |4
110+
45656 |45656 |1996 |3.0 |3.0 |1
111111
;
112112

113113
minMaxRoundWithHavingRound
114114
SELECT MIN(salary) mi, MAX(salary) ma, YEAR(hire_date) year, ROUND(AVG(languages),1), COUNT(*) FROM test_emp GROUP BY YEAR(hire_date) HAVING ROUND(AVG(languages), 1) > 2.5 ORDER BY YEAR(hire_date);
115115

116116
mi:i | ma:i | year:i |ROUND(AVG(languages),1):d| COUNT(*):l
117117
---------------+---------------+---------------+-------------------------+---------------
118-
26436 |74999 |1984 |3.1 |11
119-
31897 |61805 |1985 |3.5 |11
120-
25324 |70011 |1986 |3.0 |15
121-
25945 |73578 |1987 |2.9 |9
122-
25976 |74970 |1988 |3.0 |13
123-
31120 |71165 |1989 |3.1 |12
124-
32568 |65030 |1990 |3.3 |6
125-
27215 |60781 |1991 |4.1 |8
126-
30404 |58715 |1992 |3.0 |3
127-
35742 |67492 |1993 |2.8 |4
128-
45656 |45656 |1995 |3.0 |1
118+
26436 |74999 |1985 |3.1 |11
119+
31897 |61805 |1986 |3.5 |11
120+
25324 |70011 |1987 |3.0 |15
121+
25945 |73578 |1988 |2.9 |9
122+
25976 |74970 |1989 |3.0 |13
123+
31120 |71165 |1990 |3.1 |12
124+
32568 |65030 |1991 |3.3 |6
125+
27215 |60781 |1992 |4.1 |8
126+
30404 |58715 |1993 |3.0 |3
127+
35742 |67492 |1994 |2.8 |4
128+
45656 |45656 |1996 |3.0 |1
129129
;
130130

131131
groupByAndOrderByTruncateWithPositiveParameter

x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/function/scalar/datetime/DateTimeHistogramFunction.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,13 @@ public abstract class DateTimeHistogramFunction extends DateTimeFunction {
2222
}
2323

2424
/**
25-
* used for aggregration (date histogram)
25+
* used for aggregation (date histogram)
2626
*/
27-
public abstract long interval();
27+
public long fixedInterval() {
28+
return -1;
29+
}
30+
31+
public String calendarInterval() {
32+
return null;
33+
}
2834
}

x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/function/scalar/datetime/Year.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,20 @@
55
*/
66
package org.elasticsearch.xpack.sql.expression.function.scalar.datetime;
77

8+
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
89
import org.elasticsearch.xpack.sql.expression.Expression;
910
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DateTimeProcessor.DateTimeExtractor;
10-
import org.elasticsearch.xpack.sql.tree.Source;
1111
import org.elasticsearch.xpack.sql.tree.NodeInfo.NodeCtor2;
12+
import org.elasticsearch.xpack.sql.tree.Source;
1213

1314
import java.time.ZoneId;
14-
import java.util.concurrent.TimeUnit;
1515

1616
/**
1717
* Extract the year from a datetime.
1818
*/
1919
public class Year extends DateTimeHistogramFunction {
20-
21-
private static long YEAR_IN_MILLIS = TimeUnit.DAYS.toMillis(1) * 365L;
20+
21+
public static String YEAR_INTERVAL = DateHistogramInterval.YEAR.toString();
2222

2323
public Year(Source source, Expression field, ZoneId zoneId) {
2424
super(source, field, zoneId, DateTimeExtractor.YEAR);
@@ -45,7 +45,7 @@ public Expression orderBy() {
4545
}
4646

4747
@Override
48-
public long interval() {
49-
return YEAR_IN_MILLIS;
48+
public String calendarInterval() {
49+
return YEAR_INTERVAL;
5050
}
5151
}

x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/planner/QueryTranslator.java

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@
4141
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction;
4242
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DateTimeFunction;
4343
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DateTimeHistogramFunction;
44+
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.Year;
4445
import org.elasticsearch.xpack.sql.expression.function.scalar.geo.GeoShape;
4546
import org.elasticsearch.xpack.sql.expression.function.scalar.geo.StDistance;
4647
import org.elasticsearch.xpack.sql.expression.gen.script.ScriptTemplate;
48+
import org.elasticsearch.xpack.sql.expression.literal.IntervalYearMonth;
4749
import org.elasticsearch.xpack.sql.expression.literal.Intervals;
4850
import org.elasticsearch.xpack.sql.expression.predicate.Range;
4951
import org.elasticsearch.xpack.sql.expression.predicate.fulltext.MatchQueryPredicate;
@@ -109,6 +111,7 @@
109111
import org.elasticsearch.xpack.sql.util.ReflectionUtils;
110112

111113
import java.time.OffsetTime;
114+
import java.time.Period;
112115
import java.time.ZonedDateTime;
113116
import java.util.Arrays;
114117
import java.util.LinkedHashMap;
@@ -279,7 +282,11 @@ static GroupingContext groupBy(List<? extends Expression> groupings) {
279282
// dates are handled differently because of date histograms
280283
if (exp instanceof DateTimeHistogramFunction) {
281284
DateTimeHistogramFunction dthf = (DateTimeHistogramFunction) exp;
282-
key = new GroupByDateHistogram(aggId, nameOf(exp), dthf.interval(), dthf.zoneId());
285+
if (dthf.calendarInterval() != null) {
286+
key = new GroupByDateHistogram(aggId, nameOf(exp), dthf.calendarInterval(), dthf.zoneId());
287+
} else {
288+
key = new GroupByDateHistogram(aggId, nameOf(exp), dthf.fixedInterval(), dthf.zoneId());
289+
}
283290
}
284291
// all other scalar functions become a script
285292
else if (exp instanceof ScalarFunction) {
@@ -294,19 +301,33 @@ else if (exp instanceof GroupingFunction) {
294301

295302
// date histogram
296303
if (h.dataType().isDateBased()) {
297-
long intervalAsMillis = Intervals.inMillis(h.interval());
298-
299-
// When the histogram in SQL is applied on DATE type instead of DATETIME, the interval
300-
// specified is truncated to the multiple of a day. If the interval specified is less
301-
// than 1 day, then the interval used will be `INTERVAL '1' DAY`.
302-
if (h.dataType() == DATE) {
303-
intervalAsMillis = DateUtils.minDayInterval(intervalAsMillis);
304-
}
305-
306-
if (field instanceof FieldAttribute) {
307-
key = new GroupByDateHistogram(aggId, nameOf(field), intervalAsMillis, h.zoneId());
308-
} else if (field instanceof Function) {
309-
key = new GroupByDateHistogram(aggId, ((Function) field).asScript(), intervalAsMillis, h.zoneId());
304+
Object value = h.interval().value();
305+
if (value instanceof IntervalYearMonth
306+
&& ((IntervalYearMonth) value).interval().equals(Period.of(1, 0, 0))) {
307+
String calendarInterval = Year.YEAR_INTERVAL;
308+
309+
// When the histogram is `INTERVAL '1' YEAR`, the interval used in the ES date_histogram will be
310+
// a calendar_interval with value "1y". All other intervals will be fixed_intervals expressed in ms.
311+
if (field instanceof FieldAttribute) {
312+
key = new GroupByDateHistogram(aggId, nameOf(field), calendarInterval, h.zoneId());
313+
} else if (field instanceof Function) {
314+
key = new GroupByDateHistogram(aggId, ((Function) field).asScript(), calendarInterval, h.zoneId());
315+
}
316+
} else {
317+
long intervalAsMillis = Intervals.inMillis(h.interval());
318+
319+
// When the histogram in SQL is applied on DATE type instead of DATETIME, the interval
320+
// specified is truncated to the multiple of a day. If the interval specified is less
321+
// than 1 day, then the interval used will be `INTERVAL '1' DAY`.
322+
if (h.dataType() == DATE) {
323+
intervalAsMillis = DateUtils.minDayInterval(intervalAsMillis);
324+
}
325+
326+
if (field instanceof FieldAttribute) {
327+
key = new GroupByDateHistogram(aggId, nameOf(field), intervalAsMillis, h.zoneId());
328+
} else if (field instanceof Function) {
329+
key = new GroupByDateHistogram(aggId, ((Function) field).asScript(), intervalAsMillis, h.zoneId());
330+
}
310331
}
311332
}
312333
// numeric histogram

0 commit comments

Comments
 (0)