Skip to content

Commit 1be7ba1

Browse files
committed
SQL: Implement FIRST/LAST aggregate functions
FIRST and LAST can be used with one argument and work similarly to MIN and MAX but they are implemented using a Top Hits aggregation and therefore can also operate on keyword fields. When a second argument is provided then they return the first/last value of the first arg when its values are ordered ascending/descending (respectively) by the values of the second argument. Currently because of the usage of a Top Hits aggregation FIRST and LAST cannot be used in the HAVING clause of a GROUP BY query to filter on the results of the aggregation. Closes: elastic#35639
1 parent 9ceb218 commit 1be7ba1

File tree

29 files changed

+937
-48
lines changed

29 files changed

+937
-48
lines changed

docs/reference/sql/functions/aggs.asciidoc

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,129 @@ Returns the total number of _distinct non-null_ values in input values.
113113
include-tagged::{sql-specs}/docs.csv-spec[aggCountDistinct]
114114
--------------------------------------------------
115115

116+
[[sql-functions-aggs-first]]
117+
===== `FIRST/FIRST_VALUE`
118+
119+
.Synopsis:
120+
[source, sql]
121+
--------------------------------------------------
122+
FIRST(field_name<1>, sort_by_field_name<2>)
123+
--------------------------------------------------
124+
125+
*Input*:
126+
127+
<1> a field name
128+
<2> a field name; optional
129+
130+
*Output*: same type as the input
131+
132+
.Description:
133+
134+
When only one argument is provided it returns the first **non-NULL** value across input values in the field
135+
`field_name`. It will return **NULL** only if all values in `field_name` are null. When a second argument
136+
is provided then it returns the first **non-NULL** value across input values in the field `field_name` ordered
137+
ascending by the **non-NULL** values of `sort_by_field_name`. E.g.:
138+
139+
[cols="<,<"]
140+
|===
141+
s|a|b
142+
| 100 | 1
143+
| 200 | 1
144+
| 1 | 2
145+
| 2 | 2
146+
| 10 | null
147+
| 20 | null
148+
|
149+
|===
150+
151+
[source, sql]
152+
-------------------------
153+
SELECT FIRST(a, b) FROM t
154+
-------------------------
155+
156+
will result in:
157+
[cols="<"]
158+
|===
159+
s|FIRST(a, b)
160+
| 100
161+
|===
162+
163+
164+
["source","sql",subs="attributes,macros"]
165+
-----------------------------------------------------------
166+
include-tagged::{sql-specs}/docs.csv-spec[firstWithOneArg]
167+
-----------------------------------------------------------
168+
169+
["source","sql",subs="attributes,macros"]
170+
-----------------------------------------------------------
171+
include-tagged::{sql-specs}/docs.csv-spec[firstWithTwoArgs]
172+
-----------------------------------------------------------
173+
174+
[NOTE]
175+
`FIRST` cannot be used in to create a filter in a `HAVING` clause of a `GROUP BY` query.
176+
177+
[[sql-functions-aggs-last]]
178+
===== `LAST/LAST_VALUE`
179+
180+
.Synopsis:
181+
[source, sql]
182+
--------------------------------------------------
183+
LAST(field_name<1>, sort_by_field_name<2>)
184+
--------------------------------------------------
185+
186+
*Input*:
187+
188+
<1> a field name
189+
<2> a field name; optional
190+
191+
*Output*: same type as the input
192+
193+
.Description:
194+
195+
It's the inverse of <<sql-functions-aggs-first>>. When only one argument is provided it returns the
196+
last **non-NULL** value across input values in the field `field_name`. It will return **NULL** only if
197+
all values in `field_name` are null. When a second argument is provided then it returns the last
198+
**non-NULL** value across input values in the field `field_name` ordered descending by the **non-NULL**
199+
values of `sort_by_field_name`. E.g.:
200+
201+
[cols="<,<"]
202+
|===
203+
s|a|b
204+
| 10 | 1
205+
| 20 | 1
206+
| 1 | 2
207+
| 2 | 2
208+
| 100 | null
209+
| 200 | null
210+
|===
211+
212+
[source, sql]
213+
------------------------
214+
SELECT LAST(a, b) FROM t
215+
------------------------
216+
217+
will result in:
218+
[cols="<"]
219+
|===
220+
s|LAST(a, b)
221+
| 2
222+
|===
223+
224+
225+
["source","sql",subs="attributes,macros"]
226+
-----------------------------------------------------------
227+
include-tagged::{sql-specs}/docs.csv-spec[lastWithOneArg]
228+
-----------------------------------------------------------
229+
230+
["source","sql",subs="attributes,macros"]
231+
-----------------------------------------------------------
232+
include-tagged::{sql-specs}/docs.csv-spec[lastWithTwoArgs]
233+
-----------------------------------------------------------
234+
235+
[NOTE]
236+
`LAST` cannot be used in to create a filter in a `HAVING` clause of a `GROUP BY` query.
237+
238+
116239
[[sql-functions-aggs-max]]
117240
===== `MAX`
118241

@@ -161,6 +284,9 @@ Returns the minimum value across input values in the field `field_name`.
161284
include-tagged::{sql-specs}/docs.csv-spec[aggMin]
162285
--------------------------------------------------
163286

287+
[NOTE]
288+
`MIN` on a field of type <<text, `text`>> or <<keyword, `keyword`>> is translated into <<sql-functions-aggs-first>>.
289+
164290
[[sql-functions-aggs-sum]]
165291
===== `SUM`
166292

x-pack/plugin/sql/qa/src/main/java/org/elasticsearch/xpack/sql/qa/cli/ShowTestCase.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ public void testShowFunctions() throws IOException {
3131
assertThat(readLine(), containsString(HEADER_SEPARATOR));
3232
assertThat(readLine(), RegexMatcher.matches("\\s*AVG\\s*\\|\\s*AGGREGATE\\s*"));
3333
assertThat(readLine(), RegexMatcher.matches("\\s*COUNT\\s*\\|\\s*AGGREGATE\\s*"));
34+
assertThat(readLine(), RegexMatcher.matches("\\s*FIRST\\s*\\|\\s*AGGREGATE\\s*"));
35+
assertThat(readLine(), RegexMatcher.matches("\\s*FIRST_VALUE\\s*\\|\\s*AGGREGATE\\s*"));
36+
assertThat(readLine(), RegexMatcher.matches("\\s*LAST\\s*\\|\\s*AGGREGATE\\s*"));
37+
assertThat(readLine(), RegexMatcher.matches("\\s*LAST_VALUE\\s*\\|\\s*AGGREGATE\\s*"));
3438
assertThat(readLine(), RegexMatcher.matches("\\s*MAX\\s*\\|\\s*AGGREGATE\\s*"));
3539
assertThat(readLine(), RegexMatcher.matches("\\s*MIN\\s*\\|\\s*AGGREGATE\\s*"));
3640
String line = readLine();
@@ -58,6 +62,8 @@ public void testShowFunctions() throws IOException {
5862
public void testShowFunctionsLikePrefix() throws IOException {
5963
assertThat(command("SHOW FUNCTIONS LIKE 'L%'"), RegexMatcher.matches("\\s*name\\s*\\|\\s*type\\s*"));
6064
assertThat(readLine(), containsString(HEADER_SEPARATOR));
65+
assertThat(readLine(), RegexMatcher.matches("\\s*LAST\\s*\\|\\s*AGGREGATE\\s*"));
66+
assertThat(readLine(), RegexMatcher.matches("\\s*LAST_VALUE\\s*\\|\\s*AGGREGATE\\s*"));
6167
assertThat(readLine(), RegexMatcher.matches("\\s*LEAST\\s*\\|\\s*CONDITIONAL\\s*"));
6268
assertThat(readLine(), RegexMatcher.matches("\\s*LOG\\s*\\|\\s*SCALAR\\s*"));
6369
assertThat(readLine(), RegexMatcher.matches("\\s*LOG10\\s*\\|\\s*SCALAR\\s*"));

x-pack/plugin/sql/qa/src/main/resources/agg.csv-spec

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,3 +373,36 @@ SELECT COUNT(ALL last_name)=COUNT(ALL first_name) AS areEqual, COUNT(ALL first_n
373373
---------------+---------------+---------------
374374
false |90 |100
375375
;
376+
377+
topHitsWithOneArgAndGroupBy
378+
schema::gender:s|first:s|last:s
379+
SELECT gender, FIRST(first_name) as first, LAST(first_name) as last FROM test_emp GROUP BY gender ORDER BY gender;
380+
381+
gender | first | last
382+
---------------+---------------+------------
383+
null | Berni | Patricio
384+
F | Alejandro | Xinglin
385+
M | Amabile | Zvonko
386+
;
387+
388+
topHitsWithTwoArgsAndGroupBy
389+
schema::gender:s|first:s|last:s
390+
SELECT gender, FIRST(first_name, birth_date) as first, LAST(first_name, birth_date) as last FROM test_emp GROUP BY gender ORDER BY gender;
391+
392+
gender | first | last
393+
---------------+---------------+-----------------
394+
null | Lillian | Eberhardt
395+
F | Sumant | Valdiodio
396+
M | Remzi | Hilari
397+
;
398+
399+
topHitsOnDatetime
400+
schema::gender:s|first:i|last:i
401+
SELECT gender, month(first(birth_date, languages)) first, month(last(birth_date, languages)) last FROM test_emp GROUP BY gender ORDER BY gender;
402+
403+
gender | first | last
404+
---------------+---------------+---------------
405+
null | 1 | 10
406+
F | 4 | 6
407+
M | 1 | 4
408+
;

x-pack/plugin/sql/qa/src/main/resources/command.csv-spec

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,12 @@ SHOW FUNCTIONS;
88

99
name:s | type:s
1010
AVG |AGGREGATE
11-
COUNT |AGGREGATE
12-
MAX |AGGREGATE
11+
COUNT |AGGREGATE
12+
FIRST |AGGREGATE
13+
FIRST_VALUE |AGGREGATE
14+
LAST |AGGREGATE
15+
LAST_VALUE |AGGREGATE
16+
MAX |AGGREGATE
1317
MIN |AGGREGATE
1418
SUM |AGGREGATE
1519
KURTOSIS |AGGREGATE

x-pack/plugin/sql/qa/src/main/resources/docs.csv-spec

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,11 @@ SHOW FUNCTIONS;
185185
name | type
186186
-----------------+---------------
187187
AVG |AGGREGATE
188-
COUNT |AGGREGATE
188+
COUNT |AGGREGATE
189+
FIRST |AGGREGATE
190+
FIRST_VALUE |AGGREGATE
191+
LAST |AGGREGATE
192+
LAST_VALUE |AGGREGATE
189193
MAX |AGGREGATE
190194
MIN |AGGREGATE
191195
SUM |AGGREGATE
@@ -699,6 +703,8 @@ SELECT MIN(salary) AS min, MAX(salary) AS max FROM emp HAVING min > 25000;
699703
// end::groupByHavingImplicitNoMatch
700704
//;
701705

706+
707+
702708
///////////////////////////////
703709
//
704710
// Grouping
@@ -998,6 +1004,55 @@ SELECT COUNT(DISTINCT hire_date) unique_hires, COUNT(hire_date) AS hires FROM em
9981004
// end::aggCountDistinct
9991005
;
10001006

1007+
firstWithOneArg
1008+
schema::FIRST(first_name):s
1009+
// tag::firstWithOneArg
1010+
SELECT FIRST(first_name) FROM emp;
1011+
1012+
FIRST(first_name)
1013+
-----------------
1014+
Alejandro
1015+
1016+
// end::firstWithOneArg
1017+
;
1018+
1019+
firstWithTwoArgs
1020+
schema::FIRST(first_name, birth_date):s
1021+
// tag::firstWithTwoArgs
1022+
SELECT FIRST(first_name, birth_date) FROM emp;
1023+
1024+
FIRST(first_name, birth_date)
1025+
-----------------------------
1026+
Remzi
1027+
1028+
// end::firstWithTwoArgs
1029+
;
1030+
1031+
lastWithOneArg
1032+
schema::LAST(first_name):s
1033+
// tag::lastWithOneArg
1034+
SELECT LAST(first_name) FROM emp;
1035+
1036+
LAST(first_name)
1037+
---------------
1038+
Zvonko
1039+
1040+
// end::lastWithOneArg
1041+
;
1042+
1043+
1044+
lastWithTwoArgs
1045+
schema::LAST(first_name, birth_date):s
1046+
// tag::lastWithTwoArgs
1047+
SELECT LAST(first_name, birth_date) FROM emp;
1048+
1049+
LAST(first_name, birth_date)
1050+
---------------------------
1051+
Hilari
1052+
1053+
// end::lastWithTwoArgs
1054+
;
1055+
10011056
aggMax
10021057
// tag::aggMax
10031058
SELECT MAX(salary) AS max FROM emp;

x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/analysis/analyzer/Verifier.java

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.elasticsearch.xpack.sql.expression.function.Functions;
2020
import org.elasticsearch.xpack.sql.expression.function.Score;
2121
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
22+
import org.elasticsearch.xpack.sql.expression.function.aggregate.TopHits;
2223
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
2324
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction;
2425
import org.elasticsearch.xpack.sql.expression.predicate.conditional.ConditionalFunction;
@@ -366,16 +367,26 @@ private static boolean checkGroupByHaving(LogicalPlan p, Set<Failure> localFailu
366367
if (f.child() instanceof Aggregate) {
367368
Aggregate a = (Aggregate) f.child();
368369

369-
Map<Expression, Node<?>> missing = new LinkedHashMap<>();
370+
Set<Expression> missing = new LinkedHashSet<>();
371+
Set<Expression> unsupported = new LinkedHashSet<>();
370372
Expression condition = f.condition();
371373
// variation of checkGroupMatch customized for HAVING, which requires just aggregations
372-
condition.collectFirstChildren(c -> checkGroupByHavingHasOnlyAggs(c, condition, missing, functions));
374+
condition.collectFirstChildren(c -> checkGroupByHavingHasOnlyAggs(c, missing, unsupported, functions));
373375

374376
if (!missing.isEmpty()) {
375377
String plural = missing.size() > 1 ? "s" : StringUtils.EMPTY;
376378
localFailures.add(
377379
fail(condition, "Cannot use HAVING filter on non-aggregate" + plural + " %s; use WHERE instead",
378-
Expressions.names(missing.keySet())));
380+
Expressions.names(missing)));
381+
groupingFailures.add(a);
382+
return false;
383+
}
384+
385+
if (!unsupported.isEmpty()) {
386+
String plural = unsupported.size() > 1 ? "s" : StringUtils.EMPTY;
387+
localFailures.add(
388+
fail(condition, "HAVING filter is unsupported for function" + plural + " %s",
389+
Expressions.names(unsupported)));
379390
groupingFailures.add(a);
380391
return false;
381392
}
@@ -385,8 +396,8 @@ private static boolean checkGroupByHaving(LogicalPlan p, Set<Failure> localFailu
385396
}
386397

387398

388-
private static boolean checkGroupByHavingHasOnlyAggs(Expression e, Node<?> source,
389-
Map<Expression, Node<?>> missing, Map<String, Function> functions) {
399+
private static boolean checkGroupByHavingHasOnlyAggs(Expression e, Set<Expression> missing,
400+
Set<Expression> unsupported, Map<String, Function> functions) {
390401

391402
// resolve FunctionAttribute to backing functions
392403
if (e instanceof FunctionAttribute) {
@@ -407,13 +418,17 @@ private static boolean checkGroupByHavingHasOnlyAggs(Expression e, Node<?> sourc
407418

408419
// unwrap function to find the base
409420
for (Expression arg : sf.arguments()) {
410-
arg.collectFirstChildren(c -> checkGroupByHavingHasOnlyAggs(c, source, missing, functions));
421+
arg.collectFirstChildren(c -> checkGroupByHavingHasOnlyAggs(c, missing, unsupported, functions));
411422
}
412423
return true;
413424

414425
} else if (e instanceof Score) {
415-
// Score can't be used for having
416-
missing.put(e, source);
426+
// Score can't be used in having
427+
unsupported.add(e);
428+
return true;
429+
} else if (e instanceof TopHits) {
430+
// First and last cannot be used in having
431+
unsupported.add(e);
417432
return true;
418433
}
419434

@@ -428,7 +443,7 @@ private static boolean checkGroupByHavingHasOnlyAggs(Expression e, Node<?> sourc
428443

429444
// left without leaves which have to match; that's a failure since everything should be based on an agg
430445
if (e instanceof Attribute) {
431-
missing.put(e, source);
446+
missing.add(e);
432447
return true;
433448
}
434449

x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/execution/search/Querier.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.elasticsearch.xpack.sql.execution.search.extractor.FieldHitExtractor;
3333
import org.elasticsearch.xpack.sql.execution.search.extractor.HitExtractor;
3434
import org.elasticsearch.xpack.sql.execution.search.extractor.MetricAggExtractor;
35+
import org.elasticsearch.xpack.sql.execution.search.extractor.TopHitsAggExtractor;
3536
import org.elasticsearch.xpack.sql.expression.gen.pipeline.AggExtractorInput;
3637
import org.elasticsearch.xpack.sql.expression.gen.pipeline.AggPathInput;
3738
import org.elasticsearch.xpack.sql.expression.gen.pipeline.HitExtractorInput;
@@ -45,6 +46,7 @@
4546
import org.elasticsearch.xpack.sql.querydsl.container.QueryContainer;
4647
import org.elasticsearch.xpack.sql.querydsl.container.ScriptFieldRef;
4748
import org.elasticsearch.xpack.sql.querydsl.container.SearchHitFieldRef;
49+
import org.elasticsearch.xpack.sql.querydsl.container.TopHitsAggRef;
4850
import org.elasticsearch.xpack.sql.session.Configuration;
4951
import org.elasticsearch.xpack.sql.session.Rows;
5052
import org.elasticsearch.xpack.sql.session.SchemaRowSet;
@@ -276,6 +278,11 @@ private BucketExtractor createExtractor(FieldExtraction ref, BucketExtractor tot
276278
return new MetricAggExtractor(r.name(), r.property(), r.innerKey());
277279
}
278280

281+
if (ref instanceof TopHitsAggRef) {
282+
TopHitsAggRef r = (TopHitsAggRef) ref;
283+
return new TopHitsAggExtractor(r.name(), r.fieldDataType());
284+
}
285+
279286
if (ref == GlobalCountRef.INSTANCE) {
280287
return totalCount;
281288
}

0 commit comments

Comments
 (0)