Skip to content

SQL: Add PIVOT support #46489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Sep 23, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 26 additions & 57 deletions x-pack/plugin/sql/qa/src/main/resources/pivot.csv-spec
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ SELECT * FROM (SELECT languages, gender, salary FROM test_emp) PIVOT (AVG(salary
2 |50684.4
;

averageWithTwoValuesAndOrder
averageWithTwoValuesAndOrderDesc
schema::languages:bt|'M':d|'F':d
SELECT * FROM (SELECT languages, gender, salary FROM test_emp) PIVOT (AVG(salary) FOR gender IN ('M', 'F')) ORDER BY languages DESC;

Expand All @@ -152,6 +152,30 @@ SELECT * FROM (SELECT languages, gender, salary FROM test_emp) PIVOT (AVG(salary
null |48396.28571428572|62140.666666666664
;

averageWithTwoValuesAndOrderDesc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AndLimit?

schema::languages:bt|'M':d|'F':d
SELECT * FROM (SELECT languages, gender, salary FROM test_emp) PIVOT (AVG(salary) FOR gender IN ('M', 'F')) ORDER BY languages DESC LIMIT 2;

languages | 'M' | 'F'
---------------+-----------------+------------------
5 |39052.875 |46705.555555555555
4 |47058.90909090909|49291.5
;

averageWithTwoValuesAndOrderAsc
schema::languages:bt|'M':d|'F':d
SELECT * FROM (SELECT languages, gender, salary FROM test_emp) PIVOT (AVG(salary) FOR gender IN ('M', 'F')) ORDER BY languages ASC;

languages | 'M' | 'F'
---------------+-----------------+------------------
null |48396.28571428572|62140.666666666664
1 |49767.22222222222|47073.25
2 |44103.90909090909|50684.4
3 |51741.90909090909|53660.0
4 |47058.90909090909|49291.5
5 |39052.875 |46705.555555555555
;


sumWithoutSubquery
schema::birth_date:ts|emp_no:i|first_name:s|gender:s|hire_date:ts|last_name:s|1:i|2:i
Expand Down Expand Up @@ -179,59 +203,4 @@ null |31070.0
3 |26830.0
4 |24646.0
5 |23353.0
;

averageWithScalarOverAggregateAndFoldedValue
schema::status:s|client_ip:s
SELECT status, client_ip FROM logs
PIVOT (MAX(bytes_out) FOR "@timestamp" IN (CAST('2017-11-10T20:35:55Z' AS DATETIME), CAST('2017-11-10T21:15:40Z' AS DATETIME)));

status | client_ip
---------------+---------------
Error |null
Error |10.0.0.107
Error |10.0.0.130
Error |10.0.0.147
Error |10.0.1.166
Error |10.0.1.177
OK |null
OK |10.0.0.105
OK |10.0.0.109
OK |10.0.0.113
OK |10.0.0.118
OK |10.0.0.128
OK |10.0.0.129
OK |10.0.0.134
OK |10.0.0.144
OK |10.0.0.145
OK |10.0.1.1
OK |10.0.1.7
OK |10.0.1.8
OK |10.0.1.9
OK |10.0.1.10
OK |10.0.1.11
OK |10.0.1.12
OK |10.0.1.13
OK |10.0.1.122
OK |10.0.1.166
OK |10.0.1.199
OK |10.0.1.200
OK |10.0.1.201
OK |10.0.1.202
OK |10.0.1.203
OK |10.0.1.204
OK |10.0.1.205
OK |10.0.1.206
OK |10.0.1.207
OK |10.0.1.208
OK |10.0.1.222
;

averageWithScalarOverAggregateAndFoldedValueAndColumn
schema::status:s|client_ip:s
SELECT status, client_ip FROM logs
PIVOT (MAX(bytes_out) FOR "@timestamp" IN (CAST('2017-11-10T20:35:55Z' AS DATETIME), CAST('2017-11-10T21:15:40Z' AS DATETIME)));
status | client_ip
---------------+---------------
Error |null
Error |10.0.0.107
;
Original file line number Diff line number Diff line change
Expand Up @@ -1273,20 +1273,20 @@ public static class CleanAliases extends AnalyzeRule<LogicalPlan> {
protected LogicalPlan rule(LogicalPlan plan) {
if (plan instanceof Project) {
Project p = (Project) plan;
return new Project(p.source(), p.child(), cleanSecondaryAliases(p.projections()));
return new Project(p.source(), p.child(), cleanChildrenAliases(p.projections()));
}

if (plan instanceof Aggregate) {
Aggregate a = (Aggregate) plan;
// aliases inside GROUP BY are irellevant so remove all of them
// however aggregations are important (ultimately a projection)
return new Aggregate(a.source(), a.child(), cleanAllAliases(a.groupings()), cleanSecondaryAliases(a.aggregates()));
return new Aggregate(a.source(), a.child(), cleanAllAliases(a.groupings()), cleanChildrenAliases(a.aggregates()));
}

if (plan instanceof Pivot) {
Pivot p = (Pivot) plan;
return new Pivot(p.source(), p.child(), trimAliases(p.column()), cleanSecondaryAliases(p.values()),
cleanSecondaryAliases(p.aggregates()));
return new Pivot(p.source(), p.child(), trimAliases(p.column()), cleanChildrenAliases(p.values()),
cleanChildrenAliases(p.aggregates()));
}

return plan.transformExpressionsOnly(e -> {
Expand All @@ -1297,7 +1297,7 @@ protected LogicalPlan rule(LogicalPlan plan) {
});
}

private List<NamedExpression> cleanSecondaryAliases(List<? extends NamedExpression> args) {
private List<NamedExpression> cleanChildrenAliases(List<? extends NamedExpression> args) {
List<NamedExpression> cleaned = new ArrayList<>(args.size());
for (NamedExpression ne : args) {
cleaned.add((NamedExpression) trimNonTopLevelAliases(ne));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.elasticsearch.xpack.sql.expression.Expression;
import org.elasticsearch.xpack.sql.expression.Expressions;
import org.elasticsearch.xpack.sql.expression.FieldAttribute;
import org.elasticsearch.xpack.sql.expression.Literal;
import org.elasticsearch.xpack.sql.expression.NamedExpression;
import org.elasticsearch.xpack.sql.expression.UnresolvedAttribute;
import org.elasticsearch.xpack.sql.expression.function.Function;
Expand Down Expand Up @@ -482,15 +483,27 @@ private static boolean onlyExactFields(List<Expression> expressions, Set<Failure
expressions.forEach(e -> e.forEachUp(c -> {
EsField.Exact exact = c.getExactInfo();
if (exact.hasExact() == false) {
localFailures.add(fail(c, format(null, "Field [{}] of data type [{}] cannot be used for grouping; {}",
c.sourceText(), c.dataType().typeName, exact.errorMsg())));
localFailures.add(fail(c, "Field [{}] of data type [{}] cannot be used for grouping; {}", c.sourceText(),
c.dataType().typeName, exact.errorMsg()));
onlyExact.set(Boolean.FALSE);
}
}, FieldAttribute.class));

return onlyExact.get();
}

private static boolean onlyRawFields(Iterable<? extends Expression> expressions, Set<Failure> localFailures) {
Holder<Boolean> onlyExact = new Holder<>(Boolean.TRUE);

expressions.forEach(e -> e.forEachDown(c -> {
if (c instanceof Function || c instanceof FunctionAttribute) {
localFailures.add(fail(c, "No functions allowed (yet); encountered [{}]", c.sourceText()));
onlyExact.set(Boolean.FALSE);
}
}));
return onlyExact.get();
}

private static boolean checkGroupByTime(LogicalPlan p, Set<Failure> localFailures) {
if (p instanceof Aggregate) {
Aggregate a = (Aggregate) p;
Expand Down Expand Up @@ -767,7 +780,8 @@ private static void checkForGeoFunctionsOnDocValues(LogicalPlan p, Set<Failure>
private static void checkPivot(LogicalPlan p, Set<Failure> localFailures) {
p.forEachDown(pv -> {
// check only exact fields are used inside PIVOTing
if (onlyExactFields(combine(pv.groupingSet(), pv.column()), localFailures) == false) {
if (onlyExactFields(combine(pv.groupingSet(), pv.column()), localFailures) == false
|| onlyRawFields(pv.groupingSet(), localFailures) == false) {
// if that is not the case, no need to do further validation since the declaration is fundamentally wrong
return;
}
Expand All @@ -777,9 +791,12 @@ private static void checkPivot(LogicalPlan p, Set<Failure> localFailures) {
for (NamedExpression v : pv.values()) {
// check all values are foldable
Expression ex = v instanceof Alias ? ((Alias) v).child() : v;
if (ex.foldable() == false) {
if (ex instanceof Literal == false) {
localFailures.add(fail(v, "Non-literal [{}] found inside PIVOT values", v.name()));
}
else if (ex.foldable() && ex.fold() == null) {
localFailures.add(fail(v, "Null not allowed as a PIVOT value", v.name()));
}
// and that their type is compatible with that of the column
else if (DataTypes.areTypesCompatible(colType, v.dataType()) == false) {
localFailures.add(fail(v, "Literal [{}] of type [{}] does not match type [{}] of PIVOT column [{}]", v.name(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,13 @@ protected LogicalPlan rule(Pivot plan) {
// TODO: this should be removed when refactoring NamedExpression
else if (namedExpression instanceof Literal) {
rawValues.add(namedExpression);
} else {
}
// TODO: NamedExpression refactoring should remove this
else if (namedExpression.foldable()) {
rawValues.add(Literal.of(namedExpression.name(), namedExpression));
}
// TOOD: same as above
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo -> TODO

else {
UnresolvedAttribute attr = new UnresolvedAttribute(namedExpression.source(), namedExpression.name(), null,
"Unexpected alias");
return new Pivot(plan.source(), plan.child(), plan.column(), singletonList(attr), plan.aggregates());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ private static Query translateQuery(BinaryComparison bc) {
// Since RangeQueryBuilder can handle date as String as well, we'll format it as String and provide the format as well.
value = formatter.format((ZonedDateTime) value);
} else {
formatter = DateFormatter.forPattern(TIME_FORMAT);
formatter = DateFormatter.forPattern(TIME_FORMAT);
value = formatter.format((OffsetTime) value);
}
format = formatter.pattern();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,21 @@ public void testPivotValueNotFoldable() {
error("SELECT * FROM (SELECT int, keyword, bool FROM test) " + "PIVOT(AVG(int) FOR keyword IN ('bla', bool))"));
}

public void testPivotWithFunctionInput() {
assertEquals("1:37: No functions allowed (yet); encountered [YEAR(date)]",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No functions allowed due to namedExpression issues.

error("SELECT * FROM (SELECT int, keyword, YEAR(date) FROM test) " + "PIVOT(AVG(int) FOR keyword IN ('bla'))"));
}

public void testPivotWithFoldableFunctionInValues() {
assertEquals("1:85: Non-literal [UCASE('bla')] found inside PIVOT values",
error("SELECT * FROM (SELECT int, keyword, bool FROM test) " + "PIVOT(AVG(int) FOR keyword IN ( UCASE('bla') ))"));
}

public void testPivotWithNull() {
assertEquals("1:85: Null not allowed as a PIVOT value",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@astefan Added test for forbidding null - it looks like it doesn't work since In doesn't allow it and it just removes it. Which kinda makes sense (null means the thing is missing).
This could be improved by either handling nulls as a separate filter or, preferably inside In directly.
@matriv thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When null is inside the value list of IN it will always yield NULL as a result which in the case of IN used as a filter in a normal WHERE clause means false and therefore it's eliminated. e.g.:

postgres=# select null in (null);
 ?column?
----------

(1 row)

If for pivot this case should yield true It should be treated differently but in my opinion not inside IN but somewhere else (e.g.: QueryFolder where we have the context information of IN used in a PIVOT query).

error("SELECT * FROM (SELECT int, keyword, bool FROM test) " + "PIVOT(AVG(int) FOR keyword IN ( null ))"));
}

public void testPivotValuesHaveDifferentTypeThanColumn() {
assertEquals("1:81: Literal ['bla'] of type [keyword] does not match type [boolean] of PIVOT column [bool]",
error("SELECT * FROM (SELECT int, keyword, bool FROM test) " + "PIVOT(AVG(int) FOR bool IN ('bla'))"));
Expand Down