Skip to content

Commit ee325fa

Browse files
committed
[8.18] ESQL: ESQL: Fix ReplaceMissingFieldsWithNull (elastic#125764) (elastic#126166)
* Revert changes to Layout.java The change in 80125a4 is a quick fix and allows breaking an invariant of Layout. Revert that. * Simplify ReplaceMissingFieldWithNull When encountering projections, it tries to do the job of field extraction for missing fields by injecting an Eval that creates a literal null with the same name id as the field attribute for the missing field. This is wrong: 1. We only insert an Eval in case that a Project relies on the missing attribute. There could be other plan nodes that rely on the missing attribute. 2. Even for Projects, we only insert an Eval in case we squarely project for the field - in case of aliases (e.g. from RENAME), we do nothing. 3. In case of multiple Projects that use this attribute, we create multiple attributes with the original field attribute's id, causing a wrong Layout. This triggered elastic#121754. * Revive logic for EsRelation instead of Project * Update LocalLogicalPlanOptimizerTests * Update test expectations * Do not prune attributes from EsRelation This can lead to empty output, which leads to the EsRelation being replaced by a LocalRelation with 0 rows. * Add tests + capability * Add comments * [CI] Auto commit changes from spotless * Update docs/changelog/125764.yaml --------- Co-authored-by: elasticsearchmachine <[email protected]> (cherry picked from commit 96ca13a) # Conflicts: # x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec # x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java # x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java # x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/Layout.java # x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java # x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalPhysicalPlanOptimizerTests.java
1 parent aee9aa4 commit ee325fa

File tree

8 files changed

+152
-58
lines changed

8 files changed

+152
-58
lines changed

docs/changelog/125764.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
pr: 125764
2+
summary: Fix `ReplaceMissingFieldsWithNull`
3+
area: ES|QL
4+
type: bug
5+
issues:
6+
- 126036
7+
- 121754
8+
- 126030

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java

+10
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
import org.elasticsearch.xpack.esql.parser.QueryParam;
6565
import org.elasticsearch.xpack.esql.plan.logical.Enrich;
6666
import org.elasticsearch.xpack.esql.plan.logical.EsRelation;
67+
import org.elasticsearch.xpack.esql.plan.logical.Limit;
6768
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;
6869
import org.elasticsearch.xpack.esql.plan.logical.local.LocalRelation;
6970
import org.elasticsearch.xpack.esql.plan.logical.local.LocalSupplier;
@@ -132,6 +133,7 @@
132133
import static org.elasticsearch.xpack.esql.parser.ParserUtils.ParamClassification.PATTERN;
133134
import static org.elasticsearch.xpack.esql.parser.ParserUtils.ParamClassification.VALUE;
134135
import static org.hamcrest.Matchers.instanceOf;
136+
import static org.junit.Assert.assertEquals;
135137
import static org.junit.Assert.assertNotNull;
136138
import static org.junit.Assert.assertNull;
137139

@@ -393,6 +395,14 @@ public static <T> T as(Object node, Class<T> type) {
393395
return type.cast(node);
394396
}
395397

398+
public static Limit asLimit(Object node, Integer limitLiteral) {
399+
Limit limit = as(node, Limit.class);
400+
if (limitLiteral != null) {
401+
assertEquals(as(limit.limit(), Literal.class).value(), limitLiteral);
402+
}
403+
return limit;
404+
}
405+
396406
public static Map<String, EsField> loadMapping(String name) {
397407
return LoadMapping.loadMapping(name);
398408
}

x-pack/plugin/esql/qa/testFixtures/src/main/resources/enrich.csv-spec

+18
Original file line numberDiff line numberDiff line change
@@ -644,3 +644,21 @@ FROM airports
644644
abbrev:k | city_name:k | city_location:geo_point | country:k | location:geo_point | name:text | region:text | boundary_wkt_length:i
645645
IDR | Indore | POINT(75.8472 22.7167) | India | POINT(75.8092915005895 22.727749187571) | Devi Ahilyabai Holkar Int'l | Indore City | 231
646646
;
647+
648+
// Regression test for https://github.com/elastic/elasticsearch/issues/126030
649+
// We had wrong layouts from ReplaceMissingFieldsWithNull in case of indices that had relevant fields for the query,
650+
// but were **missing the field we enrich on**.
651+
fieldsInOtherIndicesBug
652+
required_capability: enrich_load
653+
required_capability: fix_replace_missing_field_with_null_duplicate_name_id_in_layout
654+
655+
from *
656+
| keep author.keyword, book_no, scalerank, street, bytes_in, @timestamp, abbrev, city_location, distance, description, birth_date, language_code, intersects, client_ip, event_duration, version
657+
| enrich languages_policy on author.keyword
658+
| sort book_no
659+
| limit 1
660+
;
661+
662+
author.keyword:keyword|book_no:keyword|scalerank:integer|street:keyword|bytes_in:ul|@timestamp:unsupported|abbrev:keyword|city_location:geo_point|distance:double|description:unsupported|birth_date:date|language_code:integer|intersects:boolean|client_ip:unsupported|event_duration:long|version:version|language_name:keyword
663+
Fyodor Dostoevsky |1211 |null |null |null |null |null |null |null |null |null |null |null |null |null |null |null
664+
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,12 @@ public enum Cap {
524524
/**
525525
* Fix for regex folding with case-insensitive pattern https://github.com/elastic/elasticsearch/issues/118371
526526
*/
527-
FIXED_REGEX_FOLD;
527+
FIXED_REGEX_FOLD,
528+
529+
/**
530+
* Avoid duplicated channels with the same name id when executing ESQL queries.
531+
*/
532+
FIX_REPLACE_MISSING_FIELD_WITH_NULL_DUPLICATE_NAME_ID_IN_LAYOUT;
528533

529534
private final boolean enabled;
530535

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java

+45-38
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99

1010
import org.elasticsearch.common.util.Maps;
1111
import org.elasticsearch.xpack.esql.core.expression.Alias;
12+
import org.elasticsearch.xpack.esql.core.expression.Attribute;
1213
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
1314
import org.elasticsearch.xpack.esql.core.expression.Literal;
1415
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
1516
import org.elasticsearch.xpack.esql.core.type.DataType;
1617
import org.elasticsearch.xpack.esql.optimizer.LocalLogicalOptimizerContext;
17-
import org.elasticsearch.xpack.esql.plan.logical.Aggregate;
1818
import org.elasticsearch.xpack.esql.plan.logical.EsRelation;
1919
import org.elasticsearch.xpack.esql.plan.logical.Eval;
2020
import org.elasticsearch.xpack.esql.plan.logical.Filter;
@@ -23,13 +23,12 @@
2323
import org.elasticsearch.xpack.esql.plan.logical.Project;
2424
import org.elasticsearch.xpack.esql.plan.logical.RegexExtract;
2525
import org.elasticsearch.xpack.esql.plan.logical.TopN;
26-
import org.elasticsearch.xpack.esql.plan.logical.local.LocalRelation;
2726
import org.elasticsearch.xpack.esql.rule.ParameterizedRule;
28-
import org.elasticsearch.xpack.esql.stats.SearchStats;
2927

3028
import java.util.ArrayList;
3129
import java.util.List;
3230
import java.util.Map;
31+
import java.util.function.Predicate;
3332

3433
/**
3534
* Look for any fields used in the plan that are missing locally and replace them with null.
@@ -39,60 +38,68 @@ public class ReplaceMissingFieldWithNull extends ParameterizedRule<LogicalPlan,
3938

4039
@Override
4140
public LogicalPlan apply(LogicalPlan plan, LocalLogicalOptimizerContext localLogicalOptimizerContext) {
42-
return plan.transformUp(p -> missingToNull(p, localLogicalOptimizerContext.searchStats()));
43-
}
44-
45-
private LogicalPlan missingToNull(LogicalPlan plan, SearchStats stats) {
46-
if (plan instanceof EsRelation || plan instanceof LocalRelation) {
47-
return plan;
48-
}
41+
// Do not use the attribute name, this can deviate from the field name for union types; use fieldName() instead.
42+
Predicate<FieldAttribute> shouldBeRetained = f -> localLogicalOptimizerContext.searchStats().exists(f.fieldName());
4943

50-
if (plan instanceof Aggregate a) {
51-
// don't do anything (for now)
52-
return a;
53-
}
54-
// keep the aliased name
55-
else if (plan instanceof Project project) {
56-
var projections = project.projections();
57-
List<NamedExpression> newProjections = new ArrayList<>(projections.size());
58-
Map<DataType, Alias> nullLiteral = Maps.newLinkedHashMapWithExpectedSize(DataType.types().size());
44+
return plan.transformUp(p -> missingToNull(p, shouldBeRetained));
45+
}
5946

60-
for (NamedExpression projection : projections) {
61-
// Do not use the attribute name, this can deviate from the field name for union types.
62-
if (projection instanceof FieldAttribute f && stats.exists(f.fieldName()) == false) {
47+
private LogicalPlan missingToNull(LogicalPlan plan, Predicate<FieldAttribute> shouldBeRetained) {
48+
if (plan instanceof EsRelation relation) {
49+
// Remove missing fields from the EsRelation because this is not where we will obtain them from; replace them by an Eval right
50+
// after, instead. This allows us to safely re-use the attribute ids of the corresponding FieldAttributes.
51+
// This means that an EsRelation[field1, field2, field3] where field1 and field 3 are missing will be replaced by
52+
// Project[field1, field2, field3] <- keeps the ordering intact
53+
// \_Eval[field1 = null, field3 = null]
54+
// \_EsRelation[field2]
55+
List<Attribute> relationOutput = relation.output();
56+
Map<DataType, Alias> nullLiterals = Maps.newLinkedHashMapWithExpectedSize(DataType.types().size());
57+
List<NamedExpression> newProjections = new ArrayList<>(relationOutput.size());
58+
for (int i = 0, size = relationOutput.size(); i < size; i++) {
59+
Attribute attr = relationOutput.get(i);
60+
NamedExpression projection;
61+
if (attr instanceof FieldAttribute f && (shouldBeRetained.test(f) == false)) {
6362
DataType dt = f.dataType();
64-
Alias nullAlias = nullLiteral.get(f.dataType());
63+
Alias nullAlias = nullLiterals.get(dt);
6564
// save the first field as null (per datatype)
6665
if (nullAlias == null) {
66+
// Keep the same id so downstream query plans don't need updating
67+
// NOTE: THIS IS BRITTLE AND CAN LEAD TO BUGS.
68+
// In case some optimizer rule or so inserts a plan node that requires the field BEFORE the Eval that we're adding
69+
// on top of the EsRelation, this can trigger a field extraction in the physical optimizer phase, causing wrong
70+
// layouts due to a duplicate name id.
71+
// If someone reaches here AGAIN when debugging e.g. ClassCastExceptions NPEs from wrong layouts, we should probably
72+
// give up on this approach and instead insert EvalExecs in InsertFieldExtraction.
6773
Alias alias = new Alias(f.source(), f.name(), Literal.of(f, null), f.id());
68-
nullLiteral.put(dt, alias);
74+
nullLiterals.put(dt, alias);
6975
projection = alias.toAttribute();
7076
}
71-
// otherwise point to it
77+
// otherwise point to it since this avoids creating field copies
7278
else {
73-
// since avoids creating field copies
7479
projection = new Alias(f.source(), f.name(), nullAlias.toAttribute(), f.id());
7580
}
81+
} else {
82+
projection = attr;
7683
}
77-
7884
newProjections.add(projection);
7985
}
80-
// add the first found field as null
81-
if (nullLiteral.size() > 0) {
82-
plan = new Eval(project.source(), project.child(), new ArrayList<>(nullLiteral.values()));
83-
plan = new Project(project.source(), plan, newProjections);
86+
87+
if (nullLiterals.size() == 0) {
88+
return plan;
8489
}
85-
} else if (plan instanceof Eval
90+
91+
Eval eval = new Eval(plan.source(), relation, new ArrayList<>(nullLiterals.values()));
92+
// This projection is redundant if there's another projection downstream (and no commands depend on the order until we hit it).
93+
return new Project(plan.source(), eval, newProjections);
94+
}
95+
96+
if (plan instanceof Eval
8697
|| plan instanceof Filter
8798
|| plan instanceof OrderBy
8899
|| plan instanceof RegexExtract
89100
|| plan instanceof TopN) {
90-
plan = plan.transformExpressionsOnlyUp(
91-
FieldAttribute.class,
92-
// Do not use the attribute name, this can deviate from the field name for union types.
93-
f -> stats.exists(f.fieldName()) ? f : Literal.of(f, null)
94-
);
95-
}
101+
return plan.transformExpressionsOnlyUp(FieldAttribute.class, f -> shouldBeRetained.test(f) ? f : Literal.of(f, null));
102+
}
96103

97104
return plan;
98105
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/Layout.java

+4-5
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,11 @@ public Layout build() {
109109
for (ChannelSet set : channels) {
110110
int channel = numberOfChannels++;
111111
for (NameId id : set.nameIds) {
112+
// Duplicate name ids would mean that have 2 channels that are declared under the same id. That makes no sense - which
113+
// channel should subsequent operators use, then, when they want to refer to this id?
114+
assert (layout.containsKey(id) == false) : "Duplicate name ids are not allowed in layouts";
112115
ChannelAndType next = new ChannelAndType(channel, set.type);
113-
ChannelAndType prev = layout.put(id, next);
114-
// Do allow multiple name to point to the same channel - see https://github.com/elastic/elasticsearch/pull/100238
115-
// if (prev != null) {
116-
// throw new IllegalArgumentException("Name [" + id + "] is on two channels [" + prev + "] and [" + next + "]");
117-
// }
116+
layout.put(id, next);
118117
}
119118
}
120119
return new DefaultLayout(Collections.unmodifiableMap(layout), numberOfChannels);

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java

+58-11
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import org.elasticsearch.xpack.esql.core.expression.Expressions;
2222
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
2323
import org.elasticsearch.xpack.esql.core.expression.Literal;
24-
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
2524
import org.elasticsearch.xpack.esql.core.expression.predicate.logical.And;
2625
import org.elasticsearch.xpack.esql.core.expression.predicate.nulls.IsNotNull;
2726
import org.elasticsearch.xpack.esql.core.tree.NodeInfo;
@@ -65,6 +64,7 @@
6564
import static org.elasticsearch.xpack.esql.EsqlTestUtils.THREE;
6665
import static org.elasticsearch.xpack.esql.EsqlTestUtils.TWO;
6766
import static org.elasticsearch.xpack.esql.EsqlTestUtils.as;
67+
import static org.elasticsearch.xpack.esql.EsqlTestUtils.asLimit;
6868
import static org.elasticsearch.xpack.esql.EsqlTestUtils.getFieldAttribute;
6969
import static org.elasticsearch.xpack.esql.EsqlTestUtils.greaterThanOf;
7070
import static org.elasticsearch.xpack.esql.EsqlTestUtils.loadMapping;
@@ -73,9 +73,9 @@
7373
import static org.elasticsearch.xpack.esql.EsqlTestUtils.withDefaultLimitWarning;
7474
import static org.elasticsearch.xpack.esql.core.tree.Source.EMPTY;
7575
import static org.hamcrest.Matchers.contains;
76-
import static org.hamcrest.Matchers.equalTo;
7776
import static org.hamcrest.Matchers.hasSize;
7877
import static org.hamcrest.Matchers.is;
78+
import static org.hamcrest.Matchers.not;
7979
import static org.hamcrest.Matchers.nullValue;
8080

8181
//@TestLogging(value = "org.elasticsearch.xpack.esql:TRACE", reason = "debug")
@@ -139,7 +139,7 @@ public void testMissingFieldInFilterString() {
139139

140140
/**
141141
* Expects
142-
* Project[[last_name{r}#6]]
142+
* Project[[last_name{f}#6]]
143143
* \_Eval[[null[KEYWORD] AS last_name]]
144144
* \_Limit[10000[INTEGER]]
145145
* \_EsRelation[test][_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gen..]
@@ -156,7 +156,7 @@ public void testMissingFieldInProject() {
156156
var project = as(localPlan, Project.class);
157157
var projections = project.projections();
158158
assertThat(Expressions.names(projections), contains("last_name"));
159-
as(projections.get(0), ReferenceAttribute.class);
159+
as(projections.get(0), FieldAttribute.class);
160160
var eval = as(project.child(), Eval.class);
161161
assertThat(Expressions.names(eval.fields()), contains("last_name"));
162162
var alias = as(eval.fields().get(0), Alias.class);
@@ -166,6 +166,7 @@ public void testMissingFieldInProject() {
166166

167167
var limit = as(eval.child(), Limit.class);
168168
var source = as(limit.child(), EsRelation.class);
169+
assertThat(Expressions.names(source.output()), not(contains("last_name")));
169170
}
170171

171172
/**
@@ -190,14 +191,18 @@ public void testMissingFieldInSort() {
190191

191192
var limit = as(project.child(), Limit.class);
192193
var source = as(limit.child(), EsRelation.class);
194+
assertThat(Expressions.names(source.output()), not(contains("last_name")));
193195
}
194196

195197
/**
196198
* Expects
197-
* EsqlProject[[first_name{f}#9, last_name{r}#18]]
198-
* \_MvExpand[last_name{f}#12,last_name{r}#18,1000]
199-
* \_Limit[1000[INTEGER]]
200-
* \_EsRelation[test][_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, ge..]
199+
* EsqlProject[[first_name{f}#7, last_name{r}#16]]
200+
* \_MvExpand[last_name{f}#10,last_name{r}#16,1000]
201+
* \_Project[[_meta_field{f}#12, emp_no{f}#6, first_name{f}#7, gender{f}#8, job{f}#13, job.raw{f}#14, languages{f}#9, last_
202+
* name{r}#10, long_noidx{f}#15, salary{f}#11]]
203+
* \_Eval[[null[KEYWORD] AS last_name]]
204+
* \_Limit[1000[INTEGER]]
205+
* \_EsRelation[test][_meta_field{f}#12, emp_no{f}#6, first_name{f}#7, ge..]
201206
*/
202207
public void testMissingFieldInMvExpand() {
203208
var plan = plan("""
@@ -209,14 +214,23 @@ public void testMissingFieldInMvExpand() {
209214
var testStats = statsForMissingField("last_name");
210215
var localPlan = localPlan(plan, testStats);
211216

217+
// It'd be much better if this project was pushed down past the MvExpand, because MvExpand's cost scales with the number of
218+
// involved attributes/columns.
212219
var project = as(localPlan, EsqlProject.class);
213220
var projections = project.projections();
214221
assertThat(Expressions.names(projections), contains("first_name", "last_name"));
215222

216223
var mvExpand = as(project.child(), MvExpand.class);
217-
assertThat(mvExpand.limit(), equalTo(1000));
218-
var limit2 = as(mvExpand.child(), Limit.class);
219-
as(limit2.child(), EsRelation.class);
224+
assertEquals(1000, (int) mvExpand.limit());
225+
var project2 = as(mvExpand.child(), Project.class);
226+
var eval = as(project2.child(), Eval.class);
227+
assertEquals(eval.fields().size(), 1);
228+
var lastName = eval.fields().get(0);
229+
assertEquals(lastName.name(), "last_name");
230+
assertEquals(lastName.child(), new Literal(EMPTY, null, DataType.KEYWORD));
231+
var limit = asLimit(eval.child(), 1000);
232+
var relation = as(limit.child(), EsRelation.class);
233+
assertThat(Expressions.names(relation.output()), not(contains("last_name")));
220234
}
221235

222236
public static class MockFieldAttributeCommand extends UnaryPlan {
@@ -278,6 +292,39 @@ public void testMissingFieldInNewCommand() {
278292
),
279293
testStats
280294
);
295+
296+
var plan = plan("""
297+
from test
298+
""");
299+
var initialRelation = plan.collectLeaves().get(0);
300+
FieldAttribute lastName = null;
301+
for (Attribute attr : initialRelation.output()) {
302+
if (attr.name().equals("last_name")) {
303+
lastName = (FieldAttribute) attr;
304+
}
305+
}
306+
307+
// Expects
308+
// MockFieldAttributeCommand[last_name{f}#7]
309+
// \_Project[[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gender{f}#5, hire_date{f}#10, job{f}#11, job.raw{f}#12, langu
310+
// ages{f}#6, last_name{r}#7, long_noidx{f}#13, salary{f}#8]]
311+
// \_Eval[[null[KEYWORD] AS last_name]]
312+
// \_Limit[1000[INTEGER],false]
313+
// \_EsRelation[test][_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gen..]
314+
LogicalPlan localPlan = localPlan(new MockFieldAttributeCommand(EMPTY, plan, lastName), testStats);
315+
316+
var mockCommand = as(localPlan, MockFieldAttributeCommand.class);
317+
var project = as(mockCommand.child(), Project.class);
318+
var eval = as(project.child(), Eval.class);
319+
var limit = asLimit(eval.child(), 1000);
320+
var relation = as(limit.child(), EsRelation.class);
321+
322+
assertThat(Expressions.names(eval.fields()), contains("last_name"));
323+
var literal = as(eval.fields().get(0), Alias.class);
324+
assertEquals(literal.child(), new Literal(EMPTY, null, DataType.KEYWORD));
325+
assertThat(Expressions.names(relation.output()), not(contains("last_name")));
326+
327+
assertEquals(Expressions.names(initialRelation.output()), Expressions.names(project.output()));
281328
}
282329

283330
/**

0 commit comments

Comments
 (0)