Skip to content

Commit 4edb2e7

Browse files
[7.x][ML] Add optional source filtering during data frame reindexing (#49690) (#49718)
This adds a `_source` setting under the `source` setting of a data frame analytics config. The new `_source` is reusing the structure of a `FetchSourceContext` like `analyzed_fields` does. Specifying includes and excludes for source allows selecting which fields will get reindexed and will be available in the destination index. Closes #49531 Backport of #49690
1 parent 813b49a commit 4edb2e7

File tree

28 files changed

+525
-123
lines changed

28 files changed

+525
-123
lines changed

client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsSource.java

+26-4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.elasticsearch.common.xcontent.ToXContentObject;
2727
import org.elasticsearch.common.xcontent.XContentBuilder;
2828
import org.elasticsearch.common.xcontent.XContentParser;
29+
import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
2930

3031
import java.io.IOException;
3132
import java.util.Arrays;
@@ -44,20 +45,27 @@ public static Builder builder() {
4445

4546
private static final ParseField INDEX = new ParseField("index");
4647
private static final ParseField QUERY = new ParseField("query");
48+
public static final ParseField _SOURCE = new ParseField("_source");
4749

4850
private static ObjectParser<Builder, Void> PARSER = new ObjectParser<>("data_frame_analytics_source", true, Builder::new);
4951

5052
static {
5153
PARSER.declareStringArray(Builder::setIndex, INDEX);
5254
PARSER.declareObject(Builder::setQueryConfig, (p, c) -> QueryConfig.fromXContent(p), QUERY);
55+
PARSER.declareField(Builder::setSourceFiltering,
56+
(p, c) -> FetchSourceContext.fromXContent(p),
57+
_SOURCE,
58+
ObjectParser.ValueType.OBJECT_ARRAY_BOOLEAN_OR_STRING);
5359
}
5460

5561
private final String[] index;
5662
private final QueryConfig queryConfig;
63+
private final FetchSourceContext sourceFiltering;
5764

58-
private DataFrameAnalyticsSource(String[] index, @Nullable QueryConfig queryConfig) {
65+
private DataFrameAnalyticsSource(String[] index, @Nullable QueryConfig queryConfig, @Nullable FetchSourceContext sourceFiltering) {
5966
this.index = Objects.requireNonNull(index);
6067
this.queryConfig = queryConfig;
68+
this.sourceFiltering = sourceFiltering;
6169
}
6270

6371
public String[] getIndex() {
@@ -68,13 +76,20 @@ public QueryConfig getQueryConfig() {
6876
return queryConfig;
6977
}
7078

79+
public FetchSourceContext getSourceFiltering() {
80+
return sourceFiltering;
81+
}
82+
7183
@Override
7284
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
7385
builder.startObject();
7486
builder.field(INDEX.getPreferredName(), index);
7587
if (queryConfig != null) {
7688
builder.field(QUERY.getPreferredName(), queryConfig.getQuery());
7789
}
90+
if (sourceFiltering != null) {
91+
builder.field(_SOURCE.getPreferredName(), sourceFiltering);
92+
}
7893
builder.endObject();
7994
return builder;
8095
}
@@ -86,12 +101,13 @@ public boolean equals(Object o) {
86101

87102
DataFrameAnalyticsSource other = (DataFrameAnalyticsSource) o;
88103
return Arrays.equals(index, other.index)
89-
&& Objects.equals(queryConfig, other.queryConfig);
104+
&& Objects.equals(queryConfig, other.queryConfig)
105+
&& Objects.equals(sourceFiltering, other.sourceFiltering);
90106
}
91107

92108
@Override
93109
public int hashCode() {
94-
return Objects.hash(Arrays.asList(index), queryConfig);
110+
return Objects.hash(Arrays.asList(index), queryConfig, sourceFiltering);
95111
}
96112

97113
@Override
@@ -103,6 +119,7 @@ public static class Builder {
103119

104120
private String[] index;
105121
private QueryConfig queryConfig;
122+
private FetchSourceContext sourceFiltering;
106123

107124
private Builder() {}
108125

@@ -121,8 +138,13 @@ public Builder setQueryConfig(QueryConfig queryConfig) {
121138
return this;
122139
}
123140

141+
public Builder setSourceFiltering(FetchSourceContext sourceFiltering) {
142+
this.sourceFiltering = sourceFiltering;
143+
return this;
144+
}
145+
124146
public DataFrameAnalyticsSource build() {
125-
return new DataFrameAnalyticsSource(index, queryConfig);
147+
return new DataFrameAnalyticsSource(index, queryConfig, sourceFiltering);
126148
}
127149
}
128150
}

client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java

+3
Original file line numberDiff line numberDiff line change
@@ -2939,6 +2939,9 @@ public void testPutDataFrameAnalytics() throws Exception {
29392939
DataFrameAnalyticsSource sourceConfig = DataFrameAnalyticsSource.builder() // <1>
29402940
.setIndex("put-test-source-index") // <2>
29412941
.setQueryConfig(queryConfig) // <3>
2942+
.setSourceFiltering(new FetchSourceContext(true,
2943+
new String[] { "included_field_1", "included_field_2" },
2944+
new String[] { "excluded_field" })) // <4>
29422945
.build();
29432946
// end::put-data-frame-analytics-source-config
29442947

client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsSourceTests.java

+9
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
2424
import org.elasticsearch.common.xcontent.XContentParser;
2525
import org.elasticsearch.search.SearchModule;
26+
import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
2627
import org.elasticsearch.test.AbstractXContentTestCase;
2728

2829
import java.io.IOException;
@@ -35,9 +36,17 @@
3536
public class DataFrameAnalyticsSourceTests extends AbstractXContentTestCase<DataFrameAnalyticsSource> {
3637

3738
public static DataFrameAnalyticsSource randomSourceConfig() {
39+
FetchSourceContext sourceFiltering = null;
40+
if (randomBoolean()) {
41+
sourceFiltering = new FetchSourceContext(true,
42+
generateRandomStringArray(10, 10, false, false),
43+
generateRandomStringArray(10, 10, false, false));
44+
}
45+
3846
return DataFrameAnalyticsSource.builder()
3947
.setIndex(generateRandomStringArray(10, 10, false, false))
4048
.setQueryConfig(randomBoolean() ? null : randomQueryConfig())
49+
.setSourceFiltering(sourceFiltering)
4150
.build();
4251
}
4352

docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ include-tagged::{doc-tests-file}[{api}-source-config]
5252
<1> Constructing a new DataFrameAnalyticsSource
5353
<2> The source index
5454
<3> The query from which to gather the data. If query is not set, a `match_all` query is used by default.
55+
<4> Source filtering to select which fields will exist in the destination index.
5556

5657
===== QueryConfig
5758

docs/reference/ml/df-analytics/apis/dfanalyticsresources.asciidoc

+23-9
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,18 @@
1616
<<dfanalytics-types>>.
1717

1818
`analyzed_fields`::
19-
(object) You can specify both `includes` and/or `excludes` patterns. If
20-
`analyzed_fields` is not set, only the relevant fields will be included. For
21-
example, all the numeric fields for {oldetection}. For the supported field
22-
types, see <<ml-put-dfanalytics-supported-fields>>.
19+
(Optional, object) Specify `includes` and/or `excludes` patterns to select
20+
which fields will be included in the analysis. If `analyzed_fields` is not set,
21+
only the relevant fields will be included. For example, all the numeric fields
22+
for {oldetection}. For the supported field types, see <<ml-put-dfanalytics-supported-fields>>.
23+
Also see the <<explain-dfanalytics>> which helps understand field selection.
2324

2425
`includes`:::
25-
(array) An array of strings that defines the fields that will be included in
26+
(Optional, array) An array of strings that defines the fields that will be included in
2627
the analysis.
2728

2829
`excludes`:::
29-
(array) An array of strings that defines the fields that will be excluded
30+
(Optional, array) An array of strings that defines the fields that will be excluded
3031
from the analysis.
3132

3233

@@ -81,8 +82,8 @@ PUT _ml/data_frame/analytics/loganalytics
8182
that setting. For more information, see <<ml-settings>>.
8283

8384
`source`::
84-
(object) The source configuration consisting an `index` and optionally a
85-
`query` object.
85+
(object) The configuration of how to source the analysis data. It requires an `index`.
86+
Optionally, `query` and `_source` may be specified.
8687

8788
`index`:::
8889
(Required, string or array) Index or indices on which to perform the
@@ -96,6 +97,19 @@ PUT _ml/data_frame/analytics/loganalytics
9697
as this object is passed verbatim to {es}. By default, this property has
9798
the following value: `{"match_all": {}}`.
9899

100+
`_source`:::
101+
(Optional, object) Specify `includes` and/or `excludes` patterns to select
102+
which fields will be present in the destination. Fields that are excluded
103+
cannot be included in the analysis.
104+
105+
`includes`::::
106+
(array) An array of strings that defines the fields that will be included in
107+
the destination.
108+
109+
`excludes`::::
110+
(array) An array of strings that defines the fields that will be excluded
111+
from the destination.
112+
99113
[[dfanalytics-types]]
100114
==== Analysis objects
101115

@@ -277,4 +291,4 @@ improvement. If you override any parameters, then the optimization will
277291
calculate the value of the remaining parameters accordingly and use the value
278292
you provided for the overridden parameter. The number of rounds are reduced
279293
respectively. The validation error is estimated in each round by using 4-fold
280-
cross validation.
294+
cross validation.

docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc

+33-20
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,13 @@ single number. For example, in case of age ranges, you can model the values as
101101
<<dfanalytics-types>>.
102102

103103
`analyzed_fields`::
104-
(Optional, object) You can specify both `includes` and/or `excludes` patterns.
105-
If `analyzed_fields` is not set, only the relevant fields will be included.
106-
For example, all the numeric fields for {oldetection}. For the supported field
107-
types, see <<ml-put-dfanalytics-supported-fields>>. If you specify fields –
108-
either in `includes` or in `excludes` – that have a data type that is not
109-
supported, an error occurs.
110-
104+
(Optional, object) Specify `includes` and/or `excludes` patterns to select
105+
which fields will be included in the analysis. If `analyzed_fields` is not set,
106+
only the relevant fields will be included. For example, all the numeric fields
107+
for {oldetection}. For the supported field types, see <<ml-put-dfanalytics-supported-fields>>.
108+
Also see the <<explain-dfanalytics>> which helps understand
109+
field selection.
110+
111111
`includes`:::
112112
(Optional, array) An array of strings that defines the fields that will be
113113
included in the analysis.
@@ -142,20 +142,33 @@ single number. For example, in case of age ranges, you can model the values as
142142
that setting. For more information, see <<ml-settings>>.
143143

144144
`source`::
145-
(Required, object) The source configuration, consisting of `index` and
146-
optionally a `query`.
145+
(object) The configuration of how to source the analysis data. It requires an `index`.
146+
Optionally, `query` and `_source` may be specified.
147147

148-
`index`:::
149-
(Required, string or array) Index or indices on which to perform the
150-
analysis. It can be a single index or index pattern as well as an array of
151-
indices or patterns.
152-
153-
`query`:::
154-
(Optional, object) The {es} query domain-specific language
155-
(<<query-dsl,DSL>>). This value corresponds to the query object in an {es}
156-
search POST body. All the options that are supported by {es} can be used,
157-
as this object is passed verbatim to {es}. By default, this property has
158-
the following value: `{"match_all": {}}`.
148+
`index`:::
149+
(Required, string or array) Index or indices on which to perform the
150+
analysis. It can be a single index or index pattern as well as an array of
151+
indices or patterns.
152+
153+
`query`:::
154+
(Optional, object) The {es} query domain-specific language
155+
(<<query-dsl,DSL>>). This value corresponds to the query object in an {es}
156+
search POST body. All the options that are supported by {es} can be used,
157+
as this object is passed verbatim to {es}. By default, this property has
158+
the following value: `{"match_all": {}}`.
159+
160+
`_source`:::
161+
(Optional, object) Specify `includes` and/or `excludes` patterns to select
162+
which fields will be present in the destination. Fields that are excluded
163+
cannot be included in the analysis.
164+
165+
`includes`::::
166+
(array) An array of strings that defines the fields that will be included in
167+
the destination.
168+
169+
`excludes`::::
170+
(array) An array of strings that defines the fields that will be excluded
171+
from the destination.
159172

160173
`allow_lazy_start`::
161174
(Optional, boolean) Whether this job should be allowed to start when there

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/PutDataFrameAnalyticsAction.java

+20
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import org.elasticsearch.action.ActionRequestValidationException;
99
import org.elasticsearch.action.ActionResponse;
1010
import org.elasticsearch.action.ActionType;
11+
import org.elasticsearch.action.ValidateActions;
1112
import org.elasticsearch.action.support.master.AcknowledgedRequest;
1213
import org.elasticsearch.action.support.master.MasterNodeOperationRequestBuilder;
1314
import org.elasticsearch.client.ElasticsearchClient;
@@ -18,6 +19,7 @@
1819
import org.elasticsearch.common.xcontent.XContentBuilder;
1920
import org.elasticsearch.common.xcontent.XContentParser;
2021
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
22+
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsSource;
2123
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
2224

2325
import java.io.IOException;
@@ -87,6 +89,24 @@ public DataFrameAnalyticsConfig getConfig() {
8789

8890
@Override
8991
public ActionRequestValidationException validate() {
92+
ActionRequestValidationException error = null;
93+
error = checkNoIncludedAnalyzedFieldsAreExcludedBySourceFiltering(config, error);
94+
return error;
95+
}
96+
97+
private ActionRequestValidationException checkNoIncludedAnalyzedFieldsAreExcludedBySourceFiltering(
98+
DataFrameAnalyticsConfig config, ActionRequestValidationException error) {
99+
if (config.getAnalyzedFields() == null) {
100+
return null;
101+
}
102+
for (String analyzedInclude : config.getAnalyzedFields().includes()) {
103+
if (config.getSource().isFieldExcluded(analyzedInclude)) {
104+
return ValidateActions.addValidationError("field [" + analyzedInclude + "] is included in ["
105+
+ DataFrameAnalyticsConfig.ANALYZED_FIELDS.getPreferredName() + "] but not in ["
106+
+ DataFrameAnalyticsConfig.SOURCE.getPreferredName() + "."
107+
+ DataFrameAnalyticsSource._SOURCE.getPreferredName() + "]", error);
108+
}
109+
}
90110
return null;
91111
}
92112

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/DataFrameAnalyticsConfig.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ private static DataFrameAnalysis parseAnalysis(XContentParser parser, boolean ig
127127
private final Version version;
128128
private final boolean allowLazyStart;
129129

130-
public DataFrameAnalyticsConfig(String id, String description, DataFrameAnalyticsSource source, DataFrameAnalyticsDest dest,
130+
private DataFrameAnalyticsConfig(String id, String description, DataFrameAnalyticsSource source, DataFrameAnalyticsDest dest,
131131
DataFrameAnalysis analysis, Map<String, String> headers, ByteSizeValue modelMemoryLimit,
132132
FetchSourceContext analyzedFields, Instant createTime, Version version, boolean allowLazyStart) {
133133
this.id = ExceptionsHelper.requireNonNull(id, ID);

0 commit comments

Comments
 (0)