Skip to content

Commit dd66fae

Browse files
authored
[ML] Use query in cardinality check (#49939)
When checking the cardinality of a field, the query should be take into account. The user might know about some bad data in their index and want to filter down to the target_field values they care about.
1 parent 4415f1a commit dd66fae

File tree

3 files changed

+35
-4
lines changed

3 files changed

+35
-4
lines changed

x-pack/plugin/ml/qa/native-multi-node-tests/src/test/java/org/elasticsearch/xpack/ml/integration/ClassificationIT.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import org.elasticsearch.action.index.IndexRequest;
1616
import org.elasticsearch.action.search.SearchResponse;
1717
import org.elasticsearch.action.support.WriteRequest;
18+
import org.elasticsearch.index.query.QueryBuilder;
19+
import org.elasticsearch.index.query.QueryBuilders;
1820
import org.elasticsearch.search.SearchHit;
1921
import org.elasticsearch.xpack.core.ml.action.EvaluateDataFrameAction;
2022
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
@@ -228,7 +230,7 @@ public void testWithOnlyTrainingRowsAndTrainingPercentIsFifty_DependentVariableI
228230
assertEvaluation(BOOLEAN_FIELD, BOOLEAN_FIELD_VALUES, "ml.boolean-field_prediction");
229231
}
230232

231-
public void testDependentVariableCardinalityTooHighError() {
233+
public void testDependentVariableCardinalityTooHighError() throws Exception {
232234
initialize("cardinality_too_high");
233235
indexData(sourceIndex, 6, 5, KEYWORD_FIELD);
234236
// Index one more document with a class different than the two already used.
@@ -246,6 +248,27 @@ public void testDependentVariableCardinalityTooHighError() {
246248
assertThat(e.getMessage(), equalTo("Field [keyword-field] must have at most [2] distinct values but there were at least [3]"));
247249
}
248250

251+
public void testDependentVariableCardinalityTooHighButWithQueryMakesItWithinRange() throws Exception {
252+
initialize("cardinality_too_high_with_query");
253+
indexData(sourceIndex, 6, 5, KEYWORD_FIELD);
254+
// Index one more document with a class different than the two already used.
255+
client().execute(IndexAction.INSTANCE, new IndexRequest(sourceIndex)
256+
.source(KEYWORD_FIELD, "fox")
257+
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE))
258+
.actionGet();
259+
QueryBuilder query = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(KEYWORD_FIELD, KEYWORD_FIELD_VALUES));
260+
261+
DataFrameAnalyticsConfig config = buildAnalytics(jobId, sourceIndex, destIndex, null, new Classification(KEYWORD_FIELD), query);
262+
registerAnalytics(config);
263+
putAnalytics(config);
264+
265+
// Should not throw
266+
startAnalytics(jobId);
267+
waitUntilAnalyticsIsStopped(jobId);
268+
269+
assertProgress(jobId, 100, 100, 100, 100);
270+
}
271+
249272
private void initialize(String jobId) {
250273
this.jobId = jobId;
251274
this.sourceIndex = jobId + "_source_index";

x-pack/plugin/ml/qa/native-multi-node-tests/src/test/java/org/elasticsearch/xpack/ml/integration/MlNativeDataFrameAnalyticsIntegTestCase.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.elasticsearch.common.Nullable;
1717
import org.elasticsearch.common.Strings;
1818
import org.elasticsearch.common.unit.TimeValue;
19+
import org.elasticsearch.index.query.QueryBuilder;
1920
import org.elasticsearch.index.query.QueryBuilders;
2021
import org.elasticsearch.search.sort.SortOrder;
2122
import org.elasticsearch.xpack.core.ml.action.DeleteDataFrameAnalyticsAction;
@@ -37,6 +38,7 @@
3738
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
3839
import org.elasticsearch.xpack.core.ml.notifications.AuditorField;
3940
import org.elasticsearch.xpack.core.ml.utils.PhaseProgress;
41+
import org.elasticsearch.xpack.core.ml.utils.QueryProvider;
4042
import org.elasticsearch.xpack.ml.dataframe.DataFrameAnalyticsTask;
4143
import org.hamcrest.Matcher;
4244
import org.hamcrest.Matchers;
@@ -161,10 +163,16 @@ protected EvaluateDataFrameAction.Response evaluateDataFrame(String index, Evalu
161163
}
162164

163165
protected static DataFrameAnalyticsConfig buildAnalytics(String id, String sourceIndex, String destIndex,
164-
@Nullable String resultsField, DataFrameAnalysis analysis) {
166+
@Nullable String resultsField, DataFrameAnalysis analysis) throws Exception {
167+
return buildAnalytics(id, sourceIndex, destIndex, resultsField, analysis, QueryBuilders.matchAllQuery());
168+
}
169+
170+
protected static DataFrameAnalyticsConfig buildAnalytics(String id, String sourceIndex, String destIndex,
171+
@Nullable String resultsField, DataFrameAnalysis analysis,
172+
QueryBuilder queryBuilder) throws Exception {
165173
return new DataFrameAnalyticsConfig.Builder()
166174
.setId(id)
167-
.setSource(new DataFrameAnalyticsSource(new String[] { sourceIndex }, null, null))
175+
.setSource(new DataFrameAnalyticsSource(new String[] { sourceIndex }, QueryProvider.fromParsedQuery(queryBuilder), null))
168176
.setDest(new DataFrameAnalyticsDest(destIndex, resultsField))
169177
.setAnalysis(analysis)
170178
.build();

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ private void getCardinalitiesForFieldsWithLimit(String[] index, DataFrameAnalyti
109109
listener::onFailure
110110
);
111111

112-
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder().size(0);
112+
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder().size(0).query(config.getSource().getParsedQuery());
113113
for (Map.Entry<String, Long> entry : fieldCardinalityLimits.entrySet()) {
114114
String fieldName = entry.getKey();
115115
Long limit = entry.getValue();

0 commit comments

Comments
 (0)