From dbc57327a87774f189f45f66c687afcc3b1403d9 Mon Sep 17 00:00:00 2001 From: Dimitris Athanasiou Date: Fri, 2 Apr 2021 13:42:56 +0300 Subject: [PATCH] [ML] Exclude nested fields in data frame analytics Previously, the destination index was sorted which meant it could not have `nested` fields. Since this has changed, `nested` fields may be present. These were handled incorrectly as the _explain API would report that they can be included in the analysis while that is not the case. This commit fixes this issue by detecting `nested` fields and children of those `nested` fields and excluding them from the analysis. A `nested` field may contain multiple inner fields. To avoid the noise in the API response, we collapse them into a single entry with the path to the top level nested field. --- .../extractor/ExtractedFieldsDetector.java | 72 ++++++++++-- .../xpack/ml/extractor/ExtractedFields.java | 9 +- .../ExtractedFieldsDetectorTests.java | 111 +++++++++++++++--- 3 files changed, 164 insertions(+), 28 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java index ef1dcc9b86599..2762737c78d1d 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java @@ -47,6 +47,7 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Optional; import java.util.Set; import java.util.TreeSet; import java.util.stream.Collectors; @@ -65,6 +66,7 @@ public class ExtractedFieldsDetector { private final int docValueFieldsLimit; private final FieldCapabilitiesResponse fieldCapabilitiesResponse; private final Map cardinalitiesForFieldsWithConstraints; + private final List topNestedFieldPrefixes; ExtractedFieldsDetector(DataFrameAnalyticsConfig config, int docValueFieldsLimit, @@ -74,6 +76,26 @@ public class ExtractedFieldsDetector { this.docValueFieldsLimit = docValueFieldsLimit; this.fieldCapabilitiesResponse = Objects.requireNonNull(fieldCapabilitiesResponse); this.cardinalitiesForFieldsWithConstraints = Objects.requireNonNull(cardinalitiesForFieldsWithConstraints); + this.topNestedFieldPrefixes = findTopNestedFieldPrefixes(fieldCapabilitiesResponse); + } + + private List findTopNestedFieldPrefixes(FieldCapabilitiesResponse fieldCapabilitiesResponse) { + List sortedNestedFieldPrefixes = fieldCapabilitiesResponse.get().keySet().stream() + .filter(field -> isNested(getMappingTypes(field))) + .map(field -> field + ".") + .sorted() + .collect(Collectors.toList()); + Iterator iterator = sortedNestedFieldPrefixes.iterator(); + String previousNestedFieldPrefix = null; + while (iterator.hasNext()) { + String nestedFieldPrefix = iterator.next(); + if (previousNestedFieldPrefix != null && nestedFieldPrefix.startsWith(previousNestedFieldPrefix)) { + iterator.remove(); + } else { + previousNestedFieldPrefix = nestedFieldPrefix; + } + } + return Collections.unmodifiableList(sortedNestedFieldPrefixes); } public Tuple> detect() { @@ -139,7 +161,14 @@ private void validateFieldsRequireForProcessors(Set processorFields) { } removeObjects(fieldsForProcessor); if (fieldsForProcessor.size() < processorFields.size()) { - throw ExceptionsHelper.badRequestException("fields for feature_processors must not be objects"); + throw ExceptionsHelper.badRequestException("fields for feature_processors must not be objects or nested"); + } + for (String field : fieldsForProcessor) { + Optional matchingNestedFieldPattern = findMatchingNestedFieldPattern(field); + if (matchingNestedFieldPattern.isPresent()) { + throw ExceptionsHelper.badRequestException("nested fields [{}] cannot be used in a feature_processor", + matchingNestedFieldPattern.get()); + } } Collection errorFields = new ArrayList<>(); for (String fieldName : fieldsForProcessor) { @@ -190,7 +219,7 @@ private void removeObjects(Set fields) { while (fieldsIterator.hasNext()) { String field = fieldsIterator.next(); Set types = getMappingTypes(field); - if (isObject(types)) { + if (isObject(types) || isNested(types)) { fieldsIterator.remove(); } } @@ -210,6 +239,11 @@ private void addExcludedField(String field, String reason, Set f fieldSelection.add(FieldSelection.excluded(field, getMappingTypes(field), reason)); } + private void addExcludedNestedPattern(String pattern, Set fieldSelection) { + fieldSelection.add(FieldSelection.excluded( + pattern, Collections.singleton(ObjectMapper.NESTED_CONTENT_TYPE), "nested fields are not supported")); + } + private Set getMappingTypes(String field) { Map fieldCaps = fieldCapabilitiesResponse.getField(field); return fieldCaps == null ? Collections.emptySet() : fieldCaps.keySet(); @@ -223,6 +257,11 @@ private void removeFieldsWithIncompatibleTypes(Set fields, Set matchingNestedFieldPattern = findMatchingNestedFieldPattern(field); + if (matchingNestedFieldPattern.isPresent()) { + addExcludedNestedPattern(matchingNestedFieldPattern.get(), fieldSelection); + fieldsIterator.remove(); + } } } @@ -257,6 +296,10 @@ private Set getSupportedTypes() { return supportedTypes; } + private Optional findMatchingNestedFieldPattern(String field) { + return topNestedFieldPrefixes.stream().filter(prefix -> field.startsWith(prefix)).map(prefix -> prefix + "*").findFirst(); + } + private void includeAndExcludeFields(Set fields, Set fieldSelection) { FetchSourceContext analyzedFields = config.getAnalyzedFields(); if (analyzedFields == null) { @@ -294,10 +337,10 @@ private void includeAndExcludeFields(Set fields, Set fie private void checkIncludesExcludesAreNotObjects(FetchSourceContext analyzedFields) { List objectFields = Stream.concat(Arrays.stream(analyzedFields.includes()), Arrays.stream(analyzedFields.excludes())) - .filter(field -> isObject(getMappingTypes(field))) + .filter(field -> isObject(getMappingTypes(field)) || isNested(getMappingTypes(field))) .collect(Collectors.toList()); if (objectFields.isEmpty() == false) { - throw ExceptionsHelper.badRequestException("{} must not include or exclude object fields: {}", + throw ExceptionsHelper.badRequestException("{} must not include or exclude object or nested fields: {}", DataFrameAnalyticsConfig.ANALYZED_FIELDS.getPreferredName(), objectFields); } } @@ -317,10 +360,15 @@ private void applyIncludesExcludes(Set fields, Set includes, Set } } else { fieldsIterator.remove(); - if (hasCompatibleType(field)) { - addExcludedField(field, "field not in includes list", fieldSelection); - } else { + if (hasCompatibleType(field) == false) { addExcludedField(field, "unsupported type; supported types are " + getSupportedTypes(), fieldSelection); + } else { + Optional matchingNestedFieldPattern = findMatchingNestedFieldPattern(field); + if (matchingNestedFieldPattern.isPresent()) { + addExcludedNestedPattern(matchingNestedFieldPattern.get(), fieldSelection); + } else { + addExcludedField(field, "field not in includes list", fieldSelection); + } } } } @@ -337,6 +385,10 @@ private void checkFieldsHaveCompatibleTypes(Set fields) { throw ExceptionsHelper.badRequestException("field [{}] has unsupported type {}. Supported types are {}.", field, fieldCaps.keySet(), getSupportedTypes()); } + Optional matchingNestedFieldPattern = findMatchingNestedFieldPattern(field); + if (matchingNestedFieldPattern.isPresent()) { + throw ExceptionsHelper.badRequestException("nested fields [{}] are not supported", matchingNestedFieldPattern.get()); + } } } @@ -601,7 +653,11 @@ private static boolean isBoolean(Set types) { return types.size() == 1 && types.contains(BooleanFieldMapper.CONTENT_TYPE); } - private boolean isObject(Set types) { + private static boolean isObject(Set types) { return types.size() == 1 && types.contains(ObjectMapper.CONTENT_TYPE); } + + private static boolean isNested(Set types) { + return types.size() == 1 && types.contains(ObjectMapper.NESTED_CONTENT_TYPE); + } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/extractor/ExtractedFields.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/extractor/ExtractedFields.java index 955245106f71a..d9365850ffdfa 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/extractor/ExtractedFields.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/extractor/ExtractedFields.java @@ -205,13 +205,16 @@ private boolean isMultiField(String field, String parent) { return false; } Map parentFieldCaps = fieldsCapabilities.getField(parent); - if (parentFieldCaps == null || (parentFieldCaps.size() == 1 && parentFieldCaps.containsKey("object"))) { - // We check if the parent is an object which is indicated by field caps containing an "object" entry. - // If an object, it's not a multi field + if (parentFieldCaps == null || (parentFieldCaps.size() == 1 && isNestedOrObject(parentFieldCaps))) { + // We check if the parent is an object or nested field. If so, it's not a multi field. return false; } return true; } + + private static boolean isNestedOrObject(Map fieldCaps) { + return fieldCaps.containsKey("object") || fieldCaps.containsKey("nested"); + } } /** diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java index 2b89ee84b8857..aef82e0b5ac3c 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java @@ -922,6 +922,33 @@ public void testDetect_GivenObjectFields() { assertThat(allFields.get(0).getName(), equalTo("float_field")); } + public void testDetect_GivenNestedFields() { + FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder() + .addAggregatableField("float_field", "float") + .addNonAggregatableField("nested_field_1", "nested") + .addAggregatableField("nested_field_1.a", "float") + .addAggregatableField("nested_field_1.b", "float") + .addNonAggregatableField("nested_field_1.inner_nested", "nested") + .addAggregatableField("nested_field_1.inner_nested.z", "float") + .addNonAggregatableField("nested_field_2", "nested") + .addAggregatableField("nested_field_2.c", "float") + .build(); + + ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( + buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); + Tuple> fieldExtraction = extractedFieldsDetector.detect(); + + List allFields = fieldExtraction.v1().getAllFields(); + assertThat(allFields, hasSize(1)); + assertThat(allFields.get(0).getName(), equalTo("float_field")); + + assertFieldSelectionContains(fieldExtraction.v2(), + FieldSelection.included("float_field", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL), + FieldSelection.excluded("nested_field_1.*", Collections.singleton("nested"), "nested fields are not supported"), + FieldSelection.excluded("nested_field_2.*", Collections.singleton("nested"), "nested fields are not supported") + ); + } + public void testDetect_GivenAnalyzedFieldIncludesObjectField() { FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder() .addAggregatableField("float_field", "float") @@ -933,7 +960,21 @@ public void testDetect_GivenAnalyzedFieldIncludesObjectField() { buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object fields: [object_field]")); + assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object or nested fields: [object_field]")); + } + + public void testDetect_GivenAnalyzedFieldIncludesNestedField() { + FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder() + .addAggregatableField("float_field", "float") + .addNonAggregatableField("nested_field", "nested").build(); + + analyzedFields = new FetchSourceContext(true, new String[] { "float_field", "nested_field" }, null); + + ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( + buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); + ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); + + assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object or nested fields: [nested_field]")); } private static FieldCapabilitiesResponse simpleFieldResponse() { @@ -959,7 +1000,21 @@ public void testDetect_GivenAnalyzedFieldExcludesObjectField() { buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object fields: [object_field]")); + assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object or nested fields: [object_field]")); + } + + public void testDetect_GivenAnalyzedFieldExcludesNestedField() { + FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder() + .addAggregatableField("float_field", "float") + .addNonAggregatableField("nested_field", "nested").build(); + + analyzedFields = new FetchSourceContext(true, null, new String[]{"nested_field"}); + + ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( + buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); + ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); + + assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object or nested fields: [nested_field]")); } public void testDetect_givenFeatureProcessorsFailures_ResultsField() { @@ -970,8 +1025,7 @@ public void testDetect_givenFeatureProcessorsFailures_ResultsField() { fieldCapabilities, Collections.emptyMap()); ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(ex.getMessage(), - containsString("fields contained in results field [ml] cannot be used in a feature_processor")); + assertThat(ex.getMessage(), equalTo("fields contained in results field [ml] cannot be used in a feature_processor")); } public void testDetect_givenFeatureProcessorsFailures_Objects() { @@ -982,8 +1036,36 @@ public void testDetect_givenFeatureProcessorsFailures_Objects() { fieldCapabilities, Collections.emptyMap()); ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(ex.getMessage(), - containsString("fields for feature_processors must not be objects")); + assertThat(ex.getMessage(), equalTo("fields for feature_processors must not be objects or nested")); + } + + public void testDetect_givenFeatureProcessorsFailures_Nested() { + FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder() + .addAggregatableField("some_float", "float") + .addNonAggregatableField("nested_field", "nested") + .build(); + ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( + buildRegressionConfig("some_float", Arrays.asList(buildPreProcessor("nested_field", "foo"))), + 100, + fieldCapabilities, + Collections.emptyMap()); + ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); + assertThat(ex.getMessage(), equalTo("fields for feature_processors must not be objects or nested")); + } + + public void testDetect_givenFeatureProcessorsFailures_ChildOfNested() { + FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder() + .addAggregatableField("some_float", "float") + .addNonAggregatableField("nested_field", "nested") + .addAggregatableField("nested_field.inner_float", "float") + .build(); + ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( + buildRegressionConfig("some_float", Arrays.asList(buildPreProcessor("nested_field.inner_float", "foo"))), + 100, + fieldCapabilities, + Collections.emptyMap()); + ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); + assertThat(ex.getMessage(), equalTo("nested fields [nested_field.*] cannot be used in a feature_processor")); } public void testDetect_givenFeatureProcessorsFailures_ReservedFields() { @@ -1018,8 +1100,7 @@ public void testDetect_givenFeatureProcessorsFailures_UsingRequiredField() { fieldCapabilities, Collections.emptyMap()); ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(ex.getMessage(), - containsString("required analysis fields [field_31] cannot be used in a feature_processor")); + assertThat(ex.getMessage(), equalTo("required analysis fields [field_31] cannot be used in a feature_processor")); } public void testDetect_givenFeatureProcessorsFailures_BadSourceFiltering() { @@ -1032,8 +1113,7 @@ public void testDetect_givenFeatureProcessorsFailures_BadSourceFiltering() { Collections.emptyMap()); ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(ex.getMessage(), - containsString("fields [field_11] required by field_processors are not included in source filtering.")); + assertThat(ex.getMessage(), equalTo("fields [field_11] required by field_processors are not included in source filtering.")); } public void testDetect_givenFeatureProcessorsFailures_MissingAnalyzedField() { @@ -1046,8 +1126,7 @@ public void testDetect_givenFeatureProcessorsFailures_MissingAnalyzedField() { Collections.emptyMap()); ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(ex.getMessage(), - containsString("fields [field_11] required by field_processors are not included in the analyzed_fields")); + assertThat(ex.getMessage(), equalTo("fields [field_11] required by field_processors are not included in the analyzed_fields.")); } public void testDetect_givenFeatureProcessorsFailures_RequiredMultiFields() { @@ -1103,8 +1182,7 @@ public void testDetect_givenFeatureProcessorsFailures_DuplicateOutputFields() { Collections.emptyMap()); ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(ex.getMessage(), - containsString("feature_processors must define unique output field names; duplicate fields [foo]")); + assertThat(ex.getMessage(), equalTo("feature_processors must define unique output field names; duplicate fields [foo]")); } public void testDetect_withFeatureProcessors() { @@ -1207,9 +1285,8 @@ public void testDetect_givenFeatureProcessorsFailures_DuplicateOutputFieldsWithU Collections.emptyMap()); ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(ex.getMessage(), - containsString( - "feature_processors output fields must not include non-processed analysis fields; duplicate fields [field_21]")); + assertThat(ex.getMessage(), + equalTo("feature_processors output fields must not include non-processed analysis fields; duplicate fields [field_21]")); } private static class MockFieldCapsResponseBuilder {