Skip to content

[ML] Additional outlier detection parameters #47600

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

package org.elasticsearch.client.ml.dataframe;

import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.xcontent.ObjectParser;
Expand Down Expand Up @@ -48,6 +47,9 @@ public static Builder builder() {
static final ParseField N_NEIGHBORS = new ParseField("n_neighbors");
static final ParseField METHOD = new ParseField("method");
public static final ParseField FEATURE_INFLUENCE_THRESHOLD = new ParseField("feature_influence_threshold");
static final ParseField COMPUTE_FEATURE_INFLUENCE = new ParseField("compute_feature_influence");
static final ParseField OUTLIER_FRACTION = new ParseField("outlier_fraction");
static final ParseField STANDARDIZATION_ENABLED = new ParseField("standardization_enabled");

private static ObjectParser<Builder, Void> PARSER = new ObjectParser<>(NAME.getPreferredName(), true, Builder::new);

Expand All @@ -60,22 +62,49 @@ public static Builder builder() {
throw new IllegalArgumentException("Unsupported token [" + p.currentToken() + "]");
}, METHOD, ObjectParser.ValueType.STRING);
PARSER.declareDouble(Builder::setFeatureInfluenceThreshold, FEATURE_INFLUENCE_THRESHOLD);
PARSER.declareBoolean(Builder::setComputeFeatureInfluence, COMPUTE_FEATURE_INFLUENCE);
PARSER.declareDouble(Builder::setOutlierFraction, OUTLIER_FRACTION);
PARSER.declareBoolean(Builder::setStandardizationEnabled, STANDARDIZATION_ENABLED);
}

/**
* The number of neighbors. Leave unspecified for dynamic detection.
*/
private final Integer nNeighbors;

/**
* The method. Leave unspecified for a dynamic mixture of methods.
*/
private final Method method;

/**
* The min outlier score required to calculate feature influence. Defaults to 0.1.
*/
private final Double featureInfluenceThreshold;

/**
* Constructs the outlier detection configuration
* @param nNeighbors The number of neighbors. Leave unspecified for dynamic detection.
* @param method The method. Leave unspecified for a dynamic mixture of methods.
* @param featureInfluenceThreshold The min outlier score required to calculate feature influence. Defaults to 0.1.
* Whether to compute feature influence or not. Defaults to true.
*/
private OutlierDetection(@Nullable Integer nNeighbors, @Nullable Method method, @Nullable Double featureInfluenceThreshold) {
private final Boolean computeFeatureInfluence;

/**
* The proportion of data assumed to be outlying prior to outlier detection. Defaults to 0.05.
*/
private final Double outlierFraction;

/**
* Whether to perform standardization.
*/
private final Boolean standardizationEnabled;

private OutlierDetection(Integer nNeighbors, Method method, Double featureInfluenceThreshold, Boolean computeFeatureInfluence,
Double outlierFraction, Boolean standardizationEnabled) {
this.nNeighbors = nNeighbors;
this.method = method;
this.featureInfluenceThreshold = featureInfluenceThreshold;
this.computeFeatureInfluence = computeFeatureInfluence;
this.outlierFraction = outlierFraction;
this.standardizationEnabled = standardizationEnabled;
}

@Override
Expand All @@ -95,6 +124,18 @@ public Double getFeatureInfluenceThreshold() {
return featureInfluenceThreshold;
}

public Boolean getComputeFeatureInfluence() {
return computeFeatureInfluence;
}

public Double getOutlierFraction() {
return outlierFraction;
}

public Boolean getStandardizationEnabled() {
return standardizationEnabled;
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
Expand All @@ -107,6 +148,15 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
if (featureInfluenceThreshold != null) {
builder.field(FEATURE_INFLUENCE_THRESHOLD.getPreferredName(), featureInfluenceThreshold);
}
if (computeFeatureInfluence != null) {
builder.field(COMPUTE_FEATURE_INFLUENCE.getPreferredName(), computeFeatureInfluence);
}
if (outlierFraction != null) {
builder.field(OUTLIER_FRACTION.getPreferredName(), outlierFraction);
}
if (standardizationEnabled != null) {
builder.field(STANDARDIZATION_ENABLED.getPreferredName(), standardizationEnabled);
}
builder.endObject();
return builder;
}
Expand All @@ -119,12 +169,16 @@ public boolean equals(Object o) {
OutlierDetection other = (OutlierDetection) o;
return Objects.equals(nNeighbors, other.nNeighbors)
&& Objects.equals(method, other.method)
&& Objects.equals(featureInfluenceThreshold, other.featureInfluenceThreshold);
&& Objects.equals(featureInfluenceThreshold, other.featureInfluenceThreshold)
&& Objects.equals(computeFeatureInfluence, other.computeFeatureInfluence)
&& Objects.equals(outlierFraction, other.outlierFraction)
&& Objects.equals(standardizationEnabled, other.standardizationEnabled);
}

@Override
public int hashCode() {
return Objects.hash(nNeighbors, method, featureInfluenceThreshold);
return Objects.hash(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
standardizationEnabled);
}

@Override
Expand All @@ -150,6 +204,9 @@ public static class Builder {
private Integer nNeighbors;
private Method method;
private Double featureInfluenceThreshold;
private Boolean computeFeatureInfluence;
private Double outlierFraction;
private Boolean standardizationEnabled;

private Builder() {}

Expand All @@ -168,8 +225,24 @@ public Builder setFeatureInfluenceThreshold(Double featureInfluenceThreshold) {
return this;
}

public Builder setComputeFeatureInfluence(Boolean computeFeatureInfluence) {
this.computeFeatureInfluence = computeFeatureInfluence;
return this;
}

public Builder setOutlierFraction(Double outlierFraction) {
this.outlierFraction = outlierFraction;
return this;
}

public Builder setStandardizationEnabled(Boolean standardizationEnabled) {
this.standardizationEnabled = standardizationEnabled;
return this;
}

public OutlierDetection build() {
return new OutlierDetection(nNeighbors, method, featureInfluenceThreshold);
return new OutlierDetection(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
standardizationEnabled);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1246,7 +1246,10 @@ public void testPutDataFrameAnalyticsConfig_GivenOutlierDetectionAnalysis() thro
assertThat(createdConfig.getSource().getQueryConfig(), equalTo(new QueryConfig(new MatchAllQueryBuilder()))); // default value
assertThat(createdConfig.getDest().getIndex(), equalTo(config.getDest().getIndex()));
assertThat(createdConfig.getDest().getResultsField(), equalTo("ml")); // default value
assertThat(createdConfig.getAnalysis(), equalTo(config.getAnalysis()));
assertThat(createdConfig.getAnalysis(), equalTo(OutlierDetection.builder()
.setComputeFeatureInfluence(true)
.setOutlierFraction(0.05)
.setStandardizationEnabled(true).build()));
assertThat(createdConfig.getAnalyzedFields(), equalTo(config.getAnalyzedFields()));
assertThat(createdConfig.getModelMemoryLimit(), equalTo(ByteSizeValue.parseBytesSizeValue("1gb", ""))); // default value
assertThat(createdConfig.getDescription(), equalTo("some description"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2932,6 +2932,10 @@ public void testPutDataFrameAnalytics() throws Exception {
DataFrameAnalysis outlierDetectionCustomized = OutlierDetection.builder() // <1>
.setMethod(OutlierDetection.Method.DISTANCE_KNN) // <2>
.setNNeighbors(5) // <3>
.setFeatureInfluenceThreshold(0.1) // <4>
.setComputeFeatureInfluence(true) // <5>
.setOutlierFraction(0.05) // <6>
.setStandardizationEnabled(true) // <7>
.build();
// end::put-data-frame-analytics-outlier-detection-customized

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;

public class OutlierDetectionTests extends AbstractXContentTestCase<OutlierDetection> {

Expand All @@ -34,6 +35,9 @@ public static OutlierDetection randomOutlierDetection() {
.setNNeighbors(randomBoolean() ? null : randomIntBetween(1, 20))
.setMethod(randomBoolean() ? null : randomFrom(OutlierDetection.Method.values()))
.setFeatureInfluenceThreshold(randomBoolean() ? null : randomDoubleBetween(0.0, 1.0, true))
.setComputeFeatureInfluence(randomBoolean() ? null : randomBoolean())
.setOutlierFraction(randomBoolean() ? null : randomDoubleBetween(0.0, 1.0, true))
.setStandardizationEnabled(randomBoolean() ? null : randomBoolean())
.build();
}

Expand All @@ -57,6 +61,9 @@ public void testGetParams_GivenDefaults() {
assertNull(outlierDetection.getNNeighbors());
assertNull(outlierDetection.getMethod());
assertNull(outlierDetection.getFeatureInfluenceThreshold());
assertNull(outlierDetection.getComputeFeatureInfluence());
assertNull(outlierDetection.getOutlierFraction());
assertNull(outlierDetection.getStandardizationEnabled());
}

public void testGetParams_GivenExplicitValues() {
Expand All @@ -65,9 +72,15 @@ public void testGetParams_GivenExplicitValues() {
.setNNeighbors(42)
.setMethod(OutlierDetection.Method.LDOF)
.setFeatureInfluenceThreshold(0.5)
.setComputeFeatureInfluence(true)
.setOutlierFraction(0.42)
.setStandardizationEnabled(false)
.build();
assertThat(outlierDetection.getNNeighbors(), equalTo(42));
assertThat(outlierDetection.getMethod(), equalTo(OutlierDetection.Method.LDOF));
assertThat(outlierDetection.getFeatureInfluenceThreshold(), closeTo(0.5, 1E-9));
assertThat(outlierDetection.getComputeFeatureInfluence(), is(true));
assertThat(outlierDetection.getOutlierFraction(), closeTo(0.42, 1E-9));
assertThat(outlierDetection.getStandardizationEnabled(), is(false));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ include-tagged::{doc-tests-file}[{api}-outlier-detection-customized]
<1> Constructing a new OutlierDetection object
<2> The method used to perform the analysis
<3> Number of neighbors taken into account during analysis
<4> The min `outlier_score` required to compute feature influence
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious: Could the functionality of compute_feature_influence setting be achieved with setting min_outlier_score to a very high number?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, one can set feature_influence_threshold to 1 to achieve the same as setting compute_feature_influence to false.

<5> Whether to compute feature influence
<6> The proportion of the data set that is assumed to be outlying prior to outlier detection
<7> Whether to apply standardization to feature values

===== Regression

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ An `outlier_detection` configuration object has the following properties:
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real outliers
and 95% are inliers.

`standardize_columns`::
`standardization_enabled`::
(boolean) If `true`, then the following operation is performed on the columns
before computing outlier scores: (x_i - mean(x_i)) / sd(x_i). Defaults to
`true`. For more information, see
Expand Down
9 changes: 8 additions & 1 deletion docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ PUT _ml/data_frame/analytics/loganalytics
},
"analysis": {
"outlier_detection": {
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
}
}
Expand All @@ -165,7 +168,11 @@ The API returns the following result:
"results_field": "ml"
},
"analysis": {
"outlier_detection": {}
"outlier_detection": {
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
},
"model_memory_limit": "1gb",
"create_time" : 1562265491319,
Expand Down
Loading