Skip to content

Commit e99435a

Browse files
[ML] Additional outlier detection parameters (#47600)
Adds the following parameters to `outlier_detection`: - `compute_feature_influence` (boolean): whether to compute or not feature influence scores - `outlier_fraction` (double): the proportion of the data set assumed to be outlying prior to running outlier detection - `standardization_enabled` (boolean): whether to apply standardization to the feature values
1 parent 924b298 commit e99435a

File tree

16 files changed

+562
-68
lines changed

16 files changed

+562
-68
lines changed

client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/OutlierDetection.java

+82-9
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
package org.elasticsearch.client.ml.dataframe;
2121

22-
import org.elasticsearch.common.Nullable;
2322
import org.elasticsearch.common.ParseField;
2423
import org.elasticsearch.common.Strings;
2524
import org.elasticsearch.common.xcontent.ObjectParser;
@@ -48,6 +47,9 @@ public static Builder builder() {
4847
static final ParseField N_NEIGHBORS = new ParseField("n_neighbors");
4948
static final ParseField METHOD = new ParseField("method");
5049
public static final ParseField FEATURE_INFLUENCE_THRESHOLD = new ParseField("feature_influence_threshold");
50+
static final ParseField COMPUTE_FEATURE_INFLUENCE = new ParseField("compute_feature_influence");
51+
static final ParseField OUTLIER_FRACTION = new ParseField("outlier_fraction");
52+
static final ParseField STANDARDIZATION_ENABLED = new ParseField("standardization_enabled");
5153

5254
private static ObjectParser<Builder, Void> PARSER = new ObjectParser<>(NAME.getPreferredName(), true, Builder::new);
5355

@@ -60,22 +62,49 @@ public static Builder builder() {
6062
throw new IllegalArgumentException("Unsupported token [" + p.currentToken() + "]");
6163
}, METHOD, ObjectParser.ValueType.STRING);
6264
PARSER.declareDouble(Builder::setFeatureInfluenceThreshold, FEATURE_INFLUENCE_THRESHOLD);
65+
PARSER.declareBoolean(Builder::setComputeFeatureInfluence, COMPUTE_FEATURE_INFLUENCE);
66+
PARSER.declareDouble(Builder::setOutlierFraction, OUTLIER_FRACTION);
67+
PARSER.declareBoolean(Builder::setStandardizationEnabled, STANDARDIZATION_ENABLED);
6368
}
6469

70+
/**
71+
* The number of neighbors. Leave unspecified for dynamic detection.
72+
*/
6573
private final Integer nNeighbors;
74+
75+
/**
76+
* The method. Leave unspecified for a dynamic mixture of methods.
77+
*/
6678
private final Method method;
79+
80+
/**
81+
* The min outlier score required to calculate feature influence. Defaults to 0.1.
82+
*/
6783
private final Double featureInfluenceThreshold;
6884

6985
/**
70-
* Constructs the outlier detection configuration
71-
* @param nNeighbors The number of neighbors. Leave unspecified for dynamic detection.
72-
* @param method The method. Leave unspecified for a dynamic mixture of methods.
73-
* @param featureInfluenceThreshold The min outlier score required to calculate feature influence. Defaults to 0.1.
86+
* Whether to compute feature influence or not. Defaults to true.
7487
*/
75-
private OutlierDetection(@Nullable Integer nNeighbors, @Nullable Method method, @Nullable Double featureInfluenceThreshold) {
88+
private final Boolean computeFeatureInfluence;
89+
90+
/**
91+
* The proportion of data assumed to be outlying prior to outlier detection. Defaults to 0.05.
92+
*/
93+
private final Double outlierFraction;
94+
95+
/**
96+
* Whether to perform standardization.
97+
*/
98+
private final Boolean standardizationEnabled;
99+
100+
private OutlierDetection(Integer nNeighbors, Method method, Double featureInfluenceThreshold, Boolean computeFeatureInfluence,
101+
Double outlierFraction, Boolean standardizationEnabled) {
76102
this.nNeighbors = nNeighbors;
77103
this.method = method;
78104
this.featureInfluenceThreshold = featureInfluenceThreshold;
105+
this.computeFeatureInfluence = computeFeatureInfluence;
106+
this.outlierFraction = outlierFraction;
107+
this.standardizationEnabled = standardizationEnabled;
79108
}
80109

81110
@Override
@@ -95,6 +124,18 @@ public Double getFeatureInfluenceThreshold() {
95124
return featureInfluenceThreshold;
96125
}
97126

127+
public Boolean getComputeFeatureInfluence() {
128+
return computeFeatureInfluence;
129+
}
130+
131+
public Double getOutlierFraction() {
132+
return outlierFraction;
133+
}
134+
135+
public Boolean getStandardizationEnabled() {
136+
return standardizationEnabled;
137+
}
138+
98139
@Override
99140
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
100141
builder.startObject();
@@ -107,6 +148,15 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
107148
if (featureInfluenceThreshold != null) {
108149
builder.field(FEATURE_INFLUENCE_THRESHOLD.getPreferredName(), featureInfluenceThreshold);
109150
}
151+
if (computeFeatureInfluence != null) {
152+
builder.field(COMPUTE_FEATURE_INFLUENCE.getPreferredName(), computeFeatureInfluence);
153+
}
154+
if (outlierFraction != null) {
155+
builder.field(OUTLIER_FRACTION.getPreferredName(), outlierFraction);
156+
}
157+
if (standardizationEnabled != null) {
158+
builder.field(STANDARDIZATION_ENABLED.getPreferredName(), standardizationEnabled);
159+
}
110160
builder.endObject();
111161
return builder;
112162
}
@@ -119,12 +169,16 @@ public boolean equals(Object o) {
119169
OutlierDetection other = (OutlierDetection) o;
120170
return Objects.equals(nNeighbors, other.nNeighbors)
121171
&& Objects.equals(method, other.method)
122-
&& Objects.equals(featureInfluenceThreshold, other.featureInfluenceThreshold);
172+
&& Objects.equals(featureInfluenceThreshold, other.featureInfluenceThreshold)
173+
&& Objects.equals(computeFeatureInfluence, other.computeFeatureInfluence)
174+
&& Objects.equals(outlierFraction, other.outlierFraction)
175+
&& Objects.equals(standardizationEnabled, other.standardizationEnabled);
123176
}
124177

125178
@Override
126179
public int hashCode() {
127-
return Objects.hash(nNeighbors, method, featureInfluenceThreshold);
180+
return Objects.hash(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
181+
standardizationEnabled);
128182
}
129183

130184
@Override
@@ -150,6 +204,9 @@ public static class Builder {
150204
private Integer nNeighbors;
151205
private Method method;
152206
private Double featureInfluenceThreshold;
207+
private Boolean computeFeatureInfluence;
208+
private Double outlierFraction;
209+
private Boolean standardizationEnabled;
153210

154211
private Builder() {}
155212

@@ -168,8 +225,24 @@ public Builder setFeatureInfluenceThreshold(Double featureInfluenceThreshold) {
168225
return this;
169226
}
170227

228+
public Builder setComputeFeatureInfluence(Boolean computeFeatureInfluence) {
229+
this.computeFeatureInfluence = computeFeatureInfluence;
230+
return this;
231+
}
232+
233+
public Builder setOutlierFraction(Double outlierFraction) {
234+
this.outlierFraction = outlierFraction;
235+
return this;
236+
}
237+
238+
public Builder setStandardizationEnabled(Boolean standardizationEnabled) {
239+
this.standardizationEnabled = standardizationEnabled;
240+
return this;
241+
}
242+
171243
public OutlierDetection build() {
172-
return new OutlierDetection(nNeighbors, method, featureInfluenceThreshold);
244+
return new OutlierDetection(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
245+
standardizationEnabled);
173246
}
174247
}
175248
}

client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -1246,7 +1246,10 @@ public void testPutDataFrameAnalyticsConfig_GivenOutlierDetectionAnalysis() thro
12461246
assertThat(createdConfig.getSource().getQueryConfig(), equalTo(new QueryConfig(new MatchAllQueryBuilder()))); // default value
12471247
assertThat(createdConfig.getDest().getIndex(), equalTo(config.getDest().getIndex()));
12481248
assertThat(createdConfig.getDest().getResultsField(), equalTo("ml")); // default value
1249-
assertThat(createdConfig.getAnalysis(), equalTo(config.getAnalysis()));
1249+
assertThat(createdConfig.getAnalysis(), equalTo(OutlierDetection.builder()
1250+
.setComputeFeatureInfluence(true)
1251+
.setOutlierFraction(0.05)
1252+
.setStandardizationEnabled(true).build()));
12501253
assertThat(createdConfig.getAnalyzedFields(), equalTo(config.getAnalyzedFields()));
12511254
assertThat(createdConfig.getModelMemoryLimit(), equalTo(ByteSizeValue.parseBytesSizeValue("1gb", ""))); // default value
12521255
assertThat(createdConfig.getDescription(), equalTo("some description"));

client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java

+4
Original file line numberDiff line numberDiff line change
@@ -2932,6 +2932,10 @@ public void testPutDataFrameAnalytics() throws Exception {
29322932
DataFrameAnalysis outlierDetectionCustomized = OutlierDetection.builder() // <1>
29332933
.setMethod(OutlierDetection.Method.DISTANCE_KNN) // <2>
29342934
.setNNeighbors(5) // <3>
2935+
.setFeatureInfluenceThreshold(0.1) // <4>
2936+
.setComputeFeatureInfluence(true) // <5>
2937+
.setOutlierFraction(0.05) // <6>
2938+
.setStandardizationEnabled(true) // <7>
29352939
.build();
29362940
// end::put-data-frame-analytics-outlier-detection-customized
29372941

client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/OutlierDetectionTests.java

+13
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import static org.hamcrest.Matchers.closeTo;
2828
import static org.hamcrest.Matchers.equalTo;
29+
import static org.hamcrest.Matchers.is;
2930

3031
public class OutlierDetectionTests extends AbstractXContentTestCase<OutlierDetection> {
3132

@@ -34,6 +35,9 @@ public static OutlierDetection randomOutlierDetection() {
3435
.setNNeighbors(randomBoolean() ? null : randomIntBetween(1, 20))
3536
.setMethod(randomBoolean() ? null : randomFrom(OutlierDetection.Method.values()))
3637
.setFeatureInfluenceThreshold(randomBoolean() ? null : randomDoubleBetween(0.0, 1.0, true))
38+
.setComputeFeatureInfluence(randomBoolean() ? null : randomBoolean())
39+
.setOutlierFraction(randomBoolean() ? null : randomDoubleBetween(0.0, 1.0, true))
40+
.setStandardizationEnabled(randomBoolean() ? null : randomBoolean())
3741
.build();
3842
}
3943

@@ -57,6 +61,9 @@ public void testGetParams_GivenDefaults() {
5761
assertNull(outlierDetection.getNNeighbors());
5862
assertNull(outlierDetection.getMethod());
5963
assertNull(outlierDetection.getFeatureInfluenceThreshold());
64+
assertNull(outlierDetection.getComputeFeatureInfluence());
65+
assertNull(outlierDetection.getOutlierFraction());
66+
assertNull(outlierDetection.getStandardizationEnabled());
6067
}
6168

6269
public void testGetParams_GivenExplicitValues() {
@@ -65,9 +72,15 @@ public void testGetParams_GivenExplicitValues() {
6572
.setNNeighbors(42)
6673
.setMethod(OutlierDetection.Method.LDOF)
6774
.setFeatureInfluenceThreshold(0.5)
75+
.setComputeFeatureInfluence(true)
76+
.setOutlierFraction(0.42)
77+
.setStandardizationEnabled(false)
6878
.build();
6979
assertThat(outlierDetection.getNNeighbors(), equalTo(42));
7080
assertThat(outlierDetection.getMethod(), equalTo(OutlierDetection.Method.LDOF));
7181
assertThat(outlierDetection.getFeatureInfluenceThreshold(), closeTo(0.5, 1E-9));
82+
assertThat(outlierDetection.getComputeFeatureInfluence(), is(true));
83+
assertThat(outlierDetection.getOutlierFraction(), closeTo(0.42, 1E-9));
84+
assertThat(outlierDetection.getStandardizationEnabled(), is(false));
7285
}
7386
}

docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc

+4
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ include-tagged::{doc-tests-file}[{api}-outlier-detection-customized]
9696
<1> Constructing a new OutlierDetection object
9797
<2> The method used to perform the analysis
9898
<3> Number of neighbors taken into account during analysis
99+
<4> The min `outlier_score` required to compute feature influence
100+
<5> Whether to compute feature influence
101+
<6> The proportion of the data set that is assumed to be outlying prior to outlier detection
102+
<7> Whether to apply standardization to feature values
99103

100104
===== Regression
101105

docs/reference/ml/df-analytics/apis/dfanalyticsresources.asciidoc

+1-1
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ An `outlier_detection` configuration object has the following properties:
134134
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real outliers
135135
and 95% are inliers.
136136

137-
`standardize_columns`::
137+
`standardization_enabled`::
138138
(boolean) If `true`, then the following operation is performed on the columns
139139
before computing outlier scores: (x_i - mean(x_i)) / sd(x_i). Defaults to
140140
`true`. For more information, see

docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc

+8-1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ PUT _ml/data_frame/analytics/loganalytics
139139
},
140140
"analysis": {
141141
"outlier_detection": {
142+
"compute_feature_influence": true,
143+
"outlier_fraction": 0.05,
144+
"standardization_enabled": true
142145
}
143146
}
144147
}
@@ -164,7 +167,11 @@ The API returns the following result:
164167
"results_field": "ml"
165168
},
166169
"analysis": {
167-
"outlier_detection": {}
170+
"outlier_detection": {
171+
"compute_feature_influence": true,
172+
"outlier_fraction": 0.05,
173+
"standardization_enabled": true
174+
}
168175
},
169176
"model_memory_limit": "1gb",
170177
"create_time" : 1562265491319,

0 commit comments

Comments
 (0)