Skip to content

Commit 181ee3a

Browse files
authored
[ML] specifying missing_field_value value and using it instead of empty_string (#53108) (#53165)
For analytics, we need a consistent way of indicating when a value is missing. Inheriting from anomaly detection, analysis sent `""` when a field is missing. This works fine with numbers, but the underlying analytics process actually treats `""` as a category in categorical values. Consequently, you end up with this situation in the resulting model ``` { "frequency_encoding" : { "field" : "RainToday", "feature_name" : "RainToday_frequency", "frequency_map" : { "" : 0.009844409027270245, "No" : 0.6472019970785184, "Yes" : 0.6472019970785184 } } } ``` For inference this is a problem, because inference will treat missing values as `null`. And thus not include them on the infer call against the model. This PR takes advantage of our new `missing_field_value` option and supplies `\0` as the value.
1 parent 48707ec commit 181ee3a

File tree

4 files changed

+8
-7
lines changed

4 files changed

+8
-7
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public class DataFrameDataExtractor {
5252
private static final Logger LOGGER = LogManager.getLogger(DataFrameDataExtractor.class);
5353
private static final TimeValue SCROLL_TIMEOUT = new TimeValue(30, TimeUnit.MINUTES);
5454

55-
private static final String EMPTY_STRING = "";
55+
public static final String NULL_VALUE = "\0";
5656

5757
private final Client client;
5858
private final DataFrameDataExtractorContext context;
@@ -189,7 +189,7 @@ private Row createRow(SearchHit hit) {
189189
} else {
190190
if (values.length == 0 && context.includeRowsWithMissingValues) {
191191
// if values is empty then it means it's a missing value
192-
extractedValues[i] = EMPTY_STRING;
192+
extractedValues[i] = NULL_VALUE;
193193
} else {
194194
// we are here if we have a missing value but the analysis does not support those
195195
// or the value type is not supported (e.g. arrays, etc.)

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package org.elasticsearch.xpack.ml.dataframe.process.customprocessing;
77

88
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
9+
import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor;
910

1011
import java.util.List;
1112
import java.util.Random;
@@ -18,8 +19,6 @@
1819
*/
1920
class DatasetSplittingCustomProcessor implements CustomProcessor {
2021

21-
private static final String EMPTY = "";
22-
2322
private final int dependentVariableIndex;
2423
private final double trainingPercent;
2524
private final Random random;
@@ -47,7 +46,7 @@ public void process(String[] row) {
4746
// Let's make sure we have at least one training row
4847
isFirstRow = false;
4948
} else if (isRandomlyExcludedFromTraining()) {
50-
row[dependentVariableIndex] = EMPTY;
49+
row[dependentVariableIndex] = DataFrameDataExtractor.NULL_VALUE;
5150
}
5251
}
5352
}

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,8 @@ public void testMissingValues_GivenShouldInclude() throws IOException {
377377
assertThat(rows.get().size(), equalTo(3));
378378

379379
assertThat(rows.get().get(0).getValues(), equalTo(new String[] {"11", "21"}));
380-
assertThat(rows.get().get(1).getValues(), equalTo(new String[] {"", "22"}));
380+
assertThat(rows.get().get(1).getValues()[0], equalTo(DataFrameDataExtractor.NULL_VALUE));
381+
assertThat(rows.get().get(1).getValues()[1], equalTo("22"));
381382
assertThat(rows.get().get(2).getValues(), equalTo(new String[] {"13", "23"}));
382383

383384
assertThat(rows.get().get(0).shouldSkip(), is(false));

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package org.elasticsearch.xpack.ml.dataframe.process.customprocessing;
77

88
import org.elasticsearch.test.ESTestCase;
9+
import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor;
910
import org.junit.Before;
1011

1112
import java.util.ArrayList;
@@ -98,7 +99,7 @@ public void testProcess_GivenRowsWithDependentVariableValue_AndTrainingPercentIs
9899
assertThat(processedRow[fieldIndex], equalTo(row[fieldIndex]));
99100
}
100101
}
101-
if (processedRow[dependentVariableIndex].length() > 0) {
102+
if (DataFrameDataExtractor.NULL_VALUE.equals(processedRow[dependentVariableIndex]) == false) {
102103
assertThat(processedRow[dependentVariableIndex], equalTo(row[dependentVariableIndex]));
103104
trainingRows++;
104105
}

0 commit comments

Comments
 (0)