Skip to content

Commit 9abec24

Browse files
authored
[ML] Distinguish missing and empty categorical values (#1034)
1 parent 1cf9de8 commit 9abec24

19 files changed

+668
-300
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ necessary. This will improve the allocation of data frame analyses to cluster no
5757
* Upgrade the compiler used on Linux from gcc 7.3 to gcc 7.5, and the binutils used in
5858
the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].)
5959
* Remove all memory overheads for computing tree SHAP values. (See {ml-pull}1023[#1023].)
60+
* Distinguish between empty and missing categorical fields in classification and regression
61+
model training. (See {ml-pull}1034[#1034].)
6062

6163
=== Bug Fixes
6264

include/api/CDataFrameAnalysisSpecification.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class API_EXPORT CDataFrameAnalysisSpecification {
6666
static const std::string THREADS;
6767
static const std::string TEMPORARY_DIRECTORY;
6868
static const std::string RESULTS_FIELD;
69+
static const std::string MISSING_FIELD_VALUE;
6970
static const std::string CATEGORICAL_FIELD_NAMES;
7071
static const std::string DISK_USAGE_ALLOWED;
7172
static const std::string ANALYSIS;
@@ -203,6 +204,7 @@ class API_EXPORT CDataFrameAnalysisSpecification {
203204
std::string m_ResultsField;
204205
std::string m_JobId;
205206
std::string m_AnalysisName;
207+
std::string m_MissingFieldValue;
206208
TStrVec m_CategoricalFieldNames;
207209
bool m_DiskUsageAllowed;
208210
// TODO Sparse table support

include/api/CDataFrameAnalysisSpecificationJsonWriter.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI
3434
std::size_t numberThreads,
3535
const std::string& temporaryDirectory,
3636
const std::string& resultsField,
37+
const std::string& missingString,
3738
const TStrVec& categoricalFields,
3839
bool diskUsageAllowed,
3940
const std::string& analysisName,
@@ -48,6 +49,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI
4849
std::size_t numberThreads,
4950
const std::string& temporaryDirectory,
5051
const std::string& resultsField,
52+
const std::string& missingString,
5153
const TStrVec& categoricalFields,
5254
bool diskUsageAllowed,
5355
const std::string& analysisName,
@@ -56,10 +58,11 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI
5658

5759
//! Returns a string with the data frame analysis specification in JSON format.
5860
static std::string jsonString(const std::string& jobId,
59-
size_t rows,
60-
size_t cols,
61-
size_t memoryLimit,
62-
size_t numberThreads,
61+
std::size_t rows,
62+
std::size_t cols,
63+
std::size_t memoryLimit,
64+
std::size_t numberThreads,
65+
const std::string& missingString,
6366
const TStrVec& categoricalFields,
6467
bool diskUsageAllowed,
6568
const std::string& tempDir,

include/core/CDataFrame.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,9 @@ class CORE_EXPORT CDataFrame final {
238238
//! The maximum number of distinct categorical fields we can faithfully represent.
239239
static const std::size_t MAX_CATEGORICAL_CARDINALITY;
240240

241+
//! The default value indicating that a value is missing.
242+
static const std::string DEFAULT_MISSING_STRING;
243+
241244
public:
242245
//! \param[in] inMainMemory True if the data frame is stored in main memory.
243246
//! \param[in] numberColumns The number of columns in the data frame.
@@ -443,6 +446,9 @@ class CORE_EXPORT CDataFrame final {
443446
//! Write the column names.
444447
void columnNames(TStrVec columnNames);
445448

449+
//! Write the string which indicates that a value is missing.
450+
void missingString(std::string missing);
451+
446452
//! Write for which columns an empty string implies the value is missing.
447453
void emptyIsMissing(TBoolVec emptyIsMissing);
448454

@@ -577,7 +583,12 @@ class CORE_EXPORT CDataFrame final {
577583
//! A lookup for the integer value of categories.
578584
TStrSizeUMapVec m_CategoricalColumnValueLookup;
579585

586+
//! The string which indicates that a category is missing.
587+
std::string m_MissingString;
588+
580589
//! Indicator vector for treating empty strings as missing values.
590+
// TODO Remove once Java passes the correct value for the missing target
591+
// for classification.
581592
TBoolVec m_EmptyIsMissing;
582593

583594
//! Indicator vector of the columns which contain categorical values.

include/test/CDataFrameAnalysisSpecificationFactory.h

Lines changed: 70 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
#include <test/ImportExport.h>
1616

17+
#include <boost/optional.hpp>
18+
1719
#include <cstddef>
1820
#include <memory>
1921
#include <string>
@@ -32,37 +34,77 @@ class TEST_EXPORT CDataFrameAnalysisSpecificationFactory {
3234
using TSpecificationUPtr = std::unique_ptr<api::CDataFrameAnalysisSpecification>;
3335

3436
public:
37+
CDataFrameAnalysisSpecificationFactory();
38+
3539
static const std::string& classification();
3640
static const std::string& regression();
3741

38-
static TSpecificationUPtr outlierSpec(std::size_t rows = 110,
39-
std::size_t cols = 5,
40-
std::size_t memoryLimit = 100000,
41-
const std::string& method = "",
42-
std::size_t numberNeighbours = 0,
43-
bool computeFeatureInfluence = false,
44-
bool diskUsageAllowed = true);
45-
46-
static TSpecificationUPtr
47-
predictionSpec(const std::string& analysis,
48-
const std::string& dependentVariable,
49-
std::size_t rows = 100,
50-
std::size_t cols = 5,
51-
std::size_t memoryLimit = 7000000,
52-
std::size_t numberRoundsPerHyperparameter = 0,
53-
std::size_t bayesianOptimisationRestarts = 0,
54-
const TStrVec& categoricalFieldNames = TStrVec{},
55-
double alpha = -1.0,
56-
double lambda = -1.0,
57-
double gamma = -1.0,
58-
double softTreeDepthLimit = -1.0,
59-
double softTreeDepthTolerance = -1.0,
60-
double eta = -1.0,
61-
std::size_t maximumNumberTrees = 0,
62-
double featureBagFraction = -1.0,
63-
size_t topShapValues = 0,
64-
TPersisterSupplier* persisterSupplier = nullptr,
65-
TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr);
42+
// Shared
43+
CDataFrameAnalysisSpecificationFactory& rows(std::size_t rows);
44+
CDataFrameAnalysisSpecificationFactory& columns(std::size_t columns);
45+
CDataFrameAnalysisSpecificationFactory& memoryLimit(std::size_t memoryLimit);
46+
CDataFrameAnalysisSpecificationFactory& missingString(const std::string& missing);
47+
CDataFrameAnalysisSpecificationFactory& diskUsageAllowed(bool disk);
48+
49+
// Outliers
50+
CDataFrameAnalysisSpecificationFactory& outlierMethod(std::string method);
51+
CDataFrameAnalysisSpecificationFactory& outlierNumberNeighbours(std::size_t number);
52+
CDataFrameAnalysisSpecificationFactory& outlierComputeInfluence(bool compute);
53+
54+
// Prediction
55+
CDataFrameAnalysisSpecificationFactory&
56+
predicitionNumberRoundsPerHyperparameter(std::size_t rounds);
57+
CDataFrameAnalysisSpecificationFactory&
58+
predictionBayesianOptimisationRestarts(std::size_t restarts);
59+
CDataFrameAnalysisSpecificationFactory&
60+
predictionCategoricalFieldNames(const TStrVec& categorical);
61+
CDataFrameAnalysisSpecificationFactory& predictionAlpha(double alpha);
62+
CDataFrameAnalysisSpecificationFactory& predictionLambda(double lambda);
63+
CDataFrameAnalysisSpecificationFactory& predictionGamma(double gamma);
64+
CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthLimit(double limit);
65+
CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthTolerance(double tolerance);
66+
CDataFrameAnalysisSpecificationFactory& predictionEta(double eta);
67+
CDataFrameAnalysisSpecificationFactory& predictionMaximumNumberTrees(std::size_t number);
68+
CDataFrameAnalysisSpecificationFactory& predictionFeatureBagFraction(double fraction);
69+
CDataFrameAnalysisSpecificationFactory& predictionNumberTopShapValues(std::size_t number);
70+
CDataFrameAnalysisSpecificationFactory&
71+
predictionPersisterSupplier(TPersisterSupplier* persisterSupplier);
72+
CDataFrameAnalysisSpecificationFactory&
73+
predictionRestoreSearcherSupplier(TRestoreSearcherSupplier* restoreSearcherSupplier);
74+
75+
TSpecificationUPtr outlierSpec() const;
76+
TSpecificationUPtr predictionSpec(const std::string& analysis,
77+
const std::string& dependentVariable) const;
78+
79+
private:
80+
using TOptionalSize = boost::optional<std::size_t>;
81+
82+
private:
83+
// Shared
84+
TOptionalSize m_Rows;
85+
TOptionalSize m_Columns;
86+
TOptionalSize m_MemoryLimit;
87+
std::string m_MissingString;
88+
bool m_DiskUsageAllowed = true;
89+
// Outliers
90+
std::string m_Method;
91+
std::size_t m_NumberNeighbours = 0;
92+
bool m_ComputeFeatureInfluence = false;
93+
// Prediction
94+
std::size_t m_NumberRoundsPerHyperparameter = 0;
95+
std::size_t m_BayesianOptimisationRestarts = 0;
96+
TStrVec m_CategoricalFieldNames;
97+
double m_Alpha = -1.0;
98+
double m_Lambda = -1.0;
99+
double m_Gamma = -1.0;
100+
double m_SoftTreeDepthLimit = -1.0;
101+
double m_SoftTreeDepthTolerance = -1.0;
102+
double m_Eta = -1.0;
103+
std::size_t m_MaximumNumberTrees = 0;
104+
double m_FeatureBagFraction = -1.0;
105+
std::size_t m_NumberTopShapValues = 0;
106+
TPersisterSupplier* m_PersisterSupplier = nullptr;
107+
TRestoreSearcherSupplier* m_RestoreSearcherSupplier = nullptr;
66108
};
67109
}
68110
}

lib/api/CDataFrameAnalysisSpecification.cc

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <core/CDataFrame.h>
1010
#include <core/CLogger.h>
11+
#include <core/CStringUtils.h>
1112

1213
#include <api/CDataFrameAnalysisConfigReader.h>
1314
#include <api/CDataFrameOutliersRunner.h>
@@ -28,18 +29,19 @@ namespace ml {
2829
namespace api {
2930

3031
// These must be consistent with Java names.
31-
const std::string CDataFrameAnalysisSpecification::JOB_ID("job_id");
32-
const std::string CDataFrameAnalysisSpecification::ROWS("rows");
33-
const std::string CDataFrameAnalysisSpecification::COLS("cols");
34-
const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT("memory_limit");
35-
const std::string CDataFrameAnalysisSpecification::THREADS("threads");
36-
const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY("temp_dir");
37-
const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD("results_field");
32+
const std::string CDataFrameAnalysisSpecification::JOB_ID{"job_id"};
33+
const std::string CDataFrameAnalysisSpecification::ROWS{"rows"};
34+
const std::string CDataFrameAnalysisSpecification::COLS{"cols"};
35+
const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT{"memory_limit"};
36+
const std::string CDataFrameAnalysisSpecification::THREADS{"threads"};
37+
const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY{"temp_dir"};
38+
const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD{"results_field"};
39+
const std::string CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE{"missing_field_value"};
3840
const std::string CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES{"categorical_fields"};
39-
const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED("disk_usage_allowed");
40-
const std::string CDataFrameAnalysisSpecification::ANALYSIS("analysis");
41-
const std::string CDataFrameAnalysisSpecification::NAME("name");
42-
const std::string CDataFrameAnalysisSpecification::PARAMETERS("parameters");
41+
const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED{"disk_usage_allowed"};
42+
const std::string CDataFrameAnalysisSpecification::ANALYSIS{"analysis"};
43+
const std::string CDataFrameAnalysisSpecification::NAME{"name"};
44+
const std::string CDataFrameAnalysisSpecification::PARAMETERS{"parameters"};
4345

4446
namespace {
4547
using TBoolVec = std::vector<bool>;
@@ -75,6 +77,8 @@ const CDataFrameAnalysisConfigReader CONFIG_READER{[] {
7577
CDataFrameAnalysisConfigReader::E_OptionalParameter);
7678
theReader.addParameter(CDataFrameAnalysisSpecification::RESULTS_FIELD,
7779
CDataFrameAnalysisConfigReader::E_OptionalParameter);
80+
theReader.addParameter(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE,
81+
CDataFrameAnalysisConfigReader::E_OptionalParameter);
7882
theReader.addParameter(CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES,
7983
CDataFrameAnalysisConfigReader::E_OptionalParameter);
8084
theReader.addParameter(CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED,
@@ -131,12 +135,20 @@ CDataFrameAnalysisSpecification::CDataFrameAnalysisSpecification(
131135
m_TemporaryDirectory = parameters[TEMPORARY_DIRECTORY].fallback(std::string{});
132136
m_JobId = parameters[JOB_ID].fallback(std::string{});
133137
m_ResultsField = parameters[RESULTS_FIELD].fallback(DEFAULT_RESULT_FIELD);
138+
m_MissingFieldValue = parameters[MISSING_FIELD_VALUE].fallback(
139+
core::CDataFrame::DEFAULT_MISSING_STRING);
134140
m_CategoricalFieldNames = parameters[CATEGORICAL_FIELD_NAMES].fallback(TStrVec{});
135141
m_DiskUsageAllowed = parameters[DISK_USAGE_ALLOWED].fallback(DEFAULT_DISK_USAGE_ALLOWED);
136142

143+
double missing;
144+
if (m_MissingFieldValue != core::CDataFrame::DEFAULT_MISSING_STRING &&
145+
core::CStringUtils::stringToTypeSilent(m_MissingFieldValue, missing)) {
146+
HANDLE_FATAL(<< "Input error: you can't use a number (" << missing
147+
<< ") to denote a missing field value.")
148+
}
137149
if (m_DiskUsageAllowed && m_TemporaryDirectory.empty()) {
138150
HANDLE_FATAL(<< "Input error: temporary directory path should be explicitly set if disk"
139-
" usage is allowed! Please report this problem.");
151+
" usage is allowed! Please report this problem.")
140152
}
141153

142154
auto jsonAnalysis = parameters[ANALYSIS].jsonObject();
@@ -189,6 +201,7 @@ CDataFrameAnalysisSpecification::makeDataFrame() {
189201
? core::makeMainStorageDataFrame(m_NumberColumns)
190202
: core::makeDiskStorageDataFrame(m_TemporaryDirectory,
191203
m_NumberColumns, m_NumberRows);
204+
result.first->missingString(m_MissingFieldValue);
192205
result.first->reserve(m_NumberThreads, m_NumberColumns + this->numberExtraColumns());
193206

194207
return result;

lib/api/CDataFrameAnalysisSpecificationJsonWriter.cc

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#include <api/CDataFrameAnalysisSpecificationJsonWriter.h>
88

9+
#include <core/CDataFrame.h>
10+
911
#include <api/CDataFrameAnalysisSpecification.h>
1012

1113
#include <iostream>
@@ -20,6 +22,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
2022
std::size_t numberThreads,
2123
const std::string& temporaryDirectory,
2224
const std::string& resultsField,
25+
const std::string& missingFieldValue,
2326
const TStrVec& categoricalFields,
2427
bool diskUsageAllowed,
2528
const std::string& analysisName,
@@ -34,8 +37,8 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
3437
}
3538
}
3639
write(jobId, rows, cols, memoryLimit, numberThreads, temporaryDirectory,
37-
resultsField, categoricalFields, diskUsageAllowed, analysisName,
38-
analysisParametersDoc, writer);
40+
resultsField, missingFieldValue, categoricalFields, diskUsageAllowed,
41+
analysisName, analysisParametersDoc, writer);
3942
}
4043

4144
void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
@@ -45,6 +48,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
4548
std::size_t numberThreads,
4649
const std::string& temporaryDirectory,
4750
const std::string& resultsField,
51+
const std::string& missingFieldValue,
4852
const TStrVec& categoricalFields,
4953
bool diskUsageAllowed,
5054
const std::string& analysisName,
@@ -73,6 +77,11 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
7377
writer.Key(CDataFrameAnalysisSpecification::RESULTS_FIELD);
7478
writer.String(resultsField);
7579

80+
if (missingFieldValue != core::CDataFrame::DEFAULT_MISSING_STRING) {
81+
writer.Key(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE);
82+
writer.String(missingFieldValue);
83+
}
84+
7685
rapidjson::Value array(rapidjson::kArrayType);
7786
for (const auto& field : categoricalFields) {
7887
array.PushBack(rapidjson::Value(rapidjson::StringRef(field)),
@@ -105,24 +114,26 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
105114
writer.Flush();
106115
}
107116

108-
std::string
109-
CDataFrameAnalysisSpecificationJsonWriter::jsonString(const std::string& jobId,
110-
size_t rows,
111-
size_t cols,
112-
size_t memoryLimit,
113-
size_t numberThreads,
114-
const TStrVec& categoricalFields,
115-
bool diskUsageAllowed,
116-
const std::string& tempDir,
117-
const std::string& resultField,
118-
const std::string& analysisName,
119-
const std::string& analysisParameters) {
117+
std::string CDataFrameAnalysisSpecificationJsonWriter::jsonString(
118+
const std::string& jobId,
119+
std::size_t rows,
120+
std::size_t cols,
121+
std::size_t memoryLimit,
122+
std::size_t numberThreads,
123+
const std::string& missingFieldValue,
124+
const TStrVec& categoricalFields,
125+
bool diskUsageAllowed,
126+
const std::string& tempDir,
127+
const std::string& resultField,
128+
const std::string& analysisName,
129+
const std::string& analysisParameters) {
120130
rapidjson::StringBuffer stringBuffer;
121131
TRapidJsonLineWriter writer;
122132
writer.Reset(stringBuffer);
123133

124134
write(jobId, rows, cols, memoryLimit, numberThreads, tempDir, resultField,
125-
categoricalFields, diskUsageAllowed, analysisName, analysisParameters, writer);
135+
missingFieldValue, categoricalFields, diskUsageAllowed, analysisName,
136+
analysisParameters, writer);
126137

127138
return stringBuffer.GetString();
128139
}

0 commit comments

Comments
 (0)