@@ -44,6 +44,7 @@ using TDoubleVecVec = std::vector<TDoubleVec>;
44
44
using TSizeVec = std::vector<std::size_t >;
45
45
using TStrVec = std::vector<std::string>;
46
46
using TRowItr = core::CDataFrame::TRowItr;
47
+ using TRowRef = core::CDataFrame::TRowRef;
47
48
using TPoint = maths::CDenseVector<maths::CFloatStorage>;
48
49
using TPointVec = std::vector<TPoint>;
49
50
using TDataFrameUPtr = std::unique_ptr<core::CDataFrame>;
@@ -144,20 +145,21 @@ auto outlierSpec(std::size_t rows = 110,
144
145
return std::make_unique<api::CDataFrameAnalysisSpecification>(spec);
145
146
}
146
147
147
- auto regressionSpec (std::string dependentVariable,
148
- std::size_t rows = 100 ,
149
- std::size_t cols = 5 ,
150
- std::size_t memoryLimit = 3000000 ,
151
- std::size_t numberRoundsPerHyperparameter = 0 ,
152
- std::size_t bayesianOptimisationRestarts = 0 ,
153
- const TStrVec& categoricalFieldNames = TStrVec{},
154
- double lambda = -1.0 ,
155
- double gamma = -1.0 ,
156
- double eta = -1.0 ,
157
- std::size_t maximumNumberTrees = 0 ,
158
- double featureBagFraction = -1.0 ,
159
- CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr ,
160
- CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr ) {
148
+ auto analysisSpec (std::string analysis,
149
+ std::string dependentVariable,
150
+ std::size_t rows = 100 ,
151
+ std::size_t cols = 5 ,
152
+ std::size_t memoryLimit = 3000000 ,
153
+ std::size_t numberRoundsPerHyperparameter = 0 ,
154
+ std::size_t bayesianOptimisationRestarts = 0 ,
155
+ const TStrVec& categoricalFieldNames = TStrVec{},
156
+ double lambda = -1.0 ,
157
+ double gamma = -1.0 ,
158
+ double eta = -1.0 ,
159
+ std::size_t maximumNumberTrees = 0 ,
160
+ double featureBagFraction = -1.0 ,
161
+ CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr ,
162
+ CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr ) {
161
163
162
164
std::string parameters = " {\n\" dependent_variable\" : \" " + dependentVariable + " \" " ;
163
165
if (lambda >= 0.0 ) {
@@ -189,7 +191,7 @@ auto regressionSpec(std::string dependentVariable,
189
191
190
192
std::string spec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString (
191
193
" testJob" , rows, cols, memoryLimit, 1 , categoricalFieldNames, true ,
192
- test::CTestTmpDir::tmpDir (), " ml" , " regression " , parameters)};
194
+ test::CTestTmpDir::tmpDir (), " ml" , analysis , parameters)};
193
195
194
196
LOG_TRACE (<< " spec =\n " << spec);
195
197
@@ -614,7 +616,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() {
614
616
615
617
TStrVec fieldNames{" c1" , " c2" , " c3" , " c4" , " c5" , " ." , " ." };
616
618
TStrVec fieldValues{" " , " " , " " , " " , " " , " 0" , " " };
617
- api::CDataFrameAnalyzer analyzer{regressionSpec ( " c5" ), outputWriterFactory};
619
+ api::CDataFrameAnalyzer analyzer{analysisSpec ( " regression " , " c5" ), outputWriterFactory};
618
620
addRegressionTestData (fieldNames, fieldValues, analyzer, expectedPredictions);
619
621
620
622
core::CStopWatch watch{true };
@@ -676,8 +678,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() {
676
678
};
677
679
678
680
api::CDataFrameAnalyzer analyzer{
679
- regressionSpec ( " c5" , 100 , 5 , 3000000 , 0 , 0 , {}, lambda, gamma , eta ,
680
- maximumNumberTrees, featureBagFraction),
681
+ analysisSpec ( " regression " , " c5" , 100 , 5 , 3000000 , 0 , 0 , {}, lambda,
682
+ gamma , eta, maximumNumberTrees, featureBagFraction),
681
683
outputWriterFactory};
682
684
683
685
TDoubleVec expectedPredictions;
@@ -728,8 +730,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithRowsMissingTargetValu
728
730
729
731
auto target = [](double feature) { return 10.0 * feature; };
730
732
731
- api::CDataFrameAnalyzer analyzer{regressionSpec ( " target " , 50 , 2 , 2000000 ),
732
- outputWriterFactory};
733
+ api::CDataFrameAnalyzer analyzer{
734
+ analysisSpec ( " regression " , " target " , 50 , 2 , 2000000 ), outputWriterFactory};
733
735
734
736
TDoubleVec feature;
735
737
rng.generateUniformSamples (1.0 , 3.0 , 50 , feature);
@@ -946,7 +948,8 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
946
948
947
949
{
948
950
api::CDataFrameAnalyzer analyzer{
949
- regressionSpec (" x5" , 1000 , 5 , 8000000 , 0 , 0 , {" x1" , " x2" }), outputWriterFactory};
951
+ analysisSpec (" regression" , " x5" , 1000 , 5 , 8000000 , 0 , 0 , {" x1" , " x2" }),
952
+ outputWriterFactory};
950
953
951
954
TStrVec x[]{{" x11" , " x12" , " x13" , " x14" , " x15" },
952
955
{" x21" , " x22" , " x23" , " x24" , " x25" , " x26" , " x27" }};
@@ -972,7 +975,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
972
975
passed &= (expected[1 ] == (*row)[1 ]);
973
976
if (wasPassed && passed == false ) {
974
977
LOG_ERROR (<< " expected " << core::CContainerPrinter::print (expected)
975
- << " got [" << (*row)[0 ] << " , " << (*row)[1 ] << " ]" );
978
+ << " , got [" << (*row)[0 ] << " , " << (*row)[1 ] << " ]" );
976
979
}
977
980
}
978
981
});
@@ -984,9 +987,9 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
984
987
{
985
988
std::size_t rows{api::CDataFrameAnalyzer::MAX_CATEGORICAL_CARDINALITY + 3 };
986
989
987
- api::CDataFrameAnalyzer analyzer{
988
- regressionSpec ( " x5 " , rows, 5 , 8000000000 , 0 , 0 , {" x1" }, 0 , 0 , 0 , 0 , 0 ),
989
- outputWriterFactory};
990
+ api::CDataFrameAnalyzer analyzer{analysisSpec ( " regression " , " x5 " , rows, 5 , 8000000000 ,
991
+ 0 , 0 , {" x1" }, 0 , 0 , 0 , 0 , 0 ),
992
+ outputWriterFactory};
990
993
991
994
TStrVec fieldNames{" x1" , " x2" , " x3" , " x4" , " x5" , " ." , " ." };
992
995
TStrVec fieldValues{" " , " " , " " , " " , " " , " " , " " };
@@ -1009,7 +1012,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
1009
1012
bool wasPassed{passed};
1010
1013
passed &= (expected == (*row)[0 ]);
1011
1014
if (wasPassed && passed == false ) {
1012
- LOG_ERROR (<< " expected " << expected << " got " << (*row)[0 ]);
1015
+ LOG_ERROR (<< " expected " << expected << " , got " << (*row)[0 ]);
1013
1016
}
1014
1017
}
1015
1018
});
@@ -1018,6 +1021,66 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
1018
1021
}
1019
1022
}
1020
1023
1024
+ void CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing () {
1025
+
1026
+ auto eq = [](double expected) {
1027
+ return [expected](double actual) { return expected == actual; };
1028
+ };
1029
+
1030
+ auto nan = []() { return [](double actual) { return isnan (actual); }; };
1031
+
1032
+ auto assertRow = [&](const std::size_t row_i,
1033
+ const std::vector<std::function<bool (double )>>& matchers,
1034
+ const TRowRef& row) {
1035
+ CPPUNIT_ASSERT_EQUAL_MESSAGE (" row " + std::to_string (row_i),
1036
+ matchers.size (), row.numberColumns ());
1037
+ for (std::size_t i = 0 ; i < row.numberColumns (); ++i) {
1038
+ CPPUNIT_ASSERT_MESSAGE (" row " + std::to_string (row_i) +
1039
+ " , column " + std::to_string (i),
1040
+ matchers[i](row[i]));
1041
+ }
1042
+ };
1043
+
1044
+ std::stringstream output;
1045
+ auto outputWriterFactory = [&output]() {
1046
+ return std::make_unique<core::CJsonOutputStreamWrapper>(output);
1047
+ };
1048
+
1049
+ api::CDataFrameAnalyzer analyzer{analysisSpec (" classification" , " x5" , 1000 , 5 ,
1050
+ 8000000 , 0 , 0 , {" x1" , " x2" , " x5" }),
1051
+ outputWriterFactory};
1052
+
1053
+ TStrVec fieldNames{" x1" , " x2" , " x3" , " x4" , " x5" , " ." , " ." };
1054
+ analyzer.handleRecord (fieldNames, {" x11" , " x21" , " 0" , " 0" , " x51" , " 0" , " " });
1055
+ analyzer.handleRecord (fieldNames, {" x12" , " x22" , " 1" , " 1" , " x52" , " 1" , " " });
1056
+ analyzer.handleRecord (fieldNames, {" " , " x23" , " 2" , " 2" , " x51" , " 2" , " " });
1057
+ analyzer.handleRecord (fieldNames, {" x14" , " x24" , " 3" , " 3" , " " , " 3" , " " });
1058
+ analyzer.handleRecord (fieldNames, {" x15" , " x25" , " 4" , " 4" , " x51" , " 4" , " " });
1059
+ analyzer.handleRecord (fieldNames, {" x11" , " x26" , " 5" , " 5" , " x52" , " 5" , " " });
1060
+ analyzer.handleRecord (fieldNames, {" x12" , " " , " 6" , " 6" , " " , " 6" , " " });
1061
+ analyzer.handleRecord (fieldNames, {" x13" , " x21" , " 7" , " 7" , " " , " 7" , " " });
1062
+ analyzer.handleRecord (fieldNames, {" x14" , " x22" , " 8" , " 8" , " x51" , " 8" , " " });
1063
+ analyzer.handleRecord (fieldNames, {" " , " x23" , " 9" , " 9" , " x52" , " 9" , " " });
1064
+ analyzer.receivedAllRows ();
1065
+
1066
+ const core::CDataFrame& frame{analyzer.dataFrame ()};
1067
+ frame.readRows (1 , [&](TRowItr beginRows, TRowItr endRows) {
1068
+ std::vector<TRowRef> rows;
1069
+ std::copy (beginRows, endRows, std::back_inserter (rows));
1070
+ CPPUNIT_ASSERT_EQUAL (10UL , rows.size ());
1071
+ assertRow (0 , {eq (0.0 ), eq (0.0 ), eq (0.0 ), eq (0.0 ), eq (0.0 )}, rows[0 ]);
1072
+ assertRow (1 , {eq (1.0 ), eq (1.0 ), eq (1.0 ), eq (1.0 ), eq (1.0 )}, rows[1 ]);
1073
+ assertRow (2 , {eq (2.0 ), eq (2.0 ), eq (2.0 ), eq (2.0 ), eq (0.0 )}, rows[2 ]);
1074
+ assertRow (3 , {eq (3.0 ), eq (3.0 ), eq (3.0 ), eq (3.0 ), nan () }, rows[3 ]);
1075
+ assertRow (4 , {eq (4.0 ), eq (4.0 ), eq (4.0 ), eq (4.0 ), eq (0.0 )}, rows[4 ]);
1076
+ assertRow (5 , {eq (0.0 ), eq (5.0 ), eq (5.0 ), eq (5.0 ), eq (1.0 )}, rows[5 ]);
1077
+ assertRow (6 , {eq (1.0 ), eq (6.0 ), eq (6.0 ), eq (6.0 ), nan () }, rows[6 ]);
1078
+ assertRow (7 , {eq (5.0 ), eq (0.0 ), eq (7.0 ), eq (7.0 ), nan () }, rows[7 ]);
1079
+ assertRow (8 , {eq (3.0 ), eq (1.0 ), eq (8.0 ), eq (8.0 ), eq (0.0 )}, rows[8 ]);
1080
+ assertRow (9 , {eq (2.0 ), eq (2.0 ), eq (9.0 ), eq (9.0 ), eq (1.0 )}, rows[9 ]);
1081
+ });
1082
+ }
1083
+
1021
1084
CppUnit::Test* CDataFrameAnalyzerTest::suite () {
1022
1085
CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite (" CDataFrameAnalyzerTest" );
1023
1086
@@ -1058,6 +1121,9 @@ CppUnit::Test* CDataFrameAnalyzerTest::suite() {
1058
1121
suiteOfTests->addTest (new CppUnit::TestCaller<CDataFrameAnalyzerTest>(
1059
1122
" CDataFrameAnalyzerTest::testCategoricalFields" ,
1060
1123
&CDataFrameAnalyzerTest::testCategoricalFields));
1124
+ suiteOfTests->addTest (new CppUnit::TestCaller<CDataFrameAnalyzerTest>(
1125
+ " CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing" ,
1126
+ &CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing));
1061
1127
1062
1128
return suiteOfTests;
1063
1129
}
@@ -1137,9 +1203,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti
1137
1203
// compute expected tree
1138
1204
1139
1205
api::CDataFrameAnalyzer analyzer{
1140
- regressionSpec ( " c5" , numberExamples, 5 , 15000000 ,
1141
- numberRoundsPerHyperparameter, 12 , {}, lambda, gamma , eta,
1142
- maximumNumberTrees, featureBagFraction, &persisterSupplier),
1206
+ analysisSpec ( " regression " , " c5" , numberExamples, 5 , 15000000 ,
1207
+ numberRoundsPerHyperparameter, 12 , {}, lambda, gamma , eta,
1208
+ maximumNumberTrees, featureBagFraction, &persisterSupplier),
1143
1209
outputWriterFactory};
1144
1210
std::size_t dependentVariable (
1145
1211
std::find (fieldNames.begin (), fieldNames.end (), " c5" ) - fieldNames.begin ());
@@ -1161,9 +1227,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti
1161
1227
};
1162
1228
1163
1229
api::CDataFrameAnalyzer analyzerToRestore{
1164
- regressionSpec ( " c5" , numberExamples, 5 , 15000000 , numberRoundsPerHyperparameter,
1165
- 12 , {}, lambda, gamma , eta, maximumNumberTrees, featureBagFraction,
1166
- &persisterSupplier, &restoreSearcherSupplier),
1230
+ analysisSpec ( " regression " , " c5" , numberExamples, 5 , 15000000 , numberRoundsPerHyperparameter,
1231
+ 12 , {}, lambda, gamma , eta, maximumNumberTrees, featureBagFraction,
1232
+ &persisterSupplier, &restoreSearcherSupplier),
1167
1233
outputWriterFactory};
1168
1234
1169
1235
passDataToAnalyzer (fieldNames, fieldValues, analyzerToRestore, weights, values);
0 commit comments