Skip to content

Commit d3bec86

Browse files
committed
Add unit tests
1 parent 56248b1 commit d3bec86

File tree

4 files changed

+137
-32
lines changed

4 files changed

+137
-32
lines changed

lib/api/unittest/CDataFrameAnalysisRunnerTest.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,36 @@ void CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000() {
189189
testEstimateMemoryUsage(1000, "450kB", "143kB", 0);
190190
}
191191

192+
void testColumnsForWhichEmptyIsMissing(const std::string& analysis,
193+
bool expected_dependentVariableEmptyAsMissing) {
194+
using TBoolVec = std::vector<bool>;
195+
using TStrVec = std::vector<std::string>;
196+
197+
std::string parameters{"{\"dependent_variable\": \"label\"}"};
198+
std::string jsonSpec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString(
199+
"testJob", 10000, 5, 100000000, 1, {}, true,
200+
test::CTestTmpDir::tmpDir(), "", analysis, parameters)};
201+
api::CDataFrameAnalysisSpecification spec{jsonSpec};
202+
203+
TStrVec fieldNames{"feature_1", "feature_2", "feature_3", "label"};
204+
TBoolVec emptyAsMissing{spec.columnsForWhichEmptyIsMissing(fieldNames)};
205+
206+
CPPUNIT_ASSERT_EQUAL(fieldNames.size(), emptyAsMissing.size());
207+
CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[0]));
208+
CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[1]));
209+
CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[2]));
210+
CPPUNIT_ASSERT_EQUAL(expected_dependentVariableEmptyAsMissing,
211+
bool(emptyAsMissing[3]));
212+
}
213+
214+
void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification() {
215+
testColumnsForWhichEmptyIsMissing("classification", true);
216+
}
217+
218+
void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression() {
219+
testColumnsForWhichEmptyIsMissing("regression", false);
220+
}
221+
192222
CppUnit::Test* CDataFrameAnalysisRunnerTest::suite() {
193223
CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CDataFrameAnalysisRunnerTest");
194224

@@ -213,6 +243,12 @@ CppUnit::Test* CDataFrameAnalysisRunnerTest::suite() {
213243
suiteOfTests->addTest(new CppUnit::TestCaller<CDataFrameAnalysisRunnerTest>(
214244
"CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000",
215245
&CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000));
246+
suiteOfTests->addTest(new CppUnit::TestCaller<CDataFrameAnalysisRunnerTest>(
247+
"CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification",
248+
&CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification));
249+
suiteOfTests->addTest(new CppUnit::TestCaller<CDataFrameAnalysisRunnerTest>(
250+
"CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression",
251+
&CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression));
216252

217253
return suiteOfTests;
218254
}

lib/api/unittest/CDataFrameAnalysisRunnerTest.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ class CDataFrameAnalysisRunnerTest : public CppUnit::TestFixture {
1818
void testEstimateMemoryUsage_10();
1919
void testEstimateMemoryUsage_100();
2020
void testEstimateMemoryUsage_1000();
21+
void testColumnsForWhichEmptyIsMissing_Classification();
22+
void testColumnsForWhichEmptyIsMissing_Regression();
2123

2224
static CppUnit::Test* suite();
2325

lib/api/unittest/CDataFrameAnalyzerTest.cc

Lines changed: 98 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ using TDoubleVecVec = std::vector<TDoubleVec>;
4444
using TSizeVec = std::vector<std::size_t>;
4545
using TStrVec = std::vector<std::string>;
4646
using TRowItr = core::CDataFrame::TRowItr;
47+
using TRowRef = core::CDataFrame::TRowRef;
4748
using TPoint = maths::CDenseVector<maths::CFloatStorage>;
4849
using TPointVec = std::vector<TPoint>;
4950
using TDataFrameUPtr = std::unique_ptr<core::CDataFrame>;
@@ -144,20 +145,21 @@ auto outlierSpec(std::size_t rows = 110,
144145
return std::make_unique<api::CDataFrameAnalysisSpecification>(spec);
145146
}
146147

147-
auto regressionSpec(std::string dependentVariable,
148-
std::size_t rows = 100,
149-
std::size_t cols = 5,
150-
std::size_t memoryLimit = 3000000,
151-
std::size_t numberRoundsPerHyperparameter = 0,
152-
std::size_t bayesianOptimisationRestarts = 0,
153-
const TStrVec& categoricalFieldNames = TStrVec{},
154-
double lambda = -1.0,
155-
double gamma = -1.0,
156-
double eta = -1.0,
157-
std::size_t maximumNumberTrees = 0,
158-
double featureBagFraction = -1.0,
159-
CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr,
160-
CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) {
148+
auto analysisSpec(std::string analysis,
149+
std::string dependentVariable,
150+
std::size_t rows = 100,
151+
std::size_t cols = 5,
152+
std::size_t memoryLimit = 3000000,
153+
std::size_t numberRoundsPerHyperparameter = 0,
154+
std::size_t bayesianOptimisationRestarts = 0,
155+
const TStrVec& categoricalFieldNames = TStrVec{},
156+
double lambda = -1.0,
157+
double gamma = -1.0,
158+
double eta = -1.0,
159+
std::size_t maximumNumberTrees = 0,
160+
double featureBagFraction = -1.0,
161+
CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr,
162+
CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) {
161163

162164
std::string parameters = "{\n\"dependent_variable\": \"" + dependentVariable + "\"";
163165
if (lambda >= 0.0) {
@@ -189,7 +191,7 @@ auto regressionSpec(std::string dependentVariable,
189191

190192
std::string spec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString(
191193
"testJob", rows, cols, memoryLimit, 1, categoricalFieldNames, true,
192-
test::CTestTmpDir::tmpDir(), "ml", "regression", parameters)};
194+
test::CTestTmpDir::tmpDir(), "ml", analysis, parameters)};
193195

194196
LOG_TRACE(<< "spec =\n" << spec);
195197

@@ -614,7 +616,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() {
614616

615617
TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."};
616618
TStrVec fieldValues{"", "", "", "", "", "0", ""};
617-
api::CDataFrameAnalyzer analyzer{regressionSpec("c5"), outputWriterFactory};
619+
api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "c5"), outputWriterFactory};
618620
addRegressionTestData(fieldNames, fieldValues, analyzer, expectedPredictions);
619621

620622
core::CStopWatch watch{true};
@@ -676,8 +678,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() {
676678
};
677679

678680
api::CDataFrameAnalyzer analyzer{
679-
regressionSpec("c5", 100, 5, 3000000, 0, 0, {}, lambda, gamma, eta,
680-
maximumNumberTrees, featureBagFraction),
681+
analysisSpec("regression", "c5", 100, 5, 3000000, 0, 0, {}, lambda,
682+
gamma, eta, maximumNumberTrees, featureBagFraction),
681683
outputWriterFactory};
682684

683685
TDoubleVec expectedPredictions;
@@ -728,8 +730,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithRowsMissingTargetValu
728730

729731
auto target = [](double feature) { return 10.0 * feature; };
730732

731-
api::CDataFrameAnalyzer analyzer{regressionSpec("target", 50, 2, 2000000),
732-
outputWriterFactory};
733+
api::CDataFrameAnalyzer analyzer{
734+
analysisSpec("regression", "target", 50, 2, 2000000), outputWriterFactory};
733735

734736
TDoubleVec feature;
735737
rng.generateUniformSamples(1.0, 3.0, 50, feature);
@@ -946,7 +948,8 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
946948

947949
{
948950
api::CDataFrameAnalyzer analyzer{
949-
regressionSpec("x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}), outputWriterFactory};
951+
analysisSpec("regression", "x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}),
952+
outputWriterFactory};
950953

951954
TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"},
952955
{"x21", "x22", "x23", "x24", "x25", "x26", "x27"}};
@@ -972,7 +975,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
972975
passed &= (expected[1] == (*row)[1]);
973976
if (wasPassed && passed == false) {
974977
LOG_ERROR(<< "expected " << core::CContainerPrinter::print(expected)
975-
<< "got [" << (*row)[0] << ", " << (*row)[1] << "]");
978+
<< ", got [" << (*row)[0] << ", " << (*row)[1] << "]");
976979
}
977980
}
978981
});
@@ -984,9 +987,9 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
984987
{
985988
std::size_t rows{api::CDataFrameAnalyzer::MAX_CATEGORICAL_CARDINALITY + 3};
986989

987-
api::CDataFrameAnalyzer analyzer{
988-
regressionSpec("x5", rows, 5, 8000000000, 0, 0, {"x1"}, 0, 0, 0, 0, 0),
989-
outputWriterFactory};
990+
api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "x5", rows, 5, 8000000000,
991+
0, 0, {"x1"}, 0, 0, 0, 0, 0),
992+
outputWriterFactory};
990993

991994
TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."};
992995
TStrVec fieldValues{"", "", "", "", "", "", ""};
@@ -1009,7 +1012,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
10091012
bool wasPassed{passed};
10101013
passed &= (expected == (*row)[0]);
10111014
if (wasPassed && passed == false) {
1012-
LOG_ERROR(<< "expected " << expected << " got " << (*row)[0]);
1015+
LOG_ERROR(<< "expected " << expected << ", got " << (*row)[0]);
10131016
}
10141017
}
10151018
});
@@ -1018,6 +1021,66 @@ void CDataFrameAnalyzerTest::testCategoricalFields() {
10181021
}
10191022
}
10201023

1024+
void CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing() {
1025+
1026+
auto eq = [](double expected) {
1027+
return [expected](double actual) { return expected == actual; };
1028+
};
1029+
1030+
auto nan = []() { return [](double actual) { return isnan(actual); }; };
1031+
1032+
auto assertRow = [&](const std::size_t row_i,
1033+
const std::vector<std::function<bool(double)>>& matchers,
1034+
const TRowRef& row) {
1035+
CPPUNIT_ASSERT_EQUAL_MESSAGE("row " + std::to_string(row_i),
1036+
matchers.size(), row.numberColumns());
1037+
for (std::size_t i = 0; i < row.numberColumns(); ++i) {
1038+
CPPUNIT_ASSERT_MESSAGE("row " + std::to_string(row_i) +
1039+
", column " + std::to_string(i),
1040+
matchers[i](row[i]));
1041+
}
1042+
};
1043+
1044+
std::stringstream output;
1045+
auto outputWriterFactory = [&output]() {
1046+
return std::make_unique<core::CJsonOutputStreamWrapper>(output);
1047+
};
1048+
1049+
api::CDataFrameAnalyzer analyzer{analysisSpec("classification", "x5", 1000, 5,
1050+
8000000, 0, 0, {"x1", "x2", "x5"}),
1051+
outputWriterFactory};
1052+
1053+
TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."};
1054+
analyzer.handleRecord(fieldNames, {"x11", "x21", "0", "0", "x51", "0", ""});
1055+
analyzer.handleRecord(fieldNames, {"x12", "x22", "1", "1", "x52", "1", ""});
1056+
analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""});
1057+
analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""});
1058+
analyzer.handleRecord(fieldNames, {"x15", "x25", "4", "4", "x51", "4", ""});
1059+
analyzer.handleRecord(fieldNames, {"x11", "x26", "5", "5", "x52", "5", ""});
1060+
analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""});
1061+
analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""});
1062+
analyzer.handleRecord(fieldNames, {"x14", "x22", "8", "8", "x51", "8", ""});
1063+
analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""});
1064+
analyzer.receivedAllRows();
1065+
1066+
const core::CDataFrame& frame{analyzer.dataFrame()};
1067+
frame.readRows(1, [&](TRowItr beginRows, TRowItr endRows) {
1068+
std::vector<TRowRef> rows;
1069+
std::copy(beginRows, endRows, std::back_inserter(rows));
1070+
CPPUNIT_ASSERT_EQUAL(10UL, rows.size());
1071+
assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]);
1072+
assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]);
1073+
assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]);
1074+
assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), nan() }, rows[3]);
1075+
assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]);
1076+
assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]);
1077+
assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), nan() }, rows[6]);
1078+
assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), nan() }, rows[7]);
1079+
assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]);
1080+
assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]);
1081+
});
1082+
}
1083+
10211084
CppUnit::Test* CDataFrameAnalyzerTest::suite() {
10221085
CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CDataFrameAnalyzerTest");
10231086

@@ -1058,6 +1121,9 @@ CppUnit::Test* CDataFrameAnalyzerTest::suite() {
10581121
suiteOfTests->addTest(new CppUnit::TestCaller<CDataFrameAnalyzerTest>(
10591122
"CDataFrameAnalyzerTest::testCategoricalFields",
10601123
&CDataFrameAnalyzerTest::testCategoricalFields));
1124+
suiteOfTests->addTest(new CppUnit::TestCaller<CDataFrameAnalyzerTest>(
1125+
"CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing",
1126+
&CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing));
10611127

10621128
return suiteOfTests;
10631129
}
@@ -1137,9 +1203,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti
11371203
// compute expected tree
11381204

11391205
api::CDataFrameAnalyzer analyzer{
1140-
regressionSpec("c5", numberExamples, 5, 15000000,
1141-
numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta,
1142-
maximumNumberTrees, featureBagFraction, &persisterSupplier),
1206+
analysisSpec("regression", "c5", numberExamples, 5, 15000000,
1207+
numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta,
1208+
maximumNumberTrees, featureBagFraction, &persisterSupplier),
11431209
outputWriterFactory};
11441210
std::size_t dependentVariable(
11451211
std::find(fieldNames.begin(), fieldNames.end(), "c5") - fieldNames.begin());
@@ -1161,9 +1227,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti
11611227
};
11621228

11631229
api::CDataFrameAnalyzer analyzerToRestore{
1164-
regressionSpec("c5", numberExamples, 5, 15000000, numberRoundsPerHyperparameter,
1165-
12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction,
1166-
&persisterSupplier, &restoreSearcherSupplier),
1230+
analysisSpec("regression", "c5", numberExamples, 5, 15000000, numberRoundsPerHyperparameter,
1231+
12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction,
1232+
&persisterSupplier, &restoreSearcherSupplier),
11671233
outputWriterFactory};
11681234

11691235
passDataToAnalyzer(fieldNames, fieldValues, analyzerToRestore, weights, values);

lib/api/unittest/CDataFrameAnalyzerTest.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class CDataFrameAnalyzerTest : public CppUnit::TestFixture {
3939
void testErrors();
4040
void testRoundTripDocHashes();
4141
void testCategoricalFields();
42+
void testCategoricalFields_EmptyAsMissing();
4243

4344
static CppUnit::Test* suite();
4445

0 commit comments

Comments
 (0)