From 61cf0332af24d528df9dd30a0cdf13c3f74ee921 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 6 Aug 2019 18:29:52 +0100 Subject: [PATCH 1/3] Allow configuration of prediction column name and better default --- include/api/CDataFrameBoostedTreeRunner.h | 3 ++- lib/api/CDataFrameBoostedTreeRunner.cc | 24 ++++++++++++---------- lib/api/unittest/CDataFrameAnalyzerTest.cc | 22 ++++++++++---------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/include/api/CDataFrameBoostedTreeRunner.h b/include/api/CDataFrameBoostedTreeRunner.h index 18122cfd46..27c7dce2a2 100644 --- a/include/api/CDataFrameBoostedTreeRunner.h +++ b/include/api/CDataFrameBoostedTreeRunner.h @@ -54,7 +54,8 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu private: // Note custom config is written directly to the factory object. - std::string m_DependentVariable; + std::string m_DependentVariableName; + std::string m_PredictionName; TBoostedTreeFactoryUPtr m_BoostedTreeFactory; TBoostedTreeUPtr m_BoostedTree; }; diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 212808d540..7114154a0e 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -23,7 +23,8 @@ namespace ml { namespace api { namespace { // Configuration -const std::string DEPENDENT_VARIABLE{"dependent_variable"}; +const std::string DEPENDENT_VARIABLE_NAME{"dependent_variable"}; +const std::string PREDICTION_COLUMN_NAME{"prediction_column_name"}; const std::string LAMBDA{"lambda"}; const std::string GAMMA{"gamma"}; const std::string ETA{"eta"}; @@ -32,8 +33,10 @@ const std::string FEATURE_BAG_FRACTION{"feature_bag_fraction"}; const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { CDataFrameAnalysisConfigReader theReader; - theReader.addParameter(DEPENDENT_VARIABLE, + theReader.addParameter(DEPENDENT_VARIABLE_NAME, CDataFrameAnalysisConfigReader::E_RequiredParameter); + theReader.addParameter(PREDICTION_COLUMN_NAME, + CDataFrameAnalysisConfigReader::E_OptionalParameter); // TODO objective function, support train and predict. theReader.addParameter(LAMBDA, CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(GAMMA, CDataFrameAnalysisConfigReader::E_OptionalParameter); @@ -44,9 +47,6 @@ const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { CDataFrameAnalysisConfigReader::E_OptionalParameter); return theReader; }()}; - -// Output -const std::string PREDICTION{"prediction"}; } CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner(const CDataFrameAnalysisSpecification& spec, @@ -55,7 +55,10 @@ CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner(const CDataFrameAnalysi auto parameters = PARAMETER_READER.read(jsonParameters); - m_DependentVariable = parameters[DEPENDENT_VARIABLE].as(); + m_DependentVariableName = parameters[DEPENDENT_VARIABLE_NAME].as(); + + m_PredictionName = parameters[PREDICTION_COLUMN_NAME].fallback( + m_DependentVariableName + "_prediction"); std::size_t maximumNumberTrees{ parameters[MAXIMUM_NUMBER_TREES].fallback(std::size_t{0})}; @@ -117,7 +120,7 @@ void CDataFrameBoostedTreeRunner::writeOneRow(const TStrVec&, HANDLE_FATAL(<< "Internal error: boosted tree object missing. Please report this error."); } else { writer.StartObject(); - writer.Key(PREDICTION); + writer.Key(m_PredictionName); writer.Double(row[m_BoostedTree->columnHoldingPrediction(row.numberColumns())]); writer.EndObject(); } @@ -126,12 +129,11 @@ void CDataFrameBoostedTreeRunner::writeOneRow(const TStrVec&, void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, core::CDataFrame& frame) { auto dependentVariableColumn = - std::find(featureNames.begin(), featureNames.end(), m_DependentVariable); + std::find(featureNames.begin(), featureNames.end(), m_DependentVariableName); if (dependentVariableColumn == featureNames.end()) { HANDLE_FATAL(<< "Input error: supplied variable to predict '" - << m_DependentVariable << "' is missing from training data " - << core::CContainerPrinter::print(featureNames) - << ". Please report this problem."); + << m_DependentVariableName << "' is missing from training" + << " data " << core::CContainerPrinter::print(featureNames)); } else { m_BoostedTree = m_BoostedTreeFactory->buildFor( frame, dependentVariableColumn - featureNames.begin()); diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index d7db0219d4..e66b05c751 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -117,7 +117,7 @@ regressionSpec(std::string dependentVariable, rows, 5, memoryLimit, 1, categoricalFieldNames, true, test::CTestTmpDir::tmpDir(), "ml", "regression", parameters)}; - LOG_DEBUG(<< "spec =\n" << spec); + LOG_TRACE(<< "spec =\n" << spec); return std::make_unique(spec); } @@ -294,7 +294,7 @@ void CDataFrameAnalyzerTest::testWithoutControlMessages() { analyzer.run(); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str()); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedScore = expectedScores.begin(); @@ -340,7 +340,7 @@ void CDataFrameAnalyzerTest::testRunOutlierDetection() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str()); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedScore = expectedScores.begin(); @@ -394,7 +394,7 @@ void CDataFrameAnalyzerTest::testRunOutlierDetectionPartitioned() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedScore = expectedScores.begin(); @@ -441,7 +441,7 @@ void CDataFrameAnalyzerTest::testRunOutlierFeatureInfluences() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedFeatureInfluence = expectedFeatureInfluences.begin(); @@ -492,7 +492,7 @@ void CDataFrameAnalyzerTest::testRunOutlierDetectionWithParams() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedScore = expectedScores.begin(); @@ -530,7 +530,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedPrediction = expectedPredictions.begin(); @@ -540,7 +540,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() { CPPUNIT_ASSERT(expectedPrediction != expectedPredictions.end()); CPPUNIT_ASSERT_DOUBLES_EQUAL( *expectedPrediction, - result["row_results"]["results"]["ml"]["prediction"].GetDouble(), + result["row_results"]["results"]["ml"]["c5_prediction"].GetDouble(), 1e-4 * std::fabs(*expectedPrediction)); ++expectedPrediction; CPPUNIT_ASSERT(result.HasMember("progress_percent") == false); @@ -584,7 +584,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str()); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedPrediction = expectedPredictions.begin(); @@ -594,7 +594,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() { CPPUNIT_ASSERT(expectedPrediction != expectedPredictions.end()); CPPUNIT_ASSERT_DOUBLES_EQUAL( *expectedPrediction, - result["row_results"]["results"]["ml"]["prediction"].GetDouble(), + result["row_results"]["results"]["ml"]["c5_prediction"].GetDouble(), 1e-4 * std::fabs(*expectedPrediction)); ++expectedPrediction; CPPUNIT_ASSERT(result.HasMember("progress_percent") == false); @@ -748,7 +748,7 @@ void CDataFrameAnalyzerTest::testRoundTripDocHashes() { {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str().c_str())); + rapidjson::ParseResult ok(results.Parse(output.str()); CPPUNIT_ASSERT(static_cast(ok) == true); int expectedHash{0}; From 9ebb67cde9ed7328db009a8277d9ebbce42517b8 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 7 Aug 2019 11:49:24 +0100 Subject: [PATCH 2/3] Test fix --- lib/api/unittest/CDataFrameAnalyzerTest.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index e66b05c751..8af7326fda 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -294,7 +294,7 @@ void CDataFrameAnalyzerTest::testWithoutControlMessages() { analyzer.run(); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str()); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedScore = expectedScores.begin(); @@ -340,7 +340,7 @@ void CDataFrameAnalyzerTest::testRunOutlierDetection() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str()); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedScore = expectedScores.begin(); @@ -584,7 +584,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str()); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); auto expectedPrediction = expectedPredictions.begin(); @@ -748,7 +748,7 @@ void CDataFrameAnalyzerTest::testRoundTripDocHashes() { {"", "", "", "", "", "", "$"}); rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str()); + rapidjson::ParseResult ok(results.Parse(output.str())); CPPUNIT_ASSERT(static_cast(ok) == true); int expectedHash{0}; From 4a924768058c9c92625345135c4d18beb5ad0000 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 7 Aug 2019 13:57:12 +0100 Subject: [PATCH 3/3] Rename per review --- include/api/CDataFrameBoostedTreeRunner.h | 4 ++-- lib/api/CDataFrameBoostedTreeRunner.cc | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/api/CDataFrameBoostedTreeRunner.h b/include/api/CDataFrameBoostedTreeRunner.h index 27c7dce2a2..848d2b67bc 100644 --- a/include/api/CDataFrameBoostedTreeRunner.h +++ b/include/api/CDataFrameBoostedTreeRunner.h @@ -54,8 +54,8 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu private: // Note custom config is written directly to the factory object. - std::string m_DependentVariableName; - std::string m_PredictionName; + std::string m_DependentVariableFieldName; + std::string m_PredictionFieldName; TBoostedTreeFactoryUPtr m_BoostedTreeFactory; TBoostedTreeUPtr m_BoostedTree; }; diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 7114154a0e..7c00e30ecb 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -24,7 +24,7 @@ namespace api { namespace { // Configuration const std::string DEPENDENT_VARIABLE_NAME{"dependent_variable"}; -const std::string PREDICTION_COLUMN_NAME{"prediction_column_name"}; +const std::string PREDICTION_FIELD_NAME{"prediction_field_name"}; const std::string LAMBDA{"lambda"}; const std::string GAMMA{"gamma"}; const std::string ETA{"eta"}; @@ -35,7 +35,7 @@ const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { CDataFrameAnalysisConfigReader theReader; theReader.addParameter(DEPENDENT_VARIABLE_NAME, CDataFrameAnalysisConfigReader::E_RequiredParameter); - theReader.addParameter(PREDICTION_COLUMN_NAME, + theReader.addParameter(PREDICTION_FIELD_NAME, CDataFrameAnalysisConfigReader::E_OptionalParameter); // TODO objective function, support train and predict. theReader.addParameter(LAMBDA, CDataFrameAnalysisConfigReader::E_OptionalParameter); @@ -55,10 +55,10 @@ CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner(const CDataFrameAnalysi auto parameters = PARAMETER_READER.read(jsonParameters); - m_DependentVariableName = parameters[DEPENDENT_VARIABLE_NAME].as(); + m_DependentVariableFieldName = parameters[DEPENDENT_VARIABLE_NAME].as(); - m_PredictionName = parameters[PREDICTION_COLUMN_NAME].fallback( - m_DependentVariableName + "_prediction"); + m_PredictionFieldName = parameters[PREDICTION_FIELD_NAME].fallback( + m_DependentVariableFieldName + "_prediction"); std::size_t maximumNumberTrees{ parameters[MAXIMUM_NUMBER_TREES].fallback(std::size_t{0})}; @@ -120,7 +120,7 @@ void CDataFrameBoostedTreeRunner::writeOneRow(const TStrVec&, HANDLE_FATAL(<< "Internal error: boosted tree object missing. Please report this error."); } else { writer.StartObject(); - writer.Key(m_PredictionName); + writer.Key(m_PredictionFieldName); writer.Double(row[m_BoostedTree->columnHoldingPrediction(row.numberColumns())]); writer.EndObject(); } @@ -128,11 +128,11 @@ void CDataFrameBoostedTreeRunner::writeOneRow(const TStrVec&, void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, core::CDataFrame& frame) { - auto dependentVariableColumn = - std::find(featureNames.begin(), featureNames.end(), m_DependentVariableName); + auto dependentVariableColumn = std::find( + featureNames.begin(), featureNames.end(), m_DependentVariableFieldName); if (dependentVariableColumn == featureNames.end()) { HANDLE_FATAL(<< "Input error: supplied variable to predict '" - << m_DependentVariableName << "' is missing from training" + << m_DependentVariableFieldName << "' is missing from training" << " data " << core::CContainerPrinter::print(featureNames)); } else { m_BoostedTree = m_BoostedTreeFactory->buildFor(