diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index e547965981..4c6d5fff28 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -43,6 +43,7 @@ boosted tree training. Hard depth based regularization is often the strategy of choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs. Also, the parameters of the penalty function are mode suited to optimising with our Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].) +* Binomial logistic regression targeting cross entropy. (See {ml-pull}713[#713].) * Improvements to count and sum anomaly detection for sparse data. This primarily aims to improve handling of data which are predictably present: detecting when they are unexpectedly missing. (See {ml-pull}721[#721].) diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h index c460d4a99b..f528dd30cd 100644 --- a/include/maths/CBoostedTree.h +++ b/include/maths/CBoostedTree.h @@ -13,10 +13,13 @@ #include #include +#include #include #include #include +#include +#include namespace ml { namespace core { @@ -29,18 +32,29 @@ class CEncodedDataFrameRowRef; namespace boosted_tree_detail { class MATHS_EXPORT CArgMinLossImpl { public: + CArgMinLossImpl(double lambda); virtual ~CArgMinLossImpl() = default; virtual std::unique_ptr clone() const = 0; + virtual bool nextPass() = 0; virtual void add(double prediction, double actual) = 0; virtual void merge(const CArgMinLossImpl& other) = 0; virtual double value() const = 0; + +protected: + double lambda() const; + +private: + double m_Lambda; }; -//! \brief Finds the value to add to a set of predictions which minimises the MSE. +//! \brief Finds the value to add to a set of predictions which minimises the +//! regularized MSE w.r.t. the actual values. class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl { public: + CArgMinMseImpl(double lambda); std::unique_ptr clone() const override; + bool nextPass() override; void add(double prediction, double actual) override; void merge(const CArgMinLossImpl& other) override; double value() const override; @@ -51,6 +65,46 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl { private: TMeanAccumulator m_MeanError; }; + +//! \brief Finds the value to add to a set of predicted log-odds which minimises +//! regularised cross entropy loss w.r.t. the actual categories. +class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { +public: + CArgMinLogisticImpl(double lambda); + std::unique_ptr clone() const override; + bool nextPass() override; + void add(double prediction, double actual) override; + void merge(const CArgMinLossImpl& other) override; + double value() const override; + +private: + using TMinMaxAccumulator = CBasicStatistics::CMinMax; + using TSizeVector = CVectorNx1; + using TSizeVectorVec = std::vector; + +private: + std::size_t bucket(double prediction) const { + double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()}; + return std::min(static_cast(bucket), + m_BucketCategoryCounts.size() - 1); + } + + double bucketCentre(std::size_t bucket) const { + return m_PredictionMinMax.min() + + (static_cast(bucket) + 0.5) * this->bucketWidth(); + } + + double bucketWidth() const { + return m_PredictionMinMax.range() / + static_cast(m_BucketCategoryCounts.size()); + } + +private: + std::size_t m_CurrentPass = 0; + TMinMaxAccumulator m_PredictionMinMax; + TSizeVector m_CategoryCounts; + TSizeVectorVec m_BucketCategoryCounts; +}; } namespace boosted_tree { @@ -64,6 +118,11 @@ class MATHS_EXPORT CArgMinLoss { CArgMinLoss& operator=(const CArgMinLoss& other); CArgMinLoss& operator=(CArgMinLoss&& other) = default; + //! Start another pass over the predictions and actuals. + //! + //! \return True if we need to perform another pass to compute value(). + bool nextPass() const; + //! Update with a point prediction and actual value. void add(double prediction, double actual); @@ -94,6 +153,8 @@ class MATHS_EXPORT CArgMinLoss { class MATHS_EXPORT CLoss { public: virtual ~CLoss() = default; + //! Clone the loss. + virtual std::unique_ptr clone() const = 0; //! The value of the loss function. virtual double value(double prediction, double actual) const = 0; //! The slope of the loss function. @@ -103,7 +164,7 @@ class MATHS_EXPORT CLoss { //! Returns true if the loss curvature is constant. virtual bool isCurvatureConstant() const = 0; //! Get an object which computes the leaf value that minimises loss. - virtual CArgMinLoss minimizer() const = 0; + virtual CArgMinLoss minimizer(double lambda) const = 0; //! Get the name of the loss function virtual const std::string& name() const = 0; @@ -114,11 +175,34 @@ class MATHS_EXPORT CLoss { //! \brief The MSE loss function. class MATHS_EXPORT CMse final : public CLoss { public: + std::unique_ptr clone() const override; double value(double prediction, double actual) const override; double gradient(double prediction, double actual) const override; double curvature(double prediction, double actual) const override; bool isCurvatureConstant() const override; - CArgMinLoss minimizer() const override; + CArgMinLoss minimizer(double lambda) const override; + const std::string& name() const override; + +public: + static const std::string NAME; +}; + +//! \brief Implements loss for binomial logistic regression. +//! +//! DESCRIPTION:\n +//! This targets the cross entropy loss using the tree to predict class log-odds: +//!
+//!   \f$\displaystyle l_i(p) = -(1 - a_i) \log(1 - S(p)) - a_i \log(S(p))\f$
+//! 
+//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the +//! prediction and \f$S(\cdot)\f$ denotes the logistic function. +class MATHS_EXPORT CLogistic final : public CLoss { + std::unique_ptr clone() const override; + double value(double prediction, double actual) const override; + double gradient(double prediction, double actual) const override; + double curvature(double prediction, double actual) const override; + bool isCurvatureConstant() const override; + CArgMinLoss minimizer(double lambda) const override; const std::string& name() const override; public: @@ -248,6 +332,7 @@ class MATHS_EXPORT CBoostedTreeNode final { //! proposed by Reshef for this purpose. See CDataFrameCategoryEncoder for more details. class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel { public: + using TStrVec = std::vector; using TRowRef = core::CDataFrame::TRowRef; using TLossFunctionUPtr = std::unique_ptr; using TDataFramePtr = core::CDataFrame*; @@ -285,6 +370,16 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel { //! Get the model produced by training if it has been run. const TNodeVecVec& trainedModel() const; + //! The name of the object holding the best hyperaparameters in the state document. + static const std::string& bestHyperparametersName(); + + //! The name of the object holding the best regularisation hyperparameters in the + //! state document. + static const std::string& bestRegularizationHyperparametersName(); + + //! A list of the names of the best individual hyperparameters in the state document. + static TStrVec bestHyperparameterNames(); + //! Persist by passing information to \p inserter. void acceptPersistInserter(core::CStatePersistInserter& inserter) const; diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 01582d8d10..5f4a76f3b3 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -177,6 +177,8 @@ class MATHS_EXPORT CBoostedTreeFactory final { TOptionalDouble m_MinimumFrequencyToOneHotEncode; TOptionalSize m_BayesianOptimisationRestarts; bool m_Restored = false; + std::size_t m_NumberThreads; + TLossFunctionUPtr m_Loss; TBoostedTreeImplUPtr m_TreeImpl; TVector m_LogDepthPenaltyMultiplierSearchInterval; TVector m_LogTreeSizePenaltyMultiplierSearchInterval; diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 3b5bcad000..7e25af7293 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -48,6 +48,7 @@ inline std::size_t predictionColumn(std::size_t numberColumns) { class MATHS_EXPORT CBoostedTreeImpl final { public: using TDoubleVec = std::vector; + using TStrVec = std::vector; using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; using TBayesinOptimizationUPtr = std::unique_ptr; @@ -101,6 +102,16 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! frame with \p numberRows row and \p numberColumns columns will use. std::size_t estimateMemoryUsage(std::size_t numberRows, std::size_t numberColumns) const; + //! The name of the object holding the best hyperaparameters in the state document. + static const std::string& bestHyperparametersName(); + + //! The name of the object holding the best regularisation hyperparameters in the + //! state document. + static const std::string& bestRegularizationHyperparametersName(); + + //! A list of the names of the best individual hyperparameters in the state document. + static TStrVec bestHyperparameterNames(); + //! Persist by passing information to \p inserter. void acceptPersistInserter(core::CStatePersistInserter& inserter) const; diff --git a/include/maths/CTools.h b/include/maths/CTools.h index 079a9304c6..d51f760564 100644 --- a/include/maths/CTools.h +++ b/include/maths/CTools.h @@ -678,7 +678,8 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { //! \param[in] width The step width. //! \param[in] x0 The centre of the step. //! \param[in] sign Determines whether it's a step up or down. - static double logisticFunction(double x, double width, double x0 = 0.0, double sign = 1.0) { + static double + logisticFunction(double x, double width = 1.0, double x0 = 0.0, double sign = 1.0) { return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width)); } diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index b87139108b..7dedee6e4b 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -49,6 +49,10 @@ using TRowRef = core::CDataFrame::TRowRef; using TPoint = maths::CDenseVector; using TPointVec = std::vector; using TDataFrameUPtr = std::unique_ptr; +using TDataAdderUPtr = std::unique_ptr; +using TPersisterSupplier = std::function; +using TDataSearcherUPtr = std::unique_ptr; +using TRestoreSearcherSupplier = std::function; class CTestDataSearcher : public core::CDataSearcher { public: @@ -58,6 +62,7 @@ class CTestDataSearcher : public core::CDataSearcher { virtual TIStreamP search(size_t /*currentDocNum*/, size_t /*limit*/) { std::istringstream* intermediateStateStream{ static_cast(m_Stream.get())}; + // Discard first line, which contains the state id. intermediateStateStream->ignore(256, '\n'); std::string intermediateState; std::getline(*intermediateStateStream, intermediateState); @@ -68,26 +73,8 @@ class CTestDataSearcher : public core::CDataSearcher { TIStreamP m_Stream; }; -class CTestDataAdder : public core::CDataAdder { -public: - CTestDataAdder() : m_Stream(new std::ostringstream) {} - - virtual TOStreamP addStreamed(const std::string& /*index*/, const std::string& /*id*/) { - return m_Stream; - } - - virtual bool streamComplete(TOStreamP& /*strm*/, bool /*force*/) { - return true; - } - - TOStreamP getStream() { return m_Stream; } - -private: - TOStreamP m_Stream; -}; - -std::vector streamToStringVector(std::stringstream&& tokenStream) { - std::vector results; +TStrVec splitOnNull(std::stringstream&& tokenStream) { + TStrVec results; std::string token; while (std::getline(tokenStream, token, '\0')) { results.push_back(token); @@ -108,6 +95,16 @@ rapidjson::Document treeToJsonDocument(const maths::CBoostedTree& tree) { return results; } +auto restoreTree(std::string persistedState, TDataFrameUPtr& frame, std::size_t dependentVariable) { + CTestDataSearcher dataSearcher(persistedState); + auto decompressor = std::make_unique(dataSearcher); + decompressor->setStateRestoreSearch(api::ML_STATE_INDEX, + api::getRegressionStateId("testJob")); + auto stream = decompressor->search(1, 1); + return maths::CBoostedTreeFactory::constructFromString(*stream).buildFor( + *frame, dependentVariable); +} + auto outlierSpec(std::size_t rows = 110, std::size_t memoryLimit = 100000, std::string method = "", @@ -146,29 +143,43 @@ auto outlierSpec(std::size_t rows = 110, return std::make_unique(spec); } -auto analysisSpec(std::string analysis, - std::string dependentVariable, - std::size_t rows = 100, - std::size_t cols = 5, - std::size_t memoryLimit = 3000000, - std::size_t numberRoundsPerHyperparameter = 0, - std::size_t bayesianOptimisationRestarts = 0, - const TStrVec& categoricalFieldNames = TStrVec{}, - double lambda = -1.0, - double gamma = -1.0, - double eta = -1.0, - std::size_t maximumNumberTrees = 0, - double featureBagFraction = -1.0, - CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr, - CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) { +auto predictionSpec(std::string analysis, + std::string dependentVariable, + std::size_t rows = 100, + std::size_t cols = 5, + std::size_t memoryLimit = 3000000, + std::size_t numberRoundsPerHyperparameter = 0, + std::size_t bayesianOptimisationRestarts = 0, + const TStrVec& categoricalFieldNames = TStrVec{}, + double alpha = -1.0, + double lambda = -1.0, + double gamma = -1.0, + double softTreeDepthLimit = -1.0, + double softTreeDepthTolerance = -1.0, + double eta = -1.0, + std::size_t maximumNumberTrees = 0, + double featureBagFraction = -1.0, + TPersisterSupplier* persisterSupplier = nullptr, + TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) { std::string parameters = "{\n\"dependent_variable\": \"" + dependentVariable + "\""; + if (alpha >= 0.0) { + parameters += ",\n\"alpha\": " + core::CStringUtils::typeToString(alpha); + } if (lambda >= 0.0) { parameters += ",\n\"lambda\": " + core::CStringUtils::typeToString(lambda); } if (gamma >= 0.0) { parameters += ",\n\"gamma\": " + core::CStringUtils::typeToString(gamma); } + if (softTreeDepthLimit >= 0.0) { + parameters += ",\n\"soft_tree_depth_limit\": " + + core::CStringUtils::typeToString(softTreeDepthLimit); + } + if (softTreeDepthTolerance >= 0.0) { + parameters += ",\n\"soft_tree_depth_tolerance\": " + + core::CStringUtils::typeToString(softTreeDepthTolerance); + } if (eta > 0.0) { parameters += ",\n\"eta\": " + core::CStringUtils::typeToString(eta); } @@ -277,11 +288,11 @@ void addOutlierTestData(TStrVec fieldNames, }); } -TDataFrameUPtr passDataToAnalyzer(const TStrVec& fieldNames, - TStrVec& fieldValues, - api::CDataFrameAnalyzer& analyzer, - const TDoubleVec& weights, - const TDoubleVec& values) { +TDataFrameUPtr passLinearRegressionDataToAnalyzer(const TStrVec& fieldNames, + TStrVec& fieldValues, + api::CDataFrameAnalyzer& analyzer, + const TDoubleVec& weights, + const TDoubleVec& values) { auto f = [](const TDoubleVec& weights_, const TPoint& regressors) { double result{0.0}; @@ -320,8 +331,11 @@ void addRegressionTestData(const TStrVec& fieldNames, api::CDataFrameAnalyzer& analyzer, TDoubleVec& expectedPredictions, std::size_t numberExamples = 100, + double alpha = -1.0, double lambda = -1.0, double gamma = -1.0, + double softTreeDepthLimit = -1.0, + double softTreeDepthTolerance = -1.0, double eta = 0.0, std::size_t maximumNumberTrees = 0, double featureBagFraction = 0.0) { @@ -332,16 +346,26 @@ void addRegressionTestData(const TStrVec& fieldNames, TDoubleVec values; rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, values); - auto frame = passDataToAnalyzer(fieldNames, fieldValues, analyzer, weights, values); + auto frame = passLinearRegressionDataToAnalyzer(fieldNames, fieldValues, + analyzer, weights, values); maths::CBoostedTreeFactory treeFactory{maths::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique())}; + if (alpha >= 0.0) { + treeFactory.depthPenaltyMultiplier(alpha); + } if (lambda >= 0.0) { treeFactory.leafWeightPenaltyMultiplier(lambda); } if (gamma >= 0.0) { treeFactory.treeSizePenaltyMultiplier(gamma); } + if (softTreeDepthLimit >= 0.0) { + treeFactory.softTreeDepthLimit(softTreeDepthLimit); + } + if (softTreeDepthTolerance >= 0.0) { + treeFactory.softTreeDepthTolerance(softTreeDepthTolerance); + } if (eta > 0.0) { treeFactory.eta(eta); } @@ -364,6 +388,90 @@ void addRegressionTestData(const TStrVec& fieldNames, } }); } + +template +void testOneRunOfBoostedTreeTrainingWithStateRecovery(F makeSpec, std::size_t iterationToRestartFrom) { + + std::stringstream outputStream; + auto outputWriterFactory = [&outputStream]() { + return std::make_unique(outputStream); + }; + + std::size_t numberExamples{200}; + TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + TDoubleVec weights{0.1, 2.0, 0.4, -0.5}; + TDoubleVec values; + test::CRandomNumbers rng; + rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, values); + + auto persistenceStream = std::make_shared(); + TPersisterSupplier persisterSupplier = [&persistenceStream]() -> TDataAdderUPtr { + return std::make_unique(persistenceStream); + }; + + // Compute expected tree. + + api::CDataFrameAnalyzer analyzer{ + makeSpec("c5", numberExamples, persisterSupplier), outputWriterFactory}; + std::size_t dependentVariable( + std::find(fieldNames.begin(), fieldNames.end(), "c5") - fieldNames.begin()); + + auto frame = passLinearRegressionDataToAnalyzer(fieldNames, fieldValues, + analyzer, weights, values); + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + TStrVec persistedStates{ + splitOnNull(std::stringstream{std::move(persistenceStream->str())})}; + auto expectedTree = restoreTree(std::move(persistedStates.back()), frame, dependentVariable); + + // Compute actual tree. + + persistenceStream->str(""); + + std::istringstream intermediateStateStream{persistedStates[iterationToRestartFrom]}; + TRestoreSearcherSupplier restoreSearcherSupplier = [&intermediateStateStream]() -> TDataSearcherUPtr { + return std::make_unique(intermediateStateStream.str()); + }; + + api::CDataFrameAnalyzer restoredAnalyzer{ + makeSpec("c5", numberExamples, persisterSupplier), outputWriterFactory}; + + passLinearRegressionDataToAnalyzer(fieldNames, fieldValues, + restoredAnalyzer, weights, values); + restoredAnalyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + persistedStates = splitOnNull(std::stringstream{std::move(persistenceStream->str())}); + auto actualTree = restoreTree(std::move(persistedStates.back()), frame, dependentVariable); + + // Compare hyperparameters. + + rapidjson::Document expectedResults{treeToJsonDocument(*expectedTree)}; + const auto& expectedHyperparameters = + expectedResults[maths::CBoostedTree::bestHyperparametersName()]; + const auto& expectedRegularizationHyperparameters = + expectedHyperparameters[maths::CBoostedTree::bestRegularizationHyperparametersName()]; + + rapidjson::Document actualResults{treeToJsonDocument(*actualTree)}; + const auto& actualHyperparameters = + actualResults[maths::CBoostedTree::bestHyperparametersName()]; + const auto& actualRegularizationHyperparameters = + actualHyperparameters[maths::CBoostedTree::bestRegularizationHyperparametersName()]; + + for (const auto& key : maths::CBoostedTree::bestHyperparameterNames()) { + if (expectedHyperparameters.HasMember(key)) { + double expected{std::stod(expectedHyperparameters[key].GetString())}; + double actual{std::stod(actualHyperparameters[key].GetString())}; + CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 1e-4 * expected); + } else if (expectedRegularizationHyperparameters.HasMember(key)) { + double expected{std::stod(expectedRegularizationHyperparameters[key].GetString())}; + double actual{std::stod(actualRegularizationHyperparameters[key].GetString())}; + CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 1e-4 * expected); + } else { + CPPUNIT_FAIL("Missing " + key); + } + } +} } void CDataFrameAnalyzerTest::testWithoutControlMessages() { @@ -617,7 +725,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() { TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; - api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "c5"), outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{predictionSpec("regression", "c5"), outputWriterFactory}; addRegressionTestData(fieldNames, fieldValues, analyzer, expectedPredictions); core::CStopWatch watch{true}; @@ -667,8 +775,11 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() { // Test the regression hyperparameter settings are correctly propagated to the // analysis runner. + double alpha{2.0}; double lambda{1.0}; double gamma{10.0}; + double softTreeDepthLimit{3.0}; + double softTreeDepthTolerance{0.1}; double eta{0.9}; std::size_t maximumNumberTrees{1}; double featureBagFraction{0.3}; @@ -679,16 +790,18 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() { }; api::CDataFrameAnalyzer analyzer{ - analysisSpec("regression", "c5", 100, 5, 3000000, 0, 0, {}, lambda, - gamma, eta, maximumNumberTrees, featureBagFraction), + predictionSpec("regression", "c5", 100, 5, 3000000, 0, 0, {}, alpha, + lambda, gamma, softTreeDepthLimit, softTreeDepthTolerance, + eta, maximumNumberTrees, featureBagFraction), outputWriterFactory}; TDoubleVec expectedPredictions; TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; - addRegressionTestData(fieldNames, fieldValues, analyzer, expectedPredictions, 100, - lambda, gamma, eta, maximumNumberTrees, featureBagFraction); + addRegressionTestData(fieldNames, fieldValues, analyzer, expectedPredictions, + 100, alpha, lambda, gamma, softTreeDepthLimit, softTreeDepthTolerance, + eta, maximumNumberTrees, featureBagFraction); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; @@ -732,7 +845,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithRowsMissingTargetValu auto target = [](double feature) { return 10.0 * feature; }; api::CDataFrameAnalyzer analyzer{ - analysisSpec("regression", "target", 50, 2, 2000000), outputWriterFactory}; + predictionSpec("regression", "target", 50, 2, 2000000), outputWriterFactory}; TDoubleVec feature; rng.generateUniformSamples(1.0, 3.0, 50, feature); @@ -776,6 +889,62 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithRowsMissingTargetValu CPPUNIT_ASSERT_EQUAL(std::size_t{50}, numberResults); } +void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecovery() { + + struct SHyperparameters { + SHyperparameters(double alpha = 2.0, double lambda = 1.0, double gamma = 10.0) + : s_Alpha{alpha}, s_Lambda{lambda}, s_Gamma{gamma} {} + + std::size_t numberUnset() const { + return (s_Alpha < 0.0 ? 1 : 0) + (s_Lambda < 0.0 ? 1 : 0) + + (s_Gamma < 0.0 ? 1 : 0); + } + + double s_Alpha; + double s_Lambda; + double s_Gamma; + double s_SoftTreeDepthLimit = 3.0; + double s_SoftTreeDepthTolerance = 0.15; + double s_Eta = 0.9; + std::size_t s_MaximumNumberTrees = 2; + double s_FeatureBagFraction = 0.3; + }; + + std::size_t numberRoundsPerHyperparameter{3}; + + TSizeVec intermediateIterations; + std::size_t finalIteration{0}; + + test::CRandomNumbers rng; + + // TODO re-enable case that all parameters are set. + for (const auto& params : + {/*SHyperparameters{},*/ SHyperparameters{-1.0}, + SHyperparameters{-1.0, -1.0}, SHyperparameters{-1.0, -1.0, -1.0}}) { + + LOG_DEBUG(<< "Number parameters to search = " << params.numberUnset()); + + auto makeSpec = [&](const std::string& dependentVariable, std::size_t numberExamples, + TPersisterSupplier persisterSupplier) { + return predictionSpec("regression", dependentVariable, numberExamples, + 5, 15000000, numberRoundsPerHyperparameter, + 12, {}, params.s_Alpha, params.s_Lambda, + params.s_Gamma, params.s_SoftTreeDepthLimit, + params.s_SoftTreeDepthTolerance, params.s_Eta, + params.s_MaximumNumberTrees, + params.s_FeatureBagFraction, &persisterSupplier); + }; + + finalIteration = params.numberUnset() * numberRoundsPerHyperparameter - 1; + rng.generateUniformSamples(0, finalIteration - 1, 3, intermediateIterations); + + for (auto intermediateIteration : intermediateIterations) { + LOG_DEBUG(<< "restart from " << intermediateIteration); + testOneRunOfBoostedTreeTrainingWithStateRecovery(makeSpec, intermediateIteration); + } + } +} + void CDataFrameAnalyzerTest::testFlushMessage() { // Test that white space is just ignored. @@ -949,7 +1118,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { { api::CDataFrameAnalyzer analyzer{ - analysisSpec("regression", "x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}), + predictionSpec("regression", "x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}), outputWriterFactory}; TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"}, @@ -988,9 +1157,9 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { { std::size_t rows{api::CDataFrameAnalyzer::MAX_CATEGORICAL_CARDINALITY + 3}; - api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "x5", rows, 5, 8000000000, - 0, 0, {"x1"}, 0, 0, 0, 0, 0), - outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{ + predictionSpec("regression", "x5", rows, 5, 8000000000, 0, 0, {"x1"}), + outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "", ""}; @@ -1051,8 +1220,8 @@ void CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing() { return std::make_unique(output); }; - api::CDataFrameAnalyzer analyzer{analysisSpec("classification", "x5", 1000, 5, - 8000000, 0, 0, {"x1", "x2", "x5"}), + api::CDataFrameAnalyzer analyzer{predictionSpec("classification", "x5", 1000, 5, + 8000000, 0, 0, {"x1", "x2", "x5"}), outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; @@ -1132,155 +1301,3 @@ CppUnit::Test* CDataFrameAnalyzerTest::suite() { return suiteOfTests; } - -void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecovery() { - - // no hyperparameter search - double lambda{1.0}; - double gamma{10.0}; - double eta{0.9}; - std::size_t maximumNumberTrees{2}; - double featureBagFraction{0.3}; - std::size_t numberRoundsPerHyperparameter{5}; - - TSizeVec intermediateIterations; - std::size_t finalIteration{0}; - - test::CRandomNumbers rng; - - // TODO reactivate this test case - // LOG_DEBUG(<< "No hyperparameters to search") - // testRunBoostedTreeTrainingWithStateRecoverySubroutine( - // lambda, gamma, eta, maximumNumberTrees, featureBagFraction, - // numberRoundsPerHyperparameter, 0, finalIteration); - - LOG_DEBUG(<< "One hyperparameter to search"); - lambda = -1.0; - gamma = 10.0; - finalIteration = 1 * numberRoundsPerHyperparameter - 1; - rng.generateUniformSamples(1, finalIteration - 1, 3, intermediateIterations); - for (auto intermediateIteration : intermediateIterations) { - LOG_DEBUG(<< "restart from " << intermediateIteration); - testRunBoostedTreeTrainingWithStateRecoverySubroutine( - lambda, gamma, eta, maximumNumberTrees, featureBagFraction, - numberRoundsPerHyperparameter, intermediateIteration); - } - - LOG_DEBUG(<< "Two hyperparameters to search"); - lambda = -1.0; - gamma = -1.0; - finalIteration = 2 * numberRoundsPerHyperparameter - 1; - rng.generateUniformSamples(finalIteration / 2, finalIteration - 1, 3, intermediateIterations); - for (auto intermediateIteration : intermediateIterations) { - LOG_DEBUG(<< "restart from " << intermediateIteration); - testRunBoostedTreeTrainingWithStateRecoverySubroutine( - lambda, gamma, eta, maximumNumberTrees, featureBagFraction, - numberRoundsPerHyperparameter, intermediateIteration); - } -} - -void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubroutine( - double lambda, - double gamma, - double eta, - std::size_t maximumNumberTrees, - double featureBagFraction, - std::size_t numberRoundsPerHyperparameter, - std::size_t iterationToRestartFrom) const { - std::stringstream outputStream; - auto outputWriterFactory = [&outputStream]() { - return std::make_unique(outputStream); - }; - - std::size_t numberExamples{200}; - TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; - TStrVec fieldValues{"", "", "", "", "", "0", ""}; - TDoubleVec weights{0.1, 2.0, 0.4, -0.5}; - TDoubleVec values; - test::CRandomNumbers rng; - rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, values); - - auto persistenceStream{std::make_shared()}; - TPersisterSupplier persisterSupplier = [&persistenceStream]() -> TDataAdderUPtr { - return std::make_unique(persistenceStream); - }; - - // compute expected tree - - api::CDataFrameAnalyzer analyzer{ - analysisSpec("regression", "c5", numberExamples, 5, 15000000, - numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta, - maximumNumberTrees, featureBagFraction, &persisterSupplier), - outputWriterFactory}; - std::size_t dependentVariable( - std::find(fieldNames.begin(), fieldNames.end(), "c5") - fieldNames.begin()); - - auto frame{passDataToAnalyzer(fieldNames, fieldValues, analyzer, weights, values)}; - analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - TStrVec persistedStatesString{ - streamToStringVector(std::stringstream(persistenceStream->str()))}; - - auto expectedTree{this->getFinalTree(persistedStatesString, frame, dependentVariable)}; - - // Compute actual tree - persistenceStream->str(""); - - std::istringstream intermediateStateStream{persistedStatesString[iterationToRestartFrom]}; - TRestoreSearcherSupplier restoreSearcherSupplier = [&intermediateStateStream]() -> TDataSearcherUPtr { - return std::make_unique(intermediateStateStream.str()); - }; - - api::CDataFrameAnalyzer analyzerToRestore{ - analysisSpec("regression", "c5", numberExamples, 5, 15000000, numberRoundsPerHyperparameter, - 12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction, - &persisterSupplier, &restoreSearcherSupplier), - outputWriterFactory}; - - passDataToAnalyzer(fieldNames, fieldValues, analyzerToRestore, weights, values); - analyzerToRestore.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - persistedStatesString = - streamToStringVector(std::stringstream(persistenceStream->str())); - auto actualTree{this->getFinalTree(persistedStatesString, frame, dependentVariable)}; - - // compare hyperparameter - - // TODO avoid implicit dependency on state names - - rapidjson::Document expectedResults{treeToJsonDocument(*expectedTree)}; - const auto& expectedHyperparameters = expectedResults["best_hyperparameters"]; - const auto& expectedRegularizationHyperparameters = - expectedHyperparameters["hyperparam_regularization"]; - - rapidjson::Document actualResults{treeToJsonDocument(*actualTree)}; - const auto& actualHyperparameters = actualResults["best_hyperparameters"]; - const auto& actualRegularizationHyperparameters = - actualHyperparameters["hyperparam_regularization"]; - - for (const auto& key : {"hyperparam_eta", "hyperparam_eta_growth_rate_per_tree", - "hyperparam_feature_bag_fraction"}) { - double expected{std::stod(expectedHyperparameters[key].GetString())}; - double actual{std::stod(actualHyperparameters[key].GetString())}; - CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 1e-4 * expected); - } - for (const auto& key : {"regularization_tree_size_penalty_multiplier", - "regularization_leaf_weight_penalty_multiplier"}) { - double expected{std::stod(expectedRegularizationHyperparameters[key].GetString())}; - double actual{std::stod(actualRegularizationHyperparameters[key].GetString())}; - CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 1e-4 * expected); - } -} - -maths::CBoostedTreeFactory::TBoostedTreeUPtr -CDataFrameAnalyzerTest::getFinalTree(const TStrVec& persistedStates, - std::unique_ptr& frame, - std::size_t dependentVariable) const { - CTestDataSearcher dataSearcher(persistedStates.back()); - auto decompressor{std::make_unique(dataSearcher)}; - decompressor->setStateRestoreSearch(api::ML_STATE_INDEX, - api::getRegressionStateId("testJob")); - auto stream{decompressor->search(1, 1)}; - return maths::CBoostedTreeFactory::constructFromString(*stream).buildFor( - *frame, dependentVariable); -} diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.h b/lib/api/unittest/CDataFrameAnalyzerTest.h index bbf1ecaf49..52ddb254a6 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.h +++ b/lib/api/unittest/CDataFrameAnalyzerTest.h @@ -7,24 +7,9 @@ #ifndef INCLUDED_CDataFrameAnalyzerTest_h #define INCLUDED_CDataFrameAnalyzerTest_h -#include -#include - -#include -#include - -#include - #include class CDataFrameAnalyzerTest : public CppUnit::TestFixture { -public: - using TDataAdderUPtr = std::unique_ptr; - using TPersisterSupplier = std::function; - using TDataSearcherUPtr = std::unique_ptr; - using TRestoreSearcherSupplier = std::function; - using TDataFrameUPtr = std::unique_ptr; - public: void testWithoutControlMessages(); void testRunOutlierDetection(); @@ -42,25 +27,6 @@ class CDataFrameAnalyzerTest : public CppUnit::TestFixture { void testCategoricalFieldsEmptyAsMissing(); static CppUnit::Test* suite(); - -private: - using TDoubleVec = std::vector; - using TStrVec = std::vector; - -private: - void testRunBoostedTreeTrainingWithStateRecoverySubroutine( - double lambda, - double gamma, - double eta, - std::size_t maximumNumberTrees, - double featureBagFraction, - std::size_t numberRoundsPerHyperparameter, - std::size_t iterationToRestartFrom) const; - - ml::maths::CBoostedTreeFactory::TBoostedTreeUPtr - getFinalTree(const TStrVec& persistedStates, - TDataFrameUPtr& frame, - std::size_t dependentVariable) const; }; #endif // INCLUDED_CDataFrameAnalyzerTest_h diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index 7c9f1c4528..b1fbe2a74b 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -10,6 +10,11 @@ #include #include +#include +#include +#include + +#include #include #include @@ -27,10 +32,24 @@ const std::string SPLIT_VALUE_TAG{"split_value"}; namespace boosted_tree_detail { +CArgMinLossImpl::CArgMinLossImpl(double lambda) : m_Lambda{lambda} { +} + +double CArgMinLossImpl::lambda() const { + return m_Lambda; +} + +CArgMinMseImpl::CArgMinMseImpl(double lambda) : CArgMinLossImpl{lambda} { +} + std::unique_ptr CArgMinMseImpl::clone() const { return std::make_unique(*this); } +bool CArgMinMseImpl::nextPass() { + return false; +} + void CArgMinMseImpl::add(double prediction, double actual) { m_MeanError.add(actual - prediction); } @@ -43,7 +62,136 @@ void CArgMinMseImpl::merge(const CArgMinLossImpl& other) { } double CArgMinMseImpl::value() const { - return CBasicStatistics::mean(m_MeanError); + + // We searching for the value x which minimises + // + // x^* = argmin_x{ sum_i{(a_i - (p_i + x))^2} + lambda * x^2 } + // + // This is convex so there is one minimum where the derivative w.r.t. x is zero + // and x^* = 1 / (n + lambda) sum_i{ a_i - p_i }. Denoting the mean prediction + // error m = 1/n sum_i{ a_i - p_i } we have x^* = n / (n + lambda) m. + + double count{CBasicStatistics::count(m_MeanError)}; + return count == 0.0 + ? 0.0 + : count / (count + this->lambda()) * CBasicStatistics::mean(m_MeanError); +} + +CArgMinLogisticImpl::CArgMinLogisticImpl(double lambda) + : CArgMinLossImpl{lambda}, m_CategoryCounts{0}, + m_BucketCategoryCounts(128, TSizeVector{0}) { +} + +std::unique_ptr CArgMinLogisticImpl::clone() const { + return std::make_unique(*this); +} + +bool CArgMinLogisticImpl::nextPass() { + m_CurrentPass += this->bucketWidth() > 0.0 ? 1 : 2; + return m_CurrentPass < 2; +} + +void CArgMinLogisticImpl::add(double prediction, double actual) { + switch (m_CurrentPass) { + case 0: { + m_PredictionMinMax.add(prediction); + ++m_CategoryCounts(static_cast(actual)); + break; + } + case 1: { + auto& count = m_BucketCategoryCounts[this->bucket(prediction)]; + ++count(static_cast(actual)); + break; + } + default: + break; + } +} + +void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) { + const auto* logistic = dynamic_cast(&other); + if (logistic != nullptr) { + switch (m_CurrentPass) { + case 0: + m_PredictionMinMax += logistic->m_PredictionMinMax; + m_CategoryCounts += logistic->m_CategoryCounts; + break; + case 1: + for (std::size_t i = 0; i < m_BucketCategoryCounts.size(); ++i) { + m_BucketCategoryCounts[i] += logistic->m_BucketCategoryCounts[i]; + } + break; + default: + break; + } + } +} + +double CArgMinLogisticImpl::value() const { + + std::function objective; + double minWeight; + double maxWeight; + + // This is true if and only if all the predictions were identical. In this + // case we only need one pass over the data and can compute the optimal + // value from the counts of the two categories. + if (this->bucketWidth() == 0.0) { + objective = [this](double weight) { + double p{CTools::logisticFunction(weight)}; + std::size_t c0{m_CategoryCounts(0)}; + std::size_t c1{m_CategoryCounts(1)}; + return this->lambda() * CTools::pow2(weight) - + static_cast(c0) * CTools::fastLog(1.0 - p) - + static_cast(c1) * CTools::fastLog(p); + }; + + // Weight shrinkage means the optimal weight will be somewhere + // between the logit of the empirical probability and zero. + std::size_t c0{m_CategoryCounts(0) + 1}; + std::size_t c1{m_CategoryCounts(1) + 1}; + double empiricalProbabilityC1{static_cast(c1) / + static_cast(c0 + c1)}; + double empiricalLogOddsC1{ + std::log(empiricalProbabilityC1 / (1.0 - empiricalProbabilityC1))}; + minWeight = empiricalProbabilityC1 < 0.5 ? empiricalLogOddsC1 : 0.0; + maxWeight = empiricalProbabilityC1 < 0.5 ? 0.0 : empiricalLogOddsC1; + + } else { + objective = [this](double weight) { + double loss{0.0}; + for (std::size_t i = 0; i < m_BucketCategoryCounts.size(); ++i) { + double bucketPrediction{this->bucketCentre(i)}; + double p{CTools::logisticFunction(bucketPrediction + weight)}; + std::size_t c0{m_BucketCategoryCounts[i](0)}; + std::size_t c1{m_BucketCategoryCounts[i](1)}; + loss -= static_cast(c0) * CTools::fastLog(1.0 - p) + + static_cast(c1) * CTools::fastLog(p); + } + return loss + this->lambda() * CTools::pow2(weight); + }; + + // Choose a weight interval in which all probabilites vary from close to + // zero to close to one. In particular, the idea is to minimize the leaf + // weight on an interval [a, b] where if we add "a" the log-odds for all + // rows <= -5, i.e. max prediction + a = -5, and if we add "b" the log-odds + // for all rows >= 5, i.e. min prediction + a = 5. + minWeight = -m_PredictionMinMax.max() - 5.0; + maxWeight = -m_PredictionMinMax.min() + 5.0; + } + + if (minWeight == maxWeight) { + return minWeight; + } + + double minimum; + double objectiveAtMinimum; + std::size_t maxIterations{10}; + CSolvers::minimize(minWeight, maxWeight, objective(minWeight), objective(maxWeight), + objective, 1e-3, maxIterations, minimum, objectiveAtMinimum); + LOG_TRACE(<< "minimum = " << minimum << " objective(minimum) = " << objectiveAtMinimum); + + return minimum; } } @@ -61,6 +209,10 @@ CArgMinLoss& CArgMinLoss::operator=(const CArgMinLoss& other) { return *this; } +bool CArgMinLoss::nextPass() const { + return m_Impl->nextPass(); +} + void CArgMinLoss::add(double prediction, double actual) { return m_Impl->add(prediction, actual); } @@ -80,6 +232,10 @@ CArgMinLoss CLoss::makeMinimizer(const boosted_tree_detail::CArgMinLossImpl& imp return {impl}; } +std::unique_ptr CMse::clone() const { + return std::make_unique(*this); +} + double CMse::value(double prediction, double actual) const { return CTools::pow2(prediction - actual); } @@ -96,8 +252,8 @@ bool CMse::isCurvatureConstant() const { return true; } -CArgMinLoss CMse::minimizer() const { - return this->makeMinimizer(CArgMinMseImpl{}); +CArgMinLoss CMse::minimizer(double lambda) const { + return this->makeMinimizer(CArgMinMseImpl{lambda}); } const std::string& CMse::name() const { @@ -105,6 +261,41 @@ const std::string& CMse::name() const { } const std::string CMse::NAME{"mse"}; + +std::unique_ptr CLogistic::clone() const { + return std::make_unique(*this); +} + +double CLogistic::value(double prediction, double actual) const { + // Cross entropy + prediction = CTools::logisticFunction(prediction); + return -((1.0 - actual) * CTools::fastLog(1.0 - prediction) + + actual * CTools::fastLog(prediction)); +} + +double CLogistic::gradient(double prediction, double actual) const { + prediction = CTools::logisticFunction(prediction); + return prediction - actual; +} + +double CLogistic::curvature(double prediction, double /*actual*/) const { + prediction = CTools::logisticFunction(prediction); + return prediction * (1.0 - prediction); +} + +bool CLogistic::isCurvatureConstant() const { + return false; +} + +CArgMinLoss CLogistic::minimizer(double lambda) const { + return this->makeMinimizer(CArgMinLogisticImpl{lambda}); +} + +const std::string& CLogistic::name() const { + return NAME; +} + +const std::string CLogistic::NAME{"logistic"}; } std::size_t CBoostedTreeNode::leafIndex(const CEncodedDataFrameRowRef& row, @@ -291,6 +482,22 @@ const CBoostedTree::TNodeVecVec& CBoostedTree::trainedModel() const { return m_Impl->trainedModel(); } +const std::string& CBoostedTree::bestHyperparametersName() { + return CBoostedTreeImpl::bestHyperparametersName(); +} + +const std::string& CBoostedTree::bestRegularizationHyperparametersName() { + return CBoostedTreeImpl::bestRegularizationHyperparametersName(); +} + +CBoostedTree::TStrVec CBoostedTree::bestHyperparameterNames() { + return CBoostedTreeImpl::bestHyperparameterNames(); +} + +bool CBoostedTree::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { + return m_Impl->acceptRestoreTraverser(traverser); +} + void CBoostedTree::acceptPersistInserter(core::CStatePersistInserter& inserter) const { m_Impl->acceptPersistInserter(inserter); } diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index e4b7e2e7cf..55639ad458 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -80,9 +80,11 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari } } - // TODO can only use factory to create one object since this is moved. This seems trappy. + auto treeImpl = std::make_unique( + m_NumberThreads, m_Loss != nullptr ? m_Loss->clone() : nullptr); + std::swap(m_TreeImpl, treeImpl); return TBoostedTreeUPtr{new CBoostedTree{frame, m_RecordProgress, m_RecordMemoryUsage, - m_RecordTrainingState, std::move(m_TreeImpl)}}; + m_RecordTrainingState, std::move(treeImpl)}}; } std::size_t CBoostedTreeFactory::numberHyperparameterTuningRounds() const { @@ -534,7 +536,6 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, double testLoss{m_TreeImpl->meanLoss(frame, testRowMask, forest)}; leastSquaresQuadraticTestLoss.add(static_cast(i) * stepSize, testLoss); testLosses[i] = testLoss; - m_TreeImpl->m_TrainingProgress.increment(); } LOG_TRACE(<< "test losses = " << core::CContainerPrinter::print(testLosses)); @@ -616,8 +617,10 @@ CBoostedTreeFactory CBoostedTreeFactory::constructFromString(std::istream& jsonS } CBoostedTreeFactory::CBoostedTreeFactory(bool restored, std::size_t numberThreads, TLossFunctionUPtr loss) - : m_Restored{restored}, m_TreeImpl{std::make_unique(numberThreads, - std::move(loss))}, + : m_Restored{restored}, m_NumberThreads{numberThreads}, m_Loss{loss != nullptr + ? loss->clone() + : nullptr}, + m_TreeImpl{std::make_unique(numberThreads, std::move(loss))}, m_LogDepthPenaltyMultiplierSearchInterval{0.0}, m_LogTreeSizePenaltyMultiplierSearchInterval{0.0}, m_LogLeafWeightPenaltyMultiplierSearchInterval{0.0} { } @@ -793,12 +796,16 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring() { // This comprises: // - The cost of category encoding and feature selection which we count as // one unit, + // - One unit for estimating the expected gain and sum curvature per node, // - INITIAL_REGULARIZER_SEARCH_ITERATIONS units per regularization parameter // which isn't user defined, // - The main optimisation loop which costs number folds units per iteration, // - The cost of the final train which we count as number folds units. - std::size_t totalNumberSteps{1}; + std::size_t totalNumberSteps{2}; + if (m_TreeImpl->m_RegularizationOverride.depthPenaltyMultiplier() == boost::none) { + totalNumberSteps += INITIAL_REGULARIZER_SEARCH_ITERATIONS; + } if (m_TreeImpl->m_RegularizationOverride.treeSizePenaltyMultiplier() == boost::none) { totalNumberSteps += INITIAL_REGULARIZER_SEARCH_ITERATIONS; } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 692aff017c..31f8d4072c 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -433,7 +433,6 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame, lossMoments.add(loss); LOG_TRACE(<< "fold = " << i << " forest size = " << forest.size() << " test set loss = " << loss); - m_TrainingProgress.increment(); } LOG_TRACE(<< "test mean loss = " << CBasicStatistics::mean(lossMoments) << ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments))); @@ -521,6 +520,8 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame, LOG_TRACE(<< "Trained one forest"); + m_TrainingProgress.increment(); + return forest; } @@ -733,27 +734,39 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr using TArgMinLossVec = std::vector; - auto result = frame.readRows( - m_NumberThreads, 0, frame.numberRows(), - core::bindRetrievableState( - [&](TArgMinLossVec& leafValues, TRowItr beginRows, TRowItr endRows) { - for (auto itr = beginRows; itr != endRows; ++itr) { - const TRowRef& row{*itr}; - double prediction{readPrediction(row)}; - double actual{readActual(row, m_DependentVariable)}; - leafValues[root(tree).leafIndex(m_Encoder->encode(row), tree)] - .add(prediction, actual); - } - }, - TArgMinLossVec(tree.size(), m_Loss->minimizer())), - &trainingRowMask); + TArgMinLossVec leafValues( + tree.size(), m_Loss->minimizer(m_Regularization.leafWeightPenaltyMultiplier())); + auto nextPass = [&] { + bool done{true}; + for (const auto& value : leafValues) { + done &= (value.nextPass() == false); + } + return done == false; + }; - auto leafValues = std::move(result.first[0].s_FunctionState); - for (std::size_t i = 1; i < result.first.size(); ++i) { - for (std::size_t j = 0; j < leafValues.size(); ++j) { - leafValues[j].merge(result.first[i].s_FunctionState[j]); + do { + auto result = frame.readRows( + m_NumberThreads, 0, frame.numberRows(), + core::bindRetrievableState( + [&](TArgMinLossVec& leafValues_, TRowItr beginRows, TRowItr endRows) { + for (auto itr = beginRows; itr != endRows; ++itr) { + const TRowRef& row{*itr}; + double prediction{readPrediction(row)}; + double actual{readActual(row, m_DependentVariable)}; + leafValues_[root(tree).leafIndex(m_Encoder->encode(row), tree)] + .add(prediction, actual); + } + }, + std::move(leafValues)), + &trainingRowMask); + + leafValues = std::move(result.first[0].s_FunctionState); + for (std::size_t i = 1; i < result.first.size(); ++i) { + for (std::size_t j = 0; j < leafValues.size(); ++j) { + leafValues[j].merge(result.first[i].s_FunctionState[j]); + } } - } + } while (nextPass()); for (std::size_t i = 0; i < tree.size(); ++i) { tree[i].value(eta * leafValues[i].value()); @@ -1010,6 +1023,25 @@ const std::string HYPERPARAM_FEATURE_BAG_FRACTION_TAG{"hyperparam_feature_bag_fr const std::string HYPERPARAM_REGULARIZATION_TAG{"hyperparam_regularization"}; } +const std::string& CBoostedTreeImpl::bestHyperparametersName() { + return BEST_HYPERPARAMETERS_TAG; +} + +const std::string& CBoostedTreeImpl::bestRegularizationHyperparametersName() { + return HYPERPARAM_REGULARIZATION_TAG; +} + +CBoostedTreeImpl::TStrVec CBoostedTreeImpl::bestHyperparameterNames() { + return {HYPERPARAM_ETA_TAG, + HYPERPARAM_ETA_GROWTH_RATE_PER_TREE_TAG, + HYPERPARAM_FEATURE_BAG_FRACTION_TAG, + REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG, + REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG, + REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG, + REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG, + REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG}; +} + template void CBoostedTreeImpl::CRegularization::acceptPersistInserter(core::CStatePersistInserter& inserter) const { core::CPersistUtils::persist(REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG, diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index f45d735cfd..ace6d8c4eb 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -71,13 +73,14 @@ template void fillDataFrame(std::size_t trainRows, std::size_t testRows, std::size_t cols, + const TBoolVec& categoricalColumns, const TDoubleVecVec& regressors, const TDoubleVec& noise, const F& target, core::CDataFrame& frame) { std::size_t rows{trainRows + testRows}; - frame.categoricalColumns(TBoolVec(cols, false)); + frame.categoricalColumns(categoricalColumns); for (std::size_t i = 0; i < rows; ++i) { frame.writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t j = 0; j < cols - 1; ++j, ++column) { @@ -96,6 +99,18 @@ void fillDataFrame(std::size_t trainRows, }); } +template +void fillDataFrame(std::size_t trainRows, + std::size_t testRows, + std::size_t cols, + const TDoubleVecVec& regressors, + const TDoubleVec& noise, + const F& target, + core::CDataFrame& frame) { + fillDataFrame(trainRows, testRows, cols, TBoolVec(cols, false), regressors, + noise, target, frame); +} + template auto predictAndComputeEvaluationMetrics(const F& generateFunction, test::CRandomNumbers& rng, @@ -212,12 +227,13 @@ void CBoostedTreeTest::testPiecewiseConstant() { // Unbiased... CPPUNIT_ASSERT_DOUBLES_EQUAL( 0.0, modelBias[i][0], - 7.0 * std::sqrt(noiseVariance / static_cast(trainRows))); + 4.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... - CPPUNIT_ASSERT(modelRSquared[i][0] > 0.95); + CPPUNIT_ASSERT(modelRSquared[i][0] > 0.96); meanModelRSquared.add(modelRSquared[i][0]); } + LOG_DEBUG(<< "mean R^2 = " << maths::CBasicStatistics::mean(meanModelRSquared)); CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanModelRSquared) > 0.97); } @@ -267,7 +283,7 @@ void CBoostedTreeTest::testLinear() { // Unbiased... CPPUNIT_ASSERT_DOUBLES_EQUAL( 0.0, modelBias[i][0], - 5.0 * std::sqrt(noiseVariance / static_cast(trainRows))); + 4.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... CPPUNIT_ASSERT(modelRSquared[i][0] > 0.97); @@ -334,14 +350,14 @@ void CBoostedTreeTest::testNonLinear() { // Unbiased... CPPUNIT_ASSERT_DOUBLES_EQUAL( 0.0, modelBias[i][0], - 8.0 * std::sqrt(noiseVariance / static_cast(trainRows))); + 4.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... - CPPUNIT_ASSERT(modelRSquared[i][0] > 0.95); + CPPUNIT_ASSERT(modelRSquared[i][0] > 0.96); meanModelRSquared.add(modelRSquared[i][0]); } LOG_DEBUG(<< "mean R^2 = " << maths::CBasicStatistics::mean(meanModelRSquared)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanModelRSquared) > 0.96); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanModelRSquared) > 0.97); } void CBoostedTreeTest::testThreading() { @@ -802,13 +818,233 @@ void CBoostedTreeTest::testDepthBasedRegularization() { TMeanAccumulator meanDepth; for (const auto& tree : regression->trainedModel()) { CPPUNIT_ASSERT(maxDepth(tree, tree[0], 0) <= static_cast(targetDepth)); - meanDepth.add(maxDepth(tree, tree[0], 0)); + meanDepth.add(static_cast(maxDepth(tree, tree[0], 0))); } LOG_DEBUG(<< "mean depth = " << maths::CBasicStatistics::mean(meanDepth)); CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanDepth) > targetDepth - 1.0); } } +void CBoostedTreeTest::testLogisticMinimizer() { + + // Test that we a good approximation of the additive term for the log-odds + // which minimises the cross entropy objective. + + using maths::boosted_tree_detail::CArgMinLogisticImpl; + + test::CRandomNumbers rng; + + TDoubleVec labels; + TDoubleVec weights; + + // All predictions equal and zero. + { + CArgMinLogisticImpl argmin{0.0}; + argmin.add(0.0, 0.0); + argmin.add(0.0, 1.0); + argmin.add(0.0, 1.0); + argmin.add(0.0, 0.0); + argmin.nextPass(); + CPPUNIT_ASSERT_EQUAL(0.0, argmin.value()); + } + // All predictions are equal. + { + rng.generateUniformSamples(0.0, 1.0, 1000, labels); + for (auto& label : labels) { + label = std::floor(label + 0.3); + } + weights.resize(labels.size(), 0.0); + + CArgMinLogisticImpl argmin{0.0}; + std::size_t numberPasses{0}; + std::size_t counts[2]{0, 0}; + + do { + ++numberPasses; + for (std::size_t i = 0; i < labels.size(); ++i) { + argmin.add(weights[i], labels[i]); + ++counts[static_cast(labels[i])]; + } + } while (argmin.nextPass()); + + double p{static_cast(counts[1]) / 1000.0}; + double expected{std::log(p / (1.0 - p))}; + double actual{argmin.value()}; + + CPPUNIT_ASSERT_EQUAL(std::size_t{1}, numberPasses); + CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 0.01 * std::fabs(expected)); + } + + for (auto lambda : {0.0, 10.0}) { + + LOG_DEBUG(<< "lambda = " << lambda); + + // The true objective. + auto objective = [&](double weight) { + double loss{0.0}; + for (std::size_t i = 0; i < labels.size(); ++i) { + double p{maths::CTools::logisticFunction(weights[i] + weight)}; + loss -= (1.0 - labels[i]) * maths::CTools::fastLog(1.0 - p) + + labels[i] * maths::CTools::fastLog(p); + } + return loss + lambda * maths::CTools::pow2(weight); + }; + + // This loop is fuzzing the predicted log-odds and testing we get consistently + // good estimates of the true minimizer. + for (std::size_t t = 0; t < 10; ++t) { + + double min{std::numeric_limits::max()}; + double max{-min}; + + rng.generateUniformSamples(0.0, 1.0, 1000, labels); + for (auto& label : labels) { + label = std::floor(label + 0.5); + } + weights.clear(); + for (const auto& label : labels) { + TDoubleVec weight; + rng.generateNormalSamples(label, 2.0, 1, weight); + weights.push_back(weight[0]); + min = std::min(min, weight[0]); + max = std::max(max, weight[0]); + } + + double expected; + double objectiveAtExpected; + std::size_t maxIterations{20}; + maths::CSolvers::minimize(-max, -min, objective(-max), objective(-min), + objective, 1e-3, maxIterations, expected, + objectiveAtExpected); + LOG_DEBUG(<< "expected = " << expected + << " objective at expected = " << objectiveAtExpected); + + CArgMinLogisticImpl argmin{lambda}; + CArgMinLogisticImpl argminPartition[2]{{lambda}, {lambda}}; + auto nextPass = [&] { + bool done{argmin.nextPass() == false}; + done &= (argminPartition[0].nextPass() == false); + done &= (argminPartition[1].nextPass() == false); + return done == false; + }; + + do { + for (std::size_t i = 0; i < labels.size() / 2; ++i) { + argmin.add(weights[i], labels[i]); + argminPartition[0].add(weights[i], labels[i]); + } + for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) { + argmin.add(weights[i], labels[i]); + argminPartition[1].add(weights[i], labels[i]); + } + argminPartition[0].merge(argminPartition[1]); + argminPartition[1] = argminPartition[0]; + } while (nextPass()); + + double actual{argmin.value()}; + double actualPartition{argminPartition[0].value()}; + LOG_DEBUG(<< "actual = " << actual + << " objective at actual = " << objective(actual)); + + // We should be within 1% for the value and 0.001% for the objective + // at the value. + CPPUNIT_ASSERT_EQUAL(actual, actualPartition); + CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 0.01 * std::fabs(expected)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(objectiveAtExpected, objective(actual), + 1e-5 * objectiveAtExpected); + } + } +} + +void CBoostedTreeTest::testLogisticRegression() { + + // The idea of this test is to create a random linear relationship between + // the feature values and the log-odds of each class, i.e. + // + // log-odds(class_1) = sum_i{ w * x_i } + // + // where, w is some fixed weight vector and x_i denoted the i'th feature vector. + // We try to recover this relationship in logistic regression by observing + // the actual labels. We want to test that we've roughly correctly estimated the + // log-odds. However, we target the cross-entropy so the errors in our estimates + // p_i^ should be measured in terms of cross entropy: sum_i{ p_i^ log(p_i) } + // where p_i = logistic(sum_i{ w_i * x_i}). + + test::CRandomNumbers rng; + + std::size_t trainRows{1000}; + std::size_t rows{1200}; + std::size_t cols{4}; + std::size_t capacity{600}; + + TMeanAccumulator meanExcessCrossEntropy; + for (std::size_t test = 0; test < 3; ++test) { + TDoubleVec weights; + rng.generateUniformSamples(-2.0, 2.0, cols - 1, weights); + TDoubleVec noise; + rng.generateNormalSamples(0.0, 1.0, rows, noise); + TDoubleVec uniform01; + rng.generateUniformSamples(0.0, 1.0, rows, uniform01); + + auto probability = [&](const TRowRef& row) { + double x{0.0}; + for (std::size_t i = 0; i < cols - 1; ++i) { + x += weights[i] * row[i]; + } + return maths::CTools::logisticFunction(x + noise[row.index()]); + }; + + auto target = [&](const TRowRef& row) { + return uniform01[row.index()] < probability(row) ? 1.0 : 0.0; + }; + + TDoubleVecVec x(cols - 1); + for (std::size_t i = 0; i < cols - 1; ++i) { + rng.generateUniformSamples(0.0, 4.0, rows, x[i]); + } + + auto frame = core::makeMainStorageDataFrame(cols, capacity).first; + + fillDataFrame(trainRows, rows - trainRows, cols, {false, false, false, true}, + x, TDoubleVec(rows, 0.0), target, *frame); + + auto regression = maths::CBoostedTreeFactory::constructFromParameters( + 1, std::make_unique()) + .buildFor(*frame, cols - 1); + + regression->train(); + regression->predict(); + + double actualCrossEntropy{0.0}; + double minimumCrossEntropy{0.0}; + frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + if (row->index() >= trainRows) { + std::size_t index{ + regression->columnHoldingPrediction(row->numberColumns())}; + actualCrossEntropy -= + probability(*row) * + std::log(maths::CTools::logisticFunction((*row)[index])); + minimumCrossEntropy -= probability(*row) * + std::log(probability(*row)); + } + } + }); + LOG_DEBUG(<< "actual cross entropy = " << actualCrossEntropy + << ", minimum cross entropy = " << minimumCrossEntropy); + + // We should be with 40% of the minimum possible cross entropy. + CPPUNIT_ASSERT(actualCrossEntropy < 1.4 * minimumCrossEntropy); + meanExcessCrossEntropy.add(actualCrossEntropy / minimumCrossEntropy); + } + + LOG_DEBUG(<< "mean excess cross entropy = " + << maths::CBasicStatistics::mean(meanExcessCrossEntropy)); + + // We should be within 25% of the minimum possible cross entropy on average. + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanExcessCrossEntropy) < 1.25); +} + void CBoostedTreeTest::testEstimateMemoryUsedByTrain() { // Test estimation of the memory used training a model. @@ -1122,6 +1358,10 @@ CppUnit::Test* CBoostedTreeTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CBoostedTreeTest::testDepthBasedRegularization", &CBoostedTreeTest::testDepthBasedRegularization)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CBoostedTreeTest::testLogisticMinimizer", &CBoostedTreeTest::testLogisticMinimizer)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CBoostedTreeTest::testLogisticRegression", &CBoostedTreeTest::testLogisticRegression)); suiteOfTests->addTest(new CppUnit::TestCaller( "CBoostedTreeTest::testEstimateMemoryUsedByTrain", &CBoostedTreeTest::testEstimateMemoryUsedByTrain)); diff --git a/lib/maths/unittest/CBoostedTreeTest.h b/lib/maths/unittest/CBoostedTreeTest.h index 6c098bdc0b..7abe8bfa18 100644 --- a/lib/maths/unittest/CBoostedTreeTest.h +++ b/lib/maths/unittest/CBoostedTreeTest.h @@ -22,6 +22,8 @@ class CBoostedTreeTest : public CppUnit::TestFixture { void testSingleSplit(); void testTranslationInvariance(); void testDepthBasedRegularization(); + void testLogisticMinimizer(); + void testLogisticRegression(); void testEstimateMemoryUsedByTrain(); void testProgressMonitoring(); void testMissingData();