From b742e506c28529753e7b298b19b52e86ad84bc01 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 23 Sep 2019 13:32:25 +0100 Subject: [PATCH 01/23] Improve hyperparameter optimisation initialisation --- include/core/CLoopProgress.h | 27 ++- include/maths/CBoostedTreeFactory.h | 30 ++- include/maths/CBoostedTreeImpl.h | 198 +++++++++++----- lib/api/CDataFrameBoostedTreeRunner.cc | 3 +- lib/core/CLoopProgress.cc | 57 ++++- lib/core/unittest/CLoopProgressTest.cc | 50 +++- lib/core/unittest/CLoopProgressTest.h | 1 + lib/maths/CBoostedTreeFactory.cc | 306 +++++++++++++++++++------ lib/maths/CBoostedTreeImpl.cc | 249 ++++++++++---------- lib/maths/unittest/CBoostedTreeTest.cc | 2 +- 10 files changed, 663 insertions(+), 260 deletions(-) diff --git a/include/core/CLoopProgress.h b/include/core/CLoopProgress.h index a9a4d70c25..d37de8c3bb 100644 --- a/include/core/CLoopProgress.h +++ b/include/core/CLoopProgress.h @@ -14,6 +14,8 @@ namespace ml { namespace core { +class CStatePersistInserter; +class CStateRestoreTraverser; //! \brief Manages recording the progress of a loop. //! @@ -46,14 +48,35 @@ class CORE_EXPORT CLoopProgress { using TProgressCallback = std::function; public: + CLoopProgress(); template - CLoopProgress(ITR begin, ITR end, const TProgressCallback& recordProgress, double scale = 1.0) + CLoopProgress(ITR begin, ITR end, const TProgressCallback& recordProgress = noop, double scale = 1.0) : CLoopProgress(std::distance(begin, end), recordProgress, scale) {} - CLoopProgress(std::size_t size, const TProgressCallback& recordProgress, double scale = 1.0); + CLoopProgress(std::size_t size, + const TProgressCallback& recordProgress = noop, + double scale = 1.0); + + //! Attach a new progress monitor callback. + void attach(const TProgressCallback& recordProgress); //! Increment the progress by \p i. void increment(std::size_t i = 1); + //! Resume progress monitoring which was restored. + void resumeRestored(); + + //! Get a checksum for this object. + std::uint64_t checksum() const; + + //! Persist by passing information to \p inserter. + void acceptPersistInserter(CStatePersistInserter& inserter) const; + + //! Populate the object from serialized data. + bool acceptRestoreTraverser(CStateRestoreTraverser& traverser); + +private: + static void noop(double); + private: std::size_t m_Size; std::size_t m_Steps; diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 11fcbffd58..3dd579a4a8 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -93,10 +94,14 @@ class MATHS_EXPORT CBoostedTreeFactory final { TBoostedTreeUPtr buildFor(core::CDataFrame& frame, std::size_t dependentVariable); private: + using TDoubleDoublePr = std::pair; using TOptionalDouble = boost::optional; using TOptionalSize = boost::optional; + using TVector = CVectorNx1; + using TOptionalVector = boost::optional; using TPackedBitVectorVec = std::vector; using TBoostedTreeImplUPtr = std::unique_ptr; + using TScaleRegularization = std::function; private: static const double MINIMUM_ETA; @@ -121,10 +126,22 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Initialize the regressors sample distribution. bool initializeFeatureSampleDistribution() const; - //! Read overrides for hyperparameters and if necessary estimate the initial - //! values for \f$\lambda\f$ and \f$\gamma\f$ which match the gain from an - //! overfit tree. - void initializeHyperparameters(core::CDataFrame& frame) const; + //! Set the initial values for the various hyperparameters. + void initializeHyperparameters(core::CDataFrame& frame); + + //! Estimate a good central value for the regularisation hyperparameters + //! search bounding box. + void initializeUnsetRegularizationHyperparameters(core::CDataFrame& frame); + + //! Estimate the reduction in gain from a split and the total curvature of + //! the loss function at a split. + TDoubleDoublePr estimateTreeGainAndCurvature(core::CDataFrame& frame, + const core::CPackedBitVector& trainingRowMask) const; + + //! Get the regularizer value at the point the model starts to overfit. + TOptionalVector candidateRegularizerSearchInterval(core::CDataFrame& frame, + core::CPackedBitVector trainingRowMask, + TScaleRegularization scale) const; //! Initialize the state for hyperparameter optimisation. void initializeHyperparameterOptimisation() const; @@ -132,6 +149,9 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Get the number of hyperparameter tuning rounds to use. std::size_t numberHyperparameterTuningRounds() const; + //! Setup monitoring for training progress. + void setupTrainingProgressMonitoring(); + static void noopRecordProgress(double); static void noopRecordMemoryUsage(std::int64_t); static void noopRecordTrainingState(CDataFrameRegressionModel::TPersistFunc); @@ -140,6 +160,8 @@ class MATHS_EXPORT CBoostedTreeFactory final { TOptionalDouble m_MinimumFrequencyToOneHotEncode; TOptionalSize m_BayesianOptimisationRestarts; TBoostedTreeImplUPtr m_TreeImpl; + TVector m_GammaSearchInterval; + TVector m_LambdaSearchInterval; TProgressCallback m_RecordProgress = noopRecordProgress; TMemoryUsageCallback m_RecordMemoryUsage = noopRecordMemoryUsage; TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState; diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 1f4793ef95..d5d8201b47 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -103,14 +103,14 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t memoryUsage() const; private: - using TDoubleDoublePrVec = std::vector>; + using TSizeDoublePr = std::pair; + using TDoubleDoublePr = std::pair; + using TDoubleDoublePrVec = std::vector; using TOptionalDouble = boost::optional; using TOptionalSize = boost::optional; - using TVector = CDenseVector; using TDoubleVecVec = std::vector; using TSizeVec = std::vector; - using TSizeDoublePr = std::pair; - using TDoubleDoubleDoubleTr = std::tuple; + using TVector = CDenseVector; using TRowItr = core::CDataFrame::TRowItr; using TPackedBitVectorVec = std::vector; using TDataFrameCategoryEncoderUPtr = std::unique_ptr; @@ -120,14 +120,73 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TNodeVec = std::vector; using TNodeVecVec = std::vector; + //! \brief Holds the parameters associated with the different types of regulariser + //! terms available. + template + class CRegularization final { + public: + //! Set the multiplier of the tree size regularizer. + CRegularization& gamma(double gamma) { + m_Gamma = gamma; + return *this; + } + + //! Set the multiplier of the square leaf weight regularizer. + CRegularization& lambda(double lambda) { + m_Lambda = lambda; + return *this; + } + + //! Count the number of parameters which have their default values. + std::size_t countNotSet() const { + return (m_Gamma == T{} ? 1 : 0) + (m_Lambda == T{} ? 1 : 0); + } + + //! Multiplier of the tree size regularizer. + T gamma() const { return m_Gamma; } + + //! Multiplier of the square leaf weight regularizer. + T lambda() const { return m_Lambda; } + + //! Get description of the regularization parameters. + std::string print() const { + return "(gamma = " + toString(m_Gamma) + + ", lambda = " + toString(m_Lambda) + ")"; + } + + //! Persist by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! Populate the object from serialized data. + bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); + + private: + static std::string toString(double x) { return std::to_string(x); } + static std::string toString(TOptionalDouble x) { + return x != boost::none ? toString(*x) : "null"; + } + + private: + T m_Gamma = T{}; + T m_Lambda = T{}; + }; + + using TRegularization = CRegularization; + using TRegularizationOverride = CRegularization; + //! \brief The algorithm parameters we'll directly optimise to improve test error. struct SHyperparameters { - double s_Lambda; - double s_Gamma; + //! The regularisation parameters. + TRegularization s_Regularization; + + //! Shrinkage. double s_Eta; + + //! Rate of growth of shrinkage in the training loop. double s_EtaGrowthRatePerTree; + + //! The fraction of features we use per bag. double s_FeatureBagFraction; - TDoubleVec s_FeatureSampleProbabilities; //! Persist by passing information to \p inserter. void acceptPersistInserter(core::CStatePersistInserter& inserter) const; @@ -182,16 +241,26 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Set the node value to \p value. void value(double value) { m_NodeValue = value; } + //! Get the gain of the split. + double gain() const { return m_Gain; } + + //! Get the total curvature at the rows below this node. + double curvature() const { return m_Curvature; } + //! Split this node and add its child nodes to \p tree. std::pair split(std::size_t splitFeature, double splitValue, bool assignMissingToLeft, + double gain, + double curvature, TNodeVec& tree) { m_SplitFeature = splitFeature; m_SplitValue = splitValue; m_AssignMissingToLeft = assignMissingToLeft; m_LeftChild = static_cast(tree.size()); m_RightChild = static_cast(tree.size() + 1); + m_Gain = gain; + m_Curvature = curvature; tree.resize(tree.size() + 2); return {m_LeftChild, m_RightChild}; } @@ -211,8 +280,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { core::bindRetrievableState( [&](auto& state, TRowItr beginRows, TRowItr endRows) { core::CPackedBitVector& leftRowMask{std::get<0>(state)}; - std::size_t& leftCount{std::get<1>(state)}; - std::size_t& rightCount{std::get<2>(state)}; + std::size_t& leftChildNumberRows{std::get<1>(state)}; + std::size_t& rightChildNumberRows{std::get<2>(state)}; for (auto row = beginRows; row != endRows; ++row) { std::size_t index{row->index()}; double value{encoder.encode(*row)[m_SplitFeature]}; @@ -221,9 +290,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { (missing == false && value < m_SplitValue)) { leftRowMask.extend(false, index - leftRowMask.size()); leftRowMask.extend(true); - ++leftCount; + ++leftChildNumberRows; } else { - ++rightCount; + ++rightChildNumberRows; } } }, @@ -237,13 +306,14 @@ class MATHS_EXPORT CBoostedTreeImpl final { } core::CPackedBitVector leftRowMask; - std::size_t leftCount; - std::size_t rightCount; - std::tie(leftRowMask, leftCount, rightCount) = std::move(masks[0].s_FunctionState); + std::size_t leftChildNumberRows; + std::size_t rightChildNumberRows; + std::tie(leftRowMask, leftChildNumberRows, rightChildNumberRows) = + std::move(masks[0].s_FunctionState); for (std::size_t i = 1; i < masks.size(); ++i) { leftRowMask |= std::get<0>(masks[i].s_FunctionState); - leftCount += std::get<1>(masks[i].s_FunctionState); - rightCount += std::get<2>(masks[i].s_FunctionState); + leftChildNumberRows += std::get<1>(masks[i].s_FunctionState); + rightChildNumberRows += std::get<2>(masks[i].s_FunctionState); } LOG_TRACE(<< "# rows in left node = " << leftRowMask.manhattan()); LOG_TRACE(<< "left row mask = " << leftRowMask); @@ -254,7 +324,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { LOG_TRACE(<< "left row mask = " << rightRowMask); return std::make_tuple(std::move(leftRowMask), std::move(rightRowMask), - leftCount < rightCount); + leftChildNumberRows < rightChildNumberRows); } //! Get a human readable description of this tree. @@ -290,6 +360,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::int32_t m_LeftChild = -1; std::int32_t m_RightChild = -1; double m_NodeValue = 0.0; + double m_Gain = 0.0; + double m_Curvature = 0.0; }; //! \brief Maintains a collection of statistics about a leaf of the regression @@ -305,12 +377,13 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t numberThreads, const core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder, - double lambda, - double gamma, + const TRegularization& regularization, const TDoubleVecVec& candidateSplits, + std::size_t depth, TSizeVec featureBag, core::CPackedBitVector rowMask) - : m_Id{id}, m_Lambda{lambda}, m_Gamma{gamma}, m_CandidateSplits{candidateSplits}, + : m_Id{id}, m_Regularization{regularization}, + m_CandidateSplits{candidateSplits}, m_Depth{depth}, m_FeatureBag{std::move(featureBag)}, m_RowMask{std::move(rowMask)} { std::sort(m_FeatureBag.begin(), m_FeatureBag.end()); @@ -320,13 +393,13 @@ class MATHS_EXPORT CBoostedTreeImpl final { this->computeAggregateLossDerivatives(numberThreads, frame, encoder); } - //! This should only called by split but is public so it's accessible to make_shared. + //! This should only called by split but is public so it's accessible to std::make_shared. CLeafNodeStatistics(std::size_t id, const CLeafNodeStatistics& parent, const CLeafNodeStatistics& sibling, core::CPackedBitVector rowMask) - : m_Id{id}, m_Lambda{sibling.m_Lambda}, m_Gamma{sibling.m_Gamma}, - m_CandidateSplits{sibling.m_CandidateSplits}, + : m_Id{id}, m_Regularization{sibling.m_Regularization}, + m_CandidateSplits{sibling.m_CandidateSplits}, m_Depth{sibling.m_Depth}, m_FeatureBag{sibling.m_FeatureBag}, m_RowMask{std::move(rowMask)} { LOG_TRACE(<< "row mask = " << m_RowMask); @@ -363,10 +436,10 @@ class MATHS_EXPORT CBoostedTreeImpl final { CLeafNodeStatistics(const CLeafNodeStatistics&) = delete; - CLeafNodeStatistics& operator=(const CLeafNodeStatistics&) = delete; - CLeafNodeStatistics(CLeafNodeStatistics&&) = default; + CLeafNodeStatistics& operator=(const CLeafNodeStatistics&) = delete; + CLeafNodeStatistics& operator=(CLeafNodeStatistics&&) = default; //! Apply the split defined by (\p leftChildRowMask, \p rightChildRowMask). @@ -375,8 +448,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t numberThreads, const core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder, - double lambda, - double gamma, + const TRegularization& regularization, const TDoubleVecVec& candidateSplits, TSizeVec featureBag, core::CPackedBitVector leftChildRowMask, @@ -385,8 +457,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { if (leftChildHasFewerRows) { auto leftChild = std::make_shared( - leftChildId, numberThreads, frame, encoder, lambda, gamma, candidateSplits, - std::move(featureBag), std::move(leftChildRowMask)); + leftChildId, numberThreads, frame, encoder, regularization, + candidateSplits, m_Depth + 1, std::move(featureBag), + std::move(leftChildRowMask)); auto rightChild = std::make_shared( rightChildId, *this, *leftChild, std::move(rightChildRowMask)); @@ -394,8 +467,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { } auto rightChild = std::make_shared( - rightChildId, numberThreads, frame, encoder, lambda, gamma, - candidateSplits, std::move(featureBag), std::move(rightChildRowMask)); + rightChildId, numberThreads, frame, encoder, regularization, candidateSplits, + m_Depth + 1, std::move(featureBag), std::move(rightChildRowMask)); auto leftChild = std::make_shared( leftChildId, *this, *rightChild, std::move(leftChildRowMask)); @@ -410,6 +483,10 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Get the gain in loss of the best split of this leaf. double gain() const { return this->bestSplitStatistics().s_Gain; } + double curvature() const { + return this->bestSplitStatistics().s_Curvature; + } + //! Get the best (feature, feature value) split. TSizeDoublePr bestSplit() const { const auto& split = this->bestSplitStatistics(); @@ -460,20 +537,20 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t curvatureSize{gradientsSize}; std::size_t missingGradientsSize{(numberCols - 1) * sizeof(double)}; std::size_t missingCurvatureSize{missingGradientsSize}; - return featureBagSize + rowMaskSize + gradientsSize + + return sizeof(CLeafNodeStatistics) + featureBagSize + rowMaskSize + gradientsSize + curvatureSize + missingGradientsSize + missingCurvatureSize; } private: //! \brief Statistics relating to a split of the node. struct SSplitStatistics : private boost::less_than_comparable { - SSplitStatistics(double gain, std::size_t feature, double splitAt, bool assignMissingToLeft) - : s_Gain{gain}, s_Feature{feature}, s_SplitAt{splitAt}, + SSplitStatistics(double gain, double curvature, std::size_t feature, double splitAt, bool assignMissingToLeft) + : s_Gain{gain}, s_Curvature{curvature}, s_Feature{feature}, s_SplitAt{splitAt}, s_AssignMissingToLeft{assignMissingToLeft} {} bool operator<(const SSplitStatistics& rhs) const { return COrderings::lexicographical_compare( - s_Gain, s_Feature, rhs.s_Gain, rhs.s_Feature); + s_Gain, s_Curvature, s_Feature, rhs.s_Gain, rhs.s_Curvature, rhs.s_Feature); } std::string print() const { @@ -484,6 +561,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { } double s_Gain; + double s_Curvature; std::size_t s_Feature; double s_SplitAt; bool s_AssignMissingToLeft; @@ -571,10 +649,11 @@ class MATHS_EXPORT CBoostedTreeImpl final { SSplitStatistics computeBestSplitStatistics() const { - static const std::size_t ASSIGN_MISSING_TO_LEFT{0}; - static const std::size_t ASSIGN_MISSING_TO_RIGHT{1}; + // We have two possible regularisation terms we'll use: + // 1. Tree size: gamma * "node count" + // 2. Sum square weights: lambda * sum{"leaf weight" ^ 2)} - SSplitStatistics result{-INF, m_FeatureBag.size(), INF, true}; + SSplitStatistics result{-INF, 0.0, m_FeatureBag.size(), INF, true}; for (auto i : m_FeatureBag) { double g{std::accumulate(m_Gradients[i].begin(), m_Gradients[i].end(), 0.0) + @@ -595,14 +674,17 @@ class MATHS_EXPORT CBoostedTreeImpl final { gl[ASSIGN_MISSING_TO_RIGHT] += m_Gradients[i][j]; hl[ASSIGN_MISSING_TO_RIGHT] += m_Curvatures[i][j]; - double gain[]{CTools::pow2(gl[ASSIGN_MISSING_TO_LEFT]) / - (hl[ASSIGN_MISSING_TO_LEFT] + m_Lambda) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_LEFT]) / - (h - hl[ASSIGN_MISSING_TO_LEFT] + m_Lambda), - CTools::pow2(gl[ASSIGN_MISSING_TO_RIGHT]) / - (hl[ASSIGN_MISSING_TO_RIGHT] + m_Lambda) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_RIGHT]) / - (h - hl[ASSIGN_MISSING_TO_RIGHT] + m_Lambda)}; + double gain[]{ + CTools::pow2(gl[ASSIGN_MISSING_TO_LEFT]) / + (hl[ASSIGN_MISSING_TO_LEFT] + m_Regularization.lambda()) + + CTools::pow2(g - gl[ASSIGN_MISSING_TO_LEFT]) / + (h - hl[ASSIGN_MISSING_TO_LEFT] + + m_Regularization.lambda()), + CTools::pow2(gl[ASSIGN_MISSING_TO_RIGHT]) / + (hl[ASSIGN_MISSING_TO_RIGHT] + m_Regularization.lambda()) + + CTools::pow2(g - gl[ASSIGN_MISSING_TO_RIGHT]) / + (h - hl[ASSIGN_MISSING_TO_RIGHT] + + m_Regularization.lambda())}; if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_LEFT]; @@ -616,9 +698,11 @@ class MATHS_EXPORT CBoostedTreeImpl final { } } - double gain{0.5 * (maximumGain - CTools::pow2(g) / (h + m_Lambda)) - m_Gamma}; + double gain{0.5 * (maximumGain - + CTools::pow2(g) / (h + m_Regularization.lambda())) - + m_Regularization.gamma()}; - SSplitStatistics candidate{gain, i, splitAt, assignMissingToLeft}; + SSplitStatistics candidate{gain, h, i, splitAt, assignMissingToLeft}; LOG_TRACE(<< "candidate split: " << candidate.print()); if (candidate > result) { @@ -633,9 +717,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { private: std::size_t m_Id; - double m_Lambda; - double m_Gamma; + const TRegularization& m_Regularization; const TDoubleVecVec& m_CandidateSplits; + std::size_t m_Depth; TSizeVec m_FeatureBag; core::CPackedBitVector m_RowMask; TDoubleVecVec m_Gradients; @@ -662,9 +746,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Compute the sum loss for the predictions from \p frame and the leaf //! count and squared weight sum from \p forest. - TDoubleDoubleDoubleTr regularisedLoss(const core::CDataFrame& frame, - const core::CPackedBitVector& trainingRowMask, - const TNodeVecVec& forest) const; + TDoubleDoublePr gainAndCurvatureAtPercentile(double percentile, + const TNodeVecVec& forest) const; //! Train the forest and compute loss moments on each fold. TMeanVarAccumulator crossValidateForest(core::CDataFrame& frame, @@ -761,20 +844,18 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t m_NumberThreads; std::size_t m_DependentVariable = std::numeric_limits::max(); CBoostedTree::TLossFunctionUPtr m_Loss; - TOptionalDouble m_LambdaOverride; - TOptionalDouble m_GammaOverride; + TRegularizationOverride m_RegularizationOverride; TOptionalDouble m_EtaOverride; TOptionalSize m_MaximumNumberTreesOverride; TOptionalDouble m_FeatureBagFractionOverride; - double m_Lambda = 0.0; - double m_Gamma = 0.0; + TRegularization m_Regularization; double m_Eta = 0.1; double m_EtaGrowthRatePerTree = 1.05; std::size_t m_NumberFolds = 4; std::size_t m_MaximumNumberTrees = 20; std::size_t m_MaximumAttemptsToAddTree = 3; std::size_t m_NumberSplitsPerFeature = 75; - std::size_t m_MaximumOptimisationRoundsPerHyperparameter = 5; + std::size_t m_MaximumOptimisationRoundsPerHyperparameter = 3; std::size_t m_RowsPerFeature = 50; double m_FeatureBagFraction = 0.5; double m_MaximumTreeSizeMultiplier = 1.0; @@ -790,6 +871,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { TBayesinOptimizationUPtr m_BayesianOptimization; std::size_t m_NumberRounds = 1; std::size_t m_CurrentRound = 0; + mutable core::CLoopProgress m_TrainingProgress; friend class CBoostedTreeFactory; }; diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 41ad7cb549..0023c99d04 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -191,11 +191,10 @@ void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, auto restoreSearcher{this->spec().restoreSearcher()}; bool treeRestored{false}; if (restoreSearcher != nullptr) { - treeRestored = restoreBoostedTree(frame, restoreSearcher); + treeRestored = this->restoreBoostedTree(frame, restoreSearcher); } if (treeRestored == false) { - m_BoostedTree = m_BoostedTreeFactory->buildFor( frame, dependentVariableColumn - featureNames.begin()); } diff --git a/lib/core/CLoopProgress.cc b/lib/core/CLoopProgress.cc index 4a167402ec..608892f754 100644 --- a/lib/core/CLoopProgress.cc +++ b/lib/core/CLoopProgress.cc @@ -6,12 +6,28 @@ #include +#include +#include +#include +#include + #include +#include namespace ml { namespace core { namespace { -const std::size_t STEPS{16}; +const std::size_t STEPS{32}; +const std::string LOOP_SIZE_TAG{"loop_size_tag"}; +const std::string PROGRESS_STEPS_TAG{"progress_steps_tag"}; +const std::string CURRENT_STEP_PROGRESS_TAG{"current_step_progress_tag"}; +const std::string LOOP_POS_TAG{"loop_pos_tag"}; +const std::hash stringHasher; +} + +CLoopProgress::CLoopProgress() + : m_Size{std::numeric_limits::max()}, m_Steps{1}, + m_StepProgress{1.0}, m_RecordProgress{noop} { } CLoopProgress::CLoopProgress(std::size_t size, const TProgressCallback& recordProgress, double scale) @@ -19,6 +35,10 @@ CLoopProgress::CLoopProgress(std::size_t size, const TProgressCallback& recordPr m_StepProgress{scale / static_cast(m_Steps)}, m_RecordProgress{recordProgress} { } +void CLoopProgress::attach(const TProgressCallback& recordProgress) { + m_RecordProgress = recordProgress; +} + void CLoopProgress::increment(std::size_t i) { m_Pos += i; @@ -30,5 +50,40 @@ void CLoopProgress::increment(std::size_t i) { m_LastProgress += stride; } } + +void CLoopProgress::resumeRestored() { + this->increment(0); +} + +std::uint64_t CLoopProgress::checksum() const { + std::uint64_t seed{core::CHashing::hashCombine( + static_cast(m_Size), static_cast(m_Steps))}; + seed = core::CHashing::hashCombine( + seed, stringHasher(core::CStringUtils::typeToStringPrecise( + m_StepProgress, core::CIEEE754::E_DoublePrecision))); + return core::CHashing::hashCombine(seed, static_cast(m_Pos)); +} + +void CLoopProgress::acceptPersistInserter(CStatePersistInserter& inserter) const { + inserter.insertValue(LOOP_SIZE_TAG, m_Size); + inserter.insertValue(PROGRESS_STEPS_TAG, m_Steps); + inserter.insertValue(CURRENT_STEP_PROGRESS_TAG, m_StepProgress, + core::CIEEE754::E_DoublePrecision); + inserter.insertValue(LOOP_POS_TAG, m_Pos); +} + +bool CLoopProgress::acceptRestoreTraverser(CStateRestoreTraverser& traverser) { + do { + const std::string& name{traverser.name()}; + RESTORE_BUILT_IN(LOOP_SIZE_TAG, m_Size) + RESTORE_BUILT_IN(PROGRESS_STEPS_TAG, m_Steps) + RESTORE_BUILT_IN(CURRENT_STEP_PROGRESS_TAG, m_StepProgress) + RESTORE_BUILT_IN(LOOP_POS_TAG, m_Pos) + } while (traverser.next()); + return true; +} + +void CLoopProgress::noop(double) { +} } } diff --git a/lib/core/unittest/CLoopProgressTest.cc b/lib/core/unittest/CLoopProgressTest.cc index ab790e4379..bdcae9fb76 100644 --- a/lib/core/unittest/CLoopProgressTest.cc +++ b/lib/core/unittest/CLoopProgressTest.cc @@ -6,11 +6,16 @@ #include "CLoopProgressTest.h" +#include +#include #include #include #include +#include +#include + using namespace ml; using TSizeVec = std::vector; @@ -75,7 +80,7 @@ void CLoopProgressTest::testRandom() { core::CLoopProgress loopProgress{size[0], recordProgress}; for (std::size_t i = 0; i < size[0]; ++i, loopProgress.increment()) { - CPPUNIT_ASSERT_EQUAL(static_cast(16 * i / size[0]) / 16.0, progress); + CPPUNIT_ASSERT_EQUAL(static_cast(32 * i / size[0]) / 32.0, progress); } CPPUNIT_ASSERT_EQUAL(1.0, progress); @@ -86,7 +91,7 @@ void CLoopProgressTest::testRandom() { for (std::size_t t = 0; t < 100; ++t) { TSizeVec size; - rng.generateUniformSamples(30, 100, 1, size); + rng.generateUniformSamples(33, 100, 1, size); if (t % 10 == 0) { LOG_DEBUG(<< "Loop length = " << size[0]); @@ -96,7 +101,7 @@ void CLoopProgressTest::testRandom() { core::CLoopProgress loopProgress{size[0], recordProgress}; for (std::size_t i = 0; i < size[0]; i += 20, loopProgress.increment(20)) { - CPPUNIT_ASSERT_EQUAL(static_cast(16 * i / size[0]) / 16.0, progress); + CPPUNIT_ASSERT_EQUAL(static_cast(32 * i / size[0]) / 32.0, progress); } CPPUNIT_ASSERT_EQUAL(1.0, progress); @@ -134,6 +139,43 @@ void CLoopProgressTest::testScaled() { } } +void CLoopProgressTest::testSerialization() { + + double progress{0.0}; + auto recordProgress = [&progress](double p) { progress += p; }; + + core::CLoopProgress loopProgress{50, recordProgress}; + for (std::size_t i = 0; i < 20; ++i) { + loopProgress.increment(); + } + + std::stringstream persistStream; + { + core::CJsonStatePersistInserter inserter(persistStream); + loopProgress.acceptPersistInserter(inserter); + } + + LOG_DEBUG(<< "state = " << persistStream.str()); + + core::CJsonStateRestoreTraverser traverser(persistStream); + core::CLoopProgress restoredLoopProgress; + restoredLoopProgress.acceptRestoreTraverser(traverser); + + double restoredProgress{0.0}; + auto restoredRecordProgress = [&restoredProgress](double p) { + restoredProgress += p; + }; + restoredLoopProgress.attach(restoredRecordProgress); + restoredLoopProgress.resumeRestored(); + + CPPUNIT_ASSERT_EQUAL(loopProgress.checksum(), restoredLoopProgress.checksum()); + for (std::size_t i = 20; i < 50; ++i) { + loopProgress.increment(); + restoredLoopProgress.increment(); + CPPUNIT_ASSERT_EQUAL(progress, restoredProgress); + } +} + CppUnit::Test* CLoopProgressTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CLoopProgressTest"); @@ -143,6 +185,8 @@ CppUnit::Test* CLoopProgressTest::suite() { "CLoopProgressTest::testRandom", &CLoopProgressTest::testRandom)); suiteOfTests->addTest(new CppUnit::TestCaller( "CLoopProgressTest::testScaled", &CLoopProgressTest::testScaled)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CLoopProgressTest::testSerialization", &CLoopProgressTest::testSerialization)); return suiteOfTests; } diff --git a/lib/core/unittest/CLoopProgressTest.h b/lib/core/unittest/CLoopProgressTest.h index 20048a490d..34244f6dc7 100644 --- a/lib/core/unittest/CLoopProgressTest.h +++ b/lib/core/unittest/CLoopProgressTest.h @@ -14,6 +14,7 @@ class CLoopProgressTest : public CppUnit::TestFixture { void testShort(); void testRandom(); void testScaled(); + void testSerialization(); static CppUnit::Test* suite(); }; diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index e6f46f1b7f..ebb76ef141 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -11,8 +11,12 @@ #include #include #include +#include +#include #include +#include + namespace ml { namespace maths { using namespace boosted_tree_detail; @@ -21,6 +25,10 @@ using TSizeVec = std::vector; using TRowItr = core::CDataFrame::TRowItr; namespace { +const std::size_t MIN_REGULARIZER_INDEX{0}; +const std::size_t BEST_REGULARIZER_INDEX{1}; +const std::size_t MAX_REGULARIZER_INDEX{2}; +const std::size_t INITIAL_REGULARIZER_SEARCH_ITERATIONS{8}; const double MIN_REGULARIZER_SCALE{0.1}; const double MAX_REGULARIZER_SCALE{10.0}; const double MIN_ETA_SCALE{0.3}; @@ -37,6 +45,8 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari m_TreeImpl->m_DependentVariable = dependentVariable; + this->setupTrainingProgressMonitoring(); + this->initializeMissingFeatureMasks(frame); std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks) = this->crossValidationRowMasks(); @@ -78,13 +88,13 @@ void CBoostedTreeFactory::initializeHyperparameterOptimisation() const { // less than p_1, this translates to using log parameter values. CBayesianOptimisation::TDoubleDoublePrVec boundingBox; - if (m_TreeImpl->m_LambdaOverride == boost::none) { - boundingBox.emplace_back(std::log(MIN_REGULARIZER_SCALE * m_TreeImpl->m_Lambda), - std::log(MAX_REGULARIZER_SCALE * m_TreeImpl->m_Lambda)); + if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { + boundingBox.emplace_back(std::log(m_LambdaSearchInterval(MIN_REGULARIZER_INDEX)), + std::log(m_LambdaSearchInterval(MAX_REGULARIZER_INDEX))); } - if (m_TreeImpl->m_GammaOverride == boost::none) { - boundingBox.emplace_back(std::log(MIN_REGULARIZER_SCALE * m_TreeImpl->m_Gamma), - std::log(MAX_REGULARIZER_SCALE * m_TreeImpl->m_Gamma)); + if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { + boundingBox.emplace_back(std::log(m_GammaSearchInterval(MIN_REGULARIZER_INDEX)), + std::log(m_GammaSearchInterval(MAX_REGULARIZER_INDEX))); } if (m_TreeImpl->m_EtaOverride == boost::none) { double rate{m_TreeImpl->m_EtaGrowthRatePerTree - 1.0}; @@ -175,6 +185,7 @@ void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFra .minimumFrequencyToOneHotEncode(m_MinimumFrequencyToOneHotEncode) .rowMask(m_TreeImpl->allTrainingRowsMask()) .columnMask(std::move(regressors))); + m_TreeImpl->m_TrainingProgress.increment(1); } void CBoostedTreeFactory::determineFeatureDataTypes(const core::CDataFrame& frame) const { @@ -214,11 +225,9 @@ bool CBoostedTreeFactory::initializeFeatureSampleDistribution() const { return false; } -void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) const { +void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) { - m_TreeImpl->m_Lambda = m_TreeImpl->m_LambdaOverride.value_or(0.0); - m_TreeImpl->m_Gamma = m_TreeImpl->m_GammaOverride.value_or(0.0); - if (m_TreeImpl->m_EtaOverride) { + if (m_TreeImpl->m_EtaOverride != boost::none) { m_TreeImpl->m_Eta = *(m_TreeImpl->m_EtaOverride); } else { // Eta is the learning rate. There is a lot of empirical evidence that @@ -238,69 +247,25 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) con frame.numberColumns() - 4))); m_TreeImpl->m_EtaGrowthRatePerTree = 1.0 + m_TreeImpl->m_Eta / 2.0; } - if (m_TreeImpl->m_MaximumNumberTreesOverride) { + + if (m_TreeImpl->m_MaximumNumberTreesOverride != boost::none) { m_TreeImpl->m_MaximumNumberTrees = *(m_TreeImpl->m_MaximumNumberTreesOverride); } else { // This needs to be tied to the learn rate to avoid bias. m_TreeImpl->m_MaximumNumberTrees = static_cast(2.0 / m_TreeImpl->m_Eta + 0.5); } - if (m_TreeImpl->m_FeatureBagFractionOverride) { + + if (m_TreeImpl->m_FeatureBagFractionOverride != boost::none) { m_TreeImpl->m_FeatureBagFraction = *(m_TreeImpl->m_FeatureBagFractionOverride); } - if (m_TreeImpl->m_LambdaOverride && m_TreeImpl->m_GammaOverride) { - // Fall through. - } else { - core::CPackedBitVector trainingRowMask{m_TreeImpl->allTrainingRowsMask()}; - - auto tree = m_TreeImpl->initializePredictionsAndLossDerivatives(frame, trainingRowMask); - - double L[2]; - double T[2]; - double W[2]; - - std::tie(L[0], T[0], W[0]) = - m_TreeImpl->regularisedLoss(frame, trainingRowMask, {std::move(tree)}); - LOG_TRACE(<< "loss = " << L[0] << ", # leaves = " << T[0] - << ", sum square weights = " << W[0]); + m_TreeImpl->m_Regularization + .gamma(m_TreeImpl->m_RegularizationOverride.gamma().value_or(0.0)) + .lambda(m_TreeImpl->m_RegularizationOverride.lambda().value_or(0.0)); - double eta{1.0}; - std::size_t maximumNumberOfTrees{1}; - std::swap(eta, m_TreeImpl->m_Eta); - std::swap(maximumNumberOfTrees, m_TreeImpl->m_MaximumNumberTrees); - auto forest = m_TreeImpl->trainForest(frame, trainingRowMask, m_RecordMemoryUsage); - std::swap(eta, m_TreeImpl->m_Eta); - std::swap(maximumNumberOfTrees, m_TreeImpl->m_MaximumNumberTrees); - - std::tie(L[1], T[1], W[1]) = - m_TreeImpl->regularisedLoss(frame, trainingRowMask, forest); - LOG_TRACE(<< "loss = " << L[1] << ", # leaves = " << T[1] - << ", sum square weights = " << W[1]); - - // If we can't improve the loss with no regularisation on the train set - // we're not going to be able to make much headway! In this case we just - // force the regularisation parameters to zero and don't try to optimise - // them. - double scale{static_cast(m_TreeImpl->m_NumberFolds - 1) / - static_cast(m_TreeImpl->m_NumberFolds)}; - double lambda{m_TreeImpl->m_Eta * scale * - (L[0] <= L[1] ? 0.0 : (L[0] - L[1]) / (W[1] - W[0]))}; - double gamma{m_TreeImpl->m_Eta * scale * - (L[0] <= L[1] ? 0.0 : (L[0] - L[1]) / (T[1] - T[0]))}; - - if (lambda == 0.0) { - m_TreeImpl->m_LambdaOverride = lambda; - } else if (m_TreeImpl->m_LambdaOverride == boost::none) { - m_TreeImpl->m_Lambda = m_TreeImpl->m_GammaOverride ? lambda : 0.5 * lambda; - } - if (gamma == 0.0) { - m_TreeImpl->m_GammaOverride = gamma; - } else if (m_TreeImpl->m_GammaOverride == boost::none) { - m_TreeImpl->m_Gamma = m_TreeImpl->m_LambdaOverride ? gamma : 0.5 * gamma; - } - LOG_TRACE(<< "lambda(initial) = " << m_TreeImpl->m_Lambda - << " gamma(initial) = " << m_TreeImpl->m_Gamma); + if (m_TreeImpl->m_RegularizationOverride.countNotSet() > 0) { + this->initializeUnsetRegularizationHyperparameters(frame); } m_TreeImpl->m_MaximumTreeSizeMultiplier = MAIN_TRAINING_LOOP_TREE_SIZE_MULTIPLIER; @@ -315,6 +280,188 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) con } } +void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDataFrame& frame) { + + // The strategy here is to: + // 1) Get percentile estimates of the gain in and sum curvature of the loss + // function at splits in a single tree, + // 2) Use these to upper bound the size of gamma and lambda, that is find + // values we for which we expect to underfit the data, + // 3) Decrease each regularizer and look for turning point in the test loss, + // i.e. the point at which transition to overfit occurs. + // We'll search intervals in the vicinity of this values in the hyperparameter + // optimisation loop. + + core::CPackedBitVector allTrainingRowsMask{m_TreeImpl->allTrainingRowsMask()}; + + double gainPerNode; + double totalCurvaturePerNode; + std::tie(gainPerNode, totalCurvaturePerNode) = + this->estimateTreeGainAndCurvature(frame, allTrainingRowsMask); + + if (gainPerNode > 0.0 && m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { + + TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; + fallbackInterval *= m_TreeImpl->m_Eta; + auto interval = this->candidateRegularizerSearchInterval( + frame, allTrainingRowsMask, [this, gainPerNode](double scale) { + m_TreeImpl->m_Regularization.gamma(scale * gainPerNode); + }); + m_GammaSearchInterval = interval.value_or(fallbackInterval) * gainPerNode; + LOG_TRACE(<< "gamma search interval = [" + << m_GammaSearchInterval.toDelimited() << "]"); + + } else if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { + m_TreeImpl->m_RegularizationOverride.gamma(0.0); + } + + if (totalCurvaturePerNode > 0.0 && + m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { + + TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; + m_TreeImpl->m_Regularization.gamma(m_GammaSearchInterval(MIN_REGULARIZER_INDEX)); + auto interval = this->candidateRegularizerSearchInterval( + frame, allTrainingRowsMask, [this, totalCurvaturePerNode](double scale) { + m_TreeImpl->m_Regularization.lambda(scale * totalCurvaturePerNode); + }); + m_LambdaSearchInterval = interval.value_or(fallbackInterval) * totalCurvaturePerNode; + LOG_TRACE(<< "lambda search interval = [" + << m_LambdaSearchInterval.toDelimited() << "]"); + + } else if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { + m_TreeImpl->m_RegularizationOverride.lambda(0.0); + } + + double scale{ + static_cast(m_TreeImpl->m_NumberFolds - 1) / + static_cast(m_TreeImpl->m_NumberFolds) / + ((m_TreeImpl->m_RegularizationOverride.gamma() != boost::none ? 0.0 : 1.0) + + (m_TreeImpl->m_RegularizationOverride.lambda() != boost::none ? 0.0 : 1.0))}; + + if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { + m_GammaSearchInterval *= scale; + m_TreeImpl->m_Regularization.gamma(m_GammaSearchInterval(BEST_REGULARIZER_INDEX)); + } + if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { + m_LambdaSearchInterval *= scale; + m_TreeImpl->m_Regularization.lambda(m_LambdaSearchInterval(BEST_REGULARIZER_INDEX)); + } + LOG_TRACE(<< "regularization(initial) = " << m_TreeImpl->m_Regularization.print()); +} + +CBoostedTreeFactory::TDoubleDoublePr +CBoostedTreeFactory::estimateTreeGainAndCurvature(core::CDataFrame& frame, + const core::CPackedBitVector& trainingRowMask) const { + + std::size_t maximumNumberOfTrees{1}; + std::swap(maximumNumberOfTrees, m_TreeImpl->m_MaximumNumberTrees); + auto forest = m_TreeImpl->trainForest(frame, trainingRowMask, m_RecordMemoryUsage); + std::swap(maximumNumberOfTrees, m_TreeImpl->m_MaximumNumberTrees); + + double gain; + double curvature; + std::tie(gain, curvature) = m_TreeImpl->gainAndCurvatureAtPercentile(75.0, forest); + + LOG_TRACE(<< "gain = " << gain << ", curvature = " << curvature); + + return {gain, curvature}; +} + +CBoostedTreeFactory::TOptionalVector +CBoostedTreeFactory::candidateRegularizerSearchInterval(core::CDataFrame& frame, + core::CPackedBitVector trainingRowMask, + TScaleRegularization scaleRegularization) const { + + // This uses a quadratic approximation to the test loss function w.r.t. + // the scaled regularization hyperparameter from which it estimates the + // minimum error point in the interval we search here. Separately, it + // examines size of the residual errors w.r.t. to the variation in the + // best fit curve over the interval. We truncate the interval the main + // hyperparameter optimisation loop searches if we determine there is a + // low chance of missing the best solution by doing so. + + using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; + + double pSample{1.0 / static_cast(m_TreeImpl->m_NumberFolds)}; + + core::CPackedBitVector testRowMask; + for (auto row = trainingRowMask.beginOneBits(); + row != trainingRowMask.endOneBits(); ++row) { + if (CSampling::uniformSample(m_TreeImpl->m_Rng, 0.0, 1.0) < pSample) { + testRowMask.extend(false, *row - testRowMask.size()); + testRowMask.extend(true); + } + } + testRowMask.extend(false, trainingRowMask.size() - testRowMask.size()); + trainingRowMask ^= testRowMask; + + double maximumTreeSizeMultiplier{MAIN_TRAINING_LOOP_TREE_SIZE_MULTIPLIER}; + std::swap(maximumTreeSizeMultiplier, m_TreeImpl->m_MaximumTreeSizeMultiplier); + + double multiplier{std::exp( + -std::log(1024.0) / static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS))}; + + CLeastSquaresOnlineRegression<2, double> leastSquaresQuadraticTestLoss; + TDoubleVec testLosses(INITIAL_REGULARIZER_SEARCH_ITERATIONS); + + double scale{1.0}; + for (std::size_t i = 0; i < INITIAL_REGULARIZER_SEARCH_ITERATIONS; ++i) { + scaleRegularization(scale); + scale *= multiplier; + auto forest = m_TreeImpl->trainForest(frame, trainingRowMask, m_RecordMemoryUsage); + double testLoss{m_TreeImpl->meanLoss(frame, testRowMask, forest)}; + leastSquaresQuadraticTestLoss.add(static_cast(i), testLoss); + testLosses[i] = testLoss; + m_TreeImpl->m_TrainingProgress.increment(); + } + LOG_TRACE(<< "test losses = " << core::CContainerPrinter::print(testLosses)); + + std::swap(maximumTreeSizeMultiplier, m_TreeImpl->m_MaximumTreeSizeMultiplier); + + CLeastSquaresOnlineRegression<2, double>::TArray params; + bool successful{leastSquaresQuadraticTestLoss.parameters(params)}; + double gradient{params[1]}; + double curvature{params[2]}; + LOG_TRACE(<< "[intercept, slope, curvature] = " + << core::CContainerPrinter::print(params)); + + // Find the scale at the minimum of the least squares quadratic fit + // to the test loss in the search interval. + double leftEndpoint{0.0}; + double rightEndpoint{static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS - 1)}; + double stationaryPoint{-gradient / 2.0 / curvature}; + double distanceToLeftEndpoint{std::fabs(leftEndpoint - stationaryPoint)}; + double distanceToRightEndpoint{std::fabs(rightEndpoint - stationaryPoint)}; + double logBestRegularizerScale{ + curvature < 0.0 + ? (distanceToLeftEndpoint > distanceToRightEndpoint ? leftEndpoint : rightEndpoint) + : CTools::truncate(stationaryPoint, leftEndpoint, rightEndpoint)}; + double bestRegularizerScale{std::pow(0.5, logBestRegularizerScale)}; + + // Find an interval with a high probability of containing the optimal + // regularisation parameter if the interval we searched has a minimum. + TVector interval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; + if (curvature > 0.0) { + TMeanVarAccumulator residualMoments; + for (std::size_t i = 0; i < INITIAL_REGULARIZER_SEARCH_ITERATIONS; ++i) { + residualMoments.add(testLosses[i] - leastSquaresQuadraticTestLoss.predict( + static_cast(i))); + } + double margin{2.0 * std::sqrt(CBasicStatistics::variance(residualMoments)) / curvature}; + if (logBestRegularizerScale - margin >= leftEndpoint) { + interval(MIN_REGULARIZER_INDEX) = + std::max(std::pow(0.5, margin), MIN_REGULARIZER_SCALE); + } + if (logBestRegularizerScale + margin <= rightEndpoint) { + interval(MAX_REGULARIZER_INDEX) = + std::min(std::pow(2.0, margin), MAX_REGULARIZER_SCALE); + } + } + interval *= bestRegularizerScale; + + return successful ? TOptionalVector{interval} : TOptionalVector{}; +} + CBoostedTreeFactory CBoostedTreeFactory::constructFromParameters(std::size_t numberThreads, TLossFunctionUPtr loss) { return {numberThreads, std::move(loss)}; @@ -334,6 +481,8 @@ CBoostedTreeFactory::constructFromString(std::istream& jsonStringStream, if (treePtr->acceptRestoreTraverser(traverser) == false || traverser.haveBadState()) { throw std::runtime_error{"failed to restore boosted tree"}; } + treePtr->m_Impl->m_TrainingProgress.attach(recordProgress); + treePtr->m_Impl->m_TrainingProgress.resumeRestored(); frame.resizeColumns(treePtr->m_Impl->m_NumberThreads, frame.numberColumns() + treePtr->m_Impl->numberExtraColumnsForTrain()); @@ -345,7 +494,8 @@ CBoostedTreeFactory::constructFromString(std::istream& jsonStringStream, } CBoostedTreeFactory::CBoostedTreeFactory(std::size_t numberThreads, TLossFunctionUPtr loss) - : m_TreeImpl{std::make_unique(numberThreads, std::move(loss))} { + : m_TreeImpl{std::make_unique(numberThreads, std::move(loss))}, + m_GammaSearchInterval{0.0}, m_LambdaSearchInterval{0.0} { } CBoostedTreeFactory::CBoostedTreeFactory(CBoostedTreeFactory&&) = default; @@ -377,7 +527,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::lambda(double lambda) { LOG_WARN(<< "Lambda must be non-negative"); lambda = 0.0; } - m_TreeImpl->m_LambdaOverride = lambda; + m_TreeImpl->m_RegularizationOverride.lambda(lambda); return *this; } @@ -386,7 +536,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::gamma(double gamma) { LOG_WARN(<< "Gamma must be non-negative"); gamma = 0.0; } - m_TreeImpl->m_GammaOverride = gamma; + m_TreeImpl->m_RegularizationOverride.gamma(gamma); return *this; } @@ -485,6 +635,30 @@ std::size_t CBoostedTreeFactory::numberExtraColumnsForTrain() const { return m_TreeImpl->numberExtraColumnsForTrain(); } +void CBoostedTreeFactory::setupTrainingProgressMonitoring() { + + // The base unit is the cost of training on one fold. + // + // This comprises: + // - The cost of category encoding and feature selection which we count as + // one unit, + // - INITIAL_REGULARIZER_SEARCH_ITERATIONS units per regularization parameter + // which isn't user defined, + // - The main optimisation loop which costs number folds units per iteration, + // - The cost of the final train which we count as number folds units. + + std::size_t totalNumberSteps{1}; + if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { + totalNumberSteps += INITIAL_REGULARIZER_SEARCH_ITERATIONS; + } + if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { + totalNumberSteps += INITIAL_REGULARIZER_SEARCH_ITERATIONS; + } + totalNumberSteps += (this->numberHyperparameterTuningRounds() + 1) * + m_TreeImpl->m_NumberFolds; + m_TreeImpl->m_TrainingProgress = core::CLoopProgress{totalNumberSteps, m_RecordProgress}; +} + void CBoostedTreeFactory::noopRecordTrainingState(std::function) { } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 90ef01bf27..4e618beabd 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -84,6 +84,9 @@ double readLossCurvature(const TRowRef& row) { double readActual(const TRowRef& row, std::size_t dependentVariable) { return row[dependentVariable]; } + +const std::size_t ASSIGN_MISSING_TO_LEFT{0}; +const std::size_t ASSIGN_MISSING_TO_RIGHT{1}; } void CBoostedTreeImpl::CLeafNodeStatistics::addRowDerivatives(const CEncodedDataFrameRowRef& row, @@ -111,7 +114,7 @@ void CBoostedTreeImpl::CLeafNodeStatistics::addRowDerivatives(const CEncodedData CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads, CBoostedTree::TLossFunctionUPtr loss) : m_NumberThreads{numberThreads}, m_Loss{std::move(loss)}, - m_BestHyperparameters{m_Lambda, m_Gamma, m_Eta, m_EtaGrowthRatePerTree, m_FeatureBagFraction, m_FeatureSampleProbabilities} { + m_BestHyperparameters{m_Regularization, m_Eta, m_EtaGrowthRatePerTree, m_FeatureBagFraction} { } CBoostedTreeImpl::CBoostedTreeImpl() = default; @@ -133,11 +136,7 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, LOG_TRACE(<< "Main training loop..."); - // We account for cost of setup as one round. The main optimisation loop runs - // for "m_NumberRounds + 1" rounds and training on the choosen hyperparameter - // values is counted as one round. This gives a total of m_NumberRounds + 3. - core::CLoopProgress progress{m_NumberRounds + 3 - m_CurrentRound, recordProgress}; - progress.increment(); + m_TrainingProgress.attach(recordProgress); std::uint64_t lastMemoryUsage(this->memoryUsage()); recordMemoryUsage(lastMemoryUsage); @@ -172,8 +171,6 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, break; } - progress.increment(); - std::int64_t memoryUsage(this->memoryUsage()); recordMemoryUsage(memoryUsage - lastMemoryUsage); lastMemoryUsage = memoryUsage; @@ -280,40 +277,32 @@ core::CPackedBitVector CBoostedTreeImpl::allTrainingRowsMask() const { return ~m_MissingFeatureRowMasks[m_DependentVariable]; } -CBoostedTreeImpl::TDoubleDoubleDoubleTr -CBoostedTreeImpl::regularisedLoss(const core::CDataFrame& frame, - const core::CPackedBitVector& trainingRowMask, - const TNodeVecVec& forest) const { +CBoostedTreeImpl::TDoubleDoublePr +CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile, + const TNodeVecVec& forest) const { - auto results = frame.readRows( - m_NumberThreads, 0, frame.numberRows(), - core::bindRetrievableState( - [&](double& loss, TRowItr beginRows, TRowItr endRows) { - for (auto row = beginRows; row != endRows; ++row) { - loss += m_Loss->value(readPrediction(*row), - readActual(*row, m_DependentVariable)); - } - }, - 0.0), - &trainingRowMask); + TDoubleVec gains; + TDoubleVec curvatures; - double loss{0.0}; - for (const auto& result : results.first) { - loss += result.s_FunctionState; - } - - double leafCount{0.0}; - double sumSquareLeafWeights{0.0}; for (const auto& tree : forest) { for (const auto& node : tree) { - if (node.isLeaf()) { - leafCount += 1.0; - sumSquareLeafWeights += CTools::pow2(node.value()); + if (node.isLeaf() == false) { + gains.push_back(node.gain()); + curvatures.push_back(node.curvature()); } } } - return {loss, leafCount, 0.5 * sumSquareLeafWeights}; + if (gains.size() == 0) { + return {0.0, 0.0}; + } + + std::size_t index{static_cast( + percentile * static_cast(gains.size()) / 100.0 + 0.5)}; + std::nth_element(gains.begin(), gains.begin() + index, gains.end()); + std::nth_element(curvatures.begin(), curvatures.begin() + index, curvatures.end()); + + return {gains[index], curvatures[index]}; } CBoostedTreeImpl::TMeanVarAccumulator @@ -326,6 +315,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame, lossMoments.add(loss); LOG_TRACE(<< "fold = " << i << " forest size = " << forest.size() << " test set loss = " << loss); + m_TrainingProgress.increment(); } LOG_TRACE(<< "test mean loss = " << CBasicStatistics::mean(lossMoments) << ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments))); @@ -514,8 +504,8 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, TLeafNodeStatisticsPtrQueue leaves; leaves.push(std::make_shared( - 0 /*root*/, m_NumberThreads, frame, *m_Encoder, m_Lambda, m_Gamma, - candidateSplits, this->featureBag(), trainingRowMask)); + 0 /*root*/, m_NumberThreads, frame, *m_Encoder, m_Regularization, + candidateSplits, 0 /*depth*/, this->featureBag(), trainingRowMask)); // We update local variables because the callback can be expensive if it // requires accessing atomics. @@ -555,8 +545,9 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, bool assignMissingToLeft{leaf->assignMissingToLeft()}; std::size_t leftChildId, rightChildId; - std::tie(leftChildId, rightChildId) = tree[leaf->id()].split( - splitFeature, splitValue, assignMissingToLeft, tree); + std::tie(leftChildId, rightChildId) = + tree[leaf->id()].split(splitFeature, splitValue, assignMissingToLeft, + leaf->gain(), leaf->curvature(), tree); TSizeVec featureBag{this->featureBag()}; @@ -569,11 +560,10 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, TLeafNodeStatisticsPtr leftChild; TLeafNodeStatisticsPtr rightChild; - std::tie(leftChild, rightChild) = - leaf->split(leftChildId, rightChildId, m_NumberThreads, frame, - *m_Encoder, m_Lambda, m_Gamma, candidateSplits, - std::move(featureBag), std::move(leftChildRowMask), - std::move(rightChildRowMask), leftChildHasFewerRows); + std::tie(leftChild, rightChild) = leaf->split( + leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder, m_Regularization, + candidateSplits, std::move(featureBag), std::move(leftChildRowMask), + std::move(rightChildRowMask), leftChildHasFewerRows); scopeMemoryUsage.add(leftChild); scopeMemoryUsage.add(rightChild); @@ -740,11 +730,11 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss // Read parameters for last round. int i{0}; - if (m_LambdaOverride == boost::none) { - parameters(i++) = std::log(m_Lambda); + if (m_RegularizationOverride.lambda() == boost::none) { + parameters(i++) = std::log(m_Regularization.lambda()); } - if (m_GammaOverride == boost::none) { - parameters(i++) = std::log(m_Gamma); + if (m_RegularizationOverride.gamma() == boost::none) { + parameters(i++) = std::log(m_Regularization.gamma()); } if (m_EtaOverride == boost::none) { parameters(i++) = std::log(m_Eta); @@ -757,6 +747,11 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss double meanLoss{CBasicStatistics::mean(lossMoments)}; double lossVariance{CBasicStatistics::variance(lossMoments)}; + LOG_TRACE(<< "round = " << m_CurrentRound << " loss = " << meanLoss + << ": regularization = " << m_Regularization.print() << ", eta = " << m_Eta + << ", eta growth rate per tree = " << m_EtaGrowthRatePerTree + << ", feature bag fraction = " << m_FeatureBagFraction); + bopt.add(parameters, meanLoss, lossVariance); if (3 * m_CurrentRound < m_NumberRounds) { std::generate_n(parameters.data(), parameters.size(), [&]() { @@ -772,11 +767,11 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss // Write parameters for next round. i = 0; - if (m_LambdaOverride == boost::none) { - m_Lambda = std::exp(parameters(i++)); + if (m_RegularizationOverride.lambda() == boost::none) { + m_Regularization.lambda(std::exp(parameters(i++))); } - if (m_GammaOverride == boost::none) { - m_Gamma = std::exp(parameters(i++)); + if (m_RegularizationOverride.gamma() == boost::none) { + m_Regularization.gamma(std::exp(parameters(i++))); } if (m_EtaOverride == boost::none) { m_Eta = std::exp(parameters(i++)); @@ -786,10 +781,6 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss m_FeatureBagFraction = parameters(i++); } - LOG_TRACE(<< "round = " << m_CurrentRound << ": lambda = " << m_Lambda - << ", gamma = " << m_Gamma << ", eta = " << m_Eta - << ", eta growth rate per tree = " << m_EtaGrowthRatePerTree - << ", feature bag fraction = " << m_FeatureBagFraction); return true; } @@ -802,25 +793,24 @@ void CBoostedTreeImpl::captureBestHyperparameters(const TMeanVarAccumulator& los if (loss < m_BestForestTestLoss) { m_BestForestTestLoss = loss; m_BestHyperparameters = SHyperparameters{ - m_Lambda, m_Gamma, m_Eta, m_EtaGrowthRatePerTree, m_FeatureBagFraction, m_FeatureSampleProbabilities}; + m_Regularization, m_Eta, m_EtaGrowthRatePerTree, m_FeatureBagFraction}; } } void CBoostedTreeImpl::restoreBestHyperparameters() { - m_Lambda = m_BestHyperparameters.s_Lambda; - m_Gamma = m_BestHyperparameters.s_Gamma; + m_Regularization = m_BestHyperparameters.s_Regularization; m_Eta = m_BestHyperparameters.s_Eta; m_EtaGrowthRatePerTree = m_BestHyperparameters.s_EtaGrowthRatePerTree; m_FeatureBagFraction = m_BestHyperparameters.s_FeatureBagFraction; - m_FeatureSampleProbabilities = m_BestHyperparameters.s_FeatureSampleProbabilities; - LOG_TRACE(<< "lambda* = " << m_Lambda << ", gamma* = " << m_Gamma - << ", eta* = " << m_Eta << ", eta growth rate per tree* = " << m_EtaGrowthRatePerTree + LOG_TRACE(<< "regularization* = " << m_Regularization.print() << ", eta* = " << m_Eta + << ", eta growth rate per tree* = " << m_EtaGrowthRatePerTree << ", feature bag fraction* = " << m_FeatureBagFraction); } std::size_t CBoostedTreeImpl::numberHyperparametersToTune() const { - return (m_LambdaOverride ? 0 : 1) + (m_GammaOverride ? 0 : 1) + - (m_EtaOverride ? 0 : 2) + (m_FeatureBagFractionOverride ? 0 : 1); + return m_RegularizationOverride.countNotSet() + + (m_EtaOverride != boost::none ? 0 : 2) + + (m_FeatureBagFractionOverride != boost::none ? 0 : 1); } std::size_t CBoostedTreeImpl::maximumTreeSize(const core::CPackedBitVector& trainingRowMask) const { @@ -835,7 +825,6 @@ std::size_t CBoostedTreeImpl::maximumTreeSize(std::size_t numberRows) const { const std::size_t CBoostedTreeImpl::PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE{256}; namespace { -const std::string RANDOM_NUMBER_GENERATOR_TAG{"random_number_generator"}; const std::string BAYESIAN_OPTIMIZATION_TAG{"bayesian_optimization"}; const std::string BEST_FOREST_TAG{"best_forest"}; const std::string BEST_FOREST_TEST_LOSS_TAG{"best_forest_test_loss"}; @@ -851,9 +840,7 @@ const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"}; const std::string FEATURE_DATA_TYPES_TAG{"feature_data_types"}; const std::string FEATURE_SAMPLE_PROBABILITIES_TAG{"feature_sample_probabilities"}; const std::string GAMMA_OVERRIDE_TAG{"gamma_override"}; -const std::string GAMMA_TAG{"gamma"}; const std::string LAMBDA_OVERRIDE_TAG{"lambda_override"}; -const std::string LAMBDA_TAG{"lambda"}; const std::string LOSS_TAG{"loss"}; const std::string MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG{"maximum_attempts_to_add_tree"}; const std::string MAXIMUM_NUMBER_TREES_OVERRIDE_TAG{"maximum_number_trees_override"}; @@ -866,9 +853,21 @@ const std::string NUMBER_FOLDS_TAG{"number_folds"}; const std::string NUMBER_ROUNDS_TAG{"number_rounds"}; const std::string NUMBER_SPLITS_PER_FEATURE_TAG{"number_splits_per_feature"}; const std::string NUMBER_THREADS_TAG{"number_threads"}; +const std::string RANDOM_NUMBER_GENERATOR_TAG{"random_number_generator"}; +const std::string REGULARIZATION_TAG{"regularization"}; +const std::string REGULARIZATION_OVERRIDE_TAG{"regularization_override"}; const std::string ROWS_PER_FEATURE_TAG{"rows_per_feature"}; const std::string TESTING_ROW_MASKS_TAG{"testing_row_masks"}; const std::string TRAINING_ROW_MASKS_TAG{"training_row_masks"}; +const std::string TRAINING_PROGRESS_TAG{"training_row_masks"}; + +const std::string REGULARIZATION_GAMMA_TAG{"gamma"}; +const std::string REGULARIZATION_LAMBDA_TAG{"lambda"}; + +const std::string HYPERPARAM_ETA_TAG{"hyperparam_eta"}; +const std::string HYPERPARAM_ETA_GROWTH_RATE_PER_TREE_TAG{"hyperparam_eta_growth_rate_per_tree"}; +const std::string HYPERPARAM_FEATURE_BAG_FRACTION_TAG{"hyperparam_feature_bag_fraction"}; +const std::string HYPERPARAM_REGULARIZATION_TAG{"hyperparam_regularization"}; const std::string LEFT_CHILD_TAG{"left_child"}; const std::string RIGHT_CHILD_TAG{"right_child"}; @@ -876,19 +875,35 @@ const std::string SPLIT_FEATURE_TAG{"split_feature"}; const std::string ASSIGN_MISSING_TO_LEFT_TAG{"assign_missing_to_left "}; const std::string NODE_VALUE_TAG{"node_value"}; const std::string SPLIT_VALUE_TAG{"split_value"}; +} -const std::string HYPERPARAM_LAMBDA_TAG{"hyperparam_lambda"}; -const std::string HYPERPARAM_GAMMA_TAG{"hyperparam_gamma"}; -const std::string HYPERPARAM_ETA_TAG{"hyperparam_eta"}; -const std::string HYPERPARAM_ETA_GROWTH_RATE_PER_TREE_TAG{"hyperparam_eta_growth_rate_per_tree"}; -const std::string HYPERPARAM_FEATURE_BAG_FRACTION_TAG{"hyperparam_feature_bag_fraction"}; -const std::string HYPERPARAM_FEATURE_SAMPLE_PROBABILITIES_TAG{"hyperparam_feature_sample_probabilities"}; +template +void CBoostedTreeImpl::CRegularization::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + core::CPersistUtils::persist(REGULARIZATION_GAMMA_TAG, m_Gamma, inserter); + core::CPersistUtils::persist(REGULARIZATION_LAMBDA_TAG, m_Lambda, inserter); +} + +void CBoostedTreeImpl::SHyperparameters::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + core::CPersistUtils::persist(HYPERPARAM_ETA_TAG, s_Eta, inserter); + core::CPersistUtils::persist(HYPERPARAM_ETA_GROWTH_RATE_PER_TREE_TAG, + s_EtaGrowthRatePerTree, inserter); + core::CPersistUtils::persist(HYPERPARAM_FEATURE_BAG_FRACTION_TAG, + s_FeatureBagFraction, inserter); + core::CPersistUtils::persist(HYPERPARAM_REGULARIZATION_TAG, s_Regularization, inserter); +} + +void CBoostedTreeImpl::CNode::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + core::CPersistUtils::persist(LEFT_CHILD_TAG, m_LeftChild, inserter); + core::CPersistUtils::persist(RIGHT_CHILD_TAG, m_RightChild, inserter); + core::CPersistUtils::persist(SPLIT_FEATURE_TAG, m_SplitFeature, inserter); + core::CPersistUtils::persist(ASSIGN_MISSING_TO_LEFT_TAG, m_AssignMissingToLeft, inserter); + core::CPersistUtils::persist(NODE_VALUE_TAG, m_NodeValue, inserter); + core::CPersistUtils::persist(SPLIT_VALUE_TAG, m_SplitValue, inserter); } void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& inserter) const { core::CPersistUtils::persist(BAYESIAN_OPTIMIZATION_TAG, *m_BayesianOptimization, inserter); core::CPersistUtils::persist(BEST_FOREST_TEST_LOSS_TAG, m_BestForestTestLoss, inserter); - inserter.insertValue(RANDOM_NUMBER_GENERATOR_TAG, m_Rng.toString()); core::CPersistUtils::persist(CURRENT_ROUND_TAG, m_CurrentRound, inserter); core::CPersistUtils::persist(DEPENDENT_VARIABLE_TAG, m_DependentVariable, inserter); core::CPersistUtils::persist(ENCODER_TAG, *m_Encoder, inserter); @@ -899,8 +914,6 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(FEATURE_DATA_TYPES_TAG, m_FeatureDataTypes, inserter); core::CPersistUtils::persist(FEATURE_SAMPLE_PROBABILITIES_TAG, m_FeatureSampleProbabilities, inserter); - core::CPersistUtils::persist(GAMMA_TAG, m_Gamma, inserter); - core::CPersistUtils::persist(LAMBDA_TAG, m_Lambda, inserter); core::CPersistUtils::persist(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, m_MaximumAttemptsToAddTree, inserter); core::CPersistUtils::persist(MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG, @@ -914,50 +927,40 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(NUMBER_SPLITS_PER_FEATURE_TAG, m_NumberSplitsPerFeature, inserter); core::CPersistUtils::persist(NUMBER_THREADS_TAG, m_NumberThreads, inserter); + inserter.insertValue(RANDOM_NUMBER_GENERATOR_TAG, m_Rng.toString()); + core::CPersistUtils::persist(REGULARIZATION_OVERRIDE_TAG, + m_RegularizationOverride, inserter); + core::CPersistUtils::persist(REGULARIZATION_TAG, m_Regularization, inserter); core::CPersistUtils::persist(ROWS_PER_FEATURE_TAG, m_RowsPerFeature, inserter); core::CPersistUtils::persist(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, inserter); core::CPersistUtils::persist(MAXIMUM_NUMBER_TREES_TAG, m_MaximumNumberTrees, inserter); core::CPersistUtils::persist(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, inserter); + core::CPersistUtils::persist(TRAINING_PROGRESS_TAG, m_TrainingProgress, inserter); core::CPersistUtils::persist(BEST_FOREST_TAG, m_BestForest, inserter); core::CPersistUtils::persist(BEST_HYPERPARAMETERS_TAG, m_BestHyperparameters, inserter); core::CPersistUtils::persist(ETA_OVERRIDE_TAG, m_EtaOverride, inserter); core::CPersistUtils::persist(FEATURE_BAG_FRACTION_OVERRIDE_TAG, m_FeatureBagFractionOverride, inserter); - core::CPersistUtils::persist(GAMMA_OVERRIDE_TAG, m_GammaOverride, inserter); - core::CPersistUtils::persist(LAMBDA_OVERRIDE_TAG, m_LambdaOverride, inserter); core::CPersistUtils::persist(MAXIMUM_NUMBER_TREES_OVERRIDE_TAG, m_MaximumNumberTreesOverride, inserter); inserter.insertValue(LOSS_TAG, m_Loss->name()); } -void CBoostedTreeImpl::CNode::acceptPersistInserter(core::CStatePersistInserter& inserter) const { - core::CPersistUtils::persist(LEFT_CHILD_TAG, m_LeftChild, inserter); - core::CPersistUtils::persist(RIGHT_CHILD_TAG, m_RightChild, inserter); - core::CPersistUtils::persist(SPLIT_FEATURE_TAG, m_SplitFeature, inserter); - core::CPersistUtils::persist(ASSIGN_MISSING_TO_LEFT_TAG, m_AssignMissingToLeft, inserter); - core::CPersistUtils::persist(NODE_VALUE_TAG, m_NodeValue, inserter); - core::CPersistUtils::persist(SPLIT_VALUE_TAG, m_SplitValue, inserter); -} - -void CBoostedTreeImpl::SHyperparameters::acceptPersistInserter(core::CStatePersistInserter& inserter) const { - core::CPersistUtils::persist(HYPERPARAM_LAMBDA_TAG, s_Lambda, inserter); - core::CPersistUtils::persist(HYPERPARAM_GAMMA_TAG, s_Gamma, inserter); - core::CPersistUtils::persist(HYPERPARAM_ETA_TAG, s_Eta, inserter); - core::CPersistUtils::persist(HYPERPARAM_ETA_GROWTH_RATE_PER_TREE_TAG, - s_EtaGrowthRatePerTree, inserter); - core::CPersistUtils::persist(HYPERPARAM_FEATURE_BAG_FRACTION_TAG, - s_FeatureBagFraction, inserter); - core::CPersistUtils::persist(HYPERPARAM_FEATURE_SAMPLE_PROBABILITIES_TAG, - s_FeatureSampleProbabilities, inserter); +template +bool CBoostedTreeImpl::CRegularization::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { + do { + const std::string& name = traverser.name(); + RESTORE(REGULARIZATION_GAMMA_TAG, + core::CPersistUtils::restore(REGULARIZATION_GAMMA_TAG, m_Gamma, traverser)) + RESTORE(REGULARIZATION_LAMBDA_TAG, + core::CPersistUtils::restore(REGULARIZATION_LAMBDA_TAG, m_Lambda, traverser)) + } while (traverser.next()); + return true; } bool CBoostedTreeImpl::SHyperparameters::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { do { const std::string& name = traverser.name(); - RESTORE(HYPERPARAM_LAMBDA_TAG, - core::CPersistUtils::restore(HYPERPARAM_LAMBDA_TAG, s_Lambda, traverser)) - RESTORE(HYPERPARAM_GAMMA_TAG, - core::CPersistUtils::restore(HYPERPARAM_GAMMA_TAG, s_Gamma, traverser)) RESTORE(HYPERPARAM_ETA_TAG, core::CPersistUtils::restore(HYPERPARAM_ETA_TAG, s_Eta, traverser)) RESTORE(HYPERPARAM_ETA_GROWTH_RATE_PER_TREE_TAG, @@ -966,9 +969,9 @@ bool CBoostedTreeImpl::SHyperparameters::acceptRestoreTraverser(core::CStateRest RESTORE(HYPERPARAM_FEATURE_BAG_FRACTION_TAG, core::CPersistUtils::restore(HYPERPARAM_FEATURE_BAG_FRACTION_TAG, s_FeatureBagFraction, traverser)) - RESTORE(HYPERPARAM_FEATURE_SAMPLE_PROBABILITIES_TAG, - core::CPersistUtils::restore(HYPERPARAM_FEATURE_SAMPLE_PROBABILITIES_TAG, - s_FeatureSampleProbabilities, traverser)) + RESTORE(HYPERPARAM_REGULARIZATION_TAG, + core::CPersistUtils::restore(HYPERPARAM_REGULARIZATION_TAG, + s_Regularization, traverser)) } while (traverser.next()); return true; } @@ -993,18 +996,6 @@ bool CBoostedTreeImpl::CNode::acceptRestoreTraverser(core::CStateRestoreTraverse return true; } -bool CBoostedTreeImpl::restoreLoss(CBoostedTree::TLossFunctionUPtr& loss, - core::CStateRestoreTraverser& traverser) { - const std::string& lossFunctionName{traverser.value()}; - if (lossFunctionName == CMse::NAME) { - loss = std::make_unique(); - return true; - } - LOG_ERROR(<< "Error restoring loss function. Unknown loss function type '" - << lossFunctionName << "'."); - return false; -} - bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { do { const std::string& name = traverser.name(); @@ -1014,8 +1005,6 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav RESTORE(BEST_FOREST_TEST_LOSS_TAG, core::CPersistUtils::restore(BEST_FOREST_TEST_LOSS_TAG, m_BestForestTestLoss, traverser)) - RESTORE(RANDOM_NUMBER_GENERATOR_TAG, m_Rng.fromString(traverser.value())) - RESTORE(CURRENT_ROUND_TAG, core::CPersistUtils::restore(CURRENT_ROUND_TAG, m_CurrentRound, traverser)) RESTORE(DEPENDENT_VARIABLE_TAG, @@ -1036,8 +1025,6 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav RESTORE(FEATURE_SAMPLE_PROBABILITIES_TAG, core::CPersistUtils::restore(FEATURE_SAMPLE_PROBABILITIES_TAG, m_FeatureSampleProbabilities, traverser)) - RESTORE(GAMMA_TAG, core::CPersistUtils::restore(GAMMA_TAG, m_Gamma, traverser)) - RESTORE(LAMBDA_TAG, core::CPersistUtils::restore(LAMBDA_TAG, m_Lambda, traverser)) RESTORE(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, core::CPersistUtils::restore(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, m_MaximumAttemptsToAddTree, traverser)) @@ -1060,6 +1047,12 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav m_NumberSplitsPerFeature, traverser)) RESTORE(NUMBER_THREADS_TAG, core::CPersistUtils::restore(NUMBER_THREADS_TAG, m_NumberThreads, traverser)) + RESTORE(RANDOM_NUMBER_GENERATOR_TAG, m_Rng.fromString(traverser.value())) + RESTORE(REGULARIZATION_TAG, + core::CPersistUtils::restore(REGULARIZATION_TAG, m_Regularization, traverser)) + RESTORE(REGULARIZATION_OVERRIDE_TAG, + core::CPersistUtils::restore(REGULARIZATION_OVERRIDE_TAG, + m_RegularizationOverride, traverser)) RESTORE(ROWS_PER_FEATURE_TAG, core::CPersistUtils::restore(ROWS_PER_FEATURE_TAG, m_RowsPerFeature, traverser)) RESTORE(TESTING_ROW_MASKS_TAG, @@ -1069,6 +1062,8 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav m_MaximumNumberTrees, traverser)) RESTORE(TRAINING_ROW_MASKS_TAG, core::CPersistUtils::restore(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, traverser)) + RESTORE(TRAINING_PROGRESS_TAG, + core::CPersistUtils::restore(TRAINING_PROGRESS_TAG, m_TrainingProgress, traverser)) RESTORE(BEST_FOREST_TAG, core::CPersistUtils::restore(BEST_FOREST_TAG, m_BestForest, traverser)) RESTORE(BEST_HYPERPARAMETERS_TAG, @@ -1079,10 +1074,6 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav RESTORE(FEATURE_BAG_FRACTION_OVERRIDE_TAG, core::CPersistUtils::restore(FEATURE_BAG_FRACTION_OVERRIDE_TAG, m_FeatureBagFractionOverride, traverser)) - RESTORE(GAMMA_OVERRIDE_TAG, - core::CPersistUtils::restore(GAMMA_OVERRIDE_TAG, m_GammaOverride, traverser)) - RESTORE(LAMBDA_OVERRIDE_TAG, - core::CPersistUtils::restore(LAMBDA_OVERRIDE_TAG, m_LambdaOverride, traverser)) RESTORE(MAXIMUM_NUMBER_TREES_OVERRIDE_TAG, core::CPersistUtils::restore(MAXIMUM_NUMBER_TREES_OVERRIDE_TAG, m_MaximumNumberTreesOverride, traverser)) @@ -1091,6 +1082,18 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav return true; } +bool CBoostedTreeImpl::restoreLoss(CBoostedTree::TLossFunctionUPtr& loss, + core::CStateRestoreTraverser& traverser) { + const std::string& lossFunctionName{traverser.value()}; + if (lossFunctionName == CMse::NAME) { + loss = std::make_unique(); + return true; + } + LOG_ERROR(<< "Error restoring loss function. Unknown loss function type '" + << lossFunctionName << "'."); + return false; +} + std::size_t CBoostedTreeImpl::memoryUsage() const { std::size_t mem{core::CMemory::dynamicSize(m_Loss)}; mem += core::CMemory::dynamicSize(m_Encoder); diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index b5178b80be..99dd04242f 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -602,7 +602,7 @@ void CBoostedTreeTest::testCategoricalRegressors() { LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); - CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.13); CPPUNIT_ASSERT(modelRSquared > 0.9); } From 31796bdcad665d83161f1760637bbe59659064e0 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 23 Sep 2019 14:06:53 +0100 Subject: [PATCH 02/23] Docs --- docs/CHANGELOG.asciidoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 3f1ca56001..629a9f01ff 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -36,6 +36,8 @@ For large data sets this change was observed to give a 10% to 20% decrease in train time. (See {ml-pull}622[#622].) * Upgrade Boost libraries to version 1.71. (See {ml-pull}638[#638].) +* Improve initialisation of boosted tree training. This generally enables us to +find lower loss models faster. (See {ml-pull}686[#686].) == {es} version 7.4.0 From d163dfd3157292049b7354f32e90e0417064c4f3 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 23 Sep 2019 15:00:31 +0100 Subject: [PATCH 03/23] Typos --- lib/maths/CBoostedTreeFactory.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index ebb76ef141..037c14ff2f 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -283,13 +283,14 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) { void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDataFrame& frame) { // The strategy here is to: - // 1) Get percentile estimates of the gain in and sum curvature of the loss - // function at splits in a single tree, + // 1) Get percentile estimates of the gain in the loss function and its sum + // curvature from the splits selected in a single tree with regulizers + // zeroed, // 2) Use these to upper bound the size of gamma and lambda, that is find - // values we for which we expect to underfit the data, - // 3) Decrease each regularizer and look for turning point in the test loss, - // i.e. the point at which transition to overfit occurs. - // We'll search intervals in the vicinity of this values in the hyperparameter + // values for which we expect to underfit the data, + // 3) Decrease each regularizer and look for a turning point in the test + // loss, i.e. the point at which transition to overfit occurs. + // We'll search intervals in the vicinity of these values in the hyperparameter // optimisation loop. core::CPackedBitVector allTrainingRowsMask{m_TreeImpl->allTrainingRowsMask()}; From dcbe3feb3e9ac0fa36bdd07df4c1e0c0d3510cf0 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 23 Sep 2019 15:04:03 +0100 Subject: [PATCH 04/23] Build fix --- include/maths/CBoostedTreeImpl.h | 133 ++++++++----------------------- lib/maths/CBoostedTreeImpl.cc | 65 +++++++++++++++ 2 files changed, 97 insertions(+), 101 deletions(-) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index d5d8201b47..d8cf8c4703 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -398,41 +398,38 @@ class MATHS_EXPORT CBoostedTreeImpl final { const CLeafNodeStatistics& parent, const CLeafNodeStatistics& sibling, core::CPackedBitVector rowMask) - : m_Id{id}, m_Regularization{sibling.m_Regularization}, - m_CandidateSplits{sibling.m_CandidateSplits}, m_Depth{sibling.m_Depth}, - m_FeatureBag{sibling.m_FeatureBag}, m_RowMask{std::move(rowMask)} { - - LOG_TRACE(<< "row mask = " << m_RowMask); - LOG_TRACE(<< "feature bag = " << core::CContainerPrinter::print(m_FeatureBag)); - - m_Gradients.resize(m_CandidateSplits.size()); - m_Curvatures.resize(m_CandidateSplits.size()); - m_MissingGradients.resize(m_CandidateSplits.size(), 0.0); - m_MissingCurvatures.resize(m_CandidateSplits.size(), 0.0); - - for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { - std::size_t numberSplits{m_CandidateSplits[i].size() + 1}; - m_Gradients[i].resize(numberSplits); - m_Curvatures[i].resize(numberSplits); - for (std::size_t j = 0; j < numberSplits; ++j) { - m_Gradients[i][j] = parent.m_Gradients[i][j] - - sibling.m_Gradients[i][j]; - m_Curvatures[i][j] = parent.m_Curvatures[i][j] - - sibling.m_Curvatures[i][j]; - } - m_MissingGradients[i] = parent.m_MissingGradients[i] - - sibling.m_MissingGradients[i]; - m_MissingCurvatures[i] = parent.m_MissingCurvatures[i] - - sibling.m_MissingCurvatures[i]; - } - - LOG_TRACE(<< "gradients = " << core::CContainerPrinter::print(m_Gradients)); - LOG_TRACE(<< "curvatures = " << core::CContainerPrinter::print(m_Curvatures)); - LOG_TRACE(<< "missing gradients = " - << core::CContainerPrinter::print(m_MissingGradients)); - LOG_TRACE(<< "missing curvatures = " - << core::CContainerPrinter::print(m_MissingCurvatures)); + : m_Id{id}, m_Regularization{sibling.m_Regularization}, + m_CandidateSplits{sibling.m_CandidateSplits}, m_Depth{sibling.m_Depth}, + m_FeatureBag{sibling.m_FeatureBag}, m_RowMask{std::move(rowMask)} { + + LOG_TRACE(<< "row mask = " << m_RowMask); + LOG_TRACE(<< "feature bag = " << core::CContainerPrinter::print(m_FeatureBag)); + + m_Gradients.resize(m_CandidateSplits.size()); + m_Curvatures.resize(m_CandidateSplits.size()); + m_MissingGradients.resize(m_CandidateSplits.size(), 0.0); + m_MissingCurvatures.resize(m_CandidateSplits.size(), 0.0); + + for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { + std::size_t numberSplits{m_CandidateSplits[i].size() + 1}; + m_Gradients[i].resize(numberSplits); + m_Curvatures[i].resize(numberSplits); + for (std::size_t j = 0; j < numberSplits; ++j) { + m_Gradients[i][j] = parent.m_Gradients[i][j] - sibling.m_Gradients[i][j]; + m_Curvatures[i][j] = parent.m_Curvatures[i][j] - sibling.m_Curvatures[i][j]; } + m_MissingGradients[i] = parent.m_MissingGradients[i] - + sibling.m_MissingGradients[i]; + m_MissingCurvatures[i] = parent.m_MissingCurvatures[i] - + sibling.m_MissingCurvatures[i]; + } + + LOG_TRACE(<< "gradients = " << core::CContainerPrinter::print(m_Gradients)); + LOG_TRACE(<< "curvatures = " << core::CContainerPrinter::print(m_Curvatures)); + LOG_TRACE(<< "missing gradients = " << core::CContainerPrinter::print(m_MissingGradients)); + LOG_TRACE(<< "missing curvatures = " + << core::CContainerPrinter::print(m_MissingCurvatures)); +} CLeafNodeStatistics(const CLeafNodeStatistics&) = delete; @@ -647,73 +644,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { return *m_BestSplit; } - SSplitStatistics computeBestSplitStatistics() const { - - // We have two possible regularisation terms we'll use: - // 1. Tree size: gamma * "node count" - // 2. Sum square weights: lambda * sum{"leaf weight" ^ 2)} - - SSplitStatistics result{-INF, 0.0, m_FeatureBag.size(), INF, true}; - - for (auto i : m_FeatureBag) { - double g{std::accumulate(m_Gradients[i].begin(), m_Gradients[i].end(), 0.0) + - m_MissingGradients[i]}; - double h{std::accumulate(m_Curvatures[i].begin(), - m_Curvatures[i].end(), 0.0) + - m_MissingCurvatures[i]}; - double gl[]{m_MissingGradients[i], 0.0}; - double hl[]{m_MissingCurvatures[i], 0.0}; - - double maximumGain{-INF}; - double splitAt{-INF}; - bool assignMissingToLeft{true}; - - for (std::size_t j = 0; j + 1 < m_Gradients[i].size(); ++j) { - gl[ASSIGN_MISSING_TO_LEFT] += m_Gradients[i][j]; - hl[ASSIGN_MISSING_TO_LEFT] += m_Curvatures[i][j]; - gl[ASSIGN_MISSING_TO_RIGHT] += m_Gradients[i][j]; - hl[ASSIGN_MISSING_TO_RIGHT] += m_Curvatures[i][j]; - - double gain[]{ - CTools::pow2(gl[ASSIGN_MISSING_TO_LEFT]) / - (hl[ASSIGN_MISSING_TO_LEFT] + m_Regularization.lambda()) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_LEFT]) / - (h - hl[ASSIGN_MISSING_TO_LEFT] + - m_Regularization.lambda()), - CTools::pow2(gl[ASSIGN_MISSING_TO_RIGHT]) / - (hl[ASSIGN_MISSING_TO_RIGHT] + m_Regularization.lambda()) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_RIGHT]) / - (h - hl[ASSIGN_MISSING_TO_RIGHT] + - m_Regularization.lambda())}; - - if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) { - maximumGain = gain[ASSIGN_MISSING_TO_LEFT]; - splitAt = m_CandidateSplits[i][j]; - assignMissingToLeft = true; - } - if (gain[ASSIGN_MISSING_TO_RIGHT] > maximumGain) { - maximumGain = gain[ASSIGN_MISSING_TO_RIGHT]; - splitAt = m_CandidateSplits[i][j]; - assignMissingToLeft = false; - } - } - - double gain{0.5 * (maximumGain - - CTools::pow2(g) / (h + m_Regularization.lambda())) - - m_Regularization.gamma()}; - - SSplitStatistics candidate{gain, h, i, splitAt, assignMissingToLeft}; - LOG_TRACE(<< "candidate split: " << candidate.print()); - - if (candidate > result) { - result = candidate; - } - } - - LOG_TRACE(<< "best split: " << result.print()); - - return result; - } + SSplitStatistics computeBestSplitStatistics() const; private: std::size_t m_Id; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 4e618beabd..ae656a6d8b 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -112,6 +112,71 @@ void CBoostedTreeImpl::CLeafNodeStatistics::addRowDerivatives(const CEncodedData } } +CBoostedTreeImpl::CLeafNodeStatistics::SSplitStatistics +CBoostedTreeImpl::CLeafNodeStatistics::computeBestSplitStatistics() const { + + // We have two possible regularisation terms we'll use: + // 1. Tree size: gamma * "node count" + // 2. Sum square weights: lambda * sum{"leaf weight" ^ 2)} + + SSplitStatistics result{-INF, 0.0, m_FeatureBag.size(), INF, true}; + + for (auto i : m_FeatureBag) { + double g{std::accumulate(m_Gradients[i].begin(), m_Gradients[i].end(), 0.0) + + m_MissingGradients[i]}; + double h{std::accumulate(m_Curvatures[i].begin(), m_Curvatures[i].end(), 0.0) + + m_MissingCurvatures[i]}; + double gl[]{m_MissingGradients[i], 0.0}; + double hl[]{m_MissingCurvatures[i], 0.0}; + + double maximumGain{-INF}; + double splitAt{-INF}; + bool assignMissingToLeft{true}; + + for (std::size_t j = 0; j + 1 < m_Gradients[i].size(); ++j) { + gl[ASSIGN_MISSING_TO_LEFT] += m_Gradients[i][j]; + hl[ASSIGN_MISSING_TO_LEFT] += m_Curvatures[i][j]; + gl[ASSIGN_MISSING_TO_RIGHT] += m_Gradients[i][j]; + hl[ASSIGN_MISSING_TO_RIGHT] += m_Curvatures[i][j]; + + double gain[]{ + CTools::pow2(gl[ASSIGN_MISSING_TO_LEFT]) / + (hl[ASSIGN_MISSING_TO_LEFT] + m_Regularization.lambda()) + + CTools::pow2(g - gl[ASSIGN_MISSING_TO_LEFT]) / + (h - hl[ASSIGN_MISSING_TO_LEFT] + m_Regularization.lambda()), + CTools::pow2(gl[ASSIGN_MISSING_TO_RIGHT]) / + (hl[ASSIGN_MISSING_TO_RIGHT] + m_Regularization.lambda()) + + CTools::pow2(g - gl[ASSIGN_MISSING_TO_RIGHT]) / + (h - hl[ASSIGN_MISSING_TO_RIGHT] + m_Regularization.lambda())}; + + if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) { + maximumGain = gain[ASSIGN_MISSING_TO_LEFT]; + splitAt = m_CandidateSplits[i][j]; + assignMissingToLeft = true; + } + if (gain[ASSIGN_MISSING_TO_RIGHT] > maximumGain) { + maximumGain = gain[ASSIGN_MISSING_TO_RIGHT]; + splitAt = m_CandidateSplits[i][j]; + assignMissingToLeft = false; + } + } + + double gain{0.5 * (maximumGain - CTools::pow2(g) / (h + m_Regularization.lambda())) - + m_Regularization.gamma()}; + + SSplitStatistics candidate{gain, h, i, splitAt, assignMissingToLeft}; + LOG_TRACE(<< "candidate split: " << candidate.print()); + + if (candidate > result) { + result = candidate; + } + } + + LOG_TRACE(<< "best split: " << result.print()); + + return result; +} + CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads, CBoostedTree::TLossFunctionUPtr loss) : m_NumberThreads{numberThreads}, m_Loss{std::move(loss)}, m_BestHyperparameters{m_Regularization, m_Eta, m_EtaGrowthRatePerTree, m_FeatureBagFraction} { From 841ff03c950b224cbdbf1d34d56fabc3124b58df Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 23 Sep 2019 15:04:56 +0100 Subject: [PATCH 05/23] Formatting --- include/maths/CBoostedTreeImpl.h | 65 +++++++++++++++++--------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index d8cf8c4703..c0becaf137 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -398,38 +398,41 @@ class MATHS_EXPORT CBoostedTreeImpl final { const CLeafNodeStatistics& parent, const CLeafNodeStatistics& sibling, core::CPackedBitVector rowMask) - : m_Id{id}, m_Regularization{sibling.m_Regularization}, - m_CandidateSplits{sibling.m_CandidateSplits}, m_Depth{sibling.m_Depth}, - m_FeatureBag{sibling.m_FeatureBag}, m_RowMask{std::move(rowMask)} { - - LOG_TRACE(<< "row mask = " << m_RowMask); - LOG_TRACE(<< "feature bag = " << core::CContainerPrinter::print(m_FeatureBag)); - - m_Gradients.resize(m_CandidateSplits.size()); - m_Curvatures.resize(m_CandidateSplits.size()); - m_MissingGradients.resize(m_CandidateSplits.size(), 0.0); - m_MissingCurvatures.resize(m_CandidateSplits.size(), 0.0); - - for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { - std::size_t numberSplits{m_CandidateSplits[i].size() + 1}; - m_Gradients[i].resize(numberSplits); - m_Curvatures[i].resize(numberSplits); - for (std::size_t j = 0; j < numberSplits; ++j) { - m_Gradients[i][j] = parent.m_Gradients[i][j] - sibling.m_Gradients[i][j]; - m_Curvatures[i][j] = parent.m_Curvatures[i][j] - sibling.m_Curvatures[i][j]; + : m_Id{id}, m_Regularization{sibling.m_Regularization}, + m_CandidateSplits{sibling.m_CandidateSplits}, m_Depth{sibling.m_Depth}, + m_FeatureBag{sibling.m_FeatureBag}, m_RowMask{std::move(rowMask)} { + + LOG_TRACE(<< "row mask = " << m_RowMask); + LOG_TRACE(<< "feature bag = " << core::CContainerPrinter::print(m_FeatureBag)); + + m_Gradients.resize(m_CandidateSplits.size()); + m_Curvatures.resize(m_CandidateSplits.size()); + m_MissingGradients.resize(m_CandidateSplits.size(), 0.0); + m_MissingCurvatures.resize(m_CandidateSplits.size(), 0.0); + + for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { + std::size_t numberSplits{m_CandidateSplits[i].size() + 1}; + m_Gradients[i].resize(numberSplits); + m_Curvatures[i].resize(numberSplits); + for (std::size_t j = 0; j < numberSplits; ++j) { + m_Gradients[i][j] = parent.m_Gradients[i][j] - + sibling.m_Gradients[i][j]; + m_Curvatures[i][j] = parent.m_Curvatures[i][j] - + sibling.m_Curvatures[i][j]; + } + m_MissingGradients[i] = parent.m_MissingGradients[i] - + sibling.m_MissingGradients[i]; + m_MissingCurvatures[i] = parent.m_MissingCurvatures[i] - + sibling.m_MissingCurvatures[i]; + } + + LOG_TRACE(<< "gradients = " << core::CContainerPrinter::print(m_Gradients)); + LOG_TRACE(<< "curvatures = " << core::CContainerPrinter::print(m_Curvatures)); + LOG_TRACE(<< "missing gradients = " + << core::CContainerPrinter::print(m_MissingGradients)); + LOG_TRACE(<< "missing curvatures = " + << core::CContainerPrinter::print(m_MissingCurvatures)); } - m_MissingGradients[i] = parent.m_MissingGradients[i] - - sibling.m_MissingGradients[i]; - m_MissingCurvatures[i] = parent.m_MissingCurvatures[i] - - sibling.m_MissingCurvatures[i]; - } - - LOG_TRACE(<< "gradients = " << core::CContainerPrinter::print(m_Gradients)); - LOG_TRACE(<< "curvatures = " << core::CContainerPrinter::print(m_Curvatures)); - LOG_TRACE(<< "missing gradients = " << core::CContainerPrinter::print(m_MissingGradients)); - LOG_TRACE(<< "missing curvatures = " - << core::CContainerPrinter::print(m_MissingCurvatures)); -} CLeafNodeStatistics(const CLeafNodeStatistics&) = delete; From 7a57883e05fc55dcf878201b6be5ebde4cc13a4a Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 23 Sep 2019 15:10:45 +0100 Subject: [PATCH 06/23] Remove depth: this isn't needed yet --- include/maths/CBoostedTreeImpl.h | 14 +++++--------- lib/maths/CBoostedTreeImpl.cc | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index c0becaf137..0c11fd3f0b 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -379,11 +379,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { const CDataFrameCategoryEncoder& encoder, const TRegularization& regularization, const TDoubleVecVec& candidateSplits, - std::size_t depth, TSizeVec featureBag, core::CPackedBitVector rowMask) - : m_Id{id}, m_Regularization{regularization}, - m_CandidateSplits{candidateSplits}, m_Depth{depth}, + : m_Id{id}, m_Regularization{regularization}, m_CandidateSplits{candidateSplits}, m_FeatureBag{std::move(featureBag)}, m_RowMask{std::move(rowMask)} { std::sort(m_FeatureBag.begin(), m_FeatureBag.end()); @@ -399,7 +397,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { const CLeafNodeStatistics& sibling, core::CPackedBitVector rowMask) : m_Id{id}, m_Regularization{sibling.m_Regularization}, - m_CandidateSplits{sibling.m_CandidateSplits}, m_Depth{sibling.m_Depth}, + m_CandidateSplits{sibling.m_CandidateSplits}, m_FeatureBag{sibling.m_FeatureBag}, m_RowMask{std::move(rowMask)} { LOG_TRACE(<< "row mask = " << m_RowMask); @@ -458,8 +456,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { if (leftChildHasFewerRows) { auto leftChild = std::make_shared( leftChildId, numberThreads, frame, encoder, regularization, - candidateSplits, m_Depth + 1, std::move(featureBag), - std::move(leftChildRowMask)); + candidateSplits, std::move(featureBag), std::move(leftChildRowMask)); auto rightChild = std::make_shared( rightChildId, *this, *leftChild, std::move(rightChildRowMask)); @@ -467,8 +464,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { } auto rightChild = std::make_shared( - rightChildId, numberThreads, frame, encoder, regularization, candidateSplits, - m_Depth + 1, std::move(featureBag), std::move(rightChildRowMask)); + rightChildId, numberThreads, frame, encoder, regularization, + candidateSplits, std::move(featureBag), std::move(rightChildRowMask)); auto leftChild = std::make_shared( leftChildId, *this, *rightChild, std::move(leftChildRowMask)); @@ -653,7 +650,6 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t m_Id; const TRegularization& m_Regularization; const TDoubleVecVec& m_CandidateSplits; - std::size_t m_Depth; TSizeVec m_FeatureBag; core::CPackedBitVector m_RowMask; TDoubleVecVec m_Gradients; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index ae656a6d8b..9f74d9ae05 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -570,7 +570,7 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, TLeafNodeStatisticsPtrQueue leaves; leaves.push(std::make_shared( 0 /*root*/, m_NumberThreads, frame, *m_Encoder, m_Regularization, - candidateSplits, 0 /*depth*/, this->featureBag(), trainingRowMask)); + candidateSplits, this->featureBag(), trainingRowMask)); // We update local variables because the callback can be expensive if it // requires accessing atomics. From cd71cd6a62794ea5dfd89e052b9afa658dce0df6 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 23 Sep 2019 18:28:39 +0100 Subject: [PATCH 07/23] Fix tests --- lib/api/unittest/CDataFrameAnalyzerTest.cc | 35 ++++++++-------------- lib/maths/CBoostedTreeImpl.cc | 6 ++-- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index 06c133982b..7c82fdbe5f 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -653,7 +653,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() { LOG_DEBUG(<< "time to train = " << core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) << "ms"); CPPUNIT_ASSERT(core::CProgramCounters::counter( - counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 2300000); + counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 2600000); CPPUNIT_ASSERT(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1050000); CPPUNIT_ASSERT(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0); CPPUNIT_ASSERT(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration); @@ -1176,34 +1176,25 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti rapidjson::Document expectedResults{treeToJsonDocument(*expectedTree)}; const auto& expectedHyperparameters = expectedResults["best_hyperparameters"]; + const auto& expectedRegularizationHyperparameters = + expectedHyperparameters["hyperparam_regularization"]; rapidjson::Document actualResults{treeToJsonDocument(*actualTree)}; const auto& actualHyperparameters = actualResults["best_hyperparameters"]; + const auto& actualRegularizationHyperparameters = + actualHyperparameters["hyperparam_regularization"]; - auto assertDoublesEqual = [&expectedHyperparameters, - &actualHyperparameters](std::string key) { + for (const auto& key : {"hyperparam_eta", "hyperparam_eta_growth_rate_per_tree", + "hyperparam_feature_bag_fraction"}) { double expected{std::stod(expectedHyperparameters[key].GetString())}; double actual{std::stod(actualHyperparameters[key].GetString())}; CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 1e-4 * expected); - }; - auto assertDoublesArrayEqual = [&expectedHyperparameters, - &actualHyperparameters](std::string key) { - TDoubleVec expectedVector; - core::CPersistUtils::fromString(expectedHyperparameters[key].GetString(), expectedVector); - TDoubleVec actualVector; - core::CPersistUtils::fromString(actualHyperparameters[key].GetString(), actualVector); - CPPUNIT_ASSERT_EQUAL(expectedVector.size(), actualVector.size()); - for (size_t i = 0; i < expectedVector.size(); i++) { - CPPUNIT_ASSERT_DOUBLES_EQUAL(expectedVector[i], actualVector[i], - 1e-4 * expectedVector[i]); - } - }; - assertDoublesEqual("hyperparam_lambda"); - assertDoublesEqual("hyperparam_gamma"); - assertDoublesEqual("hyperparam_eta"); - assertDoublesEqual("hyperparam_eta_growth_rate_per_tree"); - assertDoublesEqual("hyperparam_feature_bag_fraction"); - assertDoublesArrayEqual("hyperparam_feature_sample_probabilities"); + } + for (const auto& key : {"regularization_gamma", "regularization_lambda"}) { + double expected{std::stod(expectedRegularizationHyperparameters[key].GetString())}; + double actual{std::stod(actualRegularizationHyperparameters[key].GetString())}; + CPPUNIT_ASSERT_DOUBLES_EQUAL(expected, actual, 1e-4 * expected); + } } maths::CBoostedTreeFactory::TBoostedTreeUPtr diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 9f74d9ae05..5a46ae07b2 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -924,10 +924,10 @@ const std::string REGULARIZATION_OVERRIDE_TAG{"regularization_override"}; const std::string ROWS_PER_FEATURE_TAG{"rows_per_feature"}; const std::string TESTING_ROW_MASKS_TAG{"testing_row_masks"}; const std::string TRAINING_ROW_MASKS_TAG{"training_row_masks"}; -const std::string TRAINING_PROGRESS_TAG{"training_row_masks"}; +const std::string TRAINING_PROGRESS_TAG{"training_progress"}; -const std::string REGULARIZATION_GAMMA_TAG{"gamma"}; -const std::string REGULARIZATION_LAMBDA_TAG{"lambda"}; +const std::string REGULARIZATION_GAMMA_TAG{"regularization_gamma"}; +const std::string REGULARIZATION_LAMBDA_TAG{"regularization_lambda"}; const std::string HYPERPARAM_ETA_TAG{"hyperparam_eta"}; const std::string HYPERPARAM_ETA_GROWTH_RATE_PER_TREE_TAG{"hyperparam_eta_growth_rate_per_tree"}; From e42208c2eb880fa90fcc29467d1afdef348cf3dc Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 12:11:19 +0100 Subject: [PATCH 08/23] Create tree trainer in buildFor for both restore and creation from scratch --- include/api/CDataFrameBoostedTreeRunner.h | 4 +- include/maths/CBoostedTreeFactory.h | 15 ++--- lib/api/CDataFrameBoostedTreeRunner.cc | 17 +++-- lib/maths/CBoostedTreeFactory.cc | 77 ++++++++++++----------- lib/maths/CBoostedTreeImpl.cc | 2 + lib/maths/unittest/CBoostedTreeTest.cc | 13 ++-- 6 files changed, 71 insertions(+), 57 deletions(-) diff --git a/include/api/CDataFrameBoostedTreeRunner.h b/include/api/CDataFrameBoostedTreeRunner.h index 7924ea44f6..13b0a6863f 100644 --- a/include/api/CDataFrameBoostedTreeRunner.h +++ b/include/api/CDataFrameBoostedTreeRunner.h @@ -47,6 +47,7 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu private: using TBoostedTreeUPtr = std::unique_ptr; using TBoostedTreeFactoryUPtr = std::unique_ptr; + using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr; using TMemoryEstimator = std::function; private: @@ -58,7 +59,8 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu TMemoryEstimator memoryEstimator(); bool restoreBoostedTree(core::CDataFrame& frame, - CDataFrameAnalysisSpecification::TDataSearcherUPtr& restoreSearcher); + std::size_t dependentVariableColumn, + TDataSearcherUPtr& restoreSearcher); private: // Note custom config is written directly to the factory object. diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 3dd579a4a8..fa33e8324f 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -44,12 +44,9 @@ class MATHS_EXPORT CBoostedTreeFactory final { TLossFunctionUPtr loss); //! Construct a boosted tree object from its serialized version. - static TBoostedTreeUPtr - constructFromString(std::istream& jsonStringStream, - core::CDataFrame& frame, - TProgressCallback recordProgress = noopRecordProgress, - TMemoryUsageCallback recordMemoryUsage = noopRecordMemoryUsage, - TTrainingStateCallback recordTrainingState = noopRecordTrainingState); + //! + //! \warning Throws runtime error on fail to restore. + static CBoostedTreeFactory constructFromString(std::istream& jsonStringStream); ~CBoostedTreeFactory(); CBoostedTreeFactory(CBoostedTreeFactory&) = delete; @@ -108,7 +105,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { static const std::size_t MAXIMUM_NUMBER_TREES; private: - CBoostedTreeFactory(std::size_t numberThreads, TLossFunctionUPtr loss); + CBoostedTreeFactory(bool restored, std::size_t numberThreads, TLossFunctionUPtr loss); //! Compute the row masks for the missing values for each feature. void initializeMissingFeatureMasks(const core::CDataFrame& frame) const; @@ -152,6 +149,9 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Setup monitoring for training progress. void setupTrainingProgressMonitoring(); + //! Refresh progress monitoring after restoring from saved training state. + void restoreTrainingProgressMonitoring(); + static void noopRecordProgress(double); static void noopRecordMemoryUsage(std::int64_t); static void noopRecordTrainingState(CDataFrameRegressionModel::TPersistFunc); @@ -159,6 +159,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { private: TOptionalDouble m_MinimumFrequencyToOneHotEncode; TOptionalSize m_BayesianOptimisationRestarts; + bool m_Restored = false; TBoostedTreeImplUPtr m_TreeImpl; TVector m_GammaSearchInterval; TVector m_LambdaSearchInterval; diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 0023c99d04..f4267519ad 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -191,7 +191,8 @@ void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, auto restoreSearcher{this->spec().restoreSearcher()}; bool treeRestored{false}; if (restoreSearcher != nullptr) { - treeRestored = this->restoreBoostedTree(frame, restoreSearcher); + treeRestored = this->restoreBoostedTree( + frame, dependentVariableColumn - featureNames.begin(), restoreSearcher); } if (treeRestored == false) { @@ -204,9 +205,10 @@ void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) = watch.stop(); } -bool CDataFrameBoostedTreeRunner::restoreBoostedTree( - core::CDataFrame& frame, - CDataFrameAnalysisSpecification::TDataSearcherUPtr& restoreSearcher) { // Restore from Elasticsearch compressed data +bool CDataFrameBoostedTreeRunner::restoreBoostedTree(core::CDataFrame& frame, + std::size_t dependentVariableColumn, + TDataSearcherUPtr& restoreSearcher) { + // Restore from Elasticsearch compressed data try { core::CStateDecompressor decompressor(*restoreSearcher); decompressor.setStateRestoreSearch( @@ -228,8 +230,11 @@ bool CDataFrameBoostedTreeRunner::restoreBoostedTree( return false; } - m_BoostedTree = maths::CBoostedTreeFactory::constructFromString( - *inputStream, frame, progressRecorder(), memoryEstimator(), statePersister()); + m_BoostedTree = maths::CBoostedTreeFactory::constructFromString(*inputStream) + .progressCallback(this->progressRecorder()) + .trainingStateCallback(this->statePersister()) + .memoryUsageCallback(this->memoryEstimator()) + .buildFor(frame, dependentVariableColumn); } catch (std::exception& e) { LOG_ERROR(<< "Failed to restore state! " << e.what()); return false; diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 037c14ff2f..25c8cb7343 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -43,25 +43,38 @@ const double MAIN_TRAINING_LOOP_TREE_SIZE_MULTIPLIER{10.0}; CBoostedTreeFactory::TBoostedTreeUPtr CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVariable) { - m_TreeImpl->m_DependentVariable = dependentVariable; + if (m_Restored) { - this->setupTrainingProgressMonitoring(); + if (dependentVariable != m_TreeImpl->m_DependentVariable) { + HANDLE_FATAL(<< "Internal error: expected dependent variable " + << m_TreeImpl->m_DependentVariable << " got " << dependentVariable); + } + + this->restoreTrainingProgressMonitoring(); + + frame.resizeColumns(m_TreeImpl->m_NumberThreads, + frame.numberColumns() + this->numberExtraColumnsForTrain()); + + } else { + + m_TreeImpl->m_DependentVariable = dependentVariable; - this->initializeMissingFeatureMasks(frame); - std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks) = - this->crossValidationRowMasks(); + this->setupTrainingProgressMonitoring(); - // We store the gradient and curvature of the loss function and the predicted - // value for the dependent variable of the regression. - frame.resizeColumns(m_TreeImpl->m_NumberThreads, - frame.numberColumns() + this->numberExtraColumnsForTrain()); + this->initializeMissingFeatureMasks(frame); + std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks) = + this->crossValidationRowMasks(); - this->selectFeaturesAndEncodeCategories(frame); - this->determineFeatureDataTypes(frame); + frame.resizeColumns(m_TreeImpl->m_NumberThreads, + frame.numberColumns() + this->numberExtraColumnsForTrain()); - if (this->initializeFeatureSampleDistribution()) { - this->initializeHyperparameters(frame); - this->initializeHyperparameterOptimisation(); + this->selectFeaturesAndEncodeCategories(frame); + this->determineFeatureDataTypes(frame); + + if (this->initializeFeatureSampleDistribution()) { + this->initializeHyperparameters(frame); + this->initializeHyperparameterOptimisation(); + } } // TODO can only use factory to create one object since this is moved. This seems trappy. @@ -465,37 +478,26 @@ CBoostedTreeFactory::candidateRegularizerSearchInterval(core::CDataFrame& frame, CBoostedTreeFactory CBoostedTreeFactory::constructFromParameters(std::size_t numberThreads, TLossFunctionUPtr loss) { - return {numberThreads, std::move(loss)}; + return {false, numberThreads, std::move(loss)}; } -CBoostedTreeFactory::TBoostedTreeUPtr -CBoostedTreeFactory::constructFromString(std::istream& jsonStringStream, - core::CDataFrame& frame, - TProgressCallback recordProgress, - TMemoryUsageCallback recordMemoryUsage, - TTrainingStateCallback recordTrainingState) { +CBoostedTreeFactory CBoostedTreeFactory::constructFromString(std::istream& jsonStringStream) { + CBoostedTreeFactory result{true, 1, nullptr}; try { - TBoostedTreeUPtr treePtr{new CBoostedTree{ - frame, std::move(recordProgress), std::move(recordMemoryUsage), - std::move(recordTrainingState), TBoostedTreeImplUPtr{new CBoostedTreeImpl{}}}}; core::CJsonStateRestoreTraverser traverser(jsonStringStream); - if (treePtr->acceptRestoreTraverser(traverser) == false || traverser.haveBadState()) { + if (result.m_TreeImpl->acceptRestoreTraverser(traverser) == false || + traverser.haveBadState()) { throw std::runtime_error{"failed to restore boosted tree"}; } - treePtr->m_Impl->m_TrainingProgress.attach(recordProgress); - treePtr->m_Impl->m_TrainingProgress.resumeRestored(); - frame.resizeColumns(treePtr->m_Impl->m_NumberThreads, - frame.numberColumns() + - treePtr->m_Impl->numberExtraColumnsForTrain()); - return treePtr; } catch (const std::exception& e) { - HANDLE_FATAL(<< "Input error: '" << e.what() << "'. Check logs for more details."); + throw std::runtime_error{std::string{"Input error: '"} + e.what() + "'"}; } - return nullptr; + return result; } -CBoostedTreeFactory::CBoostedTreeFactory(std::size_t numberThreads, TLossFunctionUPtr loss) - : m_TreeImpl{std::make_unique(numberThreads, std::move(loss))}, +CBoostedTreeFactory::CBoostedTreeFactory(bool restored, std::size_t numberThreads, TLossFunctionUPtr loss) + : m_Restored{restored}, m_TreeImpl{std::make_unique(numberThreads, + std::move(loss))}, m_GammaSearchInterval{0.0}, m_LambdaSearchInterval{0.0} { } @@ -660,6 +662,11 @@ void CBoostedTreeFactory::setupTrainingProgressMonitoring() { m_TreeImpl->m_TrainingProgress = core::CLoopProgress{totalNumberSteps, m_RecordProgress}; } +void CBoostedTreeFactory::restoreTrainingProgressMonitoring() { + m_TreeImpl->m_TrainingProgress.attach(m_RecordProgress); + m_TreeImpl->m_TrainingProgress.resumeRestored(); +} + void CBoostedTreeFactory::noopRecordTrainingState(std::function) { } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 5a46ae07b2..b564d8f613 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -302,6 +302,8 @@ std::size_t CBoostedTreeImpl::columnHoldingDependentVariable() const { } std::size_t CBoostedTreeImpl::numberExtraColumnsForTrain() { + // We store the gradient and curvature of the loss function and the predicted + // value for the dependent variable of the regression in the data frame. return 3; } diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 99dd04242f..13c9dcc543 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -908,7 +908,7 @@ void CBoostedTreeTest::testPersistRestore() { } // restore auto boostedTree = - maths::CBoostedTreeFactory::constructFromString(persistOnceSStream, *frame); + maths::CBoostedTreeFactory::constructFromString(persistOnceSStream).buildFor(*frame, cols - 1); { core::CJsonStatePersistInserter inserter(persistTwiceSStream); boostedTree->acceptPersistInserter(inserter); @@ -965,8 +965,8 @@ void CBoostedTreeTest::testRestoreErrorHandling() { bool throwsExceptions{false}; try { - auto boostedTree = maths::CBoostedTreeFactory::constructFromString( - errorInBayesianOptimisationState, *frame); + auto boostedTree = maths::CBoostedTreeFactory::constructFromString(errorInBayesianOptimisationState) + .buildFor(*frame, 2); } catch (const std::exception& e) { LOG_DEBUG(<< "got = " << e.what()); throwsExceptions = true; @@ -1004,11 +1004,8 @@ void CBoostedTreeTest::testRestoreErrorHandling() { throwsExceptions = false; try { - auto boostedTree = maths::CBoostedTreeFactory::constructFromString( - errorInBoostedTreeImplState, *frame, - ml::maths::CBoostedTreeFactory::TProgressCallback(), - ml::maths::CBoostedTreeFactory::TMemoryUsageCallback(), - ml::maths::CBoostedTreeFactory::TTrainingStateCallback()); + auto boostedTree = maths::CBoostedTreeFactory::constructFromString(errorInBoostedTreeImplState) + .buildFor(*frame, 2); } catch (const std::exception& e) { LOG_DEBUG(<< "got = " << e.what()); throwsExceptions = true; From 79f3b433cc9085a4e776a08bd8a1ca7f69bb96c9 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 12:18:15 +0100 Subject: [PATCH 09/23] Better naming plus add explanation of persist/restore strategy for loop progress --- include/core/CLoopProgress.h | 2 +- lib/core/CLoopProgress.cc | 5 ++++- lib/maths/CBoostedTreeFactory.cc | 2 +- lib/maths/CBoostedTreeImpl.cc | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/core/CLoopProgress.h b/include/core/CLoopProgress.h index d37de8c3bb..4d551e7707 100644 --- a/include/core/CLoopProgress.h +++ b/include/core/CLoopProgress.h @@ -57,7 +57,7 @@ class CORE_EXPORT CLoopProgress { double scale = 1.0); //! Attach a new progress monitor callback. - void attach(const TProgressCallback& recordProgress); + void progressCallback(const TProgressCallback& recordProgress); //! Increment the progress by \p i. void increment(std::size_t i = 1); diff --git a/lib/core/CLoopProgress.cc b/lib/core/CLoopProgress.cc index 608892f754..679dd9d6f2 100644 --- a/lib/core/CLoopProgress.cc +++ b/lib/core/CLoopProgress.cc @@ -35,7 +35,7 @@ CLoopProgress::CLoopProgress(std::size_t size, const TProgressCallback& recordPr m_StepProgress{scale / static_cast(m_Steps)}, m_RecordProgress{recordProgress} { } -void CLoopProgress::attach(const TProgressCallback& recordProgress) { +void CLoopProgress::progressCallback(const TProgressCallback& recordProgress) { m_RecordProgress = recordProgress; } @@ -52,6 +52,7 @@ void CLoopProgress::increment(std::size_t i) { } void CLoopProgress::resumeRestored() { + // This outputs progress and updates m_LastProgress to the correct value. this->increment(0); } @@ -70,6 +71,8 @@ void CLoopProgress::acceptPersistInserter(CStatePersistInserter& inserter) const inserter.insertValue(CURRENT_STEP_PROGRESS_TAG, m_StepProgress, core::CIEEE754::E_DoublePrecision); inserter.insertValue(LOOP_POS_TAG, m_Pos); + // m_LastProgress is not persisted because when restoring we will have never + // recorded progress. } bool CLoopProgress::acceptRestoreTraverser(CStateRestoreTraverser& traverser) { diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 25c8cb7343..c78a95bd75 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -663,7 +663,7 @@ void CBoostedTreeFactory::setupTrainingProgressMonitoring() { } void CBoostedTreeFactory::restoreTrainingProgressMonitoring() { - m_TreeImpl->m_TrainingProgress.attach(m_RecordProgress); + m_TreeImpl->m_TrainingProgress.progressCallback(m_RecordProgress); m_TreeImpl->m_TrainingProgress.resumeRestored(); } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index b564d8f613..42c1be3bdb 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -201,7 +201,7 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, LOG_TRACE(<< "Main training loop..."); - m_TrainingProgress.attach(recordProgress); + m_TrainingProgress.progressCallback(recordProgress); std::uint64_t lastMemoryUsage(this->memoryUsage()); recordMemoryUsage(lastMemoryUsage); From aeae71e0cdd8d13237f71d22d1ba24f221e71fb0 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 12:27:52 +0100 Subject: [PATCH 10/23] Improve progress related function names --- include/maths/CBoostedTreeFactory.h | 4 ++-- lib/maths/CBoostedTreeFactory.cc | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index fa33e8324f..570684ee14 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -147,10 +147,10 @@ class MATHS_EXPORT CBoostedTreeFactory final { std::size_t numberHyperparameterTuningRounds() const; //! Setup monitoring for training progress. - void setupTrainingProgressMonitoring(); + void initializeTrainingProgressMonitoring(); //! Refresh progress monitoring after restoring from saved training state. - void restoreTrainingProgressMonitoring(); + void resumeRestoredTrainingProgressMonitoring(); static void noopRecordProgress(double); static void noopRecordMemoryUsage(std::int64_t); diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index c78a95bd75..7b4002d402 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -50,7 +50,7 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari << m_TreeImpl->m_DependentVariable << " got " << dependentVariable); } - this->restoreTrainingProgressMonitoring(); + this->resumeRestoredTrainingProgressMonitoring(); frame.resizeColumns(m_TreeImpl->m_NumberThreads, frame.numberColumns() + this->numberExtraColumnsForTrain()); @@ -59,7 +59,7 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari m_TreeImpl->m_DependentVariable = dependentVariable; - this->setupTrainingProgressMonitoring(); + this->initializeTrainingProgressMonitoring(); this->initializeMissingFeatureMasks(frame); std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks) = @@ -638,7 +638,7 @@ std::size_t CBoostedTreeFactory::numberExtraColumnsForTrain() const { return m_TreeImpl->numberExtraColumnsForTrain(); } -void CBoostedTreeFactory::setupTrainingProgressMonitoring() { +void CBoostedTreeFactory::initializeTrainingProgressMonitoring() { // The base unit is the cost of training on one fold. // @@ -662,7 +662,7 @@ void CBoostedTreeFactory::setupTrainingProgressMonitoring() { m_TreeImpl->m_TrainingProgress = core::CLoopProgress{totalNumberSteps, m_RecordProgress}; } -void CBoostedTreeFactory::restoreTrainingProgressMonitoring() { +void CBoostedTreeFactory::resumeRestoredTrainingProgressMonitoring() { m_TreeImpl->m_TrainingProgress.progressCallback(m_RecordProgress); m_TreeImpl->m_TrainingProgress.resumeRestored(); } From 7111620b7fb8a677758c81af6f8a889a1a671a98 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 12:30:36 +0100 Subject: [PATCH 11/23] Improve logic readability --- lib/maths/CBoostedTreeFactory.cc | 61 ++++++++++++++++---------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 7b4002d402..3936f9ff75 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -313,37 +313,36 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa std::tie(gainPerNode, totalCurvaturePerNode) = this->estimateTreeGainAndCurvature(frame, allTrainingRowsMask); - if (gainPerNode > 0.0 && m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { - - TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; - fallbackInterval *= m_TreeImpl->m_Eta; - auto interval = this->candidateRegularizerSearchInterval( - frame, allTrainingRowsMask, [this, gainPerNode](double scale) { - m_TreeImpl->m_Regularization.gamma(scale * gainPerNode); - }); - m_GammaSearchInterval = interval.value_or(fallbackInterval) * gainPerNode; - LOG_TRACE(<< "gamma search interval = [" - << m_GammaSearchInterval.toDelimited() << "]"); - - } else if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { - m_TreeImpl->m_RegularizationOverride.gamma(0.0); - } - - if (totalCurvaturePerNode > 0.0 && - m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { - - TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; - m_TreeImpl->m_Regularization.gamma(m_GammaSearchInterval(MIN_REGULARIZER_INDEX)); - auto interval = this->candidateRegularizerSearchInterval( - frame, allTrainingRowsMask, [this, totalCurvaturePerNode](double scale) { - m_TreeImpl->m_Regularization.lambda(scale * totalCurvaturePerNode); - }); - m_LambdaSearchInterval = interval.value_or(fallbackInterval) * totalCurvaturePerNode; - LOG_TRACE(<< "lambda search interval = [" - << m_LambdaSearchInterval.toDelimited() << "]"); - - } else if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { - m_TreeImpl->m_RegularizationOverride.lambda(0.0); + if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { + if (gainPerNode > 0.0) { + TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; + fallbackInterval *= m_TreeImpl->m_Eta; + auto interval = this->candidateRegularizerSearchInterval( + frame, allTrainingRowsMask, [this, gainPerNode](double scale) { + m_TreeImpl->m_Regularization.gamma(scale * gainPerNode); + }); + m_GammaSearchInterval = interval.value_or(fallbackInterval) * gainPerNode; + LOG_TRACE(<< "gamma search interval = [" + << m_GammaSearchInterval.toDelimited() << "]"); + } else { + m_TreeImpl->m_RegularizationOverride.gamma(0.0); + } + } + + if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { + if (totalCurvaturePerNode > 0.0) { + TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; + m_TreeImpl->m_Regularization.gamma(m_GammaSearchInterval(MIN_REGULARIZER_INDEX)); + auto interval = this->candidateRegularizerSearchInterval( + frame, allTrainingRowsMask, [this, totalCurvaturePerNode](double scale) { + m_TreeImpl->m_Regularization.lambda(scale * totalCurvaturePerNode); + }); + m_LambdaSearchInterval = interval.value_or(fallbackInterval) * totalCurvaturePerNode; + LOG_TRACE(<< "lambda search interval = [" + << m_LambdaSearchInterval.toDelimited() << "]"); + } else { + m_TreeImpl->m_RegularizationOverride.lambda(0.0); + } } double scale{ From 1f5d4c0f3135f1a432d2d4b27cfe7953889d35b4 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 12:32:30 +0100 Subject: [PATCH 12/23] Unpack long line --- lib/maths/CBoostedTreeFactory.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 3936f9ff75..0817b693cd 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -345,11 +345,11 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa } } - double scale{ - static_cast(m_TreeImpl->m_NumberFolds - 1) / - static_cast(m_TreeImpl->m_NumberFolds) / - ((m_TreeImpl->m_RegularizationOverride.gamma() != boost::none ? 0.0 : 1.0) + - (m_TreeImpl->m_RegularizationOverride.lambda() != boost::none ? 0.0 : 1.0))}; + double freeRegularizationParameters{ + (m_TreeImpl->m_RegularizationOverride.gamma() != boost::none ? 0.0 : 1.0) + + (m_TreeImpl->m_RegularizationOverride.lambda() != boost::none ? 0.0 : 1.0)}; + double scale{static_cast(m_TreeImpl->m_NumberFolds - 1) / + static_cast(m_TreeImpl->m_NumberFolds) / freeRegularizationParameters}; if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { m_GammaSearchInterval *= scale; From f63c2289883ea053fdf643ec38e19a7885a97c56 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 13:14:42 +0100 Subject: [PATCH 13/23] Improve comments --- include/maths/CBoostedTreeImpl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 0c11fd3f0b..f2e9c6ac49 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -120,18 +120,18 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TNodeVec = std::vector; using TNodeVecVec = std::vector; - //! \brief Holds the parameters associated with the different types of regulariser + //! \brief Holds the parameters associated with the different types of regularizer //! terms available. template class CRegularization final { public: - //! Set the multiplier of the tree size regularizer. + //! Set the multiplier of the tree size penalty. CRegularization& gamma(double gamma) { m_Gamma = gamma; return *this; } - //! Set the multiplier of the square leaf weight regularizer. + //! Set the multiplier of the square leaf weight penalty. CRegularization& lambda(double lambda) { m_Lambda = lambda; return *this; @@ -142,10 +142,10 @@ class MATHS_EXPORT CBoostedTreeImpl final { return (m_Gamma == T{} ? 1 : 0) + (m_Lambda == T{} ? 1 : 0); } - //! Multiplier of the tree size regularizer. + //! Multiplier of the tree size penalty. T gamma() const { return m_Gamma; } - //! Multiplier of the square leaf weight regularizer. + //! Multiplier of the square leaf weight penalty. T lambda() const { return m_Lambda; } //! Get description of the regularization parameters. From e0dd26ede432e16cd19d7ce0961f925db920f2ef Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 13:17:21 +0100 Subject: [PATCH 14/23] Correct out-of-date comment --- include/maths/CBoostedTreeImpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index f2e9c6ac49..820ab324d2 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -674,8 +674,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! the dependent variable. core::CPackedBitVector allTrainingRowsMask() const; - //! Compute the sum loss for the predictions from \p frame and the leaf - //! count and squared weight sum from \p forest. + //! Compute the \p percentile percentile gain per split and the sum of row + //! curvatures per internal node of \p forest. TDoubleDoublePr gainAndCurvatureAtPercentile(double percentile, const TNodeVecVec& forest) const; From 4cee56fb0a27c62d34fe3490e140ee32d673d34e Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 13:50:57 +0100 Subject: [PATCH 15/23] Improve function naming and unpack expression --- include/maths/CBoostedTreeFactory.h | 15 +++++--- lib/maths/CBoostedTreeFactory.cc | 54 ++++++++++++++++++----------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 570684ee14..e661fcb9fa 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -98,7 +98,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { using TOptionalVector = boost::optional; using TPackedBitVectorVec = std::vector; using TBoostedTreeImplUPtr = std::unique_ptr; - using TScaleRegularization = std::function; + using TScaleRegularization = std::function; private: static const double MINIMUM_ETA; @@ -135,10 +135,15 @@ class MATHS_EXPORT CBoostedTreeFactory final { TDoubleDoublePr estimateTreeGainAndCurvature(core::CDataFrame& frame, const core::CPackedBitVector& trainingRowMask) const; - //! Get the regularizer value at the point the model starts to overfit. - TOptionalVector candidateRegularizerSearchInterval(core::CDataFrame& frame, - core::CPackedBitVector trainingRowMask, - TScaleRegularization scale) const; + //! Perform a line search with quadratic approximation for the regularizer + //! value at the model starts to overfit. + //! + //! \note applyScaleToRegularizer Applies a specified scale to the initial + //! choosen value for tree implemenation. + TOptionalVector + lineSearchWithQuadraticApproxToTestError(core::CDataFrame& frame, + core::CPackedBitVector trainingRowMask, + const TScaleRegularization& applyScaleToRegularizer) const; //! Initialize the state for hyperparameter optimisation. void initializeHyperparameterOptimisation() const; diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 0817b693cd..1c676db3cb 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -317,11 +317,15 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa if (gainPerNode > 0.0) { TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; fallbackInterval *= m_TreeImpl->m_Eta; - auto interval = this->candidateRegularizerSearchInterval( - frame, allTrainingRowsMask, [this, gainPerNode](double scale) { - m_TreeImpl->m_Regularization.gamma(scale * gainPerNode); - }); - m_GammaSearchInterval = interval.value_or(fallbackInterval) * gainPerNode; + + double initialGamma{gainPerNode}; + auto gammaStep = [initialGamma](CBoostedTreeImpl& tree, double scale) { + tree.m_Regularization.gamma(scale * initialGamma); + }; + m_GammaSearchInterval = this->lineSearchWithQuadraticApproxToTestError( + frame, allTrainingRowsMask, gammaStep) + .value_or(fallbackInterval) * + gainPerNode; LOG_TRACE(<< "gamma search interval = [" << m_GammaSearchInterval.toDelimited() << "]"); } else { @@ -333,11 +337,15 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa if (totalCurvaturePerNode > 0.0) { TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; m_TreeImpl->m_Regularization.gamma(m_GammaSearchInterval(MIN_REGULARIZER_INDEX)); - auto interval = this->candidateRegularizerSearchInterval( - frame, allTrainingRowsMask, [this, totalCurvaturePerNode](double scale) { - m_TreeImpl->m_Regularization.lambda(scale * totalCurvaturePerNode); - }); - m_LambdaSearchInterval = interval.value_or(fallbackInterval) * totalCurvaturePerNode; + + double initialLambda{totalCurvaturePerNode}; + auto lambdaStep = [initialLambda](CBoostedTreeImpl& tree, double scale) { + tree.m_Regularization.lambda(scale * initialLambda); + }; + m_LambdaSearchInterval = this->lineSearchWithQuadraticApproxToTestError( + frame, allTrainingRowsMask, lambdaStep) + .value_or(fallbackInterval) * + totalCurvaturePerNode; LOG_TRACE(<< "lambda search interval = [" << m_LambdaSearchInterval.toDelimited() << "]"); } else { @@ -380,10 +388,10 @@ CBoostedTreeFactory::estimateTreeGainAndCurvature(core::CDataFrame& frame, return {gain, curvature}; } -CBoostedTreeFactory::TOptionalVector -CBoostedTreeFactory::candidateRegularizerSearchInterval(core::CDataFrame& frame, - core::CPackedBitVector trainingRowMask, - TScaleRegularization scaleRegularization) const { +CBoostedTreeFactory::TOptionalVector CBoostedTreeFactory::lineSearchWithQuadraticApproxToTestError( + core::CDataFrame& frame, + core::CPackedBitVector trainingRowMask, + const TScaleRegularization& regularizerStep) const { // This uses a quadratic approximation to the test loss function w.r.t. // the scaled regularization hyperparameter from which it estimates the @@ -419,7 +427,7 @@ CBoostedTreeFactory::candidateRegularizerSearchInterval(core::CDataFrame& frame, double scale{1.0}; for (std::size_t i = 0; i < INITIAL_REGULARIZER_SEARCH_ITERATIONS; ++i) { - scaleRegularization(scale); + regularizerStep(*m_TreeImpl, scale); scale *= multiplier; auto forest = m_TreeImpl->trainForest(frame, trainingRowMask, m_RecordMemoryUsage); double testLoss{m_TreeImpl->meanLoss(frame, testRowMask, forest)}; @@ -443,12 +451,16 @@ CBoostedTreeFactory::candidateRegularizerSearchInterval(core::CDataFrame& frame, double leftEndpoint{0.0}; double rightEndpoint{static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS - 1)}; double stationaryPoint{-gradient / 2.0 / curvature}; - double distanceToLeftEndpoint{std::fabs(leftEndpoint - stationaryPoint)}; - double distanceToRightEndpoint{std::fabs(rightEndpoint - stationaryPoint)}; - double logBestRegularizerScale{ - curvature < 0.0 - ? (distanceToLeftEndpoint > distanceToRightEndpoint ? leftEndpoint : rightEndpoint) - : CTools::truncate(stationaryPoint, leftEndpoint, rightEndpoint)}; + double logBestRegularizerScale{[&] { + double distanceToLeftEndpoint{std::fabs(leftEndpoint - stationaryPoint)}; + double distanceToRightEndpoint{std::fabs(rightEndpoint - stationaryPoint)}; + if (curvature < 0.0) { + // Stationary point is a maximum so use furthest point in interval. + return distanceToLeftEndpoint > distanceToRightEndpoint ? leftEndpoint : rightEndpoint; + } + // Stationary point is a minimum so use nearest point in the interval. + return CTools::truncate(stationaryPoint, leftEndpoint, rightEndpoint); + }()}; double bestRegularizerScale{std::pow(0.5, logBestRegularizerScale)}; // Find an interval with a high probability of containing the optimal From cbe44d64570bf1d000cca3e1714df0f29e5e0d8b Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 14:35:29 +0100 Subject: [PATCH 16/23] Corrections to search endpoint estimates --- lib/maths/CBoostedTreeFactory.cc | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 1c676db3cb..f766eb7dd8 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -463,23 +463,28 @@ CBoostedTreeFactory::TOptionalVector CBoostedTreeFactory::lineSearchWithQuadrati }()}; double bestRegularizerScale{std::pow(0.5, logBestRegularizerScale)}; - // Find an interval with a high probability of containing the optimal - // regularisation parameter if the interval we searched has a minimum. TVector interval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; if (curvature > 0.0) { + // Find a short interval with a high probability of containing the optimal + // regularisation parameter if we found a minimum. In particular, we solve + // curvature * (x - best)^2 = 3 sigma where sigma is the standard deviation + // of the test loss residuals. We don't extrapolate so don't truncate if a + // crossing point lies outside the searched interval. TMeanVarAccumulator residualMoments; for (std::size_t i = 0; i < INITIAL_REGULARIZER_SEARCH_ITERATIONS; ++i) { residualMoments.add(testLosses[i] - leastSquaresQuadraticTestLoss.predict( static_cast(i))); } - double margin{2.0 * std::sqrt(CBasicStatistics::variance(residualMoments)) / curvature}; - if (logBestRegularizerScale - margin >= leftEndpoint) { - interval(MIN_REGULARIZER_INDEX) = - std::max(std::pow(0.5, margin), MIN_REGULARIZER_SCALE); + double sigma{std::sqrt(CBasicStatistics::variance(residualMoments))}; + double logScaleAtThreeSigma{std::sqrt(3.0 * sigma / curvature)}; + if (logBestRegularizerScale - logScaleAtThreeSigma >= leftEndpoint) { + // These are scales > bestRegularizerScale hence 1 / multiplier. + interval(MAX_REGULARIZER_INDEX) = std::min( + std::pow(1.0 / multiplier, logScaleAtThreeSigma), MAX_REGULARIZER_SCALE); } - if (logBestRegularizerScale + margin <= rightEndpoint) { - interval(MAX_REGULARIZER_INDEX) = - std::min(std::pow(2.0, margin), MAX_REGULARIZER_SCALE); + if (logBestRegularizerScale + logScaleAtThreeSigma <= rightEndpoint) { + interval(MIN_REGULARIZER_INDEX) = std::max( + std::pow(multiplier, logScaleAtThreeSigma), MIN_REGULARIZER_SCALE); } } interval *= bestRegularizerScale; From e2c53d9008990cc7f5125f40ace72b33c769240a Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 14:40:35 +0100 Subject: [PATCH 17/23] Formatting --- include/maths/CBoostedTreeImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 820ab324d2..893dbc51f8 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -674,7 +674,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! the dependent variable. core::CPackedBitVector allTrainingRowsMask() const; - //! Compute the \p percentile percentile gain per split and the sum of row + //! Compute the \p percentile percentile gain per split and the sum of row //! curvatures per internal node of \p forest. TDoubleDoublePr gainAndCurvatureAtPercentile(double percentile, const TNodeVecVec& forest) const; From 1370c18769d7b5aa0665e2487ff80b190f4c0027 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 14:42:09 +0100 Subject: [PATCH 18/23] Test fix for rename --- lib/core/unittest/CLoopProgressTest.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/core/unittest/CLoopProgressTest.cc b/lib/core/unittest/CLoopProgressTest.cc index bdcae9fb76..ac31deb6fd 100644 --- a/lib/core/unittest/CLoopProgressTest.cc +++ b/lib/core/unittest/CLoopProgressTest.cc @@ -165,7 +165,7 @@ void CLoopProgressTest::testSerialization() { auto restoredRecordProgress = [&restoredProgress](double p) { restoredProgress += p; }; - restoredLoopProgress.attach(restoredRecordProgress); + restoredLoopProgress.progressCallback(restoredRecordProgress); restoredLoopProgress.resumeRestored(); CPPUNIT_ASSERT_EQUAL(loopProgress.checksum(), restoredLoopProgress.checksum()); From 958f45a1d1af37c53fcd32b218f340f78b793db6 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 24 Sep 2019 16:48:50 +0100 Subject: [PATCH 19/23] Update test for refactor --- lib/api/CDataFrameAnalysisRunner.cc | 3 ++- lib/api/unittest/CDataFrameAnalyzerTest.cc | 20 +++++++++++--------- lib/api/unittest/CDataFrameAnalyzerTest.h | 4 +++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index 4dc3872da7..2acf7264ab 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -84,7 +84,8 @@ void CDataFrameAnalysisRunner::computeAndSaveExecutionStrategy() { if (memoryUsage <= memoryLimit) { break; } - // if we are not allowed to spill over to disk then only one partition is possible + // If we are not allowed to spill over to disk then only one partition + // is possible. if (m_Spec.diskUsageAllowed() == false) { LOG_TRACE(<< "stop partition number computation since disk usage is turned off"); break; diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index d833f03bef..0f9e161082 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -83,7 +83,6 @@ class CTestDataAdder : public core::CDataAdder { private: TOStreamP m_Stream; }; -} std::vector streamToStringVector(std::stringstream&& tokenStream) { std::vector results; @@ -362,6 +361,7 @@ void addRegressionTestData(const TStrVec& fieldNames, } }); } +} void CDataFrameAnalyzerTest::testWithoutControlMessages() { @@ -1130,8 +1130,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, values); auto persistenceStream{std::make_shared()}; - CDataFrameAnalyzerTest::TPersisterSupplier persisterSupplier = - [&persistenceStream]() -> TDataAdderUPtr { + TPersisterSupplier persisterSupplier = [&persistenceStream]() -> TDataAdderUPtr { return std::make_unique(persistenceStream); }; @@ -1142,20 +1141,21 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction, &persisterSupplier), outputWriterFactory}; + std::size_t dependentVariable( + std::find(fieldNames.begin(), fieldNames.end(), "c5") - fieldNames.begin()); auto frame{passDataToAnalyzer(fieldNames, fieldValues, analyzer, weights, values)}; analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); TStrVec persistedStatesString{ streamToStringVector(std::stringstream(persistenceStream->str()))}; - auto expectedTree{getFinalTree(persistedStatesString, frame)}; + auto expectedTree{this->getFinalTree(persistedStatesString, frame, dependentVariable)}; // Compute actual tree persistenceStream->str(""); std::istringstream intermediateStateStream{persistedStatesString[iterationToRestartFrom]}; - CDataFrameAnalyzerTest::TRestoreSearcherSupplier restoreSearcherSupplier = - [&intermediateStateStream]() -> TDataSearcherUPtr { + TRestoreSearcherSupplier restoreSearcherSupplier = [&intermediateStateStream]() -> TDataSearcherUPtr { return std::make_unique(intermediateStateStream.str()); }; @@ -1170,7 +1170,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti persistedStatesString = streamToStringVector(std::stringstream(persistenceStream->str())); - auto actualTree{getFinalTree(persistedStatesString, frame)}; + auto actualTree{this->getFinalTree(persistedStatesString, frame, dependentVariable)}; // compare hyperparameter @@ -1199,11 +1199,13 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti maths::CBoostedTreeFactory::TBoostedTreeUPtr CDataFrameAnalyzerTest::getFinalTree(const TStrVec& persistedStates, - std::unique_ptr& frame) const { + std::unique_ptr& frame, + std::size_t dependentVariable) const { CTestDataSearcher dataSearcher(persistedStates.back()); auto decompressor{std::make_unique(dataSearcher)}; decompressor->setStateRestoreSearch(api::ML_STATE_INDEX, api::getRegressionStateId("testJob")); auto stream{decompressor->search(1, 1)}; - return maths::CBoostedTreeFactory::constructFromString(*stream, *frame); + return maths::CBoostedTreeFactory::constructFromString(*stream).buildFor( + *frame, dependentVariable); } diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.h b/lib/api/unittest/CDataFrameAnalyzerTest.h index 7b307a0f80..7943653c08 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.h +++ b/lib/api/unittest/CDataFrameAnalyzerTest.h @@ -57,7 +57,9 @@ class CDataFrameAnalyzerTest : public CppUnit::TestFixture { std::size_t iterationToRestartFrom) const; ml::maths::CBoostedTreeFactory::TBoostedTreeUPtr - getFinalTree(const TStrVec& persistedStates, TDataFrameUPtr& frame) const; + getFinalTree(const TStrVec& persistedStates, + TDataFrameUPtr& frame, + std::size_t dependentVariable) const; }; #endif // INCLUDED_CDataFrameAnalyzerTest_h From 937df235947e1c0302eb346225aa545b26903d32 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 25 Sep 2019 16:52:11 +0100 Subject: [PATCH 20/23] Rejig line search to hide the fact it's working on exponential scale --- include/maths/CBoostedTreeFactory.h | 23 ++-- lib/maths/CBoostedTree.cc | 2 +- lib/maths/CBoostedTreeFactory.cc | 148 ++++++++++++++----------- lib/maths/unittest/CBoostedTreeTest.cc | 8 +- 4 files changed, 100 insertions(+), 81 deletions(-) diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index e661fcb9fa..1f90683ba0 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -98,7 +98,8 @@ class MATHS_EXPORT CBoostedTreeFactory final { using TOptionalVector = boost::optional; using TPackedBitVectorVec = std::vector; using TBoostedTreeImplUPtr = std::unique_ptr; - using TScaleRegularization = std::function; + using TApplyRegularizerStep = + std::function; private: static const double MINIMUM_ETA; @@ -135,15 +136,17 @@ class MATHS_EXPORT CBoostedTreeFactory final { TDoubleDoublePr estimateTreeGainAndCurvature(core::CDataFrame& frame, const core::CPackedBitVector& trainingRowMask) const; - //! Perform a line search with quadratic approximation for the regularizer - //! value at the model starts to overfit. + //! Perform a line search for the test loss w.r.t. a single regularization + //! hyperparameter and apply Newton's method to find the minimum. The plan + //! is to find a value near where the model starts to overfit. //! - //! \note applyScaleToRegularizer Applies a specified scale to the initial - //! choosen value for tree implemenation. - TOptionalVector - lineSearchWithQuadraticApproxToTestError(core::CDataFrame& frame, + //! \return The interval to search during the main hyperparameter optimisation + //! loop or null if this couldn't be found. + TOptionalVector testLossNewtonLineSearch(core::CDataFrame& frame, core::CPackedBitVector trainingRowMask, - const TScaleRegularization& applyScaleToRegularizer) const; + const TApplyRegularizerStep& applyRegularizerStep, + double returnedIntervalLeftEndOffset, + double returnedIntervalRightEndOffset) const; //! Initialize the state for hyperparameter optimisation. void initializeHyperparameterOptimisation() const; @@ -166,8 +169,8 @@ class MATHS_EXPORT CBoostedTreeFactory final { TOptionalSize m_BayesianOptimisationRestarts; bool m_Restored = false; TBoostedTreeImplUPtr m_TreeImpl; - TVector m_GammaSearchInterval; - TVector m_LambdaSearchInterval; + TVector m_LogGammaSearchInterval; + TVector m_LogLambdaSearchInterval; TProgressCallback m_RecordProgress = noopRecordProgress; TMemoryUsageCallback m_RecordMemoryUsage = noopRecordMemoryUsage; TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState; diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index 0df48cd91b..a8ad65c078 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -76,7 +76,7 @@ double CMse::value(double prediction, double actual) const { } double CMse::gradient(double prediction, double actual) const { - return prediction - actual; + return 2.0 * (prediction - actual); } double CMse::curvature(double /*prediction*/, double /*actual*/) const { diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index f766eb7dd8..73c6d02b38 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -102,12 +102,12 @@ void CBoostedTreeFactory::initializeHyperparameterOptimisation() const { CBayesianOptimisation::TDoubleDoublePrVec boundingBox; if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { - boundingBox.emplace_back(std::log(m_LambdaSearchInterval(MIN_REGULARIZER_INDEX)), - std::log(m_LambdaSearchInterval(MAX_REGULARIZER_INDEX))); + boundingBox.emplace_back(m_LogLambdaSearchInterval(MIN_REGULARIZER_INDEX), + m_LogLambdaSearchInterval(MAX_REGULARIZER_INDEX)); } if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { - boundingBox.emplace_back(std::log(m_GammaSearchInterval(MIN_REGULARIZER_INDEX)), - std::log(m_GammaSearchInterval(MAX_REGULARIZER_INDEX))); + boundingBox.emplace_back(m_LogGammaSearchInterval(MIN_REGULARIZER_INDEX), + m_LogGammaSearchInterval(MAX_REGULARIZER_INDEX)); } if (m_TreeImpl->m_EtaOverride == boost::none) { double rate{m_TreeImpl->m_EtaGrowthRatePerTree - 1.0}; @@ -315,19 +315,24 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { if (gainPerNode > 0.0) { - TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; - fallbackInterval *= m_TreeImpl->m_Eta; - - double initialGamma{gainPerNode}; - auto gammaStep = [initialGamma](CBoostedTreeImpl& tree, double scale) { - tree.m_Regularization.gamma(scale * initialGamma); + TVector fallbackInterval{{std::log(MIN_REGULARIZER_SCALE), 0.0, + std::log(MAX_REGULARIZER_SCALE)}}; + fallbackInterval += TVector{std::log(m_TreeImpl->m_Eta)}; + + double logInitialGamma{std::log(gainPerNode)}; + auto applyGammaStep = [logInitialGamma](CBoostedTreeImpl& tree, + double stepSize, std::size_t step) { + tree.m_Regularization.gamma( + std::exp(logInitialGamma + static_cast(step) * stepSize)); }; - m_GammaSearchInterval = this->lineSearchWithQuadraticApproxToTestError( - frame, allTrainingRowsMask, gammaStep) - .value_or(fallbackInterval) * - gainPerNode; - LOG_TRACE(<< "gamma search interval = [" - << m_GammaSearchInterval.toDelimited() << "]"); + m_LogGammaSearchInterval = + TVector{std::log(gainPerNode)} + + this->testLossNewtonLineSearch(frame, allTrainingRowsMask, applyGammaStep, + std::log(MIN_REGULARIZER_SCALE), + std::log(MAX_REGULARIZER_SCALE)) + .value_or(fallbackInterval); + LOG_TRACE(<< "log gamma search interval = [" + << m_LogGammaSearchInterval.toDelimited() << "]"); } else { m_TreeImpl->m_RegularizationOverride.gamma(0.0); } @@ -335,19 +340,25 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { if (totalCurvaturePerNode > 0.0) { - TVector fallbackInterval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; - m_TreeImpl->m_Regularization.gamma(m_GammaSearchInterval(MIN_REGULARIZER_INDEX)); - - double initialLambda{totalCurvaturePerNode}; - auto lambdaStep = [initialLambda](CBoostedTreeImpl& tree, double scale) { - tree.m_Regularization.lambda(scale * initialLambda); + TVector fallbackInterval{{std::log(MIN_REGULARIZER_SCALE), 0.0, + std::log(MAX_REGULARIZER_SCALE)}}; + m_TreeImpl->m_Regularization.gamma( + std::exp(m_LogGammaSearchInterval(MIN_REGULARIZER_INDEX))); + + double logInitialLambda{std::log(totalCurvaturePerNode)}; + auto applyLambdaStep = [logInitialLambda](CBoostedTreeImpl& tree, + double stepSize, std::size_t step) { + tree.m_Regularization.lambda(std::exp( + logInitialLambda + static_cast(step) * stepSize)); }; - m_LambdaSearchInterval = this->lineSearchWithQuadraticApproxToTestError( - frame, allTrainingRowsMask, lambdaStep) - .value_or(fallbackInterval) * - totalCurvaturePerNode; - LOG_TRACE(<< "lambda search interval = [" - << m_LambdaSearchInterval.toDelimited() << "]"); + m_LogLambdaSearchInterval = + TVector{std::log(totalCurvaturePerNode)} + + this->testLossNewtonLineSearch(frame, allTrainingRowsMask, applyLambdaStep, + std::log(MIN_REGULARIZER_SCALE), + std::log(MAX_REGULARIZER_SCALE)) + .value_or(fallbackInterval); + LOG_TRACE(<< "log lambda search interval = [" + << m_LogLambdaSearchInterval.toDelimited() << "]"); } else { m_TreeImpl->m_RegularizationOverride.lambda(0.0); } @@ -360,12 +371,14 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa static_cast(m_TreeImpl->m_NumberFolds) / freeRegularizationParameters}; if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { - m_GammaSearchInterval *= scale; - m_TreeImpl->m_Regularization.gamma(m_GammaSearchInterval(BEST_REGULARIZER_INDEX)); + m_LogGammaSearchInterval += TVector{std::log(scale)}; + m_TreeImpl->m_Regularization.gamma( + std::exp(m_LogGammaSearchInterval(BEST_REGULARIZER_INDEX))); } if (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none) { - m_LambdaSearchInterval *= scale; - m_TreeImpl->m_Regularization.lambda(m_LambdaSearchInterval(BEST_REGULARIZER_INDEX)); + m_LogLambdaSearchInterval += TVector{std::log(scale)}; + m_TreeImpl->m_Regularization.lambda( + std::exp(m_LogLambdaSearchInterval(BEST_REGULARIZER_INDEX))); } LOG_TRACE(<< "regularization(initial) = " << m_TreeImpl->m_Regularization.print()); } @@ -388,10 +401,12 @@ CBoostedTreeFactory::estimateTreeGainAndCurvature(core::CDataFrame& frame, return {gain, curvature}; } -CBoostedTreeFactory::TOptionalVector CBoostedTreeFactory::lineSearchWithQuadraticApproxToTestError( - core::CDataFrame& frame, - core::CPackedBitVector trainingRowMask, - const TScaleRegularization& regularizerStep) const { +CBoostedTreeFactory::TOptionalVector +CBoostedTreeFactory::testLossNewtonLineSearch(core::CDataFrame& frame, + core::CPackedBitVector trainingRowMask, + const TApplyRegularizerStep& applyRegularizerStep, + double returnedIntervalLeftEndOffset, + double returnedIntervalRightEndOffset) const { // This uses a quadratic approximation to the test loss function w.r.t. // the scaled regularization hyperparameter from which it estimates the @@ -419,19 +434,17 @@ CBoostedTreeFactory::TOptionalVector CBoostedTreeFactory::lineSearchWithQuadrati double maximumTreeSizeMultiplier{MAIN_TRAINING_LOOP_TREE_SIZE_MULTIPLIER}; std::swap(maximumTreeSizeMultiplier, m_TreeImpl->m_MaximumTreeSizeMultiplier); - double multiplier{std::exp( - -std::log(1024.0) / static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS))}; + double stepSize{-std::log(1024.0) / + static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS)}; CLeastSquaresOnlineRegression<2, double> leastSquaresQuadraticTestLoss; TDoubleVec testLosses(INITIAL_REGULARIZER_SEARCH_ITERATIONS); - double scale{1.0}; for (std::size_t i = 0; i < INITIAL_REGULARIZER_SEARCH_ITERATIONS; ++i) { - regularizerStep(*m_TreeImpl, scale); - scale *= multiplier; + applyRegularizerStep(*m_TreeImpl, stepSize, i); auto forest = m_TreeImpl->trainForest(frame, trainingRowMask, m_RecordMemoryUsage); double testLoss{m_TreeImpl->meanLoss(frame, testRowMask, forest)}; - leastSquaresQuadraticTestLoss.add(static_cast(i), testLoss); + leastSquaresQuadraticTestLoss.add(static_cast(i) * stepSize, testLoss); testLosses[i] = testLoss; m_TreeImpl->m_TrainingProgress.increment(); } @@ -440,20 +453,23 @@ CBoostedTreeFactory::TOptionalVector CBoostedTreeFactory::lineSearchWithQuadrati std::swap(maximumTreeSizeMultiplier, m_TreeImpl->m_MaximumTreeSizeMultiplier); CLeastSquaresOnlineRegression<2, double>::TArray params; - bool successful{leastSquaresQuadraticTestLoss.parameters(params)}; + if (leastSquaresQuadraticTestLoss.parameters(params) == false) { + return TOptionalVector{}; + } + double gradient{params[1]}; double curvature{params[2]}; LOG_TRACE(<< "[intercept, slope, curvature] = " << core::CContainerPrinter::print(params)); - // Find the scale at the minimum of the least squares quadratic fit - // to the test loss in the search interval. - double leftEndpoint{0.0}; - double rightEndpoint{static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS - 1)}; + // Find the scale at the minimum of the least squares quadratic fit to + // the test loss in the search interval. Note step size is negative. + double leftEndpoint{static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS - 1) * stepSize}; + double rightEndpoint{0.0}; double stationaryPoint{-gradient / 2.0 / curvature}; - double logBestRegularizerScale{[&] { - double distanceToLeftEndpoint{std::fabs(leftEndpoint - stationaryPoint)}; - double distanceToRightEndpoint{std::fabs(rightEndpoint - stationaryPoint)}; + double bestRegularizer{[&] { + double distanceToLeftEndpoint{std::fabs(rightEndpoint - stationaryPoint)}; + double distanceToRightEndpoint{std::fabs(leftEndpoint - stationaryPoint)}; if (curvature < 0.0) { // Stationary point is a maximum so use furthest point in interval. return distanceToLeftEndpoint > distanceToRightEndpoint ? leftEndpoint : rightEndpoint; @@ -461,35 +477,35 @@ CBoostedTreeFactory::TOptionalVector CBoostedTreeFactory::lineSearchWithQuadrati // Stationary point is a minimum so use nearest point in the interval. return CTools::truncate(stationaryPoint, leftEndpoint, rightEndpoint); }()}; - double bestRegularizerScale{std::pow(0.5, logBestRegularizerScale)}; + LOG_TRACE(<< "best regularizer = " << bestRegularizer); - TVector interval{{MIN_REGULARIZER_SCALE, 1.0, MAX_REGULARIZER_SCALE}}; + TVector interval{{returnedIntervalLeftEndOffset, 0.0, returnedIntervalRightEndOffset}}; if (curvature > 0.0) { // Find a short interval with a high probability of containing the optimal // regularisation parameter if we found a minimum. In particular, we solve // curvature * (x - best)^2 = 3 sigma where sigma is the standard deviation - // of the test loss residuals. We don't extrapolate so don't truncate if a - // crossing point lies outside the searched interval. + // of the test loss residuals to get the interval endpoints. We don't + // extrapolate the loss function outside the line segment we searched so + // don't truncate if an endpoint lies outside the searched interval. TMeanVarAccumulator residualMoments; for (std::size_t i = 0; i < INITIAL_REGULARIZER_SEARCH_ITERATIONS; ++i) { residualMoments.add(testLosses[i] - leastSquaresQuadraticTestLoss.predict( - static_cast(i))); + static_cast(i) * stepSize)); } double sigma{std::sqrt(CBasicStatistics::variance(residualMoments))}; - double logScaleAtThreeSigma{std::sqrt(3.0 * sigma / curvature)}; - if (logBestRegularizerScale - logScaleAtThreeSigma >= leftEndpoint) { - // These are scales > bestRegularizerScale hence 1 / multiplier. - interval(MAX_REGULARIZER_INDEX) = std::min( - std::pow(1.0 / multiplier, logScaleAtThreeSigma), MAX_REGULARIZER_SCALE); + double threeSigmaInterval{std::sqrt(3.0 * sigma / curvature)}; + if (bestRegularizer - threeSigmaInterval >= leftEndpoint) { + interval(MIN_REGULARIZER_INDEX) = + std::max(-threeSigmaInterval, returnedIntervalLeftEndOffset); } - if (logBestRegularizerScale + logScaleAtThreeSigma <= rightEndpoint) { - interval(MIN_REGULARIZER_INDEX) = std::max( - std::pow(multiplier, logScaleAtThreeSigma), MIN_REGULARIZER_SCALE); + if (bestRegularizer + threeSigmaInterval <= rightEndpoint) { + interval(MAX_REGULARIZER_INDEX) = + std::min(threeSigmaInterval, returnedIntervalRightEndOffset); } } - interval *= bestRegularizerScale; + interval += TVector{bestRegularizer}; - return successful ? TOptionalVector{interval} : TOptionalVector{}; + return TOptionalVector{interval}; } CBoostedTreeFactory CBoostedTreeFactory::constructFromParameters(std::size_t numberThreads, @@ -514,7 +530,7 @@ CBoostedTreeFactory CBoostedTreeFactory::constructFromString(std::istream& jsonS CBoostedTreeFactory::CBoostedTreeFactory(bool restored, std::size_t numberThreads, TLossFunctionUPtr loss) : m_Restored{restored}, m_TreeImpl{std::make_unique(numberThreads, std::move(loss))}, - m_GammaSearchInterval{0.0}, m_LambdaSearchInterval{0.0} { + m_LogGammaSearchInterval{0.0}, m_LogLambdaSearchInterval{0.0} { } CBoostedTreeFactory::CBoostedTreeFactory(CBoostedTreeFactory&&) = default; diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 13c9dcc543..e7afb4c336 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -199,7 +199,7 @@ void CBoostedTreeTest::testPiecewiseConstant() { 0.0, modelBias[i][0], 7.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... - CPPUNIT_ASSERT(modelRSquared[i][0] > 0.94); + CPPUNIT_ASSERT(modelRSquared[i][0] > 0.93); meanModelRSquared.add(modelRSquared[i][0]); } @@ -321,7 +321,7 @@ void CBoostedTreeTest::testNonLinear() { 0.0, modelBias[i][0], 8.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... - CPPUNIT_ASSERT(modelRSquared[i][0] > 0.92); + CPPUNIT_ASSERT(modelRSquared[i][0] > 0.91); meanModelRSquared.add(modelRSquared[i][0]); } @@ -602,8 +602,8 @@ void CBoostedTreeTest::testCategoricalRegressors() { LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); - CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.13); - CPPUNIT_ASSERT(modelRSquared > 0.9); + CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.2); + CPPUNIT_ASSERT(modelRSquared > 0.92); } void CBoostedTreeTest::testIntegerRegressor() { From 7d2ccb0c4be88a24d40bfdc018188e3d5df25dc2 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 25 Sep 2019 17:37:21 +0100 Subject: [PATCH 21/23] Comment plus correct scale --- lib/maths/CBoostedTreeFactory.cc | 12 +++++++----- lib/maths/unittest/CBoostedTreeTest.cc | 8 ++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 73c6d02b38..48dc09c384 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -364,11 +364,13 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa } } - double freeRegularizationParameters{ - (m_TreeImpl->m_RegularizationOverride.gamma() != boost::none ? 0.0 : 1.0) + - (m_TreeImpl->m_RegularizationOverride.lambda() != boost::none ? 0.0 : 1.0)}; - double scale{static_cast(m_TreeImpl->m_NumberFolds - 1) / - static_cast(m_TreeImpl->m_NumberFolds) / freeRegularizationParameters}; + // If we aren't supplied a fixed value for a parameter, we find its "best" + // value forcing the other regularizers to zero. Therefore, we divide here + // by the number of unspecified parameters so the sum of the regularization + // terms is about the same in the first loop. + double scale{ + 1.0 / ((m_TreeImpl->m_RegularizationOverride.gamma() == boost::none ? 1.0 : 0.0) + + (m_TreeImpl->m_RegularizationOverride.lambda() == boost::none ? 1.0 : 0.0))}; if (m_TreeImpl->m_RegularizationOverride.gamma() == boost::none) { m_LogGammaSearchInterval += TVector{std::log(scale)}; diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index e7afb4c336..7a8096df0c 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -199,7 +199,7 @@ void CBoostedTreeTest::testPiecewiseConstant() { 0.0, modelBias[i][0], 7.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... - CPPUNIT_ASSERT(modelRSquared[i][0] > 0.93); + CPPUNIT_ASSERT(modelRSquared[i][0] > 0.94); meanModelRSquared.add(modelRSquared[i][0]); } @@ -321,7 +321,7 @@ void CBoostedTreeTest::testNonLinear() { 0.0, modelBias[i][0], 8.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... - CPPUNIT_ASSERT(modelRSquared[i][0] > 0.91); + CPPUNIT_ASSERT(modelRSquared[i][0] > 0.92); meanModelRSquared.add(modelRSquared[i][0]); } @@ -602,8 +602,8 @@ void CBoostedTreeTest::testCategoricalRegressors() { LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); - CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.2); - CPPUNIT_ASSERT(modelRSquared > 0.92); + CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.05); + CPPUNIT_ASSERT(modelRSquared > 0.91); } void CBoostedTreeTest::testIntegerRegressor() { From afc570b68d24a0b6cee924ae91f5407217495cdb Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 25 Sep 2019 17:51:13 +0100 Subject: [PATCH 22/23] Typo in refactor --- lib/maths/CBoostedTreeFactory.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 48dc09c384..8a847f6407 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -470,10 +470,10 @@ CBoostedTreeFactory::testLossNewtonLineSearch(core::CDataFrame& frame, double rightEndpoint{0.0}; double stationaryPoint{-gradient / 2.0 / curvature}; double bestRegularizer{[&] { - double distanceToLeftEndpoint{std::fabs(rightEndpoint - stationaryPoint)}; - double distanceToRightEndpoint{std::fabs(leftEndpoint - stationaryPoint)}; if (curvature < 0.0) { // Stationary point is a maximum so use furthest point in interval. + double distanceToLeftEndpoint{std::fabs(leftEndpoint - stationaryPoint)}; + double distanceToRightEndpoint{std::fabs(rightEndpoint - stationaryPoint)}; return distanceToLeftEndpoint > distanceToRightEndpoint ? leftEndpoint : rightEndpoint; } // Stationary point is a minimum so use nearest point in the interval. From 311ade134dbbcf22ad12997c8ad1e75c135aa770 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 26 Sep 2019 14:15:33 +0100 Subject: [PATCH 23/23] Improve comment --- lib/maths/CBoostedTreeFactory.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 8a847f6407..c4ee31fdfc 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -464,8 +464,8 @@ CBoostedTreeFactory::testLossNewtonLineSearch(core::CDataFrame& frame, LOG_TRACE(<< "[intercept, slope, curvature] = " << core::CContainerPrinter::print(params)); - // Find the scale at the minimum of the least squares quadratic fit to - // the test loss in the search interval. Note step size is negative. + // Find the minimizer of the least squares quadratic fit to the test loss + // in the search interval. (Note step size is negative.) double leftEndpoint{static_cast(INITIAL_REGULARIZER_SEARCH_ITERATIONS - 1) * stepSize}; double rightEndpoint{0.0}; double stationaryPoint{-gradient / 2.0 / curvature};