diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h index c62e1bc526..7c1690ab13 100644 --- a/include/maths/CBoostedTree.h +++ b/include/maths/CBoostedTree.h @@ -132,12 +132,12 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel { ~CBoostedTree() override; //! Train on the examples in the data frame supplied to the constructor. - void train(TProgressCallback recordProgress = noop) override; + void train() override; //! Write the predictions to the data frame supplied to the constructor. //! //! \warning This can only be called after train. - void predict(TProgressCallback recordProgress = noop) const override; + void predict() const override; //! Write the trained model to \p writer. //! @@ -160,7 +160,7 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel { using TImplUPtr = std::unique_ptr; private: - CBoostedTree(core::CDataFrame& frame, TImplUPtr&& impl); + CBoostedTree(core::CDataFrame& frame, TProgressCallback recordProgress, TImplUPtr&& impl); private: TImplUPtr m_Impl; diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 35621f9ec8..595b1a1f10 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -30,6 +30,7 @@ class CBoostedTreeImpl; class MATHS_EXPORT CBoostedTreeFactory final { public: using TBoostedTreeUPtr = std::unique_ptr; + using TProgressCallback = CBoostedTree::TProgressCallback; public: //! Construct a boosted tree object from parameters. @@ -38,7 +39,8 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Construct a boosted tree object from its serialized version. static TBoostedTreeUPtr constructFromString(std::stringstream& jsonStringStream, - core::CDataFrame& frame); + core::CDataFrame& frame, + TProgressCallback recordProgress = noop); ~CBoostedTreeFactory(); CBoostedTreeFactory(CBoostedTreeFactory&) = delete; @@ -66,7 +68,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Set the number of training examples we need per feature we'll include. CBoostedTreeFactory& rowsPerFeature(std::size_t rowsPerFeature); //! Set the callback function for progress monitoring. - CBoostedTreeFactory& progressCallback(CBoostedTree::TProgressCallback callback); + CBoostedTreeFactory& progressCallback(TProgressCallback callback); //! Estimate the maximum booking memory that training the boosted tree on a data //! frame with \p numberRows row and \p numberColumns columns will use. @@ -103,8 +105,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Read overrides for hyperparameters and if necessary estimate the initial //! values for \f$\lambda\f$ and \f$\gamma\f$ which match the gain from an //! overfit tree. - void initializeHyperparameters(core::CDataFrame& frame, - CBoostedTree::TProgressCallback recordProgress) const; + void initializeHyperparameters(core::CDataFrame& frame) const; //! Initialize the state for hyperparameter optimisation. void initializeHyperparameterOptimisation() const; @@ -112,10 +113,12 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Get the number of hyperparameter tuning rounds to use. std::size_t numberHyperparameterTuningRounds() const; + static void noop(double); + private: double m_MinimumFrequencyToOneHotEncode; TBoostedTreeImplUPtr m_TreeImpl; - CBoostedTree::TProgressCallback m_ProgressCallback; + TProgressCallback m_RecordProgress = noop; }; } } diff --git a/include/maths/CDataFrameRegressionModel.h b/include/maths/CDataFrameRegressionModel.h index c05fb68803..2c7d04b144 100644 --- a/include/maths/CDataFrameRegressionModel.h +++ b/include/maths/CDataFrameRegressionModel.h @@ -33,12 +33,12 @@ class MATHS_EXPORT CDataFrameRegressionModel { CDataFrameRegressionModel& operator=(const CDataFrameRegressionModel&) = delete; //! Train on the examples in the data frame supplied to the constructor. - virtual void train(TProgressCallback recordProgress = noop) = 0; + virtual void train() = 0; //! Write the predictions to the data frame supplied to the constructor. //! //! \warning This can only be called after train. - virtual void predict(TProgressCallback recordProgress = noop) const = 0; + virtual void predict() const = 0; //! Write this model to \p writer. //! @@ -52,12 +52,13 @@ class MATHS_EXPORT CDataFrameRegressionModel { virtual std::size_t columnHoldingPrediction(std::size_t numberColumns) const = 0; protected: - CDataFrameRegressionModel(core::CDataFrame& frame) : m_Frame{frame} {} - core::CDataFrame& frame() const { return m_Frame; } - static void noop(double); + CDataFrameRegressionModel(core::CDataFrame& frame, TProgressCallback recordProgress); + core::CDataFrame& frame() const; + const TProgressCallback& progressRecorder() const; private: core::CDataFrame& m_Frame; + TProgressCallback m_RecordProgress; }; } } diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 017a0b1a07..7d386f442c 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -86,6 +86,7 @@ CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner(const CDataFrameAnalysi maths::CBoostedTreeFactory::constructFromParameters( this->spec().numberThreads(), std::make_unique())); + m_BoostedTreeFactory->progressCallback(this->progressRecorder()); if (lambda >= 0.0) { m_BoostedTreeFactory->lambda(lambda); } @@ -139,8 +140,8 @@ void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, m_BoostedTree = m_BoostedTreeFactory->buildFor( frame, dependentVariableColumn - featureNames.begin()); - m_BoostedTree->train(this->progressRecorder()); - m_BoostedTree->predict(this->progressRecorder()); + m_BoostedTree->train(); + m_BoostedTree->predict(); } std::size_t CDataFrameBoostedTreeRunner::estimateBookkeepingMemoryUsage( diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index 442d62d35f..c378cc62df 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -261,7 +261,7 @@ void addRegressionTestData(TStrVec fieldNames, std::unique_ptr tree = treeFactory.buildFor(*frame, weights.size()); - tree->train(ml::maths::CBoostedTree::TProgressCallback()); + tree->train(); frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index d08cca57ef..a71b701f77 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -57,18 +57,18 @@ double CArgMinMse::value() const { } } -CBoostedTree::CBoostedTree(core::CDataFrame& frame, TImplUPtr&& impl) - : CDataFrameRegressionModel{frame}, m_Impl{std::move(impl)} { +CBoostedTree::CBoostedTree(core::CDataFrame& frame, TProgressCallback recordProgress, TImplUPtr&& impl) + : CDataFrameRegressionModel{frame, std::move(recordProgress)}, m_Impl{std::move(impl)} { } CBoostedTree::~CBoostedTree() = default; -void CBoostedTree::train(TProgressCallback recordProgress) { - m_Impl->train(this->frame(), recordProgress); +void CBoostedTree::train() { + m_Impl->train(this->frame(), this->progressRecorder()); } -void CBoostedTree::predict(TProgressCallback recordProgress) const { - m_Impl->predict(this->frame(), recordProgress); +void CBoostedTree::predict() const { + m_Impl->predict(this->frame(), this->progressRecorder()); } void CBoostedTree::write(core::CRapidJsonConcurrentLineWriter& writer) const { diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 04b5226323..276a9b0753 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -37,12 +37,12 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari this->selectFeaturesAndEncodeCategories(frame); if (this->initializeFeatureSampleDistribution()) { - this->initializeHyperparameters(frame, m_ProgressCallback); + this->initializeHyperparameters(frame); this->initializeHyperparameterOptimisation(); } // TODO can only use factory to create one object since this is moved. This seems trappy. - return TBoostedTreeUPtr{new CBoostedTree(frame, std::move(m_TreeImpl))}; + return TBoostedTreeUPtr{new CBoostedTree(frame, m_RecordProgress, std::move(m_TreeImpl))}; } std::size_t CBoostedTreeFactory::numberHyperparameterTuningRounds() const { @@ -180,8 +180,7 @@ bool CBoostedTreeFactory::initializeFeatureSampleDistribution() const { return false; } -void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame, - CBoostedTree::TProgressCallback recordProgress) const { +void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) const { m_TreeImpl->m_Lambda = m_TreeImpl->m_LambdaOverride.value_or(0.0); m_TreeImpl->m_Gamma = m_TreeImpl->m_GammaOverride.value_or(0.0); @@ -232,7 +231,7 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame, LOG_TRACE(<< "loss = " << L[0] << ", # leaves = " << T[0] << ", sum square weights = " << W[0]); - auto forest = m_TreeImpl->trainForest(frame, trainingRowMask, recordProgress); + auto forest = m_TreeImpl->trainForest(frame, trainingRowMask, m_RecordProgress); std::tie(L[1], T[1], W[1]) = m_TreeImpl->regularisedLoss(frame, trainingRowMask, forest); @@ -272,10 +271,12 @@ CBoostedTreeFactory::constructFromParameters(std::size_t numberThreads, CBoostedTreeFactory::TBoostedTreeUPtr CBoostedTreeFactory::constructFromString(std::stringstream& jsonStringStream, - core::CDataFrame& frame) { + core::CDataFrame& frame, + TProgressCallback recordProgress) { try { TBoostedTreeUPtr treePtr{ - new CBoostedTree{frame, TBoostedTreeImplUPtr{new CBoostedTreeImpl{}}}}; + new CBoostedTree{frame, std::move(recordProgress), + TBoostedTreeImplUPtr{new CBoostedTreeImpl{}}}}; core::CJsonStateRestoreTraverser traverser(jsonStringStream); if (treePtr->acceptRestoreTraverser(traverser) == false || traverser.haveBadState()) { throw std::runtime_error{"failed to restore boosted tree"}; @@ -388,9 +389,8 @@ CBoostedTreeFactory& CBoostedTreeFactory::rowsPerFeature(std::size_t rowsPerFeat return *this; } -CBoostedTreeFactory& -CBoostedTreeFactory::progressCallback(CBoostedTree::TProgressCallback callback) { - m_ProgressCallback = callback; +CBoostedTreeFactory& CBoostedTreeFactory::progressCallback(TProgressCallback callback) { + m_RecordProgress = callback; return *this; } @@ -403,6 +403,9 @@ std::size_t CBoostedTreeFactory::numberExtraColumnsForTrain() const { return m_TreeImpl->numberExtraColumnsForTrain(); } +void CBoostedTreeFactory::noop(double) { +} + const double CBoostedTreeFactory::MINIMUM_ETA{1e-3}; const std::size_t CBoostedTreeFactory::MAXIMUM_NUMBER_TREES{ static_cast(2.0 / MINIMUM_ETA + 0.5)}; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 638cc3fd81..59dc455e25 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -6,6 +6,7 @@ #include +#include #include #include @@ -88,6 +89,12 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, LOG_TRACE(<< "Main training loop..."); + // We account for cost of setup as one round. The main optimisation loop runs + // for "m_NumberRounds + 1" rounds and training on the choosen hyperparameter + // values is counted as one round. This gives a total of m_NumberRounds + 3. + core::CLoopProgress progress{m_NumberRounds + 3, recordProgress}; + progress.increment(); + if (this->canTrain() == false) { // Fallback to using the constant predictor which minimises the loss. @@ -121,6 +128,9 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, if (this->selectNextHyperparameters(lossMoments, *m_BayesianOptimization) == false) { break; } + + progress.increment(); + } while (m_CurrentRound++ < m_NumberRounds); LOG_TRACE(<< "Test loss = " << m_BestForestTestLoss); @@ -128,6 +138,10 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, this->restoreBestHyperparameters(); m_BestForest = this->trainForest(frame, this->allTrainingRowsMask(), recordProgress); } + + // Force to at least one here because we can have early exit from loop or take + // a different path. + recordProgress(1.0); } void CBoostedTreeImpl::predict(core::CDataFrame& frame, diff --git a/lib/maths/CDataFrameRegressionModel.cc b/lib/maths/CDataFrameRegressionModel.cc index ce1d9f2ee9..0181d0e750 100644 --- a/lib/maths/CDataFrameRegressionModel.cc +++ b/lib/maths/CDataFrameRegressionModel.cc @@ -9,7 +9,18 @@ namespace ml { namespace maths { -void CDataFrameRegressionModel::noop(double) { +CDataFrameRegressionModel::CDataFrameRegressionModel(core::CDataFrame& frame, + TProgressCallback recordProgress) + : m_Frame{frame}, m_RecordProgress{recordProgress} { +} + +core::CDataFrame& CDataFrameRegressionModel::frame() const { + return m_Frame; +} + +const CDataFrameRegressionModel::TProgressCallback& +CDataFrameRegressionModel::progressRecorder() const { + return m_RecordProgress; } } } diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 468272ce47..09c155c14f 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -331,11 +331,13 @@ void CBoostedTreeTest::testNonLinear() { void CBoostedTreeTest::testThreading() { - // Test we get the same results whether we thread the code or not. Note - // because we compute approximate quantiles for each thread and merge we - // get slightly different results if threaded vs single threaded. However, - // we should get the same results whether executed asynchronously or not - // so test with and without starting the thread pool. + // Test we get the same results whether we run with multiple threads or not. + // Note because we compute approximate quantiles for a partition of the data + // (one subset for each thread) and merge we get slightly different results + // if running multiple vs single threaded. However, we should get the same + // results whether we actually execute in parallel or not provided we perform + // the same partitioning. Therefore, we test with two logical threads but + // with and without starting the thread pool. test::CRandomNumbers rng; @@ -480,7 +482,7 @@ void CBoostedTreeTest::testConstantFeatures() { CPPUNIT_ASSERT(featureWeights[cols - 2] < 1e-4); } -void CBoostedTreeTest::testConstantObjective() { +void CBoostedTreeTest::testConstantTarget() { // Test we correctly deal with a constant dependent variable. @@ -604,6 +606,82 @@ void CBoostedTreeTest::testCategoricalRegressors() { CPPUNIT_ASSERT(modelRSquared > 0.97); } +void CBoostedTreeTest::testProgressMonitoring() { + + // Test progress monitoring invariants. + + test::CRandomNumbers rng; + + std::size_t rows{500}; + std::size_t cols{6}; + std::size_t capacity{600}; + + TDoubleVecVec x(cols); + for (std::size_t i = 0; i < cols; ++i) { + rng.generateUniformSamples(0.0, 10.0, rows, x[i]); + } + + core::stopDefaultAsyncExecutor(); + + std::string tests[]{"serial", "parallel"}; + + for (std::size_t threads : {1, 2}) { + + LOG_DEBUG(<< tests[threads == 1 ? 0 : 1]); + + auto frame = core::makeMainStorageDataFrame(cols, capacity).first; + for (std::size_t i = 0; i < rows; ++i) { + frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { + for (std::size_t j = 0; j < cols; ++j, ++column) { + *column = x[j][i]; + } + }); + } + frame->finishWritingRows(); + + std::atomic_int totalFractionalProgress{0}; + + auto reportProgress = [&totalFractionalProgress](double fractionalProgress) { + totalFractionalProgress.fetch_add( + static_cast(65536.0 * fractionalProgress + 0.5)); + }; + + std::atomic_bool finished{false}; + + std::thread worker{[&]() { + auto regression = maths::CBoostedTreeFactory::constructFromParameters( + threads, std::make_unique()) + .progressCallback(reportProgress) + .buildFor(*frame, cols - 1); + + regression->train(); + finished.store(true); + }}; + + int lastTotalFractionalProgress{0}; + int lastProgressReport{0}; + + bool monotonic{true}; + std::size_t percentage{0}; + while (finished.load() == false) { + if (totalFractionalProgress.load() > lastProgressReport) { + LOG_DEBUG(<< percentage << "% complete"); + percentage += 10; + lastProgressReport += 6554; + } + monotonic &= (totalFractionalProgress.load() >= lastTotalFractionalProgress); + lastTotalFractionalProgress = totalFractionalProgress.load(); + } + worker.join(); + + CPPUNIT_ASSERT(monotonic); + + core::startDefaultAsyncExecutor(); + } + + core::stopDefaultAsyncExecutor(); +} + void CBoostedTreeTest::testMissingData() { } @@ -769,10 +847,12 @@ CppUnit::Test* CBoostedTreeTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CBoostedTreeTest::testConstantFeatures", &CBoostedTreeTest::testConstantFeatures)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CBoostedTreeTest::testConstantObjective", &CBoostedTreeTest::testConstantObjective)); + "CBoostedTreeTest::testConstantTarget", &CBoostedTreeTest::testConstantTarget)); suiteOfTests->addTest(new CppUnit::TestCaller( "CBoostedTreeTest::testCategoricalRegressors", &CBoostedTreeTest::testCategoricalRegressors)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CBoostedTreeTest::testProgressMonitoring", &CBoostedTreeTest::testProgressMonitoring)); suiteOfTests->addTest(new CppUnit::TestCaller( "CBoostedTreeTest::testMissingData", &CBoostedTreeTest::testMissingData)); suiteOfTests->addTest(new CppUnit::TestCaller( diff --git a/lib/maths/unittest/CBoostedTreeTest.h b/lib/maths/unittest/CBoostedTreeTest.h index af920e338e..37a5765de4 100644 --- a/lib/maths/unittest/CBoostedTreeTest.h +++ b/lib/maths/unittest/CBoostedTreeTest.h @@ -16,8 +16,9 @@ class CBoostedTreeTest : public CppUnit::TestFixture { void testNonLinear(); void testThreading(); void testConstantFeatures(); - void testConstantObjective(); + void testConstantTarget(); void testCategoricalRegressors(); + void testProgressMonitoring(); void testMissingData(); // TODO void testFeatureWeights(); // TODO void testNuisanceFeatures();