Skip to content

[ML] Logistic regression loss function for boosted tree training #713

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Oct 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ boosted tree training. Hard depth based regularization is often the strategy of
choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs.
Also, the parameters of the penalty function are mode suited to optimising with our
Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].)
* Binomial logistic regression targeting cross entropy. (See {ml-pull}713[#713].)
* Improvements to count and sum anomaly detection for sparse data. This primarily
aims to improve handling of data which are predictably present: detecting when they
are unexpectedly missing. (See {ml-pull}721[#721].)
Expand Down
101 changes: 98 additions & 3 deletions include/maths/CBoostedTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@

#include <maths/CBasicStatistics.h>
#include <maths/CDataFrameRegressionModel.h>
#include <maths/CLinearAlgebra.h>
#include <maths/ImportExport.h>

#include <cstddef>
#include <memory>
#include <string>
#include <vector>

namespace ml {
namespace core {
Expand All @@ -29,18 +32,29 @@ class CEncodedDataFrameRowRef;
namespace boosted_tree_detail {
class MATHS_EXPORT CArgMinLossImpl {
public:
CArgMinLossImpl(double lambda);
virtual ~CArgMinLossImpl() = default;

virtual std::unique_ptr<CArgMinLossImpl> clone() const = 0;
virtual bool nextPass() = 0;
virtual void add(double prediction, double actual) = 0;
virtual void merge(const CArgMinLossImpl& other) = 0;
virtual double value() const = 0;

protected:
double lambda() const;

private:
double m_Lambda;
};

//! \brief Finds the value to add to a set of predictions which minimises the MSE.
//! \brief Finds the value to add to a set of predictions which minimises the
//! regularized MSE w.r.t. the actual values.
class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
public:
CArgMinMseImpl(double lambda);
std::unique_ptr<CArgMinLossImpl> clone() const override;
bool nextPass() override;
void add(double prediction, double actual) override;
void merge(const CArgMinLossImpl& other) override;
double value() const override;
Expand All @@ -51,6 +65,46 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
private:
TMeanAccumulator m_MeanError;
};

//! \brief Finds the value to add to a set of predicted log-odds which minimises
//! regularised cross entropy loss w.r.t. the actual categories.
class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
public:
CArgMinLogisticImpl(double lambda);
std::unique_ptr<CArgMinLossImpl> clone() const override;
bool nextPass() override;
void add(double prediction, double actual) override;
void merge(const CArgMinLossImpl& other) override;
double value() const override;

private:
using TMinMaxAccumulator = CBasicStatistics::CMinMax<double>;
using TSizeVector = CVectorNx1<std::size_t, 2>;
using TSizeVectorVec = std::vector<TSizeVector>;

private:
std::size_t bucket(double prediction) const {
double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()};
return std::min(static_cast<std::size_t>(bucket),
m_BucketCategoryCounts.size() - 1);
}

double bucketCentre(std::size_t bucket) const {
return m_PredictionMinMax.min() +
(static_cast<double>(bucket) + 0.5) * this->bucketWidth();
}

double bucketWidth() const {
return m_PredictionMinMax.range() /
static_cast<double>(m_BucketCategoryCounts.size());
}

private:
std::size_t m_CurrentPass = 0;
TMinMaxAccumulator m_PredictionMinMax;
TSizeVector m_CategoryCounts;
TSizeVectorVec m_BucketCategoryCounts;
};
}

namespace boosted_tree {
Expand All @@ -64,6 +118,11 @@ class MATHS_EXPORT CArgMinLoss {
CArgMinLoss& operator=(const CArgMinLoss& other);
CArgMinLoss& operator=(CArgMinLoss&& other) = default;

//! Start another pass over the predictions and actuals.
//!
//! \return True if we need to perform another pass to compute value().
bool nextPass() const;

//! Update with a point prediction and actual value.
void add(double prediction, double actual);

Expand Down Expand Up @@ -94,6 +153,8 @@ class MATHS_EXPORT CArgMinLoss {
class MATHS_EXPORT CLoss {
public:
virtual ~CLoss() = default;
//! Clone the loss.
virtual std::unique_ptr<CLoss> clone() const = 0;
//! The value of the loss function.
virtual double value(double prediction, double actual) const = 0;
//! The slope of the loss function.
Expand All @@ -103,7 +164,7 @@ class MATHS_EXPORT CLoss {
//! Returns true if the loss curvature is constant.
virtual bool isCurvatureConstant() const = 0;
//! Get an object which computes the leaf value that minimises loss.
virtual CArgMinLoss minimizer() const = 0;
virtual CArgMinLoss minimizer(double lambda) const = 0;
//! Get the name of the loss function
virtual const std::string& name() const = 0;

Expand All @@ -114,11 +175,34 @@ class MATHS_EXPORT CLoss {
//! \brief The MSE loss function.
class MATHS_EXPORT CMse final : public CLoss {
public:
std::unique_ptr<CLoss> clone() const override;
double value(double prediction, double actual) const override;
double gradient(double prediction, double actual) const override;
double curvature(double prediction, double actual) const override;
bool isCurvatureConstant() const override;
CArgMinLoss minimizer() const override;
CArgMinLoss minimizer(double lambda) const override;
const std::string& name() const override;

public:
static const std::string NAME;
};

//! \brief Implements loss for binomial logistic regression.
//!
//! DESCRIPTION:\n
//! This targets the cross entropy loss using the tree to predict class log-odds:
//! <pre class="fragment">
//! \f$\displaystyle l_i(p) = -(1 - a_i) \log(1 - S(p)) - a_i \log(S(p))\f$
//! </pre>
//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the
//! prediction and \f$S(\cdot)\f$ denotes the logistic function.
class MATHS_EXPORT CLogistic final : public CLoss {
std::unique_ptr<CLoss> clone() const override;
double value(double prediction, double actual) const override;
double gradient(double prediction, double actual) const override;
double curvature(double prediction, double actual) const override;
bool isCurvatureConstant() const override;
CArgMinLoss minimizer(double lambda) const override;
const std::string& name() const override;

public:
Expand Down Expand Up @@ -248,6 +332,7 @@ class MATHS_EXPORT CBoostedTreeNode final {
//! proposed by Reshef for this purpose. See CDataFrameCategoryEncoder for more details.
class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
public:
using TStrVec = std::vector<std::string>;
using TRowRef = core::CDataFrame::TRowRef;
using TLossFunctionUPtr = std::unique_ptr<boosted_tree::CLoss>;
using TDataFramePtr = core::CDataFrame*;
Expand Down Expand Up @@ -285,6 +370,16 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
//! Get the model produced by training if it has been run.
const TNodeVecVec& trainedModel() const;

//! The name of the object holding the best hyperaparameters in the state document.
static const std::string& bestHyperparametersName();

//! The name of the object holding the best regularisation hyperparameters in the
//! state document.
static const std::string& bestRegularizationHyperparametersName();

//! A list of the names of the best individual hyperparameters in the state document.
static TStrVec bestHyperparameterNames();

//! Persist by passing information to \p inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;

Expand Down
2 changes: 2 additions & 0 deletions include/maths/CBoostedTreeFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ class MATHS_EXPORT CBoostedTreeFactory final {
TOptionalDouble m_MinimumFrequencyToOneHotEncode;
TOptionalSize m_BayesianOptimisationRestarts;
bool m_Restored = false;
std::size_t m_NumberThreads;
TLossFunctionUPtr m_Loss;
TBoostedTreeImplUPtr m_TreeImpl;
TVector m_LogDepthPenaltyMultiplierSearchInterval;
TVector m_LogTreeSizePenaltyMultiplierSearchInterval;
Expand Down
11 changes: 11 additions & 0 deletions include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ inline std::size_t predictionColumn(std::size_t numberColumns) {
class MATHS_EXPORT CBoostedTreeImpl final {
public:
using TDoubleVec = std::vector<double>;
using TStrVec = std::vector<std::string>;
using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
using TBayesinOptimizationUPtr = std::unique_ptr<maths::CBayesianOptimisation>;
Expand Down Expand Up @@ -101,6 +102,16 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! frame with \p numberRows row and \p numberColumns columns will use.
std::size_t estimateMemoryUsage(std::size_t numberRows, std::size_t numberColumns) const;

//! The name of the object holding the best hyperaparameters in the state document.
static const std::string& bestHyperparametersName();

//! The name of the object holding the best regularisation hyperparameters in the
//! state document.
static const std::string& bestRegularizationHyperparametersName();

//! A list of the names of the best individual hyperparameters in the state document.
static TStrVec bestHyperparameterNames();

//! Persist by passing information to \p inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;

Expand Down
3 changes: 2 additions & 1 deletion include/maths/CTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,8 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable {
//! \param[in] width The step width.
//! \param[in] x0 The centre of the step.
//! \param[in] sign Determines whether it's a step up or down.
static double logisticFunction(double x, double width, double x0 = 0.0, double sign = 1.0) {
static double
logisticFunction(double x, double width = 1.0, double x0 = 0.0, double sign = 1.0) {
return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width));
}

Expand Down
Loading