elastic · tveasey · Oct 9, 2019 · Oct 1, 2019 · Oct 2, 2019 · Oct 2, 2019
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -43,6 +43,7 @@ boosted tree training. Hard depth based regularization is often the strategy of
 choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs.
 Also, the parameters of the penalty function are mode suited to optimising with our
 Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].)
+* Binomial logistic regression targeting cross entropy. (See {ml-pull}713[#713].) 
 * Improvements to count and sum anomaly detection for sparse data. This primarily
 aims to improve handling of data which are predictably present: detecting when they
 are unexpectedly missing. (See {ml-pull}721[#721].)

diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h
@@ -13,10 +13,13 @@
 
 #include <maths/CBasicStatistics.h>
 #include <maths/CDataFrameRegressionModel.h>
+#include <maths/CLinearAlgebra.h>
 #include <maths/ImportExport.h>
 
 #include <cstddef>
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace ml {
 namespace core {
@@ -29,18 +32,29 @@ class CEncodedDataFrameRowRef;
 namespace boosted_tree_detail {
 class MATHS_EXPORT CArgMinLossImpl {
 public:
+    CArgMinLossImpl(double lambda);
     virtual ~CArgMinLossImpl() = default;
 
     virtual std::unique_ptr<CArgMinLossImpl> clone() const = 0;
+    virtual bool nextPass() = 0;
     virtual void add(double prediction, double actual) = 0;
     virtual void merge(const CArgMinLossImpl& other) = 0;
     virtual double value() const = 0;
+
+protected:
+    double lambda() const;
+
+private:
+    double m_Lambda;
 };
 
-//! \brief Finds the value to add to a set of predictions which minimises the MSE.
+//! \brief Finds the value to add to a set of predictions which minimises the
+//! regularized MSE w.r.t. the actual values.
 class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
 public:
+    CArgMinMseImpl(double lambda);
     std::unique_ptr<CArgMinLossImpl> clone() const override;
+    bool nextPass() override;
     void add(double prediction, double actual) override;
     void merge(const CArgMinLossImpl& other) override;
     double value() const override;
@@ -51,6 +65,46 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
 private:
     TMeanAccumulator m_MeanError;
 };
+
+//! \brief Finds the value to add to a set of predicted log-odds which minimises
+//! regularised cross entropy loss w.r.t. the actual categories.
+class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
+public:
+    CArgMinLogisticImpl(double lambda);
+    std::unique_ptr<CArgMinLossImpl> clone() const override;
+    bool nextPass() override;
+    void add(double prediction, double actual) override;
+    void merge(const CArgMinLossImpl& other) override;
+    double value() const override;
+
+private:
+    using TMinMaxAccumulator = CBasicStatistics::CMinMax<double>;
+    using TSizeVector = CVectorNx1<std::size_t, 2>;
+    using TSizeVectorVec = std::vector<TSizeVector>;
+
+private:
+    std::size_t bucket(double prediction) const {
+        double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()};
+        return std::min(static_cast<std::size_t>(bucket),
+                        m_BucketCategoryCounts.size() - 1);
+    }
+
+    double bucketCentre(std::size_t bucket) const {
+        return m_PredictionMinMax.min() +
+               (static_cast<double>(bucket) + 0.5) * this->bucketWidth();
+    }
+
+    double bucketWidth() const {
+        return m_PredictionMinMax.range() /
+               static_cast<double>(m_BucketCategoryCounts.size());
+    }
+
+private:
+    std::size_t m_CurrentPass = 0;
+    TMinMaxAccumulator m_PredictionMinMax;
+    TSizeVector m_CategoryCounts;
+    TSizeVectorVec m_BucketCategoryCounts;
+};
 }
 
 namespace boosted_tree {
@@ -64,6 +118,11 @@ class MATHS_EXPORT CArgMinLoss {
     CArgMinLoss& operator=(const CArgMinLoss& other);
     CArgMinLoss& operator=(CArgMinLoss&& other) = default;
 
+    //! Start another pass over the predictions and actuals.
+    //!
+    //! \return True if we need to perform another pass to compute value().
+    bool nextPass() const;
+
     //! Update with a point prediction and actual value.
     void add(double prediction, double actual);
 
@@ -94,6 +153,8 @@ class MATHS_EXPORT CArgMinLoss {
 class MATHS_EXPORT CLoss {
 public:
     virtual ~CLoss() = default;
+    //! Clone the loss.
+    virtual std::unique_ptr<CLoss> clone() const = 0;
     //! The value of the loss function.
     virtual double value(double prediction, double actual) const = 0;
     //! The slope of the loss function.
@@ -103,7 +164,7 @@ class MATHS_EXPORT CLoss {
     //! Returns true if the loss curvature is constant.
     virtual bool isCurvatureConstant() const = 0;
     //! Get an object which computes the leaf value that minimises loss.
-    virtual CArgMinLoss minimizer() const = 0;
+    virtual CArgMinLoss minimizer(double lambda) const = 0;
     //! Get the name of the loss function
     virtual const std::string& name() const = 0;
 
@@ -114,11 +175,34 @@ class MATHS_EXPORT CLoss {
 //! \brief The MSE loss function.
 class MATHS_EXPORT CMse final : public CLoss {
 public:
+    std::unique_ptr<CLoss> clone() const override;
     double value(double prediction, double actual) const override;
     double gradient(double prediction, double actual) const override;
     double curvature(double prediction, double actual) const override;
     bool isCurvatureConstant() const override;
-    CArgMinLoss minimizer() const override;
+    CArgMinLoss minimizer(double lambda) const override;
+    const std::string& name() const override;
+
+public:
+    static const std::string NAME;
+};
+
+//! \brief Implements loss for binomial logistic regression.
+//!
+//! DESCRIPTION:\n
+//! This targets the cross entropy loss using the tree to predict class log-odds:
+//! <pre class="fragment">
+//!   \f$\displaystyle l_i(p) = -(1 - a_i) \log(1 - S(p)) - a_i \log(S(p))\f$
+//! </pre>
+//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the
+//! prediction and \f$S(\cdot)\f$ denotes the logistic function.
+class MATHS_EXPORT CLogistic final : public CLoss {
+    std::unique_ptr<CLoss> clone() const override;
+    double value(double prediction, double actual) const override;
+    double gradient(double prediction, double actual) const override;
+    double curvature(double prediction, double actual) const override;
+    bool isCurvatureConstant() const override;
+    CArgMinLoss minimizer(double lambda) const override;
     const std::string& name() const override;
 
 public:
@@ -248,6 +332,7 @@ class MATHS_EXPORT CBoostedTreeNode final {
 //! proposed by Reshef for this purpose. See CDataFrameCategoryEncoder for more details.
 class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
 public:
+    using TStrVec = std::vector<std::string>;
     using TRowRef = core::CDataFrame::TRowRef;
     using TLossFunctionUPtr = std::unique_ptr<boosted_tree::CLoss>;
     using TDataFramePtr = core::CDataFrame*;
@@ -285,6 +370,16 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
     //! Get the model produced by training if it has been run.
     const TNodeVecVec& trainedModel() const;
 
+    //! The name of the object holding the best hyperaparameters in the state document.
+    static const std::string& bestHyperparametersName();
+
+    //! The name of the object holding the best regularisation hyperparameters in the
+    //! state document.
+    static const std::string& bestRegularizationHyperparametersName();
+
+    //! A list of the names of the best individual hyperparameters in the state document.
+    static TStrVec bestHyperparameterNames();
+
     //! Persist by passing information to \p inserter.
     void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
 

diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
@@ -177,6 +177,8 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TOptionalDouble m_MinimumFrequencyToOneHotEncode;
     TOptionalSize m_BayesianOptimisationRestarts;
     bool m_Restored = false;
+    std::size_t m_NumberThreads;
+    TLossFunctionUPtr m_Loss;
     TBoostedTreeImplUPtr m_TreeImpl;
     TVector m_LogDepthPenaltyMultiplierSearchInterval;
     TVector m_LogTreeSizePenaltyMultiplierSearchInterval;

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -48,6 +48,7 @@ inline std::size_t predictionColumn(std::size_t numberColumns) {
 class MATHS_EXPORT CBoostedTreeImpl final {
 public:
     using TDoubleVec = std::vector<double>;
+    using TStrVec = std::vector<std::string>;
     using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
     using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
     using TBayesinOptimizationUPtr = std::unique_ptr<maths::CBayesianOptimisation>;
@@ -101,6 +102,16 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! frame with \p numberRows row and \p numberColumns columns will use.
     std::size_t estimateMemoryUsage(std::size_t numberRows, std::size_t numberColumns) const;
 
+    //! The name of the object holding the best hyperaparameters in the state document.
+    static const std::string& bestHyperparametersName();
+
+    //! The name of the object holding the best regularisation hyperparameters in the
+    //! state document.
+    static const std::string& bestRegularizationHyperparametersName();
+
+    //! A list of the names of the best individual hyperparameters in the state document.
+    static TStrVec bestHyperparameterNames();
+
     //! Persist by passing information to \p inserter.
     void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
 

diff --git a/include/maths/CTools.h b/include/maths/CTools.h
@@ -678,7 +678,8 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable {
     //! \param[in] width The step width.
     //! \param[in] x0 The centre of the step.
     //! \param[in] sign Determines whether it's a step up or down.
-    static double logisticFunction(double x, double width, double x0 = 0.0, double sign = 1.0) {
+    static double
+    logisticFunction(double x, double width = 1.0, double x0 = 0.0, double sign = 1.0) {
         return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width));
     }