13
13
14
14
#include < maths/CBasicStatistics.h>
15
15
#include < maths/CDataFrameRegressionModel.h>
16
+ #include < maths/CLinearAlgebra.h>
16
17
#include < maths/ImportExport.h>
17
18
18
19
#include < cstddef>
19
20
#include < memory>
21
+ #include < string>
22
+ #include < vector>
20
23
21
24
namespace ml {
22
25
namespace core {
@@ -29,18 +32,29 @@ class CEncodedDataFrameRowRef;
29
32
namespace boosted_tree_detail {
30
33
class MATHS_EXPORT CArgMinLossImpl {
31
34
public:
35
+ CArgMinLossImpl (double lambda);
32
36
virtual ~CArgMinLossImpl () = default ;
33
37
34
38
virtual std::unique_ptr<CArgMinLossImpl> clone () const = 0;
39
+ virtual bool nextPass () = 0;
35
40
virtual void add (double prediction, double actual) = 0;
36
41
virtual void merge (const CArgMinLossImpl& other) = 0;
37
42
virtual double value () const = 0;
43
+
44
+ protected:
45
+ double lambda () const ;
46
+
47
+ private:
48
+ double m_Lambda;
38
49
};
39
50
40
- // ! \brief Finds the value to add to a set of predictions which minimises the MSE.
51
+ // ! \brief Finds the value to add to a set of predictions which minimises the
52
+ // ! regularized MSE w.r.t. the actual values.
41
53
class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
42
54
public:
55
+ CArgMinMseImpl (double lambda);
43
56
std::unique_ptr<CArgMinLossImpl> clone () const override ;
57
+ bool nextPass () override ;
44
58
void add (double prediction, double actual) override ;
45
59
void merge (const CArgMinLossImpl& other) override ;
46
60
double value () const override ;
@@ -51,6 +65,46 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
51
65
private:
52
66
TMeanAccumulator m_MeanError;
53
67
};
68
+
69
+ // ! \brief Finds the value to add to a set of predicted log-odds which minimises
70
+ // ! regularised cross entropy loss w.r.t. the actual categories.
71
+ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
72
+ public:
73
+ CArgMinLogisticImpl (double lambda);
74
+ std::unique_ptr<CArgMinLossImpl> clone () const override ;
75
+ bool nextPass () override ;
76
+ void add (double prediction, double actual) override ;
77
+ void merge (const CArgMinLossImpl& other) override ;
78
+ double value () const override ;
79
+
80
+ private:
81
+ using TMinMaxAccumulator = CBasicStatistics::CMinMax<double >;
82
+ using TSizeVector = CVectorNx1<std::size_t , 2 >;
83
+ using TSizeVectorVec = std::vector<TSizeVector>;
84
+
85
+ private:
86
+ std::size_t bucket (double prediction) const {
87
+ double bucket{(prediction - m_PredictionMinMax.min ()) / this ->bucketWidth ()};
88
+ return std::min (static_cast <std::size_t >(bucket),
89
+ m_BucketCategoryCounts.size () - 1 );
90
+ }
91
+
92
+ double bucketCentre (std::size_t bucket) const {
93
+ return m_PredictionMinMax.min () +
94
+ (static_cast <double >(bucket) + 0.5 ) * this ->bucketWidth ();
95
+ }
96
+
97
+ double bucketWidth () const {
98
+ return m_PredictionMinMax.range () /
99
+ static_cast <double >(m_BucketCategoryCounts.size ());
100
+ }
101
+
102
+ private:
103
+ std::size_t m_CurrentPass = 0 ;
104
+ TMinMaxAccumulator m_PredictionMinMax;
105
+ TSizeVector m_CategoryCounts;
106
+ TSizeVectorVec m_BucketCategoryCounts;
107
+ };
54
108
}
55
109
56
110
namespace boosted_tree {
@@ -64,6 +118,11 @@ class MATHS_EXPORT CArgMinLoss {
64
118
CArgMinLoss& operator =(const CArgMinLoss& other);
65
119
CArgMinLoss& operator =(CArgMinLoss&& other) = default ;
66
120
121
+ // ! Start another pass over the predictions and actuals.
122
+ // !
123
+ // ! \return True if we need to perform another pass to compute value().
124
+ bool nextPass () const ;
125
+
67
126
// ! Update with a point prediction and actual value.
68
127
void add (double prediction, double actual);
69
128
@@ -94,6 +153,8 @@ class MATHS_EXPORT CArgMinLoss {
94
153
class MATHS_EXPORT CLoss {
95
154
public:
96
155
virtual ~CLoss () = default ;
156
+ // ! Clone the loss.
157
+ virtual std::unique_ptr<CLoss> clone () const = 0;
97
158
// ! The value of the loss function.
98
159
virtual double value (double prediction, double actual) const = 0;
99
160
// ! The slope of the loss function.
@@ -103,7 +164,7 @@ class MATHS_EXPORT CLoss {
103
164
// ! Returns true if the loss curvature is constant.
104
165
virtual bool isCurvatureConstant () const = 0;
105
166
// ! Get an object which computes the leaf value that minimises loss.
106
- virtual CArgMinLoss minimizer () const = 0;
167
+ virtual CArgMinLoss minimizer (double lambda ) const = 0;
107
168
// ! Get the name of the loss function
108
169
virtual const std::string& name () const = 0;
109
170
@@ -114,11 +175,34 @@ class MATHS_EXPORT CLoss {
114
175
// ! \brief The MSE loss function.
115
176
class MATHS_EXPORT CMse final : public CLoss {
116
177
public:
178
+ std::unique_ptr<CLoss> clone () const override ;
117
179
double value (double prediction, double actual) const override ;
118
180
double gradient (double prediction, double actual) const override ;
119
181
double curvature (double prediction, double actual) const override ;
120
182
bool isCurvatureConstant () const override ;
121
- CArgMinLoss minimizer () const override ;
183
+ CArgMinLoss minimizer (double lambda) const override ;
184
+ const std::string& name () const override ;
185
+
186
+ public:
187
+ static const std::string NAME;
188
+ };
189
+
190
+ // ! \brief Implements loss for binomial logistic regression.
191
+ // !
192
+ // ! DESCRIPTION:\n
193
+ // ! This targets the cross entropy loss using the tree to predict class log-odds:
194
+ // ! <pre class="fragment">
195
+ // ! \f$\displaystyle l_i(p) = -(1 - a_i) \log(1 - S(p)) - a_i \log(S(p))\f$
196
+ // ! </pre>
197
+ // ! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the
198
+ // ! prediction and \f$S(\cdot)\f$ denotes the logistic function.
199
+ class MATHS_EXPORT CLogistic final : public CLoss {
200
+ std::unique_ptr<CLoss> clone () const override ;
201
+ double value (double prediction, double actual) const override ;
202
+ double gradient (double prediction, double actual) const override ;
203
+ double curvature (double prediction, double actual) const override ;
204
+ bool isCurvatureConstant () const override ;
205
+ CArgMinLoss minimizer (double lambda) const override ;
122
206
const std::string& name () const override ;
123
207
124
208
public:
@@ -248,6 +332,7 @@ class MATHS_EXPORT CBoostedTreeNode final {
248
332
// ! proposed by Reshef for this purpose. See CDataFrameCategoryEncoder for more details.
249
333
class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
250
334
public:
335
+ using TStrVec = std::vector<std::string>;
251
336
using TRowRef = core::CDataFrame::TRowRef;
252
337
using TLossFunctionUPtr = std::unique_ptr<boosted_tree::CLoss>;
253
338
using TDataFramePtr = core::CDataFrame*;
@@ -285,6 +370,16 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
285
370
// ! Get the model produced by training if it has been run.
286
371
const TNodeVecVec& trainedModel () const ;
287
372
373
+ // ! The name of the object holding the best hyperaparameters in the state document.
374
+ static const std::string& bestHyperparametersName ();
375
+
376
+ // ! The name of the object holding the best regularisation hyperparameters in the
377
+ // ! state document.
378
+ static const std::string& bestRegularizationHyperparametersName ();
379
+
380
+ // ! A list of the names of the best individual hyperparameters in the state document.
381
+ static TStrVec bestHyperparameterNames ();
382
+
288
383
// ! Persist by passing information to \p inserter.
289
384
void acceptPersistInserter (core::CStatePersistInserter& inserter) const ;
290
385
0 commit comments