|
12 | 12 | #include <core/CStatePersistInserter.h>
|
13 | 13 | #include <core/CStateRestoreTraverser.h>
|
14 | 14 |
|
15 |
| -#include <maths/CBasicStatistics.h> |
16 | 15 | #include <maths/CBoostedTreeHyperparameters.h>
|
17 | 16 | #include <maths/CDataFrameCategoryEncoder.h>
|
18 | 17 | #include <maths/CDataFramePredictiveModel.h>
|
19 |
| -#include <maths/CLinearAlgebra.h> |
| 18 | +#include <maths/CLinearAlgebraEigen.h> |
20 | 19 | #include <maths/ImportExport.h>
|
21 | 20 |
|
22 | 21 | #include <cstddef>
|
| 22 | +#include <cstdint> |
23 | 23 | #include <memory>
|
24 | 24 | #include <string>
|
| 25 | +#include <utility> |
25 | 26 | #include <vector>
|
26 | 27 |
|
27 | 28 | namespace ml {
|
28 | 29 | namespace core {
|
29 | 30 | class CPackedBitVector;
|
30 | 31 | }
|
31 | 32 | namespace maths {
|
32 |
| -class CDataFrameCategoryEncoder; |
33 |
| -class CEncodedDataFrameRowRef; |
34 |
| - |
35 |
| -namespace boosted_tree_detail { |
36 |
| -class MATHS_EXPORT CArgMinLossImpl { |
37 |
| -public: |
38 |
| - using TDoubleVector = CDenseVector<double>; |
39 |
| - using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>; |
40 |
| - |
41 |
| -public: |
42 |
| - CArgMinLossImpl(double lambda); |
43 |
| - virtual ~CArgMinLossImpl() = default; |
44 |
| - |
45 |
| - virtual std::unique_ptr<CArgMinLossImpl> clone() const = 0; |
46 |
| - virtual bool nextPass() = 0; |
47 |
| - virtual void add(const TMemoryMappedFloatVector& prediction, |
48 |
| - double actual, |
49 |
| - double weight = 1.0) = 0; |
50 |
| - virtual void merge(const CArgMinLossImpl& other) = 0; |
51 |
| - virtual TDoubleVector value() const = 0; |
52 |
| - |
53 |
| -protected: |
54 |
| - double lambda() const; |
55 |
| - |
56 |
| -private: |
57 |
| - double m_Lambda; |
58 |
| -}; |
59 |
| - |
60 |
| -//! \brief Finds the value to add to a set of predictions which minimises the |
61 |
| -//! regularized MSE w.r.t. the actual values. |
62 |
| -class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl { |
63 |
| -public: |
64 |
| - CArgMinMseImpl(double lambda); |
65 |
| - std::unique_ptr<CArgMinLossImpl> clone() const override; |
66 |
| - bool nextPass() override; |
67 |
| - void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; |
68 |
| - void merge(const CArgMinLossImpl& other) override; |
69 |
| - TDoubleVector value() const override; |
70 |
| - |
71 |
| -private: |
72 |
| - using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator; |
73 |
| - |
74 |
| -private: |
75 |
| - TMeanAccumulator m_MeanError; |
76 |
| -}; |
77 |
| - |
78 |
| -//! \brief Finds the value to add to a set of predicted log-odds which minimises |
79 |
| -//! regularised cross entropy loss w.r.t. the actual categories. |
80 |
| -class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { |
81 |
| -public: |
82 |
| - CArgMinLogisticImpl(double lambda); |
83 |
| - std::unique_ptr<CArgMinLossImpl> clone() const override; |
84 |
| - bool nextPass() override; |
85 |
| - void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; |
86 |
| - void merge(const CArgMinLossImpl& other) override; |
87 |
| - TDoubleVector value() const override; |
88 |
| - |
89 |
| -private: |
90 |
| - using TMinMaxAccumulator = CBasicStatistics::CMinMax<double>; |
91 |
| - using TDoubleVector2x1 = CVectorNx1<double, 2>; |
92 |
| - using TDoubleVector2x1Vec = std::vector<TDoubleVector2x1>; |
93 |
| - |
94 |
| -private: |
95 |
| - std::size_t bucket(double prediction) const { |
96 |
| - double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()}; |
97 |
| - return std::min(static_cast<std::size_t>(bucket), |
98 |
| - m_BucketCategoryCounts.size() - 1); |
99 |
| - } |
100 |
| - |
101 |
| - double bucketCentre(std::size_t bucket) const { |
102 |
| - return m_PredictionMinMax.min() + |
103 |
| - (static_cast<double>(bucket) + 0.5) * this->bucketWidth(); |
104 |
| - } |
105 |
| - |
106 |
| - double bucketWidth() const { |
107 |
| - return m_PredictionMinMax.range() / |
108 |
| - static_cast<double>(m_BucketCategoryCounts.size()); |
109 |
| - } |
110 |
| - |
111 |
| -private: |
112 |
| - std::size_t m_CurrentPass = 0; |
113 |
| - TMinMaxAccumulator m_PredictionMinMax; |
114 |
| - TDoubleVector2x1 m_CategoryCounts; |
115 |
| - TDoubleVector2x1Vec m_BucketCategoryCounts; |
116 |
| -}; |
117 |
| -} |
118 |
| - |
119 | 33 | namespace boosted_tree {
|
120 |
| - |
121 |
| -//! \brief Computes the leaf value which minimizes the loss function. |
122 |
| -class MATHS_EXPORT CArgMinLoss { |
123 |
| -public: |
124 |
| - using TDoubleVector = CDenseVector<double>; |
125 |
| - using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>; |
126 |
| - |
127 |
| -public: |
128 |
| - CArgMinLoss(const CArgMinLoss& other); |
129 |
| - CArgMinLoss(CArgMinLoss&& other) = default; |
130 |
| - |
131 |
| - CArgMinLoss& operator=(const CArgMinLoss& other); |
132 |
| - CArgMinLoss& operator=(CArgMinLoss&& other) = default; |
133 |
| - |
134 |
| - //! Start another pass over the predictions and actuals. |
135 |
| - //! |
136 |
| - //! \return True if we need to perform another pass to compute value(). |
137 |
| - bool nextPass() const; |
138 |
| - |
139 |
| - //! Update with a point prediction and actual value. |
140 |
| - void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0); |
141 |
| - |
142 |
| - //! Get the minimiser over the predictions and actual values added to both |
143 |
| - //! this and \p other. |
144 |
| - void merge(CArgMinLoss& other); |
145 |
| - |
146 |
| - //! Returns the value to add to the predictions which minimises the loss |
147 |
| - //! with respect to the actuals. |
148 |
| - //! |
149 |
| - //! Formally, returns \f$x^* = arg\min_x\{\sum_i{L(p_i + x, a_i)}\}\f$ |
150 |
| - //! for predictions and actuals \f$p_i\f$ and \f$a_i\f$, respectively. |
151 |
| - TDoubleVector value() const; |
152 |
| - |
153 |
| -private: |
154 |
| - using TArgMinLossImplUPtr = std::unique_ptr<boosted_tree_detail::CArgMinLossImpl>; |
155 |
| - |
156 |
| -private: |
157 |
| - CArgMinLoss(const boosted_tree_detail::CArgMinLossImpl& impl); |
158 |
| - |
159 |
| -private: |
160 |
| - TArgMinLossImplUPtr m_Impl; |
161 |
| - |
162 |
| - friend class CLoss; |
163 |
| -}; |
164 |
| - |
165 |
| -//! \brief Defines the loss function for the regression problem. |
166 |
| -class MATHS_EXPORT CLoss { |
167 |
| -public: |
168 |
| - using TDoubleVector = CDenseVector<double>; |
169 |
| - using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>; |
170 |
| - using TWriter = std::function<void(std::size_t, double)>; |
171 |
| - |
172 |
| -public: |
173 |
| - virtual ~CLoss() = default; |
174 |
| - //! Clone the loss. |
175 |
| - virtual std::unique_ptr<CLoss> clone() const = 0; |
176 |
| - //! The number of parameters to the loss function. |
177 |
| - virtual std::size_t numberParameters() const = 0; |
178 |
| - //! The value of the loss function. |
179 |
| - virtual double value(const TMemoryMappedFloatVector& prediction, |
180 |
| - double actual, |
181 |
| - double weight = 1.0) const = 0; |
182 |
| - //! The gradient of the loss function. |
183 |
| - virtual void gradient(const TMemoryMappedFloatVector& prediction, |
184 |
| - double actual, |
185 |
| - TWriter writer, |
186 |
| - double weight = 1.0) const = 0; |
187 |
| - //! The Hessian of the loss function (flattened). |
188 |
| - virtual void curvature(const TMemoryMappedFloatVector& prediction, |
189 |
| - double actual, |
190 |
| - TWriter writer, |
191 |
| - double weight = 1.0) const = 0; |
192 |
| - //! Returns true if the loss curvature is constant. |
193 |
| - virtual bool isCurvatureConstant() const = 0; |
194 |
| - //! Transforms a prediction from the forest to the target space. |
195 |
| - virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0; |
196 |
| - //! Get an object which computes the leaf value that minimises loss. |
197 |
| - virtual CArgMinLoss minimizer(double lambda) const = 0; |
198 |
| - //! Get the name of the loss function |
199 |
| - virtual const std::string& name() const = 0; |
200 |
| - |
201 |
| -protected: |
202 |
| - CArgMinLoss makeMinimizer(const boosted_tree_detail::CArgMinLossImpl& impl) const; |
203 |
| -}; |
204 |
| - |
205 |
| -//! \brief The MSE loss function. |
206 |
| -class MATHS_EXPORT CMse final : public CLoss { |
207 |
| -public: |
208 |
| - static const std::string NAME; |
209 |
| - |
210 |
| -public: |
211 |
| - std::unique_ptr<CLoss> clone() const override; |
212 |
| - std::size_t numberParameters() const override; |
213 |
| - double value(const TMemoryMappedFloatVector& prediction, |
214 |
| - double actual, |
215 |
| - double weight = 1.0) const override; |
216 |
| - void gradient(const TMemoryMappedFloatVector& prediction, |
217 |
| - double actual, |
218 |
| - TWriter writer, |
219 |
| - double weight = 1.0) const override; |
220 |
| - void curvature(const TMemoryMappedFloatVector& prediction, |
221 |
| - double actual, |
222 |
| - TWriter writer, |
223 |
| - double weight = 1.0) const override; |
224 |
| - bool isCurvatureConstant() const override; |
225 |
| - TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; |
226 |
| - CArgMinLoss minimizer(double lambda) const override; |
227 |
| - const std::string& name() const override; |
228 |
| -}; |
229 |
| - |
230 |
| -//! \brief Implements loss for binomial logistic regression. |
231 |
| -//! |
232 |
| -//! DESCRIPTION:\n |
233 |
| -//! This targets the cross entropy loss using the tree to predict class log-odds: |
234 |
| -//! <pre class="fragment"> |
235 |
| -//! \f$\displaystyle l_i(p) = -(1 - a_i) \log(1 - S(p)) - a_i \log(S(p))\f$ |
236 |
| -//! </pre> |
237 |
| -//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the |
238 |
| -//! prediction and \f$S(\cdot)\f$ denotes the logistic function. |
239 |
| -class MATHS_EXPORT CBinomialLogistic final : public CLoss { |
240 |
| -public: |
241 |
| - static const std::string NAME; |
242 |
| - |
243 |
| -public: |
244 |
| - std::unique_ptr<CLoss> clone() const override; |
245 |
| - std::size_t numberParameters() const override; |
246 |
| - double value(const TMemoryMappedFloatVector& prediction, |
247 |
| - double actual, |
248 |
| - double weight = 1.0) const override; |
249 |
| - void gradient(const TMemoryMappedFloatVector& prediction, |
250 |
| - double actual, |
251 |
| - TWriter writer, |
252 |
| - double weight = 1.0) const override; |
253 |
| - void curvature(const TMemoryMappedFloatVector& prediction, |
254 |
| - double actual, |
255 |
| - TWriter writer, |
256 |
| - double weight = 1.0) const override; |
257 |
| - bool isCurvatureConstant() const override; |
258 |
| - TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; |
259 |
| - CArgMinLoss minimizer(double lambda) const override; |
260 |
| - const std::string& name() const override; |
261 |
| -}; |
| 34 | +class CLoss; |
262 | 35 | }
|
263 |
| - |
| 36 | +class CDataFrameCategoryEncoder; |
| 37 | +class CEncodedDataFrameRowRef; |
264 | 38 | class CBoostedTreeImpl;
|
265 | 39 |
|
266 | 40 | //! \brief A node of a regression tree.
|
|
0 commit comments