Skip to content

Commit 56550ee

Browse files
authored
[ML] Move classification and regression loss functions into their own file (#1027)
1 parent 2fcd104 commit 56550ee

13 files changed

+629
-572
lines changed

include/maths/CBoostedTree.h

Lines changed: 6 additions & 232 deletions
Original file line numberDiff line numberDiff line change
@@ -12,255 +12,29 @@
1212
#include <core/CStatePersistInserter.h>
1313
#include <core/CStateRestoreTraverser.h>
1414

15-
#include <maths/CBasicStatistics.h>
1615
#include <maths/CBoostedTreeHyperparameters.h>
1716
#include <maths/CDataFrameCategoryEncoder.h>
1817
#include <maths/CDataFramePredictiveModel.h>
19-
#include <maths/CLinearAlgebra.h>
18+
#include <maths/CLinearAlgebraEigen.h>
2019
#include <maths/ImportExport.h>
2120

2221
#include <cstddef>
22+
#include <cstdint>
2323
#include <memory>
2424
#include <string>
25+
#include <utility>
2526
#include <vector>
2627

2728
namespace ml {
2829
namespace core {
2930
class CPackedBitVector;
3031
}
3132
namespace maths {
32-
class CDataFrameCategoryEncoder;
33-
class CEncodedDataFrameRowRef;
34-
35-
namespace boosted_tree_detail {
36-
class MATHS_EXPORT CArgMinLossImpl {
37-
public:
38-
using TDoubleVector = CDenseVector<double>;
39-
using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
40-
41-
public:
42-
CArgMinLossImpl(double lambda);
43-
virtual ~CArgMinLossImpl() = default;
44-
45-
virtual std::unique_ptr<CArgMinLossImpl> clone() const = 0;
46-
virtual bool nextPass() = 0;
47-
virtual void add(const TMemoryMappedFloatVector& prediction,
48-
double actual,
49-
double weight = 1.0) = 0;
50-
virtual void merge(const CArgMinLossImpl& other) = 0;
51-
virtual TDoubleVector value() const = 0;
52-
53-
protected:
54-
double lambda() const;
55-
56-
private:
57-
double m_Lambda;
58-
};
59-
60-
//! \brief Finds the value to add to a set of predictions which minimises the
61-
//! regularized MSE w.r.t. the actual values.
62-
class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
63-
public:
64-
CArgMinMseImpl(double lambda);
65-
std::unique_ptr<CArgMinLossImpl> clone() const override;
66-
bool nextPass() override;
67-
void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
68-
void merge(const CArgMinLossImpl& other) override;
69-
TDoubleVector value() const override;
70-
71-
private:
72-
using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
73-
74-
private:
75-
TMeanAccumulator m_MeanError;
76-
};
77-
78-
//! \brief Finds the value to add to a set of predicted log-odds which minimises
79-
//! regularised cross entropy loss w.r.t. the actual categories.
80-
class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
81-
public:
82-
CArgMinLogisticImpl(double lambda);
83-
std::unique_ptr<CArgMinLossImpl> clone() const override;
84-
bool nextPass() override;
85-
void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
86-
void merge(const CArgMinLossImpl& other) override;
87-
TDoubleVector value() const override;
88-
89-
private:
90-
using TMinMaxAccumulator = CBasicStatistics::CMinMax<double>;
91-
using TDoubleVector2x1 = CVectorNx1<double, 2>;
92-
using TDoubleVector2x1Vec = std::vector<TDoubleVector2x1>;
93-
94-
private:
95-
std::size_t bucket(double prediction) const {
96-
double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()};
97-
return std::min(static_cast<std::size_t>(bucket),
98-
m_BucketCategoryCounts.size() - 1);
99-
}
100-
101-
double bucketCentre(std::size_t bucket) const {
102-
return m_PredictionMinMax.min() +
103-
(static_cast<double>(bucket) + 0.5) * this->bucketWidth();
104-
}
105-
106-
double bucketWidth() const {
107-
return m_PredictionMinMax.range() /
108-
static_cast<double>(m_BucketCategoryCounts.size());
109-
}
110-
111-
private:
112-
std::size_t m_CurrentPass = 0;
113-
TMinMaxAccumulator m_PredictionMinMax;
114-
TDoubleVector2x1 m_CategoryCounts;
115-
TDoubleVector2x1Vec m_BucketCategoryCounts;
116-
};
117-
}
118-
11933
namespace boosted_tree {
120-
121-
//! \brief Computes the leaf value which minimizes the loss function.
122-
class MATHS_EXPORT CArgMinLoss {
123-
public:
124-
using TDoubleVector = CDenseVector<double>;
125-
using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
126-
127-
public:
128-
CArgMinLoss(const CArgMinLoss& other);
129-
CArgMinLoss(CArgMinLoss&& other) = default;
130-
131-
CArgMinLoss& operator=(const CArgMinLoss& other);
132-
CArgMinLoss& operator=(CArgMinLoss&& other) = default;
133-
134-
//! Start another pass over the predictions and actuals.
135-
//!
136-
//! \return True if we need to perform another pass to compute value().
137-
bool nextPass() const;
138-
139-
//! Update with a point prediction and actual value.
140-
void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0);
141-
142-
//! Get the minimiser over the predictions and actual values added to both
143-
//! this and \p other.
144-
void merge(CArgMinLoss& other);
145-
146-
//! Returns the value to add to the predictions which minimises the loss
147-
//! with respect to the actuals.
148-
//!
149-
//! Formally, returns \f$x^* = arg\min_x\{\sum_i{L(p_i + x, a_i)}\}\f$
150-
//! for predictions and actuals \f$p_i\f$ and \f$a_i\f$, respectively.
151-
TDoubleVector value() const;
152-
153-
private:
154-
using TArgMinLossImplUPtr = std::unique_ptr<boosted_tree_detail::CArgMinLossImpl>;
155-
156-
private:
157-
CArgMinLoss(const boosted_tree_detail::CArgMinLossImpl& impl);
158-
159-
private:
160-
TArgMinLossImplUPtr m_Impl;
161-
162-
friend class CLoss;
163-
};
164-
165-
//! \brief Defines the loss function for the regression problem.
166-
class MATHS_EXPORT CLoss {
167-
public:
168-
using TDoubleVector = CDenseVector<double>;
169-
using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
170-
using TWriter = std::function<void(std::size_t, double)>;
171-
172-
public:
173-
virtual ~CLoss() = default;
174-
//! Clone the loss.
175-
virtual std::unique_ptr<CLoss> clone() const = 0;
176-
//! The number of parameters to the loss function.
177-
virtual std::size_t numberParameters() const = 0;
178-
//! The value of the loss function.
179-
virtual double value(const TMemoryMappedFloatVector& prediction,
180-
double actual,
181-
double weight = 1.0) const = 0;
182-
//! The gradient of the loss function.
183-
virtual void gradient(const TMemoryMappedFloatVector& prediction,
184-
double actual,
185-
TWriter writer,
186-
double weight = 1.0) const = 0;
187-
//! The Hessian of the loss function (flattened).
188-
virtual void curvature(const TMemoryMappedFloatVector& prediction,
189-
double actual,
190-
TWriter writer,
191-
double weight = 1.0) const = 0;
192-
//! Returns true if the loss curvature is constant.
193-
virtual bool isCurvatureConstant() const = 0;
194-
//! Transforms a prediction from the forest to the target space.
195-
virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0;
196-
//! Get an object which computes the leaf value that minimises loss.
197-
virtual CArgMinLoss minimizer(double lambda) const = 0;
198-
//! Get the name of the loss function
199-
virtual const std::string& name() const = 0;
200-
201-
protected:
202-
CArgMinLoss makeMinimizer(const boosted_tree_detail::CArgMinLossImpl& impl) const;
203-
};
204-
205-
//! \brief The MSE loss function.
206-
class MATHS_EXPORT CMse final : public CLoss {
207-
public:
208-
static const std::string NAME;
209-
210-
public:
211-
std::unique_ptr<CLoss> clone() const override;
212-
std::size_t numberParameters() const override;
213-
double value(const TMemoryMappedFloatVector& prediction,
214-
double actual,
215-
double weight = 1.0) const override;
216-
void gradient(const TMemoryMappedFloatVector& prediction,
217-
double actual,
218-
TWriter writer,
219-
double weight = 1.0) const override;
220-
void curvature(const TMemoryMappedFloatVector& prediction,
221-
double actual,
222-
TWriter writer,
223-
double weight = 1.0) const override;
224-
bool isCurvatureConstant() const override;
225-
TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
226-
CArgMinLoss minimizer(double lambda) const override;
227-
const std::string& name() const override;
228-
};
229-
230-
//! \brief Implements loss for binomial logistic regression.
231-
//!
232-
//! DESCRIPTION:\n
233-
//! This targets the cross entropy loss using the tree to predict class log-odds:
234-
//! <pre class="fragment">
235-
//! \f$\displaystyle l_i(p) = -(1 - a_i) \log(1 - S(p)) - a_i \log(S(p))\f$
236-
//! </pre>
237-
//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the
238-
//! prediction and \f$S(\cdot)\f$ denotes the logistic function.
239-
class MATHS_EXPORT CBinomialLogistic final : public CLoss {
240-
public:
241-
static const std::string NAME;
242-
243-
public:
244-
std::unique_ptr<CLoss> clone() const override;
245-
std::size_t numberParameters() const override;
246-
double value(const TMemoryMappedFloatVector& prediction,
247-
double actual,
248-
double weight = 1.0) const override;
249-
void gradient(const TMemoryMappedFloatVector& prediction,
250-
double actual,
251-
TWriter writer,
252-
double weight = 1.0) const override;
253-
void curvature(const TMemoryMappedFloatVector& prediction,
254-
double actual,
255-
TWriter writer,
256-
double weight = 1.0) const override;
257-
bool isCurvatureConstant() const override;
258-
TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
259-
CArgMinLoss minimizer(double lambda) const override;
260-
const std::string& name() const override;
261-
};
34+
class CLoss;
26235
}
263-
36+
class CDataFrameCategoryEncoder;
37+
class CEncodedDataFrameRowRef;
26438
class CBoostedTreeImpl;
26539

26640
//! \brief A node of a regression tree.

0 commit comments

Comments
 (0)