Skip to content

[ML] Multiclass maximise minimum recall #1113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 7, 2020
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
when CPU is constrained. (See {ml-pull}1109[#1109].)
* Take `training_percent` into account when estimating memory usage for classification and regression.
(See {ml-pull}1111[1111].)
* Support maximize minimum recall when assigning class labels for multiclass classification.
(See {ml-pull}1113[#1113].)
* Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].)
* Adds new `num_matches` and `preferred_to_categories` fields to category output.
(See {ml-pull}1062[#1062])
Expand Down
2 changes: 1 addition & 1 deletion include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! Get the number of columns training the model will add to the data frame.
static std::size_t numberExtraColumnsForTrain(std::size_t numberLossParameters) {
// We store as follows:
// 1. The predicted values for the dependent variables
// 1. The predicted values for the dependent variable
// 2. The gradient of the loss function
// 3. The upper triangle of the hessian of the loss function
// 4. The example's weight
Expand Down
16 changes: 15 additions & 1 deletion include/maths/CDataFrameUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
using TRowRef = core::CDataFrame::TRowRef;
using TWeightFunc = std::function<double(const TRowRef&)>;
using TDoubleVector = CDenseVector<double>;
using TReadPredictionFunc = std::function<TDoubleVector(const TRowRef)>;
using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
using TReadPredictionFunc = std::function<TMemoryMappedFloatVector(const TRowRef&)>;
using TQuantileSketchVec = std::vector<CQuantileSketch>;
using TPackedBitVectorVec = std::vector<core::CPackedBitVector>;

Expand Down Expand Up @@ -408,6 +409,19 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
const core::CPackedBitVector& rowMask,
const TSizeVec& columnMask,
std::size_t numberSamples);
static TDoubleVector
maximizeMinimumRecallForBinary(std::size_t numberThreads,
const core::CDataFrame& frame,
const core::CPackedBitVector& rowMask,
std::size_t targetColumn,
const TReadPredictionFunc& readPrediction);
static TDoubleVector
maximizeMinimumRecallForMulticlass(std::size_t numberThreads,
const core::CDataFrame& frame,
const core::CPackedBitVector& rowMask,
std::size_t numberClasses,
std::size_t targetColumn,
const TReadPredictionFunc& readPrediction);
static void removeMetricColumns(const core::CDataFrame& frame, TSizeVec& columnMask);
static void removeCategoricalColumns(const core::CDataFrame& frame, TSizeVec& columnMask);
static double unitWeight(const TRowRef&);
Expand Down
24 changes: 22 additions & 2 deletions include/maths/CTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cstring>
#include <iosfwd>
#include <limits>
#include <numeric>
#include <vector>

namespace ml {
Expand Down Expand Up @@ -684,7 +685,7 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable {
return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width));
}

//! Compute the softmax from the multinomial logit values \p logit.
//! Compute the softmax for the multinomial logit values \p logit.
//!
//! i.e. \f$[\sigma(z)]_i = \frac{exp(z_i)}{\sum_j exp(z_j)}\f$.
//!
Expand All @@ -703,10 +704,29 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable {
}
}

//! Specialize the softmax for our dense vector type.
//! Compute the log of the softmax for the multinomial logit values \p logit.
template<typename COLLECTION>
static void inplaceLogSoftmax(COLLECTION& z) {
double zmax{*std::max_element(z.begin(), z.end())};
for (auto& zi : z) {
zi -= zmax;
}
double logZ{std::log(std::accumulate(
z.begin(), z.end(), 0.0,
[](double sum, const auto& zi) { return sum + std::exp(zi); }))};
for (auto& zi : z) {
zi -= logZ;
}
}

//! Specialize the softmax for CDenseVector.
template<typename T>
static void inplaceSoftmax(CDenseVector<T>& z);

//! Specialize the log(softmax) for CDenseVector.
template<typename SCALAR>
static void inplaceLogSoftmax(CDenseVector<SCALAR>& z);

//! Linearly interpolate a function on the interval [\p a, \p b].
static double linearlyInterpolate(double a, double b, double fa, double fb, double x);

Expand Down
10 changes: 10 additions & 0 deletions include/maths/CToolsDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <maths/CCompositeFunctions.h>
#include <maths/CIntegration.h>
#include <maths/CLinearAlgebraEigen.h>
#include <maths/CMixtureDistribution.h>
#include <maths/COrderings.h>
#include <maths/CTools.h>
Expand Down Expand Up @@ -308,6 +309,15 @@ void CTools::inplaceSoftmax(CDenseVector<T>& z) {
z.array() = z.array().exp();
z /= z.sum();
}

template<typename SCALAR>
void CTools::inplaceLogSoftmax(CDenseVector<SCALAR>& z) {
// Handle under/overflow when taking exponentials by subtracting zmax.
double zmax{z.maxCoeff()};
z.array() -= zmax;
double Z{z.array().exp().sum()};
z.array() -= std::log(Z);
}
}
}

Expand Down
21 changes: 18 additions & 3 deletions lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,16 @@ void CBoostedTreeImpl::initializePerFoldTestLosses() {
}

void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& frame) {

using TFloatStorageVec = std::vector<CFloatStorage>;

if (m_Loss->type() == CLoss::E_BinaryClassification ||
m_Loss->type() == CLoss::E_MulticlassClassification) {

std::size_t numberClasses{m_Loss->type() == CLoss::E_BinaryClassification
? 2
: m_Loss->numberParameters()};
TFloatStorageVec storage(2);

switch (m_ClassAssignmentObjective) {
case CBoostedTree::E_Accuracy:
Expand All @@ -391,9 +395,20 @@ void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& fram
case CBoostedTree::E_MinimumRecall:
m_ClassificationWeights = CDataFrameUtils::maximumMinimumRecallClassWeights(
m_NumberThreads, frame, this->allTrainingRowsMask(),
numberClasses, m_DependentVariable, [this](const TRowRef& row) {
return m_Loss->transform(readPrediction(
row, m_NumberInputColumns, m_Loss->numberParameters()));
numberClasses, m_DependentVariable,
[storage, numberClasses, this](const TRowRef& row) mutable {
if (m_Loss->type() == CLoss::E_BinaryClassification) {
// We predict the log-odds but this is expected to return
// the log of the predicted class probabilities.
TMemoryMappedFloatVector result{&storage[0], 2};
result.array() = m_Loss
->transform(readPrediction(
row, m_NumberInputColumns, numberClasses))
.array()
.log();
return result;
}
return readPrediction(row, m_NumberInputColumns, numberClasses);
});
break;
}
Expand Down
13 changes: 2 additions & 11 deletions lib/maths/CBoostedTreeLoss.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,6 @@ double logLogistic(double logOdds) {
}
return std::log(CTools::logisticFunction(logOdds));
}

template<typename SCALAR>
void inplaceLogSoftmax(CDenseVector<SCALAR>& z) {
// Handle under/overflow when taking exponentials by subtracting zmax.
double zmax{z.maxCoeff()};
z.array() -= zmax;
double Z{z.array().exp().sum()};
z.array() -= std::log(Z);
}
}

namespace boosted_tree_detail {
Expand Down Expand Up @@ -332,7 +323,7 @@ CArgMinMultinomialLogisticLossImpl::objective() const {
if (m_Centres.size() == 1) {
return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable {
logProbabilities = m_Centres[0] + weight;
inplaceLogSoftmax(logProbabilities);
CTools::inplaceLogSoftmax(logProbabilities);
return lambda * weight.squaredNorm() - m_ClassCounts.transpose() * logProbabilities;
};
}
Expand All @@ -341,7 +332,7 @@ CArgMinMultinomialLogisticLossImpl::objective() const {
for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
if (m_CentresClassCounts[i].sum() > 0.0) {
logProbabilities = m_Centres[i] + weight;
inplaceLogSoftmax(logProbabilities);
CTools::inplaceLogSoftmax(logProbabilities);
loss -= m_CentresClassCounts[i].transpose() * logProbabilities;
}
}
Expand Down
Loading