Skip to content

[ML] Correct logistic loss function #1032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].)
* Account for the data frame's memory when estimating the peak memory used by classification
and regression model training. (See {ml-pull}996[#996].)

== {es} version 7.6.2

=== Bug Fixes

* Fix a bug in the calculation of the minimum loss leaf values for classification.
(See {ml-pull}1032[#1032].)

== {es} version 7.6.0

=== New Features
Expand Down
6 changes: 4 additions & 2 deletions include/maths/CBoostedTreeLoss.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,10 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
}

double bucketWidth() const {
return m_PredictionMinMax.range() /
static_cast<double>(m_BucketCategoryCounts.size());
return m_PredictionMinMax.initialized()
? m_PredictionMinMax.range() /
static_cast<double>(m_BucketCategoryCounts.size())
: 0.0;
}

private:
Expand Down
4 changes: 3 additions & 1 deletion lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,9 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr
} while (nextPass());

for (std::size_t i = 0; i < tree.size(); ++i) {
tree[i].value(eta * leafValues[i].value());
if (tree[i].isLeaf()) {
tree[i].value(eta * leafValues[i].value());
}
}

LOG_TRACE(<< "tree =\n" << root(tree).print(tree));
Expand Down
17 changes: 12 additions & 5 deletions lib/maths/CBoostedTreeLoss.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,17 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
// case we only need one pass over the data and can compute the optimal
// value from the counts of the two categories.
if (this->bucketWidth() == 0.0) {
objective = [this](double weight) {
// This is the (unique) predicted value for the rows in leaf by the forest
// so far (i.e. without the weight for the leaf we're about to add).
double prediction{m_PredictionMinMax.initialized()
? (m_PredictionMinMax.min() + m_PredictionMinMax.max()) / 2.0
: 0.0};
objective = [prediction, this](double weight) {
double logOdds{prediction + weight};
double c0{m_CategoryCounts(0)};
double c1{m_CategoryCounts(1)};
return this->lambda() * CTools::pow2(weight) -
c0 * logOneMinusLogistic(weight) - c1 * logLogistic(weight);
c0 * logOneMinusLogistic(logOdds) - c1 * logLogistic(logOdds);
};

// Weight shrinkage means the optimal weight will be somewhere between
Expand All @@ -158,8 +164,8 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
double empiricalProbabilityC1{c1 / (c0 + c1)};
double empiricalLogOddsC1{
std::log(empiricalProbabilityC1 / (1.0 - empiricalProbabilityC1))};
minWeight = empiricalProbabilityC1 < 0.5 ? empiricalLogOddsC1 : 0.0;
maxWeight = empiricalProbabilityC1 < 0.5 ? 0.0 : empiricalLogOddsC1;
minWeight = (empiricalProbabilityC1 < 0.5 ? empiricalLogOddsC1 : 0.0) - prediction;
maxWeight = (empiricalProbabilityC1 < 0.5 ? 0.0 : empiricalLogOddsC1) - prediction;

} else {
objective = [this](double weight) {
Expand Down Expand Up @@ -200,6 +206,7 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
return result;
}
}

namespace boosted_tree {

CArgMinLoss::CArgMinLoss(const CArgMinLoss& other)
Expand Down Expand Up @@ -338,4 +345,4 @@ const std::string& CBinomialLogistic::name() const {
const std::string CBinomialLogistic::NAME{"binomial_logistic"};
}
}
}
}
4 changes: 2 additions & 2 deletions lib/maths/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ BOOST_AUTO_TEST_CASE(testLogisticRegression) {
LOG_DEBUG(<< "log relative error = "
<< maths::CBasicStatistics::mean(logRelativeError));

BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.7);
BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.71);
meanLogRelativeError.add(maths::CBasicStatistics::mean(logRelativeError));
}

Expand Down Expand Up @@ -1307,7 +1307,7 @@ BOOST_AUTO_TEST_CASE(testImbalancedClasses) {
LOG_DEBUG(<< "recalls = " << core::CContainerPrinter::print(recalls));

BOOST_TEST_REQUIRE(std::fabs(precisions[0] - precisions[1]) < 0.1);
BOOST_TEST_REQUIRE(std::fabs(recalls[0] - recalls[1]) < 0.15);
BOOST_TEST_REQUIRE(std::fabs(recalls[0] - recalls[1]) < 0.13);
}

BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByTrain) {
Expand Down