elastic · tveasey · Feb 29, 2020 · Feb 28, 2020 · Feb 28, 2020 · Feb 28, 2020
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -65,6 +65,13 @@ the build from version 2.20 to 2.34.  (See {ml-pull}1013[#1013].)
 * Account for the data frame's memory when estimating the peak memory used by classification
 and regression model training. (See {ml-pull}996[#996].)
 
+== {es} version 7.6.2
+
+=== Bug Fixes
+
+* Fix a bug in the calculation of the minimum loss leaf values for classification.
+(See {ml-pull}1032[#1032].)
+
 == {es} version 7.6.0
 
 === New Features

diff --git a/include/maths/CBoostedTreeLoss.h b/include/maths/CBoostedTreeLoss.h
@@ -93,8 +93,10 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
     }
 
     double bucketWidth() const {
-        return m_PredictionMinMax.range() /
-               static_cast<double>(m_BucketCategoryCounts.size());
+        return m_PredictionMinMax.initialized()
+                   ? m_PredictionMinMax.range() /
+                         static_cast<double>(m_BucketCategoryCounts.size())
+                   : 0.0;
     }
 
 private:

diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
@@ -948,7 +948,9 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr
     } while (nextPass());
 
     for (std::size_t i = 0; i < tree.size(); ++i) {
-        tree[i].value(eta * leafValues[i].value());
+        if (tree[i].isLeaf()) {
+            tree[i].value(eta * leafValues[i].value());
+        }
     }
 
     LOG_TRACE(<< "tree =\n" << root(tree).print(tree));

diff --git a/lib/maths/CBoostedTreeLoss.cc b/lib/maths/CBoostedTreeLoss.cc
@@ -144,11 +144,17 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
     // case we only need one pass over the data and can compute the optimal
     // value from the counts of the two categories.
     if (this->bucketWidth() == 0.0) {
-        objective = [this](double weight) {
+        // This is the (unique) predicted value for the rows in leaf by the forest
+        // so far (i.e. without the weight for the leaf we're about to add).
+        double prediction{m_PredictionMinMax.initialized()
+                              ? (m_PredictionMinMax.min() + m_PredictionMinMax.max()) / 2.0
+                              : 0.0};
+        objective = [prediction, this](double weight) {
+            double logOdds{prediction + weight};
             double c0{m_CategoryCounts(0)};
             double c1{m_CategoryCounts(1)};
             return this->lambda() * CTools::pow2(weight) -
-                   c0 * logOneMinusLogistic(weight) - c1 * logLogistic(weight);
+                   c0 * logOneMinusLogistic(logOdds) - c1 * logLogistic(logOdds);
         };
 
         // Weight shrinkage means the optimal weight will be somewhere between
@@ -158,8 +164,8 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
         double empiricalProbabilityC1{c1 / (c0 + c1)};
         double empiricalLogOddsC1{
             std::log(empiricalProbabilityC1 / (1.0 - empiricalProbabilityC1))};
-        minWeight = empiricalProbabilityC1 < 0.5 ? empiricalLogOddsC1 : 0.0;
-        maxWeight = empiricalProbabilityC1 < 0.5 ? 0.0 : empiricalLogOddsC1;
+        minWeight = (empiricalProbabilityC1 < 0.5 ? empiricalLogOddsC1 : 0.0) - prediction;
+        maxWeight = (empiricalProbabilityC1 < 0.5 ? 0.0 : empiricalLogOddsC1) - prediction;
 
     } else {
         objective = [this](double weight) {
@@ -200,6 +206,7 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
     return result;
 }
 }
+
 namespace boosted_tree {
 
 CArgMinLoss::CArgMinLoss(const CArgMinLoss& other)
@@ -338,4 +345,4 @@ const std::string& CBinomialLogistic::name() const {
 const std::string CBinomialLogistic::NAME{"binomial_logistic"};
 }
 }
-}
+}
diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -1214,7 +1214,7 @@ BOOST_AUTO_TEST_CASE(testLogisticRegression) {
         LOG_DEBUG(<< "log relative error = "
                   << maths::CBasicStatistics::mean(logRelativeError));
 
-        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.7);
+        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.71);
         meanLogRelativeError.add(maths::CBasicStatistics::mean(logRelativeError));
     }
 
@@ -1307,7 +1307,7 @@ BOOST_AUTO_TEST_CASE(testImbalancedClasses) {
     LOG_DEBUG(<< "recalls    = " << core::CContainerPrinter::print(recalls));
 
     BOOST_TEST_REQUIRE(std::fabs(precisions[0] - precisions[1]) < 0.1);
-    BOOST_TEST_REQUIRE(std::fabs(recalls[0] - recalls[1]) < 0.15);
+    BOOST_TEST_REQUIRE(std::fabs(recalls[0] - recalls[1]) < 0.13);
 }
 
 BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByTrain) {