elastic · tveasey · Apr 9, 2020 · Apr 7, 2020
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -37,6 +37,9 @@
   when CPU is constrained. (See {ml-pull}1109[#1109].)
 * Take `training_percent` into account when estimating memory usage for classification and regression. 
   (See {ml-pull}1111[#1111].)
+* Support maximize minimum recall when assigning class labels for multiclass classification.
+  (See {ml-pull}1113[#1113].)
+* Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].)
 * Adds new `num_matches` and `preferred_to_categories` fields to category output.
   (See {ml-pull}1062[#1062])
 * Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].)

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -112,7 +112,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Get the number of columns training the model will add to the data frame.
     static std::size_t numberExtraColumnsForTrain(std::size_t numberLossParameters) {
         // We store as follows:
-        //   1. The predicted values for the dependent variables
+        //   1. The predicted values for the dependent variable
         //   2. The gradient of the loss function
         //   3. The upper triangle of the hessian of the loss function
         //   4. The example's weight

diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h
@@ -69,7 +69,8 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
     using TRowRef = core::CDataFrame::TRowRef;
     using TWeightFunc = std::function<double(const TRowRef&)>;
     using TDoubleVector = CDenseVector<double>;
-    using TReadPredictionFunc = std::function<TDoubleVector(const TRowRef)>;
+    using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
+    using TReadPredictionFunc = std::function<TMemoryMappedFloatVector(const TRowRef&)>;
     using TQuantileSketchVec = std::vector<CQuantileSketch>;
     using TPackedBitVectorVec = std::vector<core::CPackedBitVector>;
 
@@ -408,6 +409,19 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
                                        const core::CPackedBitVector& rowMask,
                                        const TSizeVec& columnMask,
                                        std::size_t numberSamples);
+    static TDoubleVector
+    maximizeMinimumRecallForBinary(std::size_t numberThreads,
+                                   const core::CDataFrame& frame,
+                                   const core::CPackedBitVector& rowMask,
+                                   std::size_t targetColumn,
+                                   const TReadPredictionFunc& readPrediction);
+    static TDoubleVector
+    maximizeMinimumRecallForMulticlass(std::size_t numberThreads,
+                                       const core::CDataFrame& frame,
+                                       const core::CPackedBitVector& rowMask,
+                                       std::size_t numberClasses,
+                                       std::size_t targetColumn,
+                                       const TReadPredictionFunc& readPrediction);
     static void removeMetricColumns(const core::CDataFrame& frame, TSizeVec& columnMask);
     static void removeCategoricalColumns(const core::CDataFrame& frame, TSizeVec& columnMask);
     static double unitWeight(const TRowRef&);

diff --git a/include/maths/CTools.h b/include/maths/CTools.h
@@ -26,6 +26,7 @@
 #include <cstring>
 #include <iosfwd>
 #include <limits>
+#include <numeric>
 #include <vector>
 
 namespace ml {
@@ -684,7 +685,7 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable {
         return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width));
     }
 
-    //! Compute the softmax from the multinomial logit values \p logit.
+    //! Compute the softmax for the multinomial logit values \p logit.
     //!
     //! i.e. \f$[\sigma(z)]_i = \frac{exp(z_i)}{\sum_j exp(z_j)}\f$.
     //!
@@ -703,10 +704,29 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable {
         }
     }
 
-    //! Specialize the softmax for our dense vector type.
+    //! Compute the log of the softmax for the multinomial logit values \p logit.
+    template<typename COLLECTION>
+    static void inplaceLogSoftmax(COLLECTION& z) {
+        double zmax{*std::max_element(z.begin(), z.end())};
+        for (auto& zi : z) {
+            zi -= zmax;
+        }
+        double logZ{std::log(std::accumulate(
+            z.begin(), z.end(), 0.0,
+            [](double sum, const auto& zi) { return sum + std::exp(zi); }))};
+        for (auto& zi : z) {
+            zi -= logZ;
+        }
+    }
+
+    //! Specialize the softmax for CDenseVector.
     template<typename T>
     static void inplaceSoftmax(CDenseVector<T>& z);
 
+    //! Specialize the log(softmax) for CDenseVector.
+    template<typename SCALAR>
+    static void inplaceLogSoftmax(CDenseVector<SCALAR>& z);
+
     //! Linearly interpolate a function on the interval [\p a, \p b].
     static double linearlyInterpolate(double a, double b, double fa, double fb, double x);
 

diff --git a/include/maths/CToolsDetail.h b/include/maths/CToolsDetail.h
@@ -11,6 +11,7 @@
 
 #include <maths/CCompositeFunctions.h>
 #include <maths/CIntegration.h>
+#include <maths/CLinearAlgebraEigen.h>
 #include <maths/CMixtureDistribution.h>
 #include <maths/COrderings.h>
 #include <maths/CTools.h>
@@ -308,6 +309,15 @@ void CTools::inplaceSoftmax(CDenseVector<T>& z) {
     z.array() = z.array().exp();
     z /= z.sum();
 }
+
+template<typename SCALAR>
+void CTools::inplaceLogSoftmax(CDenseVector<SCALAR>& z) {
+    // Handle under/overflow when taking exponentials by subtracting zmax.
+    double zmax{z.maxCoeff()};
+    z.array() -= zmax;
+    double Z{z.array().exp().sum()};
+    z.array() -= std::log(Z);
+}
 }
 }
 

diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h
@@ -8,6 +8,7 @@
 #define INCLUDED_ml_test_CDataFrameAnalyzerTrainingFactory_h
 
 #include <core/CDataFrame.h>
+#include <core/CSmallVector.h>
 
 #include <maths/CBoostedTreeFactory.h>
 #include <maths/CBoostedTreeLoss.h>
@@ -122,13 +123,11 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory {
                 auto prediction = tree->readAndAdjustPrediction(*row);
                 switch (type) {
                 case E_Regression:
-                    appendPrediction(*frame, weights.size(), prediction[0], expectedPredictions);
+                    appendPrediction(*frame, weights.size(), prediction, expectedPredictions);
                     break;
                 case E_BinaryClassification:
-                    appendPrediction(*frame, weights.size(), prediction[1], expectedPredictions);
-                    break;
                 case E_MulticlassClassification:
-                    // TODO.
+                    appendPrediction(*frame, weights.size(), prediction, expectedPredictions);
                     break;
                 }
             }
@@ -149,15 +148,19 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory {
                                                     TStrVec& targets);
 
 private:
+    using TDouble2Vec = core::CSmallVector<double, 2>;
     using TBoolVec = std::vector<bool>;
     using TRowItr = core::CDataFrame::TRowItr;
 
 private:
-    static void appendPrediction(core::CDataFrame&, std::size_t, double prediction, TDoubleVec& predictions);
+    static void appendPrediction(core::CDataFrame&,
+                                 std::size_t,
+                                 const TDouble2Vec& prediction,
+                                 TDoubleVec& predictions);
 
     static void appendPrediction(core::CDataFrame& frame,
                                  std::size_t target,
-                                 double class1Score,
+                                 const TDouble2Vec& class1Score,
                                  TStrVec& predictions);
 };
 }

diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
@@ -377,12 +377,16 @@ void CBoostedTreeImpl::initializePerFoldTestLosses() {
 }
 
 void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& frame) {
+
+    using TFloatStorageVec = std::vector<CFloatStorage>;
+
     if (m_Loss->type() == CLoss::E_BinaryClassification ||
         m_Loss->type() == CLoss::E_MulticlassClassification) {
 
         std::size_t numberClasses{m_Loss->type() == CLoss::E_BinaryClassification
                                       ? 2
                                       : m_Loss->numberParameters()};
+        TFloatStorageVec storage(2);
 
         switch (m_ClassAssignmentObjective) {
         case CBoostedTree::E_Accuracy:
@@ -391,9 +395,20 @@ void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& fram
         case CBoostedTree::E_MinimumRecall:
             m_ClassificationWeights = CDataFrameUtils::maximumMinimumRecallClassWeights(
                 m_NumberThreads, frame, this->allTrainingRowsMask(),
-                numberClasses, m_DependentVariable, [this](const TRowRef& row) {
-                    return m_Loss->transform(readPrediction(
-                        row, m_NumberInputColumns, m_Loss->numberParameters()));
+                numberClasses, m_DependentVariable,
+                [storage, numberClasses, this](const TRowRef& row) mutable {
+                    if (m_Loss->type() == CLoss::E_BinaryClassification) {
+                        // We predict the log-odds but this is expected to return
+                        // the log of the predicted class probabilities.
+                        TMemoryMappedFloatVector result{&storage[0], 2};
+                        result.array() = m_Loss
+                                             ->transform(readPrediction(
+                                                 row, m_NumberInputColumns, numberClasses))
+                                             .array()
+                                             .log();
+                        return result;
+                    }
+                    return readPrediction(row, m_NumberInputColumns, numberClasses);
                 });
             break;
         }

diff --git a/lib/maths/CBoostedTreeLoss.cc b/lib/maths/CBoostedTreeLoss.cc
@@ -39,15 +39,6 @@ double logLogistic(double logOdds) {
     }
     return std::log(CTools::logisticFunction(logOdds));
 }
-
-template<typename SCALAR>
-void inplaceLogSoftmax(CDenseVector<SCALAR>& z) {
-    // Handle under/overflow when taking exponentials by subtracting zmax.
-    double zmax{z.maxCoeff()};
-    z.array() -= zmax;
-    double Z{z.array().exp().sum()};
-    z.array() -= std::log(Z);
-}
 }
 
 namespace boosted_tree_detail {
@@ -332,7 +323,7 @@ CArgMinMultinomialLogisticLossImpl::objective() const {
     if (m_Centres.size() == 1) {
         return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable {
             logProbabilities = m_Centres[0] + weight;
-            inplaceLogSoftmax(logProbabilities);
+            CTools::inplaceLogSoftmax(logProbabilities);
             return lambda * weight.squaredNorm() - m_ClassCounts.transpose() * logProbabilities;
         };
     }
@@ -341,7 +332,7 @@ CArgMinMultinomialLogisticLossImpl::objective() const {
         for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
             if (m_CentresClassCounts[i].sum() > 0.0) {
                 logProbabilities = m_Centres[i] + weight;
-                inplaceLogSoftmax(logProbabilities);
+                CTools::inplaceLogSoftmax(logProbabilities);
                 loss -= m_CentresClassCounts[i].transpose() * logProbabilities;
             }
         }