diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index ec1892d7e3..b6a61f9ca6 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -37,6 +37,9 @@ when CPU is constrained. (See {ml-pull}1109[#1109].) * Take `training_percent` into account when estimating memory usage for classification and regression. (See {ml-pull}1111[#1111].) +* Support maximize minimum recall when assigning class labels for multiclass classification. + (See {ml-pull}1113[#1113].) +* Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].) * Adds new `num_matches` and `preferred_to_categories` fields to category output. (See {ml-pull}1062[#1062]) * Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 1c3cdca371..8969a743c9 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -112,7 +112,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Get the number of columns training the model will add to the data frame. static std::size_t numberExtraColumnsForTrain(std::size_t numberLossParameters) { // We store as follows: - // 1. The predicted values for the dependent variables + // 1. The predicted values for the dependent variable // 2. The gradient of the loss function // 3. The upper triangle of the hessian of the loss function // 4. The example's weight diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h index 636380024c..ab822d34c1 100644 --- a/include/maths/CDataFrameUtils.h +++ b/include/maths/CDataFrameUtils.h @@ -69,7 +69,8 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { using TRowRef = core::CDataFrame::TRowRef; using TWeightFunc = std::function; using TDoubleVector = CDenseVector; - using TReadPredictionFunc = std::function; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; + using TReadPredictionFunc = std::function; using TQuantileSketchVec = std::vector; using TPackedBitVectorVec = std::vector; @@ -408,6 +409,19 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { const core::CPackedBitVector& rowMask, const TSizeVec& columnMask, std::size_t numberSamples); + static TDoubleVector + maximizeMinimumRecallForBinary(std::size_t numberThreads, + const core::CDataFrame& frame, + const core::CPackedBitVector& rowMask, + std::size_t targetColumn, + const TReadPredictionFunc& readPrediction); + static TDoubleVector + maximizeMinimumRecallForMulticlass(std::size_t numberThreads, + const core::CDataFrame& frame, + const core::CPackedBitVector& rowMask, + std::size_t numberClasses, + std::size_t targetColumn, + const TReadPredictionFunc& readPrediction); static void removeMetricColumns(const core::CDataFrame& frame, TSizeVec& columnMask); static void removeCategoricalColumns(const core::CDataFrame& frame, TSizeVec& columnMask); static double unitWeight(const TRowRef&); diff --git a/include/maths/CTools.h b/include/maths/CTools.h index cb84754487..0a02d7ceeb 100644 --- a/include/maths/CTools.h +++ b/include/maths/CTools.h @@ -26,6 +26,7 @@ #include #include #include +#include #include namespace ml { @@ -684,7 +685,7 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width)); } - //! Compute the softmax from the multinomial logit values \p logit. + //! Compute the softmax for the multinomial logit values \p logit. //! //! i.e. \f$[\sigma(z)]_i = \frac{exp(z_i)}{\sum_j exp(z_j)}\f$. //! @@ -703,10 +704,29 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { } } - //! Specialize the softmax for our dense vector type. + //! Compute the log of the softmax for the multinomial logit values \p logit. + template + static void inplaceLogSoftmax(COLLECTION& z) { + double zmax{*std::max_element(z.begin(), z.end())}; + for (auto& zi : z) { + zi -= zmax; + } + double logZ{std::log(std::accumulate( + z.begin(), z.end(), 0.0, + [](double sum, const auto& zi) { return sum + std::exp(zi); }))}; + for (auto& zi : z) { + zi -= logZ; + } + } + + //! Specialize the softmax for CDenseVector. template static void inplaceSoftmax(CDenseVector& z); + //! Specialize the log(softmax) for CDenseVector. + template + static void inplaceLogSoftmax(CDenseVector& z); + //! Linearly interpolate a function on the interval [\p a, \p b]. static double linearlyInterpolate(double a, double b, double fa, double fb, double x); diff --git a/include/maths/CToolsDetail.h b/include/maths/CToolsDetail.h index ade77a745b..ac4e02e8ba 100644 --- a/include/maths/CToolsDetail.h +++ b/include/maths/CToolsDetail.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -308,6 +309,15 @@ void CTools::inplaceSoftmax(CDenseVector& z) { z.array() = z.array().exp(); z /= z.sum(); } + +template +void CTools::inplaceLogSoftmax(CDenseVector& z) { + // Handle under/overflow when taking exponentials by subtracting zmax. + double zmax{z.maxCoeff()}; + z.array() -= zmax; + double Z{z.array().exp().sum()}; + z.array() -= std::log(Z); +} } } diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h index 1f79bfd72a..63bfad67ab 100644 --- a/include/test/CDataFrameAnalyzerTrainingFactory.h +++ b/include/test/CDataFrameAnalyzerTrainingFactory.h @@ -8,6 +8,7 @@ #define INCLUDED_ml_test_CDataFrameAnalyzerTrainingFactory_h #include +#include #include #include @@ -122,13 +123,11 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { auto prediction = tree->readAndAdjustPrediction(*row); switch (type) { case E_Regression: - appendPrediction(*frame, weights.size(), prediction[0], expectedPredictions); + appendPrediction(*frame, weights.size(), prediction, expectedPredictions); break; case E_BinaryClassification: - appendPrediction(*frame, weights.size(), prediction[1], expectedPredictions); - break; case E_MulticlassClassification: - // TODO. + appendPrediction(*frame, weights.size(), prediction, expectedPredictions); break; } } @@ -149,15 +148,19 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { TStrVec& targets); private: + using TDouble2Vec = core::CSmallVector; using TBoolVec = std::vector; using TRowItr = core::CDataFrame::TRowItr; private: - static void appendPrediction(core::CDataFrame&, std::size_t, double prediction, TDoubleVec& predictions); + static void appendPrediction(core::CDataFrame&, + std::size_t, + const TDouble2Vec& prediction, + TDoubleVec& predictions); static void appendPrediction(core::CDataFrame& frame, std::size_t target, - double class1Score, + const TDouble2Vec& class1Score, TStrVec& predictions); }; } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 4f1cb2e251..0e65870281 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -377,12 +377,16 @@ void CBoostedTreeImpl::initializePerFoldTestLosses() { } void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& frame) { + + using TFloatStorageVec = std::vector; + if (m_Loss->type() == CLoss::E_BinaryClassification || m_Loss->type() == CLoss::E_MulticlassClassification) { std::size_t numberClasses{m_Loss->type() == CLoss::E_BinaryClassification ? 2 : m_Loss->numberParameters()}; + TFloatStorageVec storage(2); switch (m_ClassAssignmentObjective) { case CBoostedTree::E_Accuracy: @@ -391,9 +395,20 @@ void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& fram case CBoostedTree::E_MinimumRecall: m_ClassificationWeights = CDataFrameUtils::maximumMinimumRecallClassWeights( m_NumberThreads, frame, this->allTrainingRowsMask(), - numberClasses, m_DependentVariable, [this](const TRowRef& row) { - return m_Loss->transform(readPrediction( - row, m_NumberInputColumns, m_Loss->numberParameters())); + numberClasses, m_DependentVariable, + [storage, numberClasses, this](const TRowRef& row) mutable { + if (m_Loss->type() == CLoss::E_BinaryClassification) { + // We predict the log-odds but this is expected to return + // the log of the predicted class probabilities. + TMemoryMappedFloatVector result{&storage[0], 2}; + result.array() = m_Loss + ->transform(readPrediction( + row, m_NumberInputColumns, numberClasses)) + .array() + .log(); + return result; + } + return readPrediction(row, m_NumberInputColumns, numberClasses); }); break; } diff --git a/lib/maths/CBoostedTreeLoss.cc b/lib/maths/CBoostedTreeLoss.cc index 61394d5cb7..c43a358144 100644 --- a/lib/maths/CBoostedTreeLoss.cc +++ b/lib/maths/CBoostedTreeLoss.cc @@ -39,15 +39,6 @@ double logLogistic(double logOdds) { } return std::log(CTools::logisticFunction(logOdds)); } - -template -void inplaceLogSoftmax(CDenseVector& z) { - // Handle under/overflow when taking exponentials by subtracting zmax. - double zmax{z.maxCoeff()}; - z.array() -= zmax; - double Z{z.array().exp().sum()}; - z.array() -= std::log(Z); -} } namespace boosted_tree_detail { @@ -332,7 +323,7 @@ CArgMinMultinomialLogisticLossImpl::objective() const { if (m_Centres.size() == 1) { return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable { logProbabilities = m_Centres[0] + weight; - inplaceLogSoftmax(logProbabilities); + CTools::inplaceLogSoftmax(logProbabilities); return lambda * weight.squaredNorm() - m_ClassCounts.transpose() * logProbabilities; }; } @@ -341,7 +332,7 @@ CArgMinMultinomialLogisticLossImpl::objective() const { for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) { if (m_CentresClassCounts[i].sum() > 0.0) { logProbabilities = m_Centres[i] + weight; - inplaceLogSoftmax(logProbabilities); + CTools::inplaceLogSoftmax(logProbabilities); loss -= m_CentresClassCounts[i].transpose() * logProbabilities; } } diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index 6d411ee463..ffa157347a 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -13,16 +13,21 @@ #include #include +#include +#include #include #include #include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -117,23 +122,21 @@ class CStratifiedSampler { TRowSamplerVec m_Samplers; TSamplerSelector m_Selector; }; +using TStratifiedSamplerPtr = std::unique_ptr; //! Get a classifier stratified row sampler for cross fold validation. -std::pair, TDoubleVec> +std::pair classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads, const core::CDataFrame& frame, std::size_t targetColumn, CPRNG::CXorOShiro128Plus rng, - std::size_t numberFolds, - const core::CPackedBitVector& allTrainingRowsMask) { + std::size_t desiredCount, + const core::CPackedBitVector& rowMask) { TDoubleVec categoryFrequencies{CDataFrameUtils::categoryFrequencies( - numberThreads, frame, allTrainingRowsMask, {targetColumn})[targetColumn]}; + numberThreads, frame, rowMask, {targetColumn})[targetColumn]}; TSizeVec categoryCounts; - double numberTrainingRows{allTrainingRowsMask.manhattan()}; - std::size_t desiredCount{ - (static_cast(numberTrainingRows) + numberFolds / 2) / numberFolds}; CSampling::weightedSample(desiredCount, categoryFrequencies, categoryCounts); LOG_TRACE(<< "desired category counts per test fold = " << core::CContainerPrinter::print(categoryCounts)); @@ -150,17 +153,17 @@ classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads, } //! Get a regression stratified row sampler for cross fold validation. -std::unique_ptr +TStratifiedSamplerPtr regressionStratifiedCrossValiationRowSampler(std::size_t numberThreads, const core::CDataFrame& frame, std::size_t targetColumn, CPRNG::CXorOShiro128Plus rng, - std::size_t numberFolds, + std::size_t desiredCount, std::size_t numberBuckets, - const core::CPackedBitVector& allTrainingRowsMask) { + const core::CPackedBitVector& rowMask) { auto quantiles = CDataFrameUtils::columnQuantiles( - numberThreads, frame, allTrainingRowsMask, {targetColumn}, + numberThreads, frame, rowMask, {targetColumn}, CQuantileSketch{CQuantileSketch::E_Linear, 50}) .first; @@ -198,8 +201,7 @@ regressionStratifiedCrossValiationRowSampler(std::size_t numberThreads, }; TDoubleVec bucketFrequencies; - doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), - countBucketRows, &allTrainingRowsMask), + doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), countBucketRows, &rowMask), copyBucketRowCounts, reduceBucketRowCounts, bucketFrequencies); double totalCount{std::accumulate(bucketFrequencies.begin(), bucketFrequencies.end(), 0.0)}; @@ -208,8 +210,6 @@ regressionStratifiedCrossValiationRowSampler(std::size_t numberThreads, } TSizeVec bucketCounts; - std::size_t desiredCount{ - (static_cast(totalCount) + numberFolds / 2) / numberFolds}; CSampling::weightedSample(desiredCount, bucketFrequencies, bucketCounts); LOG_TRACE(<< "desired bucket counts per fold = " << core::CContainerPrinter::print(bucketCounts)); @@ -495,17 +495,20 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, std::size_t numberFolds, std::size_t numberBuckets, const core::CPackedBitVector& allTrainingRowsMask) { - TDoubleVec frequencies; - std::unique_ptr sampler; + TStratifiedSamplerPtr sampler; + + double numberTrainingRows{allTrainingRowsMask.manhattan()}; + std::size_t desiredCount{ + (static_cast(numberTrainingRows) + numberFolds / 2) / numberFolds}; if (frame.columnIsCategorical()[targetColumn]) { std::tie(sampler, frequencies) = classifierStratifiedCrossValidationRowSampler( - numberThreads, frame, targetColumn, rng, numberFolds, allTrainingRowsMask); + numberThreads, frame, targetColumn, rng, desiredCount, allTrainingRowsMask); } else { sampler = regressionStratifiedCrossValiationRowSampler( - numberThreads, frame, targetColumn, rng, numberFolds, numberBuckets, - allTrainingRowsMask); + numberThreads, frame, targetColumn, rng, desiredCount, + numberBuckets, allTrainingRowsMask); } LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan()); @@ -586,13 +589,9 @@ CDataFrameUtils::categoryFrequencies(std::size_t numberThreads, TDoubleVecVec result; try { - if (doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), - readCategoryCounts, &rowMask), - copyCategoryCounts, reduceCategoryCounts, result) == false) { - HANDLE_FATAL(<< "Internal error: failed to calculate category" - << " frequencies. Please report this problem.") - return result; - } + doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), + readCategoryCounts, &rowMask), + copyCategoryCounts, reduceCategoryCounts, result); } catch (const std::exception& e) { HANDLE_FATAL(<< "Internal error: '" << e.what() << "' exception calculating" << " category frequencies. Please report this problem.") @@ -653,12 +652,8 @@ CDataFrameUtils::meanValueOfTargetForCategories(const CColumnValue& target, TMeanAccumulatorVecVec means; try { - if (doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), readColumnMeans, &rowMask), - copyColumnMeans, reduceColumnMeans, means) == false) { - HANDLE_FATAL(<< "Internal error: failed to calculate mean target values" - << " for categories. Please report this problem.") - return result; - } + doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), readColumnMeans, &rowMask), + copyColumnMeans, reduceColumnMeans, means); } catch (const std::exception& e) { HANDLE_FATAL(<< "Internal error: '" << e.what() << "' exception calculating" << " mean target values for categories. Please report this problem.") @@ -741,58 +736,11 @@ CDataFrameUtils::maximumMinimumRecallClassWeights(std::size_t numberThreads, std::size_t targetColumn, const TReadPredictionFunc& readPrediction) { - if (numberClasses == 2) { - auto readQuantiles = core::bindRetrievableState( - [&](TQuantileSketchVec& quantiles, TRowItr beginRows, TRowItr endRows) { - for (auto row = beginRows; row != endRows; ++row) { - if (isMissing((*row)[targetColumn]) == false) { - quantiles[static_cast((*row)[targetColumn])] - .add(readPrediction(*row)(1)); - } - } - }, - TQuantileSketchVec(2, CQuantileSketch{CQuantileSketch::E_Linear, 100})); - auto copyQuantiles = [](TQuantileSketchVec quantiles, TQuantileSketchVec& result) { - result = std::move(quantiles); - }; - auto reduceQuantiles = [&](TQuantileSketchVec quantiles, TQuantileSketchVec& result) { - for (std::size_t i = 0; i < 2; ++i) { - result[i] += quantiles[i]; - } - }; - - TQuantileSketchVec classProbabilityClassOneQuantiles; - if (doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), readQuantiles, &rowMask), - copyQuantiles, reduceQuantiles, - classProbabilityClassOneQuantiles) == false) { - HANDLE_FATAL(<< "Failed to compute category quantiles") - return TDoubleVector::Ones(2); - } - - auto minRecall = [&](double threshold) { - double cdf[2]; - classProbabilityClassOneQuantiles[0].cdf(threshold, cdf[0]); - classProbabilityClassOneQuantiles[1].cdf(threshold, cdf[1]); - double recalls[]{cdf[0], 1.0 - cdf[1]}; - return std::min(recalls[0], recalls[1]); - }; - - double threshold; - double minRecallAtThreshold; - std::size_t maxIterations{20}; - CSolvers::maximize(0.0, 1.0, minRecall(0.0), minRecall(1.0), minRecall, - 1e-3, maxIterations, threshold, minRecallAtThreshold); - LOG_TRACE(<< "threshold = " << threshold - << ", min recall at threshold = " << minRecallAtThreshold); - - TDoubleVector result{2}; - result(0) = 0.5 / (1.0 - threshold); - result(1) = 0.5 / threshold; - return result; - } - - // TODO fixme - return TDoubleVector::Ones(numberClasses); + return numberClasses == 2 + ? maximizeMinimumRecallForBinary(numberThreads, frame, rowMask, + targetColumn, readPrediction) + : maximizeMinimumRecallForMulticlass(numberThreads, frame, rowMask, numberClasses, + targetColumn, readPrediction); } bool CDataFrameUtils::isMissing(double x) { @@ -1049,6 +997,250 @@ CDataFrameUtils::metricMicWithColumnDataFrameOnDisk(const CColumnValue& target, return mics; } +CDataFrameUtils::TDoubleVector +CDataFrameUtils::maximizeMinimumRecallForBinary(std::size_t numberThreads, + const core::CDataFrame& frame, + const core::CPackedBitVector& rowMask, + std::size_t targetColumn, + const TReadPredictionFunc& readPrediction) { + TDoubleVector probabilities; + auto readQuantiles = core::bindRetrievableState( + [=](TQuantileSketchVec& quantiles, TRowItr beginRows, TRowItr endRows) mutable { + for (auto row = beginRows; row != endRows; ++row) { + if (isMissing((*row)[targetColumn]) == false) { + std::size_t actualClass{static_cast((*row)[targetColumn])}; + probabilities = readPrediction(*row); + CTools::inplaceSoftmax(probabilities); + quantiles[actualClass].add(probabilities(1)); + } + } + }, + TQuantileSketchVec(2, CQuantileSketch{CQuantileSketch::E_Linear, 100})); + auto copyQuantiles = [](TQuantileSketchVec quantiles, TQuantileSketchVec& result) { + result = std::move(quantiles); + }; + auto reduceQuantiles = [&](TQuantileSketchVec quantiles, TQuantileSketchVec& result) { + for (std::size_t i = 0; i < 2; ++i) { + result[i] += quantiles[i]; + } + }; + + TQuantileSketchVec classProbabilityClassOneQuantiles; + if (doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), readQuantiles, &rowMask), + copyQuantiles, reduceQuantiles, classProbabilityClassOneQuantiles) == false) { + HANDLE_FATAL(<< "Failed to compute category quantiles") + return TDoubleVector::Ones(2); + } + + auto minRecall = [&](double threshold) { + double cdf[2]; + classProbabilityClassOneQuantiles[0].cdf(threshold, cdf[0]); + classProbabilityClassOneQuantiles[1].cdf(threshold, cdf[1]); + double recalls[]{cdf[0], 1.0 - cdf[1]}; + return std::min(recalls[0], recalls[1]); + }; + + double threshold; + double minRecallAtThreshold; + std::size_t maxIterations{20}; + CSolvers::maximize(0.0, 1.0, minRecall(0.0), minRecall(1.0), minRecall, + 1e-3, maxIterations, threshold, minRecallAtThreshold); + LOG_TRACE(<< "threshold = " << threshold + << ", min recall at threshold = " << minRecallAtThreshold); + + TDoubleVector result{2}; + result(0) = threshold < 0.5 ? threshold / (1.0 - threshold) : 1.0; + result(1) = threshold < 0.5 ? 1.0 : (1.0 - threshold) / threshold; + return result; +} + +CDataFrameUtils::TDoubleVector +CDataFrameUtils::maximizeMinimumRecallForMulticlass(std::size_t numberThreads, + const core::CDataFrame& frame, + const core::CPackedBitVector& rowMask, + std::size_t numberClasses, + std::size_t targetColumn, + const TReadPredictionFunc& readPrediction) { + + // Use a large random sample of the data frame and compute the expected + // optimisation objective for the whole data set from this. + + using TDoubleMatrix = CDenseMatrix; + using TMinAccumulator = CBasicStatistics::SMin::TAccumulator; + + CPRNG::CXorOShiro128Plus rng; + std::size_t numberSamples{std::min(std::size_t{1000}, rowMask.size())}; + + TStratifiedSamplerPtr sampler; + std::tie(sampler, std::ignore) = classifierStratifiedCrossValidationRowSampler( + numberThreads, frame, targetColumn, rng, numberSamples, rowMask); + + TSizeVec rowIndices; + frame.readRows(1, 0, frame.numberRows(), + [&](TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + sampler->sample(*row); + } + }, + &rowMask); + sampler->finishSampling(rng, rowIndices); + std::sort(rowIndices.begin(), rowIndices.end()); + core::CPackedBitVector sampleMask; + for (auto row : rowIndices) { + sampleMask.extend(false, row - sampleMask.size()); + sampleMask.extend(true); + } + sampleMask.extend(false, rowMask.size() - sampleMask.size()); + LOG_TRACE(<< "# row indices = " << rowIndices.size()); + + // Compute the count of each class in the sample set. + auto readClassCountsAndRecalls = core::bindRetrievableState( + [&](TDoubleVector& state, TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + if (isMissing((*row)[targetColumn]) == false) { + int j{static_cast((*row)[targetColumn])}; + int k; + readPrediction(*row).maxCoeff(&k); + state(j) += 1.0; + state(numberClasses + j) += j == k ? 1.0 : 0.0; + } + } + }, + TDoubleVector{TDoubleVector::Zero(2 * numberClasses)}); + auto copyClassCountsAndRecalls = [](TDoubleVector state, TDoubleVector& result) { + result = std::move(state); + }; + auto reduceClassCountsAndRecalls = + [&](TDoubleVector state, TDoubleVector& result) { result += state; }; + TDoubleVector classCountsAndRecalls; + doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), + readClassCountsAndRecalls, &sampleMask), + copyClassCountsAndRecalls, reduceClassCountsAndRecalls, classCountsAndRecalls); + TDoubleVector classCounts{classCountsAndRecalls.topRows(numberClasses)}; + TDoubleVector classRecalls{classCountsAndRecalls.bottomRows(numberClasses)}; + classRecalls.array() /= classCounts.array(); + LOG_TRACE(<< "class counts = " << classCounts.transpose()); + LOG_TRACE(<< "class recalls = " << classRecalls.transpose()); + + TSizeVec recallOrder(numberClasses); + std::iota(recallOrder.begin(), recallOrder.end(), 0); + std::sort(recallOrder.begin(), recallOrder.end(), [&](std::size_t lhs, std::size_t rhs) { + return classRecalls(lhs) > classRecalls(rhs); + }); + LOG_TRACE(<< "decreasing recall order = " << core::CContainerPrinter::print(recallOrder)); + + // We want to solve max_w{min_j{recall(class_j)}} = max_w{min_j{c_j(w) / n_j}} + // where c_j(w) and n_j are correct predictions for weight w and count of class_j + // in the sample set, respectively. We use an equivalent formulation + // + // min_w{max_j{f_j(w)}} = min_w{max_j{1 - c_j(w) / n_j}} + // + // We can write f_j(w) as + // + // max_j{sum_i{1 - 1{argmax_i(w_i p_i) == j}} / n_j} (1) + // + // where 1{.} denotes the indicator function. (1) has a smooth relaxation given + // by f_j(w) = max_j{sum_i{1 - softmax_j(w_i p_i)} / n_j}. Note that this isn't + // convex so we use multiple restarts. + + auto objective = [&](const TDoubleVector& weights) { + TDoubleVector probabilities; + TDoubleVector scores; + auto computeObjective = core::bindRetrievableState( + [=](TDoubleVector& state, TRowItr beginRows, TRowItr endRows) mutable { + for (auto row = beginRows; row != endRows; ++row) { + if (isMissing((*row)[targetColumn]) == false) { + std::size_t j{static_cast((*row)[targetColumn])}; + probabilities = readPrediction(*row); + CTools::inplaceSoftmax(probabilities); + scores = probabilities.cwiseProduct(weights); + CTools::inplaceSoftmax(scores); + state(j) += (1.0 - scores(j)) / classCounts(j); + } + } + }, + TDoubleVector{TDoubleVector::Zero(numberClasses)}); + auto copyObjective = [](TDoubleVector state, TDoubleVector& result) { + result = std::move(state); + }; + auto reduceObjective = [&](TDoubleVector state, TDoubleVector& result) { + result += state; + }; + + TDoubleVector objective_; + doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), computeObjective, &rowMask), + copyObjective, reduceObjective, objective_); + return objective_.maxCoeff(); + }; + + auto objectiveGradient = [&](const TDoubleVector& weights) { + TDoubleVector probabilities; + TDoubleVector scores; + auto computeObjectiveAndGradient = core::bindRetrievableState( + [=](TDoubleMatrix& state, TRowItr beginRows, TRowItr endRows) mutable { + for (auto row = beginRows; row != endRows; ++row) { + if (isMissing((*row)[targetColumn]) == false) { + std::size_t j{static_cast((*row)[targetColumn])}; + probabilities = readPrediction(*row); + CTools::inplaceSoftmax(probabilities); + scores = probabilities.cwiseProduct(weights); + CTools::inplaceSoftmax(scores); + state(j, 0) += (1.0 - scores(j)) / classCounts(j); + state.col(j + 1) += + scores(j) * + probabilities + .cwiseProduct(scores - TDoubleVector::Unit(numberClasses, j)) + .cwiseQuotient(classCounts); + } + } + }, + TDoubleMatrix{TDoubleMatrix::Zero(numberClasses, numberClasses + 1)}); + auto copyObjectiveAndGradient = [](TDoubleMatrix state, TDoubleMatrix& result) { + result = std::move(state); + }; + auto reduceObjectiveAndGradient = + [&](TDoubleMatrix state, TDoubleMatrix& result) { result += state; }; + + TDoubleMatrix objectiveAndGradient; + doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), + computeObjectiveAndGradient, &rowMask), + copyObjectiveAndGradient, reduceObjectiveAndGradient, objectiveAndGradient); + std::size_t max; + objectiveAndGradient.col(0).maxCoeff(&max); + return TDoubleVector{objectiveAndGradient.col(max + 1)}; + }; + + // We always try initialising with all weights equal because our minimization + // algorithm ensures we'll only decrease the initial value of the optimisation + // objective. This means we'll never do worse than not reweighting. Also, we + // expect weights to be (roughly) monotonic increasing for decreasing recall + // and use this to bias initial values. + + TMinAccumulator minLoss; + TDoubleVector minWeights; + TDoubleVector w0{TDoubleVector::Ones(numberClasses)}; + for (std::size_t i = 0; i < 5; ++i) { + CLbfgs lbfgs{5}; + double loss; + std::tie(w0, loss) = lbfgs.minimize(objective, objectiveGradient, std::move(w0)); + LOG_TRACE(<< "weights* = " << w0.transpose() << ", loss* = " << loss); + if (minLoss.add(loss)) { + minWeights = std::move(w0); + } + w0 = TDoubleVector::Ones(numberClasses); + for (std::size_t j = 1; j < numberClasses; ++j) { + w0(j) = w0(j - 1) * CSampling::uniformSample(rng, 0.9, 1.3); + } + } + + // Since we take argmax_i w_i p_i we can multiply by a constant. We arrange for + // the largest weight to always be one. + minWeights.array() /= minWeights.maxCoeff(); + LOG_TRACE(<< "weights = " << minWeights.transpose()); + + return minWeights; +} + void CDataFrameUtils::removeMetricColumns(const core::CDataFrame& frame, TSizeVec& columnMask) { const auto& columnIsCategorical = frame.columnIsCategorical(); columnMask.erase(std::remove_if(columnMask.begin(), columnMask.end(), diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index fab188bd16..19712ea302 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -1039,17 +1039,17 @@ BOOST_AUTO_TEST_CASE(testImbalancedClasses) { TDoubleVec falseNegatives(2, 0.0); frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { - double prediction{ - classification->readAndAdjustPrediction(*row)[1] < 0.5 ? 0.0 : 1.0}; + auto scores = classification->readAndAdjustPrediction(*row); + std::size_t prediction(scores[1] < scores[0] ? 0 : 1); if (row->index() >= trainRows && row->index() < trainRows + classesRowCounts[2]) { // Actual is zero. - (prediction == 0.0 ? truePositives[0] : falseNegatives[0]) += 1.0; - (prediction == 0.0 ? trueNegatives[1] : falsePositives[1]) += 1.0; + (prediction == 0 ? truePositives[0] : falseNegatives[0]) += 1.0; + (prediction == 0 ? trueNegatives[1] : falsePositives[1]) += 1.0; } else if (row->index() >= trainRows + classesRowCounts[2]) { // Actual is one. - (prediction == 1.0 ? truePositives[1] : falseNegatives[1]) += 1.0; - (prediction == 1.0 ? trueNegatives[0] : falsePositives[0]) += 1.0; + (prediction == 1 ? truePositives[1] : falseNegatives[1]) += 1.0; + (prediction == 1 ? trueNegatives[0] : falsePositives[0]) += 1.0; } } }); diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc index cb89e921bb..b005e46096 100644 --- a/lib/maths/unittest/CDataFrameUtilsTest.cc +++ b/lib/maths/unittest/CDataFrameUtilsTest.cc @@ -10,9 +10,12 @@ #include #include #include +#include #include #include #include +#include +#include #include #include @@ -1202,4 +1205,115 @@ BOOST_AUTO_TEST_CASE(testCategoryMicWithColumnWithMissing) { } } +BOOST_AUTO_TEST_CASE(testMaximumMinimumRecallClassWeights) { + + // Test we reliably increase the minimum class recall for predictions with uneven accuracy. + + using TDoubleVector = maths::CDenseVector; + using TMemoryMappedFloatVector = maths::CMemoryMappedDenseVector; + + std::size_t rows{5000}; + std::size_t capacity{2000}; + + test::CRandomNumbers rng; + + for (std::size_t numberClasses : {2, 3}) { + + std::size_t cols{numberClasses + 1}; + + auto readPrediction = [&](const core::CDataFrame::TRowRef& row) { + return TMemoryMappedFloatVector{row.data(), static_cast(numberClasses)}; + }; + + TBoolVec categoricalColumns(cols, false); + categoricalColumns[numberClasses] = true; + + for (std::size_t t = 0; t < 5; ++t) { + core::stopDefaultAsyncExecutor(); + + TDoubleVec predictions; + TSizeVec category; + auto frame = core::makeMainStorageDataFrame(cols, capacity).first; + frame->categoricalColumns(categoricalColumns); + for (std::size_t i = 0; i < rows; ++i) { + frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { + rng.generateUniformSamples(0, numberClasses, 1, category); + rng.generateNormalSamples(0.0, 1.0, numberClasses, predictions); + for (std::size_t j = 0; j < numberClasses; ++j) { + column[j] += predictions[j]; + } + column[category[0]] += static_cast(category[0] + 1); + column[numberClasses] = static_cast(category[0]); + }); + } + frame->finishWritingRows(); + + TDoubleVecVec minRecalls(2, TDoubleVec(2)); + TDoubleVecVec maxRecalls(2, TDoubleVec(2)); + + std::size_t i{0}; + for (auto numberThreads : {1, 4}) { + + auto weights = maths::CDataFrameUtils::maximumMinimumRecallClassWeights( + numberThreads, *frame, maskAll(rows), numberClasses, + numberClasses, readPrediction); + + TDoubleVector prediction; + TDoubleVector correct[2]{TDoubleVector::Zero(numberClasses), + TDoubleVector::Zero(numberClasses)}; + TDoubleVector counts{TDoubleVector::Zero(numberClasses)}; + + frame->readRows(1, [&](core::CDataFrame::TRowItr beginRows, + core::CDataFrame::TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + prediction = readPrediction(*row); + maths::CTools::inplaceSoftmax(prediction); + std::size_t weightedPredictedClass; + weights.cwiseProduct(prediction).maxCoeff(&weightedPredictedClass); + std::size_t actualClass{ + static_cast((*row)[numberClasses])}; + if (weightedPredictedClass == actualClass) { + correct[0](actualClass) += 1.0; + } + std::size_t unweightedPredictedClass; + prediction.maxCoeff(&unweightedPredictedClass); + if (unweightedPredictedClass == actualClass) { + correct[1](actualClass) += 1.0; + } + counts(actualClass) += 1.0; + } + }); + + LOG_TRACE(<< "weighted class recalls = " + << correct[0].cwiseQuotient(counts).transpose()); + LOG_TRACE(<< "unweighted class recalls = " + << correct[1].cwiseQuotient(counts).transpose()); + + minRecalls[i][0] = correct[0].cwiseQuotient(counts).minCoeff(); + maxRecalls[i][0] = correct[0].cwiseQuotient(counts).maxCoeff(); + minRecalls[i][1] = correct[1].cwiseQuotient(counts).minCoeff(); + maxRecalls[i][1] = correct[1].cwiseQuotient(counts).maxCoeff(); + + ++i; + core::startDefaultAsyncExecutor(); + } + + LOG_DEBUG(<< "min recalls = " << core::CContainerPrinter::print(minRecalls)); + LOG_DEBUG(<< "max recalls = " << core::CContainerPrinter::print(maxRecalls)); + + // Threaded and non-threaded results are close. + BOOST_REQUIRE_CLOSE(minRecalls[0][0], minRecalls[1][0], 1.0); // 1 % + + // We improved the minimum class recall by at least 10%. + BOOST_TEST_REQUIRE(minRecalls[0][0] > 1.1 * minRecalls[0][1]); + + // The minimum and maximum class recalls are close: we're at the global maximum. + BOOST_TEST_REQUIRE(1.02 * minRecalls[0][0] > maxRecalls[0][0]); + BOOST_TEST_REQUIRE(1.02 * minRecalls[1][0] > maxRecalls[1][0]); + } + } + + core::stopDefaultAsyncExecutor(); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/maths/unittest/CToolsTest.cc b/lib/maths/unittest/CToolsTest.cc index 0351236c51..d1f601cbac 100644 --- a/lib/maths/unittest/CToolsTest.cc +++ b/lib/maths/unittest/CToolsTest.cc @@ -1206,7 +1206,7 @@ BOOST_AUTO_TEST_CASE(testLgamma) { BOOST_REQUIRE_EQUAL(result, std::numeric_limits::infinity()); } -BOOST_AUTO_TEST_CASE(testSoftMax) { +BOOST_AUTO_TEST_CASE(testSoftmax) { // Test some invariants and that std::vector and maths::CDenseVector versions agree. using TDoubleVector = maths::CDenseVector; @@ -1231,4 +1231,33 @@ BOOST_AUTO_TEST_CASE(testSoftMax) { } } +BOOST_AUTO_TEST_CASE(testLogSoftmax) { + // Test some invariants and that std::vector and maths::CDenseVector versions agree. + + using TDoubleVector = maths::CDenseVector; + + test::CRandomNumbers rng; + + TDoubleVec z; + for (std::size_t t = 0; t < 100; ++t) { + + rng.generateUniformSamples(-3.0, 3.0, 5, z); + TDoubleVec p{z}; + CTools::inplaceSoftmax(p); + TDoubleVec logP{z}; + CTools::inplaceLogSoftmax(logP); + + BOOST_TEST_REQUIRE(*std::max_element(logP.begin(), logP.end()) <= 0.0); + for (std::size_t i = 0; i < 5; ++i) { + BOOST_REQUIRE_CLOSE(std::log(p[i]), logP[i], 1e-6); + } + + TDoubleVector logP_{TDoubleVector::fromStdVector(z)}; + CTools::inplaceLogSoftmax(logP_); + for (std::size_t i = 0; i < 5; ++i) { + BOOST_REQUIRE_CLOSE(logP[i], logP_[i], 1e-6); + } + } +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/test/CDataFrameAnalyzerTrainingFactory.cc b/lib/test/CDataFrameAnalyzerTrainingFactory.cc index fa28917bcd..5ebf1aa412 100644 --- a/lib/test/CDataFrameAnalyzerTrainingFactory.cc +++ b/lib/test/CDataFrameAnalyzerTrainingFactory.cc @@ -11,18 +11,17 @@ namespace test { void CDataFrameAnalyzerTrainingFactory::appendPrediction(core::CDataFrame&, std::size_t, - double prediction, + const TDouble2Vec& prediction, TDoubleVec& predictions) { - predictions.push_back(prediction); + predictions.push_back(prediction[0]); } void CDataFrameAnalyzerTrainingFactory::appendPrediction(core::CDataFrame& frame, std::size_t target, - double class1Score, + const TDouble2Vec& scores, TStrVec& predictions) { - predictions.push_back(class1Score < 0.5 - ? frame.categoricalColumnValues()[target][0] - : frame.categoricalColumnValues()[target][1]); + std::size_t i(std::max_element(scores.begin(), scores.end()) - scores.begin()); + predictions.push_back(frame.categoricalColumnValues()[target][i]); } CDataFrameAnalyzerTrainingFactory::TDataFrameUPtr