elastic · tveasey · Apr 30, 2019 · Apr 16, 2019 · Apr 23, 2019 · Apr 23, 2019
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -40,6 +40,8 @@ to the model. (See {ml-pull}214[#214].)
 * Only select more complex trend models for forecasting if there is evidence that they are needed.
 (See {ml-pull}463[#463].)
 
+* Improve residual model selection. (See {ml-pull}468[#468].)
+
 === Bug Fixes
 
 * Handle NaNs when detrending seasonal components. {ml-pull}408[#408]

diff --git a/include/maths/COneOfNPrior.h b/include/maths/COneOfNPrior.h
@@ -324,7 +324,7 @@ class MATHS_EXPORT COneOfNPrior : public CPrior {
     using TDoubleSizePr5Vec = core::CSmallVector<TDoubleSizePr, 5>;
     using TWeightPriorPtrPr = std::pair<CModelWeight, TPriorPtr>;
     using TWeightPriorPtrPrVec = std::vector<TWeightPriorPtrPr>;
-    using TMaxAccumulator = CBasicStatistics::SMax<double>::TAccumulator;
+    using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
 
 private:
     //! Read parameters from \p traverser.
@@ -350,6 +350,9 @@ class MATHS_EXPORT COneOfNPrior : public CPrior {
 private:
     //! A collection of component models and their probabilities.
     TWeightPriorPtrPrVec m_Models;
+
+    //! The moments of the samples added.
+    TMeanVarAccumulator m_SampleMoments;
 };
 }
 }

diff --git a/include/maths/Constants.h b/include/maths/Constants.h
@@ -116,9 +116,6 @@ const double MINIMUM_CLUSTER_SPLIT_COUNT{24.0};
 
 //! The minimum count of a category in the sketch to cluster.
 const double MINIMUM_CATEGORY_COUNT{0.5};
-
-//! Get the maximum amount we'll penalize a model in addSamples.
-MATHS_EXPORT double maxModelPenalty(double numberSamples);
 }
 }
 

diff --git a/lib/api/unittest/CAnomalyJobLimitTest.cc b/lib/api/unittest/CAnomalyJobLimitTest.cc
@@ -349,12 +349,12 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
         std::size_t s_ExpectedByFields;
         std::size_t s_ExpectedOverFields;
         std::size_t s_ExpectedPartitionFields;
-        std::size_t s_ExpectedByLowerMemoryLimit;
-        std::size_t s_ExpectedPartitionLowerMemoryLimit;
-        std::size_t s_ExpectedOverLowerMemoryLimit;
-    } testParams[] = {{600, 550, 6000, 300, 33, 40, 40},
-                      {3600, 550, 5500, 300, 30, 25, 20},
-                      {172800, 150, 850, 120, 6, 6, 3}};
+        std::size_t s_ExpectedByMemoryUsageRelativeErrorDivisor;
+        std::size_t s_ExpectedPartitionUsageRelativeErrorDivisor;
+        std::size_t s_ExpectedOverUsageRelativeErrorDivisor;
+    } testParams[]{{600, 550, 6000, 300, 33, 40, 40},
+                   {3600, 550, 5500, 300, 27, 25, 20},
+                   {172800, 150, 850, 120, 6, 6, 3}};
 
     for (const auto& testParam : testParams) {
         TGeneratorVec generators{periodic, tradingDays, level, ramp, sparse};
@@ -410,7 +410,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
             CPPUNIT_ASSERT_EQUAL(std::size_t(2), used.s_PartitionFields);
             CPPUNIT_ASSERT_DOUBLES_EQUAL(
                 memoryLimit * 1024 * 1024 / 2, used.s_Usage,
-                memoryLimit * 1024 * 1024 / testParam.s_ExpectedByLowerMemoryLimit);
+                memoryLimit * 1024 * 1024 / testParam.s_ExpectedByMemoryUsageRelativeErrorDivisor);
         }
 
         LOG_DEBUG(<< "**** Test partition with bucketLength = " << testParam.s_BucketLength
@@ -461,7 +461,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
                            0.96 * static_cast<double>(used.s_PartitionFields));
             CPPUNIT_ASSERT_DOUBLES_EQUAL(
                 memoryLimit * 1024 * 1024 / 2, used.s_Usage,
-                memoryLimit * 1024 * 1024 / testParam.s_ExpectedPartitionLowerMemoryLimit);
+                memoryLimit * 1024 * 1024 / testParam.s_ExpectedPartitionUsageRelativeErrorDivisor);
         }
 
         LOG_DEBUG(<< "**** Test over with bucketLength = " << testParam.s_BucketLength
@@ -508,7 +508,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
                            used.s_OverFields < 7000);
             CPPUNIT_ASSERT_DOUBLES_EQUAL(
                 memoryLimit * 1024 * 1024 / 2, used.s_Usage,
-                memoryLimit * 1024 * 1024 / testParam.s_ExpectedOverLowerMemoryLimit);
+                memoryLimit * 1024 * 1024 / testParam.s_ExpectedOverUsageRelativeErrorDivisor);
         }
     }
 }
diff --git a/lib/maths/CMultivariateOneOfNPrior.cc b/lib/maths/CMultivariateOneOfNPrior.cc
@@ -54,13 +54,6 @@ const std::string WEIGHT_TAG("c");
 const std::string PRIOR_TAG("d");
 const std::string DECAY_RATE_TAG("e");
 
-//! Add elements of \p x to \p y.
-void add(const TDouble10Vec& x, TDouble10Vec& y) {
-    for (std::size_t i = 0u; i < x.size(); ++i) {
-        y[i] += x[i];
-    }
-}
-
 //! Get the min of \p x and \p y.
 TDouble10Vec min(const TDouble10Vec& x, const TDouble10Vec& y) {
     TDouble10Vec result(x);
@@ -103,11 +96,6 @@ void updateMean(const TDouble10Vec10Vec& x, double nx, TDouble10Vec10Vec& mean,
     n += nx;
 }
 
-//! Get the largest element of \p x.
-double largest(const TDouble10Vec& x) {
-    return *std::max_element(x.begin(), x.end());
-}
-
 //! Add a model vector entry reading parameters from \p traverser.
 bool modelAcceptRestoreTraverser(const SDistributionRestoreParams& params,
                                  TWeightPriorPtrPrVec& models,
@@ -174,7 +162,7 @@ void modelAcceptPersistInserter(const CModelWeight& weight,
 const double DERATE = 0.99999;
 const double MINUS_INF = DERATE * boost::numeric::bounds<double>::lowest();
 const double INF = DERATE * boost::numeric::bounds<double>::highest();
-const double LOG_INITIAL_WEIGHT = std::log(1e-6);
+const double MAXIMUM_LOG_BAYES_FACTOR = std::log(1e6);
 const double MINIMUM_SIGNIFICANT_WEIGHT = 0.01;
 }
 
@@ -306,81 +294,94 @@ void CMultivariateOneOfNPrior::addSamples(const TDouble10Vec1Vec& samples,
 
     this->adjustOffset(samples, weights);
 
-    double penalty = CTools::fastLog(this->numberSamples());
+    double n{this->numberSamples()};
     this->CMultivariatePrior::addSamples(samples, weights);
-    penalty = (penalty - CTools::fastLog(this->numberSamples())) / 2.0;
+    n = this->numberSamples() - n;
 
     // See COneOfNPrior::addSamples for a discussion.
 
     CScopeCanonicalizeWeights<TPriorPtr> canonicalize(m_Models);
 
     // We need to check *before* adding samples to the constituent models.
-    bool isNonInformative = this->isNonInformative();
+    bool isNonInformative{this->isNonInformative()};
 
     // Compute the unnormalized posterior weights and update the component
     // priors. These weights are computed on the side since they are only
     // updated if all marginal likelihoods can be computed.
-    TDouble3Vec logLikelihoods;
-    TMaxAccumulator maxLogLikelihood;
-    TBool3Vec used, uses;
+    TDouble3Vec minusBics;
+    TBool3Vec used;
+    TBool3Vec uses;
+    bool failed{false};
+
+    // If none of the models are a good fit for the new data then restrict
+    // the maximum Bayes Factor since the data likelihood given the model
+    // is most useful if one of the models is (mostly) correct.
+    double m{std::max(n, 1.0)};
+    double maxLogBayesFactor{-m * MAXIMUM_LOG_BAYES_FACTOR};
+
     for (auto& model : m_Models) {
-        bool use = model.second->participatesInModelSelection();
-
-        // Update the weights with the marginal likelihoods.
-        double logLikelihood = 0.0;
-        maths_t::EFloatingPointErrorStatus status =
-            use ? model.second->jointLogMarginalLikelihood(samples, weights, logLikelihood)
-                : maths_t::E_FpOverflowed;
-        if (status & maths_t::E_FpFailed) {
-            LOG_ERROR(<< "Failed to compute log-likelihood");
-            LOG_ERROR(<< "samples = " << core::CContainerPrinter::print(samples));
-        } else {
-            if (!(status & maths_t::E_FpOverflowed)) {
-                logLikelihood += model.second->unmarginalizedParameters() * penalty;
-                logLikelihoods.push_back(logLikelihood);
-                maxLogLikelihood.add(logLikelihood);
-            } else {
-                logLikelihoods.push_back(MINUS_INF);
+
+        double minusBic{0.0};
+        maths_t::EFloatingPointErrorStatus status{maths_t::E_FpOverflowed};
+
+        used.push_back(model.second->participatesInModelSelection());
+        if (used.back()) {
+            double logLikelihood;
+            status = model.second->jointLogMarginalLikelihood(samples, weights, logLikelihood);
+
+            minusBic = 2.0 * logLikelihood -
+                       model.second->unmarginalizedParameters() * CTools::fastLog(n);
+
+            double modeLogLikelihood;
+            TDouble10Vec1Vec modes;
+            modes.reserve(weights.size());
+            for (const auto& weight : weights) {
+                modes.push_back(model.second->marginalLikelihoodMode(weight));
             }
+            model.second->jointLogMarginalLikelihood(modes, weights, modeLogLikelihood);
+            maxLogBayesFactor = std::max(
+                maxLogBayesFactor, logLikelihood - std::max(modeLogLikelihood, logLikelihood));
+        }
+
+        minusBics.push_back((status & maths_t::E_FpOverflowed) ? MINUS_INF : minusBic);
+        failed |= (status & maths_t::E_FpFailed);
 
-            // Update the component prior distribution.
-            model.second->addSamples(samples, weights);
+        // Update the component prior distribution.
+        model.second->addSamples(samples, weights);
 
-            used.push_back(use);
-            uses.push_back(model.second->participatesInModelSelection());
+        uses.push_back(model.second->participatesInModelSelection());
+        if (!uses.back()) {
+            model.first.logWeight(MINUS_INF);
         }
     }
 
-    TDouble10Vec n(m_Dimension, 0.0);
-    for (const auto& weight : weights) {
-        add(maths_t::count(weight), n);
+    if (failed) {
+        LOG_ERROR(<< "Failed to compute log-likelihood");
+        LOG_ERROR(<< "samples = " << core::CContainerPrinter::print(samples));
+        LOG_ERROR(<< "weights = " << core::CContainerPrinter::print(weights));
+        return;
+    }
+    if (isNonInformative) {
+        return;
     }
 
-    if (!isNonInformative && maxLogLikelihood.count() > 0) {
-        LOG_TRACE(<< "logLikelihoods = " << core::CContainerPrinter::print(logLikelihoods));
+    maxLogBayesFactor += m * MAXIMUM_LOG_BAYES_FACTOR;
 
-        // The idea here is to limit the amount which extreme samples
-        // affect model selection, particularly early on in the model
-        // life-cycle.
-        double l = largest(n);
-        double minLogLikelihood =
-            maxLogLikelihood[0] -
-            l * std::min(maxModelPenalty(this->numberSamples()), 100.0);
+    LOG_TRACE(<< "BICs = " << core::CContainerPrinter::print(minusBics));
+    LOG_TRACE(<< "max Bayes Factor = " << maxLogBayesFactor);
 
-        TMaxAccumulator maxLogWeight;
-        for (std::size_t i = 0; i < logLikelihoods.size(); ++i) {
-            CModelWeight& weight = m_Models[i].first;
-            if (!uses[i]) {
-                weight.logWeight(MINUS_INF);
-            } else if (used[i]) {
-                weight.addLogFactor(std::max(logLikelihoods[i], minLogLikelihood));
-                maxLogWeight.add(weight.logWeight());
-            }
+    double maxLogModelWeight{MINUS_INF + m * MAXIMUM_LOG_BAYES_FACTOR};
+    double maxMinusBic{*std::max_element(minusBics.begin(), minusBics.end())};
+    for (std::size_t i = 0; i < m_Models.size(); ++i) {
+        if (used[i] && uses[i]) {
+            m_Models[i].first.addLogFactor(
+                std::max(minusBics[i] / 2.0, maxMinusBic / 2.0 - maxLogBayesFactor));
+            maxLogModelWeight = std::max(maxLogModelWeight, m_Models[i].first.logWeight());
         }
-        for (std::size_t i = 0u; i < m_Models.size(); ++i) {
-            if (!used[i] && uses[i]) {
-                m_Models[i].first.logWeight(maxLogWeight[0] + LOG_INITIAL_WEIGHT);
-            }
+    }
+    for (std::size_t i = 0; i < m_Models.size(); ++i) {
+        if (!used[i] && uses[i]) {
+            m_Models[i].first.logWeight(maxLogModelWeight - m * MAXIMUM_LOG_BAYES_FACTOR);
         }
     }