elastic · tveasey · Oct 8, 2019 · Oct 7, 2019 · Oct 8, 2019 · Oct 8, 2019
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -43,6 +43,9 @@ boosted tree training. Hard depth based regularization is often the strategy of
 choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs.
 Also, the parameters of the penalty function are mode suited to optimising with our
 Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].)
+* Improvements to count and sum anomaly detection for sparse data. This primarily
+aims to improve handling of data which are predictably present: detecting when they
+are unexpectedly missing. (See {ml-pull}721[#721].)
 
 == {es} version 7.4.1
 

diff --git a/include/maths/CModel.h b/include/maths/CModel.h
@@ -74,12 +74,6 @@ class MATHS_EXPORT CModelParams {
     //! Get the maximum time to test for a change point in the model.
     core_t::TTime maximumTimeToTestForChange() const;
 
-    //! Set the probability that the bucket will be empty for the model.
-    void probabilityBucketEmpty(double probability);
-
-    //! Get the probability that the bucket will be empty for the model.
-    double probabilityBucketEmpty() const;
-
 private:
     //! The data bucketing length.
     core_t::TTime m_BucketLength;
@@ -93,8 +87,6 @@ class MATHS_EXPORT CModelParams {
     core_t::TTime m_MinimumTimeToDetectChange;
     //! The maximum time permitted to test for a change in the model.
     core_t::TTime m_MaximumTimeToTestForChange;
-    //! The probability that a bucket will be empty for the model.
-    double m_ProbabilityBucketEmpty;
 };
 
 //! \brief The extra parameters needed by CModel::addSamples.
@@ -169,13 +161,6 @@ class MATHS_EXPORT CModelProbabilityParams {
     //! Get the confidence interval to use when detrending.
     double seasonalConfidenceInterval() const;
 
-    //! Add whether a value's bucket is empty.
-    CModelProbabilityParams& addBucketEmpty(const TBool2Vec& empty);
-    //! Set whether or not the values' bucket is empty.
-    CModelProbabilityParams& bucketEmpty(const TBool2Vec1Vec& empty);
-    //! Get whether the values' bucket is empty.
-    const TBool2Vec1Vec& bucketEmpty() const;
-
     //! Add a value's weights.
     CModelProbabilityParams& addWeights(const TDouble2VecWeightsAry& weights);
     //! Set the values' weights.
@@ -215,8 +200,6 @@ class MATHS_EXPORT CModelProbabilityParams {
     TProbabilityCalculation2Vec m_Calculations;
     //! The confidence interval to use when detrending.
     double m_SeasonalConfidenceInterval;
-    //! True if the bucket is empty and false otherwise.
-    TBool2Vec1Vec m_BucketEmpty;
     //! The sample weights.
     TDouble2VecWeightsAry1Vec m_Weights;
     //! The coordinates for which to compute the probability.
@@ -469,16 +452,6 @@ class MATHS_EXPORT CModel {
     static boost::optional<VECTOR>
     predictionError(double propagationInterval, const PRIOR& prior, const VECTOR& sample);
 
-    //! Correct \p probability with \p probabilityEmptyBucket.
-    static double jointProbabilityGivenBucket(bool bucketEmpty,
-                                              double probabilityBucketEmpty,
-                                              double probability);
-
-    //! Correct \p probability with \p probabilityEmptyBucket.
-    static double jointProbabilityGivenBucket(const TBool2Vec& bucketEmpty,
-                                              const TDouble2Vec& probabilityEmptyBucket,
-                                              double probability);
-
 private:
     //! The model parameters.
     CModelParams m_Params;

diff --git a/include/model/CAnomalyDetectorModelConfig.h b/include/model/CAnomalyDetectorModelConfig.h
@@ -144,10 +144,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
     //! category from the sketch to cluster.
     static const double DEFAULT_CATEGORY_DELETE_FRACTION;
 
-    //! The default minimum frequency of non-empty buckets at which we model
-    //! all buckets.
-    static const double DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS;
-
     //! The default size of the seasonal components we will model.
     static const std::size_t DEFAULT_COMPONENT_SIZE;
 

diff --git a/include/model/CIndividualModel.h b/include/model/CIndividualModel.h
@@ -236,10 +236,6 @@ class MODEL_EXPORT CIndividualModel : public CAnomalyDetectorModel {
     //! for features which count empty buckets.
     double emptyBucketWeight(model_t::EFeature feature, std::size_t pid, core_t::TTime time) const;
 
-    //! Get the "probability the bucket is empty" to use to correct probabilities
-    //! for features which count empty buckets.
-    double probabilityBucketEmpty(model_t::EFeature feature, std::size_t pid) const;
-
     //! Get a read only model corresponding to \p feature of the person \p pid.
     const maths::CModel* model(model_t::EFeature feature, std::size_t pid) const;
 

diff --git a/include/model/CModelParams.h b/include/model/CModelParams.h
@@ -83,9 +83,6 @@ struct MODEL_EXPORT SModelParams {
     //! The minimum permitted count of points in a distribution mode.
     double s_MinimumModeCount;
 
-    //! The minimum frequency of non-empty buckets at which we model all buckets.
-    double s_CutoffToModelEmptyBuckets;
-
     //! The number of points to use for approximating each seasonal component.
     std::size_t s_ComponentSize;
 

diff --git a/include/model/ModelTypes.h b/include/model/ModelTypes.h
@@ -512,15 +512,9 @@ double inverseOffsetCountToZero(EFeature feature, double count);
 MODEL_EXPORT
 void inverseOffsetCountToZero(EFeature feature, TDouble1Vec& count);
 
-//! Check if the feature counts empty buckets.
+//! Check if the feature has a value for empty buckets.
 MODEL_EXPORT
-bool countsEmptyBuckets(EFeature feature);
-
-//! Get the weight to apply to an empty bucket sample based on the
-//! frequency \p feature at which empty buckets are seen for \p feature
-//! and the cutoff for empty buckets directly.
-MODEL_EXPORT
-double emptyBucketCountWeight(EFeature feature, double frequency, double cutoff);
+bool includeEmptyBuckets(EFeature feature);
 
 //! Get the rate at which \p feature learns.
 MODEL_EXPORT

diff --git a/lib/maths/CModel.cc b/lib/maths/CModel.cc
@@ -44,8 +44,7 @@ CModelParams::CModelParams(core_t::TTime bucketLength,
     : m_BucketLength(bucketLength), m_LearnRate(learnRate), m_DecayRate(decayRate),
       m_MinimumSeasonalVarianceScale(minimumSeasonalVarianceScale),
       m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 6 * bucketLength)),
-      m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)),
-      m_ProbabilityBucketEmpty(0.0) {
+      m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)) {
 }
 
 core_t::TTime CModelParams::bucketLength() const {
@@ -80,14 +79,6 @@ core_t::TTime CModelParams::maximumTimeToTestForChange() const {
     return m_MaximumTimeToTestForChange;
 }
 
-void CModelParams::probabilityBucketEmpty(double probability) {
-    m_ProbabilityBucketEmpty = probability;
-}
-
-double CModelParams::probabilityBucketEmpty() const {
-    return m_ProbabilityBucketEmpty;
-}
-
 //////// CModelAddSamplesParams ////////
 
 CModelAddSamplesParams& CModelAddSamplesParams::integer(bool integer) {
@@ -168,20 +159,6 @@ double CModelProbabilityParams::seasonalConfidenceInterval() const {
     return m_SeasonalConfidenceInterval;
 }
 
-CModelProbabilityParams& CModelProbabilityParams::addBucketEmpty(const TBool2Vec& empty) {
-    m_BucketEmpty.push_back(empty);
-    return *this;
-}
-
-CModelProbabilityParams& CModelProbabilityParams::bucketEmpty(const TBool2Vec1Vec& empty) {
-    m_BucketEmpty = empty;
-    return *this;
-}
-
-const CModelProbabilityParams::TBool2Vec1Vec& CModelProbabilityParams::bucketEmpty() const {
-    return m_BucketEmpty;
-}
-
 CModelProbabilityParams&
 CModelProbabilityParams::addWeights(const TDouble2VecWeightsAry& weights) {
     m_Weights.push_back(weights);
@@ -288,35 +265,6 @@ CModelParams& CModel::params() {
     return m_Params;
 }
 
-double CModel::jointProbabilityGivenBucket(bool empty, double probabilityBucketEmpty, double probability) {
-    if (empty == false) {
-        return (1.0 - probabilityBucketEmpty) * probability;
-    }
-    return probabilityBucketEmpty + (1.0 - probabilityBucketEmpty) * probability;
-}
-
-double CModel::jointProbabilityGivenBucket(const TBool2Vec& empty,
-                                           const TDouble2Vec& probabilityEmptyBucket,
-                                           double probability) {
-
-    double p00{probabilityEmptyBucket[0]};
-    double p10{probabilityEmptyBucket[1]};
-
-    if (empty[0] == false && empty[1] == false) {
-        return (1.0 - p00) * (1.0 - p10) * probability;
-    }
-
-    if (empty[0] == false) {
-        return (1.0 - p00) * probability * (p10 + (1.0 - p10) * probability);
-    }
-
-    if (empty[1] == false) {
-        return (p00 + (1.0 - p00) * probability) * (1.0 - p10) * probability;
-    }
-
-    return (p00 + (1.0 - p00) * probability) * (p10 + (1.0 - p10) * probability);
-}
-
 //////// CModelStub ////////
 
 CModelStub::CModelStub() : CModel(stubParameters()) {

diff --git a/lib/maths/CPeriodicityHypothesisTests.cc b/lib/maths/CPeriodicityHypothesisTests.cc
@@ -303,29 +303,28 @@ void project(const TFloatMeanAccumulatorVec& values,
 
 //! Calculate the number of non-empty buckets at each bucket offset in
 //! the period for the \p values in \p windows.
-TSizeVec calculateRepeats(const TSizeSizePr2Vec& windows,
-                          std::size_t period,
-                          const TFloatMeanAccumulatorVec& values) {
-    TSizeVec result(std::min(period, length(windows[0])), 0);
+TDoubleVec calculateRepeats(const TSizeSizePr2Vec& windows,
+                            std::size_t period,
+                            const TFloatMeanAccumulatorVec& values) {
+    TDoubleVec result(std::min(period, length(windows[0])), 0);
     std::size_t n{values.size()};
     for (const auto& window : windows) {
         std::size_t a{window.first};
         std::size_t b{window.second};
         for (std::size_t i = a; i < b; ++i) {
-            if (CBasicStatistics::count(values[i % n]) > 0.0) {
-                ++result[(i - a) % period];
-            }
+            double count{CBasicStatistics::count(values[i % n])};
+            result[(i - a) % period] += std::min(count, 1.0);
         }
     }
     return result;
 }
 
 //! Calculate the number of non-empty buckets at each bucket offset in
 //! the period for the \p values in \p windows.
-TSizeVec calculateRepeats(const TTimeTimePr2Vec& windows_,
-                          core_t::TTime period,
-                          core_t::TTime bucketLength,
-                          const TFloatMeanAccumulatorVec& values) {
+TDoubleVec calculateRepeats(const TTimeTimePr2Vec& windows_,
+                            core_t::TTime period,
+                            core_t::TTime bucketLength,
+                            const TFloatMeanAccumulatorVec& values) {
     TSizeSizePr2Vec windows;
     calculateIndexWindows(windows_, bucketLength, windows);
     return calculateRepeats(windows, period / bucketLength, values);
@@ -344,53 +343,53 @@ void reweightOutliers(const std::vector<T>& trend,
     using TMaxAccumulator =
         CBasicStatistics::COrderStatisticsHeap<TDoubleSizePr, std::greater<TDoubleSizePr>>;
 
-    if (values.size() > 0) {
+    std::size_t period{trend.size()};
+    std::size_t numberOutliers{static_cast<std::size_t>([&period, &values] {
+        std::size_t count(std::count_if(
+            values.begin(), values.end(), [](const TFloatMeanAccumulator& value) {
+                return CBasicStatistics::count(value) > 0.0;
+            }));
+        return SEASONAL_OUTLIER_FRACTION *
+               static_cast<double>(count - std::min(count, period));
+    }())};
+    LOG_TRACE(<< "Number outliers = " << numberOutliers);
+
+    if (numberOutliers > 0) {
         TSizeSizePr2Vec windows;
         calculateIndexWindows(windows_, bucketLength, windows);
-        std::size_t period{trend.size()};
         std::size_t n{values.size()};
 
-        TSizeVec repeats{calculateRepeats(windows, period, values)};
-        double excess{std::accumulate(
-            repeats.begin(), repeats.end(), 0.0, [](double excess_, std::size_t repeat) {
-                return excess_ + static_cast<double>(repeat > 1 ? repeat - 1 : 0);
-            })};
-        std::size_t numberOutliers{static_cast<std::size_t>(SEASONAL_OUTLIER_FRACTION * excess)};
-        LOG_TRACE(<< "Number outliers = " << numberOutliers);
-
-        if (numberOutliers > 0) {
-            TMaxAccumulator outliers{numberOutliers};
-            TMeanAccumulator meanDifference;
-            for (const auto& window : windows) {
-                std::size_t a{window.first};
-                std::size_t b{window.second};
-                for (std::size_t j = a; j < b; ++j) {
-                    const TFloatMeanAccumulator& value{values[j % n]};
-                    if (CBasicStatistics::count(value) > 0.0) {
-                        std::size_t offset{(j - a) % period};
-                        double difference{std::fabs(CBasicStatistics::mean(value) -
-                                                    CBasicStatistics::mean(trend[offset]))};
-                        outliers.add({difference, j});
-                        meanDifference.add(difference);
-                    }
+        TMaxAccumulator outliers{numberOutliers};
+        TMeanAccumulator meanDifference;
+        for (const auto& window : windows) {
+            std::size_t a{window.first};
+            std::size_t b{window.second};
+            for (std::size_t j = a; j < b; ++j) {
+                const TFloatMeanAccumulator& value{values[j % n]};
+                if (CBasicStatistics::count(value) > 0.0) {
+                    std::size_t offset{(j - a) % period};
+                    double difference{std::fabs(CBasicStatistics::mean(value) -
+                                                CBasicStatistics::mean(trend[offset]))};
+                    outliers.add({difference, j});
+                    meanDifference.add(difference);
                 }
             }
-            TMeanAccumulator meanDifferenceOfOutliers;
-            for (const auto& outlier : outliers) {
-                meanDifferenceOfOutliers.add(outlier.first);
-            }
-            meanDifference -= meanDifferenceOfOutliers;
-            LOG_TRACE(<< "mean difference = " << CBasicStatistics::mean(meanDifference));
-            LOG_TRACE(<< "outliers = " << core::CContainerPrinter::print(outliers));
-
-            for (const auto& outlier : outliers) {
-                if (outlier.first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
-                                        CBasicStatistics::mean(meanDifference)) {
-                    CBasicStatistics::count(values[outlier.second % n]) *= SEASONAL_OUTLIER_WEIGHT;
-                }
+        }
+        TMeanAccumulator meanDifferenceOfOutliers;
+        for (const auto& outlier : outliers) {
+            meanDifferenceOfOutliers.add(outlier.first);
+        }
+        meanDifference -= meanDifferenceOfOutliers;
+        LOG_TRACE(<< "mean difference = " << CBasicStatistics::mean(meanDifference));
+        LOG_TRACE(<< "outliers = " << core::CContainerPrinter::print(outliers));
+
+        for (const auto& outlier : outliers) {
+            if (outlier.first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
+                                    CBasicStatistics::mean(meanDifference)) {
+                CBasicStatistics::count(values[outlier.second % n]) *= SEASONAL_OUTLIER_WEIGHT;
             }
-            LOG_TRACE(<< "Values - outliers = " << core::CContainerPrinter::print(values));
         }
+        LOG_TRACE(<< "Values - outliers = " << core::CContainerPrinter::print(values));
     }
 }
 
@@ -1839,10 +1838,12 @@ bool CPeriodicityHypothesisTests::testPeriodWithScaling(const TTimeTimePr2Vec& w
     }
 
     // Compute the degrees of freedom given the alternative hypothesis.
-    TSizeVec repeats(calculateRepeats(windows, period_, m_BucketLength, values));
-    double b{static_cast<double>(
-        std::count_if(repeats.begin(), repeats.end(),
-                      [](std::size_t repeat) { return repeat > 0; }))};
+    double b{[&windows, &period_, &values, this] {
+        TDoubleVec repeats(calculateRepeats(windows, period_, m_BucketLength, values));
+        return static_cast<double>(
+            std::count_if(repeats.begin(), repeats.end(),
+                          [](double repeat) { return repeat > 0.0; }));
+    }()};
     double df1{stats.s_NonEmptyBuckets - b - static_cast<double>(segmentation.size() - 2)};
     LOG_TRACE(<< "  populated = " << b);
 
@@ -2055,14 +2056,10 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
     //   3) The significance of the variance reduction, and
     //   4) The amount of variance reduction.
 
-    auto calculateMeanRepeats = [&](const TTimeTimePr2Vec& w, core_t::TTime p) {
-        TSizeVec repeats{calculateRepeats(w, p, m_BucketLength, values)};
-        return CBasicStatistics::mean(
-            std::accumulate(repeats.begin(), repeats.end(), TMeanAccumulator{},
-                            [](TMeanAccumulator mean, std::size_t r) {
-                                mean.add(static_cast<double>(r));
-                                return mean;
-                            }));
+    auto calculateMeanRepeats = [&values, this](const TTimeTimePr2Vec& w, core_t::TTime p) {
+        TMeanAccumulator result;
+        result.add(calculateRepeats(w, p, m_BucketLength, values));
+        return CBasicStatistics::mean(result);
     };
 
     double p{0.0};
@@ -2143,13 +2140,11 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
     LOG_TRACE(<< "  autocorrelation          = " << R);
     LOG_TRACE(<< "  autocorrelationThreshold = " << stats.s_AutocorrelationThreshold);
 
-    TSizeVec repeats{calculateRepeats(window, period_, m_BucketLength, buckets)};
-    meanRepeats = CBasicStatistics::mean(
-        std::accumulate(repeats.begin(), repeats.end(), TMeanAccumulator{},
-                        [](TMeanAccumulator mean, std::size_t repeat) {
-                            mean.add(static_cast<double>(repeat));
-                            return mean;
-                        }));
+    meanRepeats = [&window, &period_, &buckets, this] {
+        TMeanAccumulator result;
+        result.add(calculateRepeats(window, period_, m_BucketLength, buckets));
+        return CBasicStatistics::mean(result);
+    }();
     LOG_TRACE(<< "  mean repeats = " << meanRepeats);
 
     // We're trading off: