Skip to content

Commit 1da7698

Browse files
authored
[7.5][ML] Improvements to sparse count modelling (elastic#722)
Backport elastic#721.
1 parent 2b7c3c5 commit 1da7698

27 files changed

+240
-529
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ boosted tree training. Hard depth based regularization is often the strategy of
4343
choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs.
4444
Also, the parameters of the penalty function are mode suited to optimising with our
4545
Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].)
46+
* Improvements to count and sum anomaly detection for sparse data. This primarily
47+
aims to improve handling of data which are predictably present: detecting when they
48+
are unexpectedly missing. (See {ml-pull}721[#721].)
4649
4750
== {es} version 7.4.1
4851

include/maths/CModel.h

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,6 @@ class MATHS_EXPORT CModelParams {
7474
//! Get the maximum time to test for a change point in the model.
7575
core_t::TTime maximumTimeToTestForChange() const;
7676

77-
//! Set the probability that the bucket will be empty for the model.
78-
void probabilityBucketEmpty(double probability);
79-
80-
//! Get the probability that the bucket will be empty for the model.
81-
double probabilityBucketEmpty() const;
82-
8377
private:
8478
//! The data bucketing length.
8579
core_t::TTime m_BucketLength;
@@ -93,8 +87,6 @@ class MATHS_EXPORT CModelParams {
9387
core_t::TTime m_MinimumTimeToDetectChange;
9488
//! The maximum time permitted to test for a change in the model.
9589
core_t::TTime m_MaximumTimeToTestForChange;
96-
//! The probability that a bucket will be empty for the model.
97-
double m_ProbabilityBucketEmpty;
9890
};
9991

10092
//! \brief The extra parameters needed by CModel::addSamples.
@@ -169,13 +161,6 @@ class MATHS_EXPORT CModelProbabilityParams {
169161
//! Get the confidence interval to use when detrending.
170162
double seasonalConfidenceInterval() const;
171163

172-
//! Add whether a value's bucket is empty.
173-
CModelProbabilityParams& addBucketEmpty(const TBool2Vec& empty);
174-
//! Set whether or not the values' bucket is empty.
175-
CModelProbabilityParams& bucketEmpty(const TBool2Vec1Vec& empty);
176-
//! Get whether the values' bucket is empty.
177-
const TBool2Vec1Vec& bucketEmpty() const;
178-
179164
//! Add a value's weights.
180165
CModelProbabilityParams& addWeights(const TDouble2VecWeightsAry& weights);
181166
//! Set the values' weights.
@@ -215,8 +200,6 @@ class MATHS_EXPORT CModelProbabilityParams {
215200
TProbabilityCalculation2Vec m_Calculations;
216201
//! The confidence interval to use when detrending.
217202
double m_SeasonalConfidenceInterval;
218-
//! True if the bucket is empty and false otherwise.
219-
TBool2Vec1Vec m_BucketEmpty;
220203
//! The sample weights.
221204
TDouble2VecWeightsAry1Vec m_Weights;
222205
//! The coordinates for which to compute the probability.
@@ -469,16 +452,6 @@ class MATHS_EXPORT CModel {
469452
static boost::optional<VECTOR>
470453
predictionError(double propagationInterval, const PRIOR& prior, const VECTOR& sample);
471454

472-
//! Correct \p probability with \p probabilityEmptyBucket.
473-
static double jointProbabilityGivenBucket(bool bucketEmpty,
474-
double probabilityBucketEmpty,
475-
double probability);
476-
477-
//! Correct \p probability with \p probabilityEmptyBucket.
478-
static double jointProbabilityGivenBucket(const TBool2Vec& bucketEmpty,
479-
const TDouble2Vec& probabilityEmptyBucket,
480-
double probability);
481-
482455
private:
483456
//! The model parameters.
484457
CModelParams m_Params;

include/model/CAnomalyDetectorModelConfig.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
144144
//! category from the sketch to cluster.
145145
static const double DEFAULT_CATEGORY_DELETE_FRACTION;
146146

147-
//! The default minimum frequency of non-empty buckets at which we model
148-
//! all buckets.
149-
static const double DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS;
150-
151147
//! The default size of the seasonal components we will model.
152148
static const std::size_t DEFAULT_COMPONENT_SIZE;
153149

include/model/CIndividualModel.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,6 @@ class MODEL_EXPORT CIndividualModel : public CAnomalyDetectorModel {
236236
//! for features which count empty buckets.
237237
double emptyBucketWeight(model_t::EFeature feature, std::size_t pid, core_t::TTime time) const;
238238

239-
//! Get the "probability the bucket is empty" to use to correct probabilities
240-
//! for features which count empty buckets.
241-
double probabilityBucketEmpty(model_t::EFeature feature, std::size_t pid) const;
242-
243239
//! Get a read only model corresponding to \p feature of the person \p pid.
244240
const maths::CModel* model(model_t::EFeature feature, std::size_t pid) const;
245241

include/model/CModelParams.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,6 @@ struct MODEL_EXPORT SModelParams {
8383
//! The minimum permitted count of points in a distribution mode.
8484
double s_MinimumModeCount;
8585

86-
//! The minimum frequency of non-empty buckets at which we model all buckets.
87-
double s_CutoffToModelEmptyBuckets;
88-
8986
//! The number of points to use for approximating each seasonal component.
9087
std::size_t s_ComponentSize;
9188

include/model/ModelTypes.h

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -512,15 +512,9 @@ double inverseOffsetCountToZero(EFeature feature, double count);
512512
MODEL_EXPORT
513513
void inverseOffsetCountToZero(EFeature feature, TDouble1Vec& count);
514514

515-
//! Check if the feature counts empty buckets.
515+
//! Check if the feature has a value for empty buckets.
516516
MODEL_EXPORT
517-
bool countsEmptyBuckets(EFeature feature);
518-
519-
//! Get the weight to apply to an empty bucket sample based on the
520-
//! frequency \p feature at which empty buckets are seen for \p feature
521-
//! and the cutoff for empty buckets directly.
522-
MODEL_EXPORT
523-
double emptyBucketCountWeight(EFeature feature, double frequency, double cutoff);
517+
bool includeEmptyBuckets(EFeature feature);
524518

525519
//! Get the rate at which \p feature learns.
526520
MODEL_EXPORT

lib/maths/CModel.cc

Lines changed: 1 addition & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ CModelParams::CModelParams(core_t::TTime bucketLength,
4444
: m_BucketLength(bucketLength), m_LearnRate(learnRate), m_DecayRate(decayRate),
4545
m_MinimumSeasonalVarianceScale(minimumSeasonalVarianceScale),
4646
m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 6 * bucketLength)),
47-
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)),
48-
m_ProbabilityBucketEmpty(0.0) {
47+
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)) {
4948
}
5049

5150
core_t::TTime CModelParams::bucketLength() const {
@@ -80,14 +79,6 @@ core_t::TTime CModelParams::maximumTimeToTestForChange() const {
8079
return m_MaximumTimeToTestForChange;
8180
}
8281

83-
void CModelParams::probabilityBucketEmpty(double probability) {
84-
m_ProbabilityBucketEmpty = probability;
85-
}
86-
87-
double CModelParams::probabilityBucketEmpty() const {
88-
return m_ProbabilityBucketEmpty;
89-
}
90-
9182
//////// CModelAddSamplesParams ////////
9283

9384
CModelAddSamplesParams& CModelAddSamplesParams::integer(bool integer) {
@@ -168,20 +159,6 @@ double CModelProbabilityParams::seasonalConfidenceInterval() const {
168159
return m_SeasonalConfidenceInterval;
169160
}
170161

171-
CModelProbabilityParams& CModelProbabilityParams::addBucketEmpty(const TBool2Vec& empty) {
172-
m_BucketEmpty.push_back(empty);
173-
return *this;
174-
}
175-
176-
CModelProbabilityParams& CModelProbabilityParams::bucketEmpty(const TBool2Vec1Vec& empty) {
177-
m_BucketEmpty = empty;
178-
return *this;
179-
}
180-
181-
const CModelProbabilityParams::TBool2Vec1Vec& CModelProbabilityParams::bucketEmpty() const {
182-
return m_BucketEmpty;
183-
}
184-
185162
CModelProbabilityParams&
186163
CModelProbabilityParams::addWeights(const TDouble2VecWeightsAry& weights) {
187164
m_Weights.push_back(weights);
@@ -288,35 +265,6 @@ CModelParams& CModel::params() {
288265
return m_Params;
289266
}
290267

291-
double CModel::jointProbabilityGivenBucket(bool empty, double probabilityBucketEmpty, double probability) {
292-
if (empty == false) {
293-
return (1.0 - probabilityBucketEmpty) * probability;
294-
}
295-
return probabilityBucketEmpty + (1.0 - probabilityBucketEmpty) * probability;
296-
}
297-
298-
double CModel::jointProbabilityGivenBucket(const TBool2Vec& empty,
299-
const TDouble2Vec& probabilityEmptyBucket,
300-
double probability) {
301-
302-
double p00{probabilityEmptyBucket[0]};
303-
double p10{probabilityEmptyBucket[1]};
304-
305-
if (empty[0] == false && empty[1] == false) {
306-
return (1.0 - p00) * (1.0 - p10) * probability;
307-
}
308-
309-
if (empty[0] == false) {
310-
return (1.0 - p00) * probability * (p10 + (1.0 - p10) * probability);
311-
}
312-
313-
if (empty[1] == false) {
314-
return (p00 + (1.0 - p00) * probability) * (1.0 - p10) * probability;
315-
}
316-
317-
return (p00 + (1.0 - p00) * probability) * (p10 + (1.0 - p10) * probability);
318-
}
319-
320268
//////// CModelStub ////////
321269

322270
CModelStub::CModelStub() : CModel(stubParameters()) {

lib/maths/CPeriodicityHypothesisTests.cc

Lines changed: 64 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -303,29 +303,28 @@ void project(const TFloatMeanAccumulatorVec& values,
303303

304304
//! Calculate the number of non-empty buckets at each bucket offset in
305305
//! the period for the \p values in \p windows.
306-
TSizeVec calculateRepeats(const TSizeSizePr2Vec& windows,
307-
std::size_t period,
308-
const TFloatMeanAccumulatorVec& values) {
309-
TSizeVec result(std::min(period, length(windows[0])), 0);
306+
TDoubleVec calculateRepeats(const TSizeSizePr2Vec& windows,
307+
std::size_t period,
308+
const TFloatMeanAccumulatorVec& values) {
309+
TDoubleVec result(std::min(period, length(windows[0])), 0);
310310
std::size_t n{values.size()};
311311
for (const auto& window : windows) {
312312
std::size_t a{window.first};
313313
std::size_t b{window.second};
314314
for (std::size_t i = a; i < b; ++i) {
315-
if (CBasicStatistics::count(values[i % n]) > 0.0) {
316-
++result[(i - a) % period];
317-
}
315+
double count{CBasicStatistics::count(values[i % n])};
316+
result[(i - a) % period] += std::min(count, 1.0);
318317
}
319318
}
320319
return result;
321320
}
322321

323322
//! Calculate the number of non-empty buckets at each bucket offset in
324323
//! the period for the \p values in \p windows.
325-
TSizeVec calculateRepeats(const TTimeTimePr2Vec& windows_,
326-
core_t::TTime period,
327-
core_t::TTime bucketLength,
328-
const TFloatMeanAccumulatorVec& values) {
324+
TDoubleVec calculateRepeats(const TTimeTimePr2Vec& windows_,
325+
core_t::TTime period,
326+
core_t::TTime bucketLength,
327+
const TFloatMeanAccumulatorVec& values) {
329328
TSizeSizePr2Vec windows;
330329
calculateIndexWindows(windows_, bucketLength, windows);
331330
return calculateRepeats(windows, period / bucketLength, values);
@@ -344,53 +343,53 @@ void reweightOutliers(const std::vector<T>& trend,
344343
using TMaxAccumulator =
345344
CBasicStatistics::COrderStatisticsHeap<TDoubleSizePr, std::greater<TDoubleSizePr>>;
346345

347-
if (values.size() > 0) {
346+
std::size_t period{trend.size()};
347+
std::size_t numberOutliers{static_cast<std::size_t>([&period, &values] {
348+
std::size_t count(std::count_if(
349+
values.begin(), values.end(), [](const TFloatMeanAccumulator& value) {
350+
return CBasicStatistics::count(value) > 0.0;
351+
}));
352+
return SEASONAL_OUTLIER_FRACTION *
353+
static_cast<double>(count - std::min(count, period));
354+
}())};
355+
LOG_TRACE(<< "Number outliers = " << numberOutliers);
356+
357+
if (numberOutliers > 0) {
348358
TSizeSizePr2Vec windows;
349359
calculateIndexWindows(windows_, bucketLength, windows);
350-
std::size_t period{trend.size()};
351360
std::size_t n{values.size()};
352361

353-
TSizeVec repeats{calculateRepeats(windows, period, values)};
354-
double excess{std::accumulate(
355-
repeats.begin(), repeats.end(), 0.0, [](double excess_, std::size_t repeat) {
356-
return excess_ + static_cast<double>(repeat > 1 ? repeat - 1 : 0);
357-
})};
358-
std::size_t numberOutliers{static_cast<std::size_t>(SEASONAL_OUTLIER_FRACTION * excess)};
359-
LOG_TRACE(<< "Number outliers = " << numberOutliers);
360-
361-
if (numberOutliers > 0) {
362-
TMaxAccumulator outliers{numberOutliers};
363-
TMeanAccumulator meanDifference;
364-
for (const auto& window : windows) {
365-
std::size_t a{window.first};
366-
std::size_t b{window.second};
367-
for (std::size_t j = a; j < b; ++j) {
368-
const TFloatMeanAccumulator& value{values[j % n]};
369-
if (CBasicStatistics::count(value) > 0.0) {
370-
std::size_t offset{(j - a) % period};
371-
double difference{std::fabs(CBasicStatistics::mean(value) -
372-
CBasicStatistics::mean(trend[offset]))};
373-
outliers.add({difference, j});
374-
meanDifference.add(difference);
375-
}
362+
TMaxAccumulator outliers{numberOutliers};
363+
TMeanAccumulator meanDifference;
364+
for (const auto& window : windows) {
365+
std::size_t a{window.first};
366+
std::size_t b{window.second};
367+
for (std::size_t j = a; j < b; ++j) {
368+
const TFloatMeanAccumulator& value{values[j % n]};
369+
if (CBasicStatistics::count(value) > 0.0) {
370+
std::size_t offset{(j - a) % period};
371+
double difference{std::fabs(CBasicStatistics::mean(value) -
372+
CBasicStatistics::mean(trend[offset]))};
373+
outliers.add({difference, j});
374+
meanDifference.add(difference);
376375
}
377376
}
378-
TMeanAccumulator meanDifferenceOfOutliers;
379-
for (const auto& outlier : outliers) {
380-
meanDifferenceOfOutliers.add(outlier.first);
381-
}
382-
meanDifference -= meanDifferenceOfOutliers;
383-
LOG_TRACE(<< "mean difference = " << CBasicStatistics::mean(meanDifference));
384-
LOG_TRACE(<< "outliers = " << core::CContainerPrinter::print(outliers));
385-
386-
for (const auto& outlier : outliers) {
387-
if (outlier.first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
388-
CBasicStatistics::mean(meanDifference)) {
389-
CBasicStatistics::count(values[outlier.second % n]) *= SEASONAL_OUTLIER_WEIGHT;
390-
}
377+
}
378+
TMeanAccumulator meanDifferenceOfOutliers;
379+
for (const auto& outlier : outliers) {
380+
meanDifferenceOfOutliers.add(outlier.first);
381+
}
382+
meanDifference -= meanDifferenceOfOutliers;
383+
LOG_TRACE(<< "mean difference = " << CBasicStatistics::mean(meanDifference));
384+
LOG_TRACE(<< "outliers = " << core::CContainerPrinter::print(outliers));
385+
386+
for (const auto& outlier : outliers) {
387+
if (outlier.first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
388+
CBasicStatistics::mean(meanDifference)) {
389+
CBasicStatistics::count(values[outlier.second % n]) *= SEASONAL_OUTLIER_WEIGHT;
391390
}
392-
LOG_TRACE(<< "Values - outliers = " << core::CContainerPrinter::print(values));
393391
}
392+
LOG_TRACE(<< "Values - outliers = " << core::CContainerPrinter::print(values));
394393
}
395394
}
396395

@@ -1839,10 +1838,12 @@ bool CPeriodicityHypothesisTests::testPeriodWithScaling(const TTimeTimePr2Vec& w
18391838
}
18401839

18411840
// Compute the degrees of freedom given the alternative hypothesis.
1842-
TSizeVec repeats(calculateRepeats(windows, period_, m_BucketLength, values));
1843-
double b{static_cast<double>(
1844-
std::count_if(repeats.begin(), repeats.end(),
1845-
[](std::size_t repeat) { return repeat > 0; }))};
1841+
double b{[&windows, &period_, &values, this] {
1842+
TDoubleVec repeats(calculateRepeats(windows, period_, m_BucketLength, values));
1843+
return static_cast<double>(
1844+
std::count_if(repeats.begin(), repeats.end(),
1845+
[](double repeat) { return repeat > 0.0; }));
1846+
}()};
18461847
double df1{stats.s_NonEmptyBuckets - b - static_cast<double>(segmentation.size() - 2)};
18471848
LOG_TRACE(<< " populated = " << b);
18481849

@@ -2055,14 +2056,10 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
20552056
// 3) The significance of the variance reduction, and
20562057
// 4) The amount of variance reduction.
20572058

2058-
auto calculateMeanRepeats = [&](const TTimeTimePr2Vec& w, core_t::TTime p) {
2059-
TSizeVec repeats{calculateRepeats(w, p, m_BucketLength, values)};
2060-
return CBasicStatistics::mean(
2061-
std::accumulate(repeats.begin(), repeats.end(), TMeanAccumulator{},
2062-
[](TMeanAccumulator mean, std::size_t r) {
2063-
mean.add(static_cast<double>(r));
2064-
return mean;
2065-
}));
2059+
auto calculateMeanRepeats = [&values, this](const TTimeTimePr2Vec& w, core_t::TTime p) {
2060+
TMeanAccumulator result;
2061+
result.add(calculateRepeats(w, p, m_BucketLength, values));
2062+
return CBasicStatistics::mean(result);
20662063
};
20672064

20682065
double p{0.0};
@@ -2143,13 +2140,11 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
21432140
LOG_TRACE(<< " autocorrelation = " << R);
21442141
LOG_TRACE(<< " autocorrelationThreshold = " << stats.s_AutocorrelationThreshold);
21452142

2146-
TSizeVec repeats{calculateRepeats(window, period_, m_BucketLength, buckets)};
2147-
meanRepeats = CBasicStatistics::mean(
2148-
std::accumulate(repeats.begin(), repeats.end(), TMeanAccumulator{},
2149-
[](TMeanAccumulator mean, std::size_t repeat) {
2150-
mean.add(static_cast<double>(repeat));
2151-
return mean;
2152-
}));
2143+
meanRepeats = [&window, &period_, &buckets, this] {
2144+
TMeanAccumulator result;
2145+
result.add(calculateRepeats(window, period_, m_BucketLength, buckets));
2146+
return CBasicStatistics::mean(result);
2147+
}();
21532148
LOG_TRACE(<< " mean repeats = " << meanRepeats);
21542149

21552150
// We're trading off:

0 commit comments

Comments
 (0)