Skip to content

[ML] Improvements to sparse count modelling #721

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ boosted tree training. Hard depth based regularization is often the strategy of
choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs.
Also, the parameters of the penalty function are mode suited to optimising with our
Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].)
* Improvements to count and sum anomaly detection for sparse data. This primarily
aims to improve handling of data which are predictably present: detecting when they
are unexpectedly missing. (See {ml-pull}721[#721].)

== {es} version 7.4.1

Expand Down
27 changes: 0 additions & 27 deletions include/maths/CModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,6 @@ class MATHS_EXPORT CModelParams {
//! Get the maximum time to test for a change point in the model.
core_t::TTime maximumTimeToTestForChange() const;

//! Set the probability that the bucket will be empty for the model.
void probabilityBucketEmpty(double probability);

//! Get the probability that the bucket will be empty for the model.
double probabilityBucketEmpty() const;

private:
//! The data bucketing length.
core_t::TTime m_BucketLength;
Expand All @@ -93,8 +87,6 @@ class MATHS_EXPORT CModelParams {
core_t::TTime m_MinimumTimeToDetectChange;
//! The maximum time permitted to test for a change in the model.
core_t::TTime m_MaximumTimeToTestForChange;
//! The probability that a bucket will be empty for the model.
double m_ProbabilityBucketEmpty;
};

//! \brief The extra parameters needed by CModel::addSamples.
Expand Down Expand Up @@ -169,13 +161,6 @@ class MATHS_EXPORT CModelProbabilityParams {
//! Get the confidence interval to use when detrending.
double seasonalConfidenceInterval() const;

//! Add whether a value's bucket is empty.
CModelProbabilityParams& addBucketEmpty(const TBool2Vec& empty);
//! Set whether or not the values' bucket is empty.
CModelProbabilityParams& bucketEmpty(const TBool2Vec1Vec& empty);
//! Get whether the values' bucket is empty.
const TBool2Vec1Vec& bucketEmpty() const;

//! Add a value's weights.
CModelProbabilityParams& addWeights(const TDouble2VecWeightsAry& weights);
//! Set the values' weights.
Expand Down Expand Up @@ -215,8 +200,6 @@ class MATHS_EXPORT CModelProbabilityParams {
TProbabilityCalculation2Vec m_Calculations;
//! The confidence interval to use when detrending.
double m_SeasonalConfidenceInterval;
//! True if the bucket is empty and false otherwise.
TBool2Vec1Vec m_BucketEmpty;
//! The sample weights.
TDouble2VecWeightsAry1Vec m_Weights;
//! The coordinates for which to compute the probability.
Expand Down Expand Up @@ -469,16 +452,6 @@ class MATHS_EXPORT CModel {
static boost::optional<VECTOR>
predictionError(double propagationInterval, const PRIOR& prior, const VECTOR& sample);

//! Correct \p probability with \p probabilityEmptyBucket.
static double jointProbabilityGivenBucket(bool bucketEmpty,
double probabilityBucketEmpty,
double probability);

//! Correct \p probability with \p probabilityEmptyBucket.
static double jointProbabilityGivenBucket(const TBool2Vec& bucketEmpty,
const TDouble2Vec& probabilityEmptyBucket,
double probability);

private:
//! The model parameters.
CModelParams m_Params;
Expand Down
4 changes: 0 additions & 4 deletions include/model/CAnomalyDetectorModelConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
//! category from the sketch to cluster.
static const double DEFAULT_CATEGORY_DELETE_FRACTION;

//! The default minimum frequency of non-empty buckets at which we model
//! all buckets.
static const double DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS;

//! The default size of the seasonal components we will model.
static const std::size_t DEFAULT_COMPONENT_SIZE;

Expand Down
4 changes: 0 additions & 4 deletions include/model/CIndividualModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,6 @@ class MODEL_EXPORT CIndividualModel : public CAnomalyDetectorModel {
//! for features which count empty buckets.
double emptyBucketWeight(model_t::EFeature feature, std::size_t pid, core_t::TTime time) const;

//! Get the "probability the bucket is empty" to use to correct probabilities
//! for features which count empty buckets.
double probabilityBucketEmpty(model_t::EFeature feature, std::size_t pid) const;

//! Get a read only model corresponding to \p feature of the person \p pid.
const maths::CModel* model(model_t::EFeature feature, std::size_t pid) const;

Expand Down
3 changes: 0 additions & 3 deletions include/model/CModelParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,6 @@ struct MODEL_EXPORT SModelParams {
//! The minimum permitted count of points in a distribution mode.
double s_MinimumModeCount;

//! The minimum frequency of non-empty buckets at which we model all buckets.
double s_CutoffToModelEmptyBuckets;

//! The number of points to use for approximating each seasonal component.
std::size_t s_ComponentSize;

Expand Down
10 changes: 2 additions & 8 deletions include/model/ModelTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -512,15 +512,9 @@ double inverseOffsetCountToZero(EFeature feature, double count);
MODEL_EXPORT
void inverseOffsetCountToZero(EFeature feature, TDouble1Vec& count);

//! Check if the feature counts empty buckets.
//! Check if the feature has a value for empty buckets.
MODEL_EXPORT
bool countsEmptyBuckets(EFeature feature);

//! Get the weight to apply to an empty bucket sample based on the
//! frequency \p feature at which empty buckets are seen for \p feature
//! and the cutoff for empty buckets directly.
MODEL_EXPORT
double emptyBucketCountWeight(EFeature feature, double frequency, double cutoff);
bool includeEmptyBuckets(EFeature feature);

//! Get the rate at which \p feature learns.
MODEL_EXPORT
Expand Down
54 changes: 1 addition & 53 deletions lib/maths/CModel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ CModelParams::CModelParams(core_t::TTime bucketLength,
: m_BucketLength(bucketLength), m_LearnRate(learnRate), m_DecayRate(decayRate),
m_MinimumSeasonalVarianceScale(minimumSeasonalVarianceScale),
m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 6 * bucketLength)),
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)),
m_ProbabilityBucketEmpty(0.0) {
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)) {
}

core_t::TTime CModelParams::bucketLength() const {
Expand Down Expand Up @@ -80,14 +79,6 @@ core_t::TTime CModelParams::maximumTimeToTestForChange() const {
return m_MaximumTimeToTestForChange;
}

void CModelParams::probabilityBucketEmpty(double probability) {
m_ProbabilityBucketEmpty = probability;
}

double CModelParams::probabilityBucketEmpty() const {
return m_ProbabilityBucketEmpty;
}

//////// CModelAddSamplesParams ////////

CModelAddSamplesParams& CModelAddSamplesParams::integer(bool integer) {
Expand Down Expand Up @@ -168,20 +159,6 @@ double CModelProbabilityParams::seasonalConfidenceInterval() const {
return m_SeasonalConfidenceInterval;
}

CModelProbabilityParams& CModelProbabilityParams::addBucketEmpty(const TBool2Vec& empty) {
m_BucketEmpty.push_back(empty);
return *this;
}

CModelProbabilityParams& CModelProbabilityParams::bucketEmpty(const TBool2Vec1Vec& empty) {
m_BucketEmpty = empty;
return *this;
}

const CModelProbabilityParams::TBool2Vec1Vec& CModelProbabilityParams::bucketEmpty() const {
return m_BucketEmpty;
}

CModelProbabilityParams&
CModelProbabilityParams::addWeights(const TDouble2VecWeightsAry& weights) {
m_Weights.push_back(weights);
Expand Down Expand Up @@ -288,35 +265,6 @@ CModelParams& CModel::params() {
return m_Params;
}

double CModel::jointProbabilityGivenBucket(bool empty, double probabilityBucketEmpty, double probability) {
if (empty == false) {
return (1.0 - probabilityBucketEmpty) * probability;
}
return probabilityBucketEmpty + (1.0 - probabilityBucketEmpty) * probability;
}

double CModel::jointProbabilityGivenBucket(const TBool2Vec& empty,
const TDouble2Vec& probabilityEmptyBucket,
double probability) {

double p00{probabilityEmptyBucket[0]};
double p10{probabilityEmptyBucket[1]};

if (empty[0] == false && empty[1] == false) {
return (1.0 - p00) * (1.0 - p10) * probability;
}

if (empty[0] == false) {
return (1.0 - p00) * probability * (p10 + (1.0 - p10) * probability);
}

if (empty[1] == false) {
return (p00 + (1.0 - p00) * probability) * (1.0 - p10) * probability;
}

return (p00 + (1.0 - p00) * probability) * (p10 + (1.0 - p10) * probability);
}

//////// CModelStub ////////

CModelStub::CModelStub() : CModel(stubParameters()) {
Expand Down
133 changes: 64 additions & 69 deletions lib/maths/CPeriodicityHypothesisTests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -303,29 +303,28 @@ void project(const TFloatMeanAccumulatorVec& values,

//! Calculate the number of non-empty buckets at each bucket offset in
//! the period for the \p values in \p windows.
TSizeVec calculateRepeats(const TSizeSizePr2Vec& windows,
std::size_t period,
const TFloatMeanAccumulatorVec& values) {
TSizeVec result(std::min(period, length(windows[0])), 0);
TDoubleVec calculateRepeats(const TSizeSizePr2Vec& windows,
std::size_t period,
const TFloatMeanAccumulatorVec& values) {
TDoubleVec result(std::min(period, length(windows[0])), 0);
std::size_t n{values.size()};
for (const auto& window : windows) {
std::size_t a{window.first};
std::size_t b{window.second};
for (std::size_t i = a; i < b; ++i) {
if (CBasicStatistics::count(values[i % n]) > 0.0) {
++result[(i - a) % period];
}
double count{CBasicStatistics::count(values[i % n])};
result[(i - a) % period] += std::min(count, 1.0);
}
}
return result;
}

//! Calculate the number of non-empty buckets at each bucket offset in
//! the period for the \p values in \p windows.
TSizeVec calculateRepeats(const TTimeTimePr2Vec& windows_,
core_t::TTime period,
core_t::TTime bucketLength,
const TFloatMeanAccumulatorVec& values) {
TDoubleVec calculateRepeats(const TTimeTimePr2Vec& windows_,
core_t::TTime period,
core_t::TTime bucketLength,
const TFloatMeanAccumulatorVec& values) {
TSizeSizePr2Vec windows;
calculateIndexWindows(windows_, bucketLength, windows);
return calculateRepeats(windows, period / bucketLength, values);
Expand All @@ -344,53 +343,53 @@ void reweightOutliers(const std::vector<T>& trend,
using TMaxAccumulator =
CBasicStatistics::COrderStatisticsHeap<TDoubleSizePr, std::greater<TDoubleSizePr>>;

if (values.size() > 0) {
std::size_t period{trend.size()};
std::size_t numberOutliers{static_cast<std::size_t>([&period, &values] {
std::size_t count(std::count_if(
values.begin(), values.end(), [](const TFloatMeanAccumulator& value) {
return CBasicStatistics::count(value) > 0.0;
}));
return SEASONAL_OUTLIER_FRACTION *
static_cast<double>(count - std::min(count, period));
}())};
LOG_TRACE(<< "Number outliers = " << numberOutliers);

if (numberOutliers > 0) {
TSizeSizePr2Vec windows;
calculateIndexWindows(windows_, bucketLength, windows);
std::size_t period{trend.size()};
std::size_t n{values.size()};

TSizeVec repeats{calculateRepeats(windows, period, values)};
double excess{std::accumulate(
repeats.begin(), repeats.end(), 0.0, [](double excess_, std::size_t repeat) {
return excess_ + static_cast<double>(repeat > 1 ? repeat - 1 : 0);
})};
std::size_t numberOutliers{static_cast<std::size_t>(SEASONAL_OUTLIER_FRACTION * excess)};
LOG_TRACE(<< "Number outliers = " << numberOutliers);

if (numberOutliers > 0) {
TMaxAccumulator outliers{numberOutliers};
TMeanAccumulator meanDifference;
for (const auto& window : windows) {
std::size_t a{window.first};
std::size_t b{window.second};
for (std::size_t j = a; j < b; ++j) {
const TFloatMeanAccumulator& value{values[j % n]};
if (CBasicStatistics::count(value) > 0.0) {
std::size_t offset{(j - a) % period};
double difference{std::fabs(CBasicStatistics::mean(value) -
CBasicStatistics::mean(trend[offset]))};
outliers.add({difference, j});
meanDifference.add(difference);
}
TMaxAccumulator outliers{numberOutliers};
TMeanAccumulator meanDifference;
for (const auto& window : windows) {
std::size_t a{window.first};
std::size_t b{window.second};
for (std::size_t j = a; j < b; ++j) {
const TFloatMeanAccumulator& value{values[j % n]};
if (CBasicStatistics::count(value) > 0.0) {
std::size_t offset{(j - a) % period};
double difference{std::fabs(CBasicStatistics::mean(value) -
CBasicStatistics::mean(trend[offset]))};
outliers.add({difference, j});
meanDifference.add(difference);
}
}
TMeanAccumulator meanDifferenceOfOutliers;
for (const auto& outlier : outliers) {
meanDifferenceOfOutliers.add(outlier.first);
}
meanDifference -= meanDifferenceOfOutliers;
LOG_TRACE(<< "mean difference = " << CBasicStatistics::mean(meanDifference));
LOG_TRACE(<< "outliers = " << core::CContainerPrinter::print(outliers));

for (const auto& outlier : outliers) {
if (outlier.first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
CBasicStatistics::mean(meanDifference)) {
CBasicStatistics::count(values[outlier.second % n]) *= SEASONAL_OUTLIER_WEIGHT;
}
}
TMeanAccumulator meanDifferenceOfOutliers;
for (const auto& outlier : outliers) {
meanDifferenceOfOutliers.add(outlier.first);
}
meanDifference -= meanDifferenceOfOutliers;
LOG_TRACE(<< "mean difference = " << CBasicStatistics::mean(meanDifference));
LOG_TRACE(<< "outliers = " << core::CContainerPrinter::print(outliers));

for (const auto& outlier : outliers) {
if (outlier.first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
CBasicStatistics::mean(meanDifference)) {
CBasicStatistics::count(values[outlier.second % n]) *= SEASONAL_OUTLIER_WEIGHT;
}
LOG_TRACE(<< "Values - outliers = " << core::CContainerPrinter::print(values));
}
LOG_TRACE(<< "Values - outliers = " << core::CContainerPrinter::print(values));
}
}

Expand Down Expand Up @@ -1839,10 +1838,12 @@ bool CPeriodicityHypothesisTests::testPeriodWithScaling(const TTimeTimePr2Vec& w
}

// Compute the degrees of freedom given the alternative hypothesis.
TSizeVec repeats(calculateRepeats(windows, period_, m_BucketLength, values));
double b{static_cast<double>(
std::count_if(repeats.begin(), repeats.end(),
[](std::size_t repeat) { return repeat > 0; }))};
double b{[&windows, &period_, &values, this] {
TDoubleVec repeats(calculateRepeats(windows, period_, m_BucketLength, values));
return static_cast<double>(
std::count_if(repeats.begin(), repeats.end(),
[](double repeat) { return repeat > 0.0; }));
}()};
double df1{stats.s_NonEmptyBuckets - b - static_cast<double>(segmentation.size() - 2)};
LOG_TRACE(<< " populated = " << b);

Expand Down Expand Up @@ -2055,14 +2056,10 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
// 3) The significance of the variance reduction, and
// 4) The amount of variance reduction.

auto calculateMeanRepeats = [&](const TTimeTimePr2Vec& w, core_t::TTime p) {
TSizeVec repeats{calculateRepeats(w, p, m_BucketLength, values)};
return CBasicStatistics::mean(
std::accumulate(repeats.begin(), repeats.end(), TMeanAccumulator{},
[](TMeanAccumulator mean, std::size_t r) {
mean.add(static_cast<double>(r));
return mean;
}));
auto calculateMeanRepeats = [&values, this](const TTimeTimePr2Vec& w, core_t::TTime p) {
TMeanAccumulator result;
result.add(calculateRepeats(w, p, m_BucketLength, values));
return CBasicStatistics::mean(result);
};

double p{0.0};
Expand Down Expand Up @@ -2143,13 +2140,11 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
LOG_TRACE(<< " autocorrelation = " << R);
LOG_TRACE(<< " autocorrelationThreshold = " << stats.s_AutocorrelationThreshold);

TSizeVec repeats{calculateRepeats(window, period_, m_BucketLength, buckets)};
meanRepeats = CBasicStatistics::mean(
std::accumulate(repeats.begin(), repeats.end(), TMeanAccumulator{},
[](TMeanAccumulator mean, std::size_t repeat) {
mean.add(static_cast<double>(repeat));
return mean;
}));
meanRepeats = [&window, &period_, &buckets, this] {
TMeanAccumulator result;
result.add(calculateRepeats(window, period_, m_BucketLength, buckets));
return CBasicStatistics::mean(result);
}();
LOG_TRACE(<< " mean repeats = " << meanRepeats);

// We're trading off:
Expand Down
Loading