Skip to content

[ML] Improve residual model selection #468

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ to the model. (See {ml-pull}214[#214].)
* Only select more complex trend models for forecasting if there is evidence that they are needed.
(See {ml-pull}463[#463].)

* Improve residual model selection. (See {ml-pull}468[#468].)

=== Bug Fixes

* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]
Expand Down
5 changes: 4 additions & 1 deletion include/maths/COneOfNPrior.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ class MATHS_EXPORT COneOfNPrior : public CPrior {
using TDoubleSizePr5Vec = core::CSmallVector<TDoubleSizePr, 5>;
using TWeightPriorPtrPr = std::pair<CModelWeight, TPriorPtr>;
using TWeightPriorPtrPrVec = std::vector<TWeightPriorPtrPr>;
using TMaxAccumulator = CBasicStatistics::SMax<double>::TAccumulator;
using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;

private:
//! Read parameters from \p traverser.
Expand All @@ -350,6 +350,9 @@ class MATHS_EXPORT COneOfNPrior : public CPrior {
private:
//! A collection of component models and their probabilities.
TWeightPriorPtrPrVec m_Models;

//! The moments of the samples added.
TMeanVarAccumulator m_SampleMoments;
};
}
}
Expand Down
3 changes: 0 additions & 3 deletions include/maths/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,6 @@ const double MINIMUM_CLUSTER_SPLIT_COUNT{24.0};

//! The minimum count of a category in the sketch to cluster.
const double MINIMUM_CATEGORY_COUNT{0.5};

//! Get the maximum amount we'll penalize a model in addSamples.
MATHS_EXPORT double maxModelPenalty(double numberSamples);
}
}

Expand Down
18 changes: 9 additions & 9 deletions lib/api/unittest/CAnomalyJobLimitTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -349,12 +349,12 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
std::size_t s_ExpectedByFields;
std::size_t s_ExpectedOverFields;
std::size_t s_ExpectedPartitionFields;
std::size_t s_ExpectedByLowerMemoryLimit;
std::size_t s_ExpectedPartitionLowerMemoryLimit;
std::size_t s_ExpectedOverLowerMemoryLimit;
} testParams[] = {{600, 550, 6000, 300, 33, 40, 40},
{3600, 550, 5500, 300, 30, 25, 20},
{172800, 150, 850, 120, 6, 6, 3}};
std::size_t s_ExpectedByMemoryUsageRelativeErrorDivisor;
std::size_t s_ExpectedPartitionUsageRelativeErrorDivisor;
std::size_t s_ExpectedOverUsageRelativeErrorDivisor;
} testParams[]{{600, 550, 6000, 300, 33, 40, 40},
{3600, 550, 5500, 300, 27, 25, 20},
{172800, 150, 850, 120, 6, 6, 3}};

for (const auto& testParam : testParams) {
TGeneratorVec generators{periodic, tradingDays, level, ramp, sparse};
Expand Down Expand Up @@ -410,7 +410,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
CPPUNIT_ASSERT_EQUAL(std::size_t(2), used.s_PartitionFields);
CPPUNIT_ASSERT_DOUBLES_EQUAL(
memoryLimit * 1024 * 1024 / 2, used.s_Usage,
memoryLimit * 1024 * 1024 / testParam.s_ExpectedByLowerMemoryLimit);
memoryLimit * 1024 * 1024 / testParam.s_ExpectedByMemoryUsageRelativeErrorDivisor);
}

LOG_DEBUG(<< "**** Test partition with bucketLength = " << testParam.s_BucketLength
Expand Down Expand Up @@ -461,7 +461,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
0.96 * static_cast<double>(used.s_PartitionFields));
CPPUNIT_ASSERT_DOUBLES_EQUAL(
memoryLimit * 1024 * 1024 / 2, used.s_Usage,
memoryLimit * 1024 * 1024 / testParam.s_ExpectedPartitionLowerMemoryLimit);
memoryLimit * 1024 * 1024 / testParam.s_ExpectedPartitionUsageRelativeErrorDivisor);
}

LOG_DEBUG(<< "**** Test over with bucketLength = " << testParam.s_BucketLength
Expand Down Expand Up @@ -508,7 +508,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
used.s_OverFields < 7000);
CPPUNIT_ASSERT_DOUBLES_EQUAL(
memoryLimit * 1024 * 1024 / 2, used.s_Usage,
memoryLimit * 1024 * 1024 / testParam.s_ExpectedOverLowerMemoryLimit);
memoryLimit * 1024 * 1024 / testParam.s_ExpectedOverUsageRelativeErrorDivisor);
}
}
}
131 changes: 66 additions & 65 deletions lib/maths/CMultivariateOneOfNPrior.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,6 @@ const std::string WEIGHT_TAG("c");
const std::string PRIOR_TAG("d");
const std::string DECAY_RATE_TAG("e");

//! Add elements of \p x to \p y.
void add(const TDouble10Vec& x, TDouble10Vec& y) {
for (std::size_t i = 0u; i < x.size(); ++i) {
y[i] += x[i];
}
}

//! Get the min of \p x and \p y.
TDouble10Vec min(const TDouble10Vec& x, const TDouble10Vec& y) {
TDouble10Vec result(x);
Expand Down Expand Up @@ -103,11 +96,6 @@ void updateMean(const TDouble10Vec10Vec& x, double nx, TDouble10Vec10Vec& mean,
n += nx;
}

//! Get the largest element of \p x.
double largest(const TDouble10Vec& x) {
return *std::max_element(x.begin(), x.end());
}

//! Add a model vector entry reading parameters from \p traverser.
bool modelAcceptRestoreTraverser(const SDistributionRestoreParams& params,
TWeightPriorPtrPrVec& models,
Expand Down Expand Up @@ -174,7 +162,7 @@ void modelAcceptPersistInserter(const CModelWeight& weight,
const double DERATE = 0.99999;
const double MINUS_INF = DERATE * boost::numeric::bounds<double>::lowest();
const double INF = DERATE * boost::numeric::bounds<double>::highest();
const double LOG_INITIAL_WEIGHT = std::log(1e-6);
const double MAXIMUM_LOG_BAYES_FACTOR = std::log(1e6);
const double MINIMUM_SIGNIFICANT_WEIGHT = 0.01;
}

Expand Down Expand Up @@ -306,81 +294,94 @@ void CMultivariateOneOfNPrior::addSamples(const TDouble10Vec1Vec& samples,

this->adjustOffset(samples, weights);

double penalty = CTools::fastLog(this->numberSamples());
double n{this->numberSamples()};
this->CMultivariatePrior::addSamples(samples, weights);
penalty = (penalty - CTools::fastLog(this->numberSamples())) / 2.0;
n = this->numberSamples() - n;

// See COneOfNPrior::addSamples for a discussion.

CScopeCanonicalizeWeights<TPriorPtr> canonicalize(m_Models);

// We need to check *before* adding samples to the constituent models.
bool isNonInformative = this->isNonInformative();
bool isNonInformative{this->isNonInformative()};

// Compute the unnormalized posterior weights and update the component
// priors. These weights are computed on the side since they are only
// updated if all marginal likelihoods can be computed.
TDouble3Vec logLikelihoods;
TMaxAccumulator maxLogLikelihood;
TBool3Vec used, uses;
TDouble3Vec minusBics;
TBool3Vec used;
TBool3Vec uses;
bool failed{false};

// If none of the models are a good fit for the new data then restrict
// the maximum Bayes Factor since the data likelihood given the model
// is most useful if one of the models is (mostly) correct.
double m{std::max(n, 1.0)};
double maxLogBayesFactor{-m * MAXIMUM_LOG_BAYES_FACTOR};

for (auto& model : m_Models) {
bool use = model.second->participatesInModelSelection();

// Update the weights with the marginal likelihoods.
double logLikelihood = 0.0;
maths_t::EFloatingPointErrorStatus status =
use ? model.second->jointLogMarginalLikelihood(samples, weights, logLikelihood)
: maths_t::E_FpOverflowed;
if (status & maths_t::E_FpFailed) {
LOG_ERROR(<< "Failed to compute log-likelihood");
LOG_ERROR(<< "samples = " << core::CContainerPrinter::print(samples));
} else {
if (!(status & maths_t::E_FpOverflowed)) {
logLikelihood += model.second->unmarginalizedParameters() * penalty;
logLikelihoods.push_back(logLikelihood);
maxLogLikelihood.add(logLikelihood);
} else {
logLikelihoods.push_back(MINUS_INF);

double minusBic{0.0};
maths_t::EFloatingPointErrorStatus status{maths_t::E_FpOverflowed};

used.push_back(model.second->participatesInModelSelection());
if (used.back()) {
double logLikelihood;
status = model.second->jointLogMarginalLikelihood(samples, weights, logLikelihood);

minusBic = 2.0 * logLikelihood -
model.second->unmarginalizedParameters() * CTools::fastLog(n);

double modeLogLikelihood;
TDouble10Vec1Vec modes;
modes.reserve(weights.size());
for (const auto& weight : weights) {
modes.push_back(model.second->marginalLikelihoodMode(weight));
}
model.second->jointLogMarginalLikelihood(modes, weights, modeLogLikelihood);
maxLogBayesFactor = std::max(
maxLogBayesFactor, logLikelihood - std::max(modeLogLikelihood, logLikelihood));
}

minusBics.push_back((status & maths_t::E_FpOverflowed) ? MINUS_INF : minusBic);
failed |= (status & maths_t::E_FpFailed);

// Update the component prior distribution.
model.second->addSamples(samples, weights);
// Update the component prior distribution.
model.second->addSamples(samples, weights);

used.push_back(use);
uses.push_back(model.second->participatesInModelSelection());
uses.push_back(model.second->participatesInModelSelection());
if (!uses.back()) {
model.first.logWeight(MINUS_INF);
}
}

TDouble10Vec n(m_Dimension, 0.0);
for (const auto& weight : weights) {
add(maths_t::count(weight), n);
if (failed) {
LOG_ERROR(<< "Failed to compute log-likelihood");
LOG_ERROR(<< "samples = " << core::CContainerPrinter::print(samples));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be useful to print out the weights here as well?

LOG_ERROR(<< "weights = " << core::CContainerPrinter::print(weights));
return;
}
if (isNonInformative) {
return;
}

if (!isNonInformative && maxLogLikelihood.count() > 0) {
LOG_TRACE(<< "logLikelihoods = " << core::CContainerPrinter::print(logLikelihoods));
maxLogBayesFactor += m * MAXIMUM_LOG_BAYES_FACTOR;

// The idea here is to limit the amount which extreme samples
// affect model selection, particularly early on in the model
// life-cycle.
double l = largest(n);
double minLogLikelihood =
maxLogLikelihood[0] -
l * std::min(maxModelPenalty(this->numberSamples()), 100.0);
LOG_TRACE(<< "BICs = " << core::CContainerPrinter::print(minusBics));
LOG_TRACE(<< "max Bayes Factor = " << maxLogBayesFactor);

TMaxAccumulator maxLogWeight;
for (std::size_t i = 0; i < logLikelihoods.size(); ++i) {
CModelWeight& weight = m_Models[i].first;
if (!uses[i]) {
weight.logWeight(MINUS_INF);
} else if (used[i]) {
weight.addLogFactor(std::max(logLikelihoods[i], minLogLikelihood));
maxLogWeight.add(weight.logWeight());
}
double maxLogModelWeight{MINUS_INF + m * MAXIMUM_LOG_BAYES_FACTOR};
double maxMinusBic{*std::max_element(minusBics.begin(), minusBics.end())};
for (std::size_t i = 0; i < m_Models.size(); ++i) {
if (used[i] && uses[i]) {
m_Models[i].first.addLogFactor(
std::max(minusBics[i] / 2.0, maxMinusBic / 2.0 - maxLogBayesFactor));
maxLogModelWeight = std::max(maxLogModelWeight, m_Models[i].first.logWeight());
}
for (std::size_t i = 0u; i < m_Models.size(); ++i) {
if (!used[i] && uses[i]) {
m_Models[i].first.logWeight(maxLogWeight[0] + LOG_INITIAL_WEIGHT);
}
}
for (std::size_t i = 0; i < m_Models.size(); ++i) {
if (!used[i] && uses[i]) {
m_Models[i].first.logWeight(maxLogModelWeight - m * MAXIMUM_LOG_BAYES_FACTOR);
}
}

Expand Down
Loading