Skip to content

[6.5][ML] Add multi_bucket_impact label to anomalies #239

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ Increased independence of anomaly scores across partitions (See {ml-pull}182[182
Avoid potential false positives at model start up when first detecting new components of the time
series decomposition. (See {ml-pull}218[218].)

Add a new label - multi_bucket_impact - to record level anomaly results.
The value will be on a scale of -5 to +5 where -5 means the anomaly is purely single bucket
and +5 means the anomaly is purely multi bucket. ({ml-pull}230[230])

=== Bug Fixes

Fix cause of "Bad density value..." log errors whilst forecasting. ({ml-pull}207[207])
Expand Down
2 changes: 2 additions & 0 deletions include/api/CHierarchicalResultsWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
double rawAnomalyScore,
double normalizedAnomalyScore,
double probability,
double multiBucketImpact,
const std::string& metricValueField,
const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
bool useNull,
Expand Down Expand Up @@ -131,6 +132,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
double s_RawAnomalyScore;
double s_NormalizedAnomalyScore;
double s_Probability;
double s_MultiBucketImpact;
const TStoredStringPtrStoredStringPtrPrDoublePrVec& s_Influences;
int s_Identifier;
TStr1Vec s_ScheduledEventDescriptions;
Expand Down
13 changes: 10 additions & 3 deletions include/maths/CModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,12 +230,19 @@ struct MATHS_EXPORT SModelProbabilityResult {
using TSize1Vec = core::CSmallVector<std::size_t, 1>;
using TTail2Vec = core::CSmallVector<maths_t::ETail, 2>;

//! Labels for different contributions to the overall probability.
enum EFeatureProbabilityLabel {
E_SingleBucketProbability,
E_MultiBucketProbability,
E_AnomalyModelProbability,
E_UndefinedProbability
};

//! \brief Wraps up a feature label and probability.
struct MATHS_EXPORT SFeatureProbability {
using TStrCRef = boost::reference_wrapper<const std::string>;
SFeatureProbability();
SFeatureProbability(const std::string& label, double probability);
TStrCRef s_Label;
SFeatureProbability(EFeatureProbabilityLabel label, double probability);
EFeatureProbabilityLabel s_Label;
double s_Probability = 1.0;
};
using TFeatureProbability4Vec = core::CSmallVector<SFeatureProbability, 4>;
Expand Down
3 changes: 3 additions & 0 deletions include/model/CAnnotatedProbability.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ struct MODEL_EXPORT SAnnotatedProbability {
//! The probability of seeing the series' sample in a time interval.
double s_Probability;

//! The impact of multi/single bucket analysis on the probability
double s_MultiBucketImpact;

//! The smallest attribute probabilities and associated data describing
//! the calculation.
TAttributeProbability1Vec s_AttributeProbabilities;
Expand Down
1 change: 1 addition & 0 deletions include/model/CAnnotatedProbabilityBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class MODEL_EXPORT CAnnotatedProbabilityBuilder : private core::CNonCopyable {
void personAttributeProbabilityPrior(const maths::CMultinomialConjugate* prior);
void personFrequency(double frequency, bool everSeenBefore);
void probability(double p);
void multiBucketImpact(double multiBucketImpact);
void addAttributeProbability(std::size_t cid,
const core::CStoredStringPtr& attribute,
double pAttribute,
Expand Down
3 changes: 3 additions & 0 deletions include/model/CAnomalyDetectorModelConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
//! for anomaly detection.
static const std::size_t MULTIBUCKET_FEATURES_WINDOW_LENGTH;

//! The maximum value that the multi_bucket_impact can take
static const double MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE;

//! The maximum number of times we'll update a model in a bucketing
//! interval. This only applies to our metric statistics, which are
//! computed on a fixed number of measurements rather than a fixed
Expand Down
25 changes: 25 additions & 0 deletions include/model/CProbabilityAndInfluenceCalculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
using TStrCRefDouble1VecDouble1VecPrPrVec = std::vector<TStrCRefDouble1VecDouble1VecPrPr>;
using TStrCRefDouble1VecDouble1VecPrPrVecVec =
std::vector<TStrCRefDouble1VecDouble1VecPrPrVec>;
using TFeatureProbabilityLabelDoubleUMap =
boost::unordered_map<maths::SModelProbabilityResult::EFeatureProbabilityLabel, double>;
using TFeatureProbabilityLabelProbabilityAggregatorUMap =
boost::unordered_map<maths::SModelProbabilityResult::EFeatureProbabilityLabel, CModelTools::CProbabilityAggregator>;
using TStoredStringPtrStoredStringPtrPr =
std::pair<core::CStoredStringPtr, core::CStoredStringPtr>;
using TStoredStringPtrStoredStringPtrPrVec = std::vector<TStoredStringPtrStoredStringPtrPr>;
Expand Down Expand Up @@ -308,10 +312,23 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
bool calculate(double& probability,
TStoredStringPtrStoredStringPtrPrDoublePrVec& influences) const;

//! Calculate a measure of the impact of both the single bucket and multi
//! bucket probabilities on the make up of the overall probability.
//!
//! The calculation is designed such that the impact saturates when
//! one of the probabilities is less than a small fraction of the other or
//! when one probability is close to one, i.e. when one factor is not at all anomalous.
//!
//! \param[out] multiBucketImpact Filled in with the impact of constituent probabilities.
bool calculateMultiBucketImpact(double& multiBucketImpact) const;

private:
//! Actually commit any influences we've found.
void commitInfluences(model_t::EFeature feature, double logp, double weight);

//! calculate the explaining probabilities
bool calculateExplainingProbabilities(TFeatureProbabilityLabelDoubleUMap& explainingProbabilities) const;

private:
//! The minimum value for the influence for which an influencing
//! field value is judged to have any influence on a feature value.
Expand All @@ -327,6 +344,14 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
//! The probability calculator.
CModelTools::CProbabilityAggregator m_Probability;

//! holds the probabilities of explanatory features
TFeatureProbabilityLabelProbabilityAggregatorUMap m_ExplainingProbabilities =
TFeatureProbabilityLabelProbabilityAggregatorUMap{
{maths::SModelProbabilityResult::E_SingleBucketProbability,
{CModelTools::CProbabilityAggregator::E_Min}},
{maths::SModelProbabilityResult::E_MultiBucketProbability,
{CModelTools::CProbabilityAggregator::E_Min}}};

//! The probability calculation cache if there is one.
CModelTools::CProbabilityCache* m_ProbabilityCache;

Expand Down
17 changes: 12 additions & 5 deletions lib/api/CHierarchicalResultsWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ CHierarchicalResultsWriter::SResults::SResults(
s_FunctionValue(functionValue), s_PopulationAverage(populationAverage),
s_BaselineRate(0.0), s_CurrentRate(currentRate), s_BaselineMean(1, 0.0),
s_CurrentMean(1, 0.0), s_RawAnomalyScore(rawAnomalyScore),
s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
s_NormalizedAnomalyScore(normalizedAnomalyScore),
s_Probability(probability), s_MultiBucketImpact{-1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE},
s_Influences(influences), s_Identifier(identifier) {
}

Expand All @@ -82,6 +83,7 @@ CHierarchicalResultsWriter::SResults::SResults(
double rawAnomalyScore,
double normalizedAnomalyScore,
double probability,
double multiBucketImpact,
const std::string& metricValueField,
const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
bool useNull,
Expand All @@ -103,7 +105,8 @@ CHierarchicalResultsWriter::SResults::SResults(
s_BaselineRate(baselineRate), s_CurrentRate(currentRate),
s_BaselineMean(baselineMean), s_CurrentMean(currentMean),
s_RawAnomalyScore(rawAnomalyScore),
s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
s_NormalizedAnomalyScore(normalizedAnomalyScore),
s_Probability(probability), s_MultiBucketImpact{multiBucketImpact},
s_Influences(influences), s_Identifier(identifier),
s_ScheduledEventDescriptions(scheduledEventDescriptions) {
}
Expand Down Expand Up @@ -238,7 +241,7 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
const model::SAttributeProbability& attributeProbability =
node.s_AnnotatedProbability.s_AttributeProbabilities[0];

m_ResultWriterFunc(TResults(
SResults individualResult = TResults(
E_Result, *node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PartitionFieldValue,
*node.s_Spec.s_ByFieldName, *node.s_Spec.s_PersonFieldValue,
attributeProbability.s_CorrelatedAttributes.empty()
Expand All @@ -248,10 +251,13 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
model_t::outputFunctionName(feature), node.s_AnnotatedProbability.s_BaselineBucketCount,
node.s_AnnotatedProbability.s_CurrentBucketCount,
attributeProbability.s_BaselineBucketMean, attributeProbability.s_CurrentBucketValue,
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore,
node.probability(), node.s_AnnotatedProbability.s_MultiBucketImpact,
*node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST);

m_ResultWriterFunc(individualResult);
}

void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results,
Expand Down Expand Up @@ -285,6 +291,7 @@ void CHierarchicalResultsWriter::writeSimpleCountResult(const TNode& node) {
baselineCount ? TDouble1Vec(1, *baselineCount) : TDouble1Vec(),
currentCount ? TDouble1Vec(1, static_cast<double>(*currentCount)) : TDouble1Vec(),
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
-1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE,
*node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
node.s_Spec.s_Detector, node.s_BucketLength, node.s_Spec.s_ScheduledEventDescriptions));
Expand Down
3 changes: 3 additions & 0 deletions lib/api/CJsonOutputWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ const std::string RECORDS("records");
const std::string EVENT_COUNT("event_count");
const std::string IS_INTERIM("is_interim");
const std::string PROBABILITY("probability");
const std::string MULTI_BUCKET_IMPACT("multi_bucket_impact");
const std::string RAW_ANOMALY_SCORE("raw_anomaly_score");
const std::string ANOMALY_SCORE("anomaly_score");
const std::string RECORD_SCORE("record_score");
Expand Down Expand Up @@ -534,6 +535,7 @@ void CJsonOutputWriter::addMetricFields(const CHierarchicalResultsWriter::TResul
results.s_NormalizedAnomalyScore, *docPtr);
m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
if (!results.s_ByFieldName.empty()) {
m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);
Expand Down Expand Up @@ -736,6 +738,7 @@ void CJsonOutputWriter::addEventRateFields(const CHierarchicalResultsWriter::TRe
results.s_NormalizedAnomalyScore, *docPtr);
m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
if (!results.s_ByFieldName.empty()) {
m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);
Expand Down
Loading