Skip to content

Commit 27df373

Browse files
committed
[6.5][ML] Add multi_bucket_impact label to anomalies
Add a label indicating the impact of the multi bucket analysis on the overall probability. The value is in the range -5 to 5 where -5 indicates a wholly single bucket contribution and 5 a wholly multi bucket contribution to the final probability. Backports elastic#230
1 parent 593e97e commit 27df373

19 files changed

+317
-150
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ Increased independence of anomaly scores across partitions (See {ml-pull}182[182
5151
Avoid potential false positives at model start up when first detecting new components of the time
5252
series decomposition. (See {ml-pull}218[218].)
5353
54+
Add a new label - multi_bucket_impact - to record level anomaly results.
55+
The value will be on a scale of -5 to +5 where -5 means the anomaly is purely single bucket
56+
and +5 means the anomaly is purely multi bucket. ({ml-pull}230[230])
57+
5458
=== Bug Fixes
5559
5660
Fix cause of "Bad density value..." log errors whilst forecasting. ({ml-pull}207[207])

include/api/CHierarchicalResultsWriter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
9797
double rawAnomalyScore,
9898
double normalizedAnomalyScore,
9999
double probability,
100+
double multiBucketImpact,
100101
const std::string& metricValueField,
101102
const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
102103
bool useNull,
@@ -131,6 +132,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
131132
double s_RawAnomalyScore;
132133
double s_NormalizedAnomalyScore;
133134
double s_Probability;
135+
double s_MultiBucketImpact;
134136
const TStoredStringPtrStoredStringPtrPrDoublePrVec& s_Influences;
135137
int s_Identifier;
136138
TStr1Vec s_ScheduledEventDescriptions;

include/maths/CModel.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,12 +230,19 @@ struct MATHS_EXPORT SModelProbabilityResult {
230230
using TSize1Vec = core::CSmallVector<std::size_t, 1>;
231231
using TTail2Vec = core::CSmallVector<maths_t::ETail, 2>;
232232

233+
//! Labels for different contributions to the overall probability.
234+
enum EFeatureProbabilityLabel {
235+
E_SingleBucketProbability,
236+
E_MultiBucketProbability,
237+
E_AnomalyModelProbability,
238+
E_UndefinedProbability
239+
};
240+
233241
//! \brief Wraps up a feature label and probability.
234242
struct MATHS_EXPORT SFeatureProbability {
235-
using TStrCRef = boost::reference_wrapper<const std::string>;
236243
SFeatureProbability();
237-
SFeatureProbability(const std::string& label, double probability);
238-
TStrCRef s_Label;
244+
SFeatureProbability(EFeatureProbabilityLabel label, double probability);
245+
EFeatureProbabilityLabel s_Label;
239246
double s_Probability = 1.0;
240247
};
241248
using TFeatureProbability4Vec = core::CSmallVector<SFeatureProbability, 4>;

include/model/CAnnotatedProbability.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ struct MODEL_EXPORT SAnnotatedProbability {
137137
//! The probability of seeing the series' sample in a time interval.
138138
double s_Probability;
139139

140+
//! The impact of multi/single bucket analysis on the probability
141+
double s_MultiBucketImpact;
142+
140143
//! The smallest attribute probabilities and associated data describing
141144
//! the calculation.
142145
TAttributeProbability1Vec s_AttributeProbabilities;

include/model/CAnnotatedProbabilityBuilder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class MODEL_EXPORT CAnnotatedProbabilityBuilder : private core::CNonCopyable {
5050
void personAttributeProbabilityPrior(const maths::CMultinomialConjugate* prior);
5151
void personFrequency(double frequency, bool everSeenBefore);
5252
void probability(double p);
53+
void multiBucketImpact(double multiBucketImpact);
5354
void addAttributeProbability(std::size_t cid,
5455
const core::CStoredStringPtr& attribute,
5556
double pAttribute,

include/model/CAnomalyDetectorModelConfig.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
166166
//! for anomaly detection.
167167
static const std::size_t MULTIBUCKET_FEATURES_WINDOW_LENGTH;
168168

169+
//! The maximum value that the multi_bucket_impact can take
170+
static const double MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE;
171+
169172
//! The maximum number of times we'll update a model in a bucketing
170173
//! interval. This only applies to our metric statistics, which are
171174
//! computed on a fixed number of measurements rather than a fixed

include/model/CProbabilityAndInfluenceCalculator.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
7171
using TStrCRefDouble1VecDouble1VecPrPrVec = std::vector<TStrCRefDouble1VecDouble1VecPrPr>;
7272
using TStrCRefDouble1VecDouble1VecPrPrVecVec =
7373
std::vector<TStrCRefDouble1VecDouble1VecPrPrVec>;
74+
using TFeatureProbabilityLabelDoubleUMap =
75+
boost::unordered_map<maths::SModelProbabilityResult::EFeatureProbabilityLabel, double>;
76+
using TFeatureProbabilityLabelProbabilityAggregatorUMap =
77+
boost::unordered_map<maths::SModelProbabilityResult::EFeatureProbabilityLabel, CModelTools::CProbabilityAggregator>;
7478
using TStoredStringPtrStoredStringPtrPr =
7579
std::pair<core::CStoredStringPtr, core::CStoredStringPtr>;
7680
using TStoredStringPtrStoredStringPtrPrVec = std::vector<TStoredStringPtrStoredStringPtrPr>;
@@ -308,10 +312,23 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
308312
bool calculate(double& probability,
309313
TStoredStringPtrStoredStringPtrPrDoublePrVec& influences) const;
310314

315+
//! Calculate a measure of the impact of both the single bucket and multi
316+
//! bucket probabilities on the make up of the overall probability.
317+
//!
318+
//! The calculation is designed such that the impact saturates when
319+
//! one of the probabilities is less than a small fraction of the other or
320+
//! when one probability is close to one, i.e. when one factor is not at all anomalous.
321+
//!
322+
//! \param[out] multiBucketImpact Filled in with the impact of constituent probabilities.
323+
bool calculateMultiBucketImpact(double& multiBucketImpact) const;
324+
311325
private:
312326
//! Actually commit any influences we've found.
313327
void commitInfluences(model_t::EFeature feature, double logp, double weight);
314328

329+
//! calculate the explaining probabilities
330+
bool calculateExplainingProbabilities(TFeatureProbabilityLabelDoubleUMap& explainingProbabilities) const;
331+
315332
private:
316333
//! The minimum value for the influence for which an influencing
317334
//! field value is judged to have any influence on a feature value.
@@ -327,6 +344,9 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
327344
//! The probability calculator.
328345
CModelTools::CProbabilityAggregator m_Probability;
329346

347+
//! holds the probabilities of explanatory features
348+
TFeatureProbabilityLabelProbabilityAggregatorUMap m_ExplainingProbabilities;
349+
330350
//! The probability calculation cache if there is one.
331351
CModelTools::CProbabilityCache* m_ProbabilityCache;
332352

lib/api/CHierarchicalResultsWriter.cc

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ CHierarchicalResultsWriter::SResults::SResults(
6161
s_FunctionValue(functionValue), s_PopulationAverage(populationAverage),
6262
s_BaselineRate(0.0), s_CurrentRate(currentRate), s_BaselineMean(1, 0.0),
6363
s_CurrentMean(1, 0.0), s_RawAnomalyScore(rawAnomalyScore),
64-
s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
64+
s_NormalizedAnomalyScore(normalizedAnomalyScore),
65+
s_Probability(probability), s_MultiBucketImpact{-1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE},
6566
s_Influences(influences), s_Identifier(identifier) {
6667
}
6768

@@ -82,6 +83,7 @@ CHierarchicalResultsWriter::SResults::SResults(
8283
double rawAnomalyScore,
8384
double normalizedAnomalyScore,
8485
double probability,
86+
double multiBucketImpact,
8587
const std::string& metricValueField,
8688
const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
8789
bool useNull,
@@ -103,7 +105,8 @@ CHierarchicalResultsWriter::SResults::SResults(
103105
s_BaselineRate(baselineRate), s_CurrentRate(currentRate),
104106
s_BaselineMean(baselineMean), s_CurrentMean(currentMean),
105107
s_RawAnomalyScore(rawAnomalyScore),
106-
s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
108+
s_NormalizedAnomalyScore(normalizedAnomalyScore),
109+
s_Probability(probability), s_MultiBucketImpact{multiBucketImpact},
107110
s_Influences(influences), s_Identifier(identifier),
108111
s_ScheduledEventDescriptions(scheduledEventDescriptions) {
109112
}
@@ -238,7 +241,7 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
238241
const model::SAttributeProbability& attributeProbability =
239242
node.s_AnnotatedProbability.s_AttributeProbabilities[0];
240243

241-
m_ResultWriterFunc(TResults(
244+
SResults individualResult = TResults(
242245
E_Result, *node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PartitionFieldValue,
243246
*node.s_Spec.s_ByFieldName, *node.s_Spec.s_PersonFieldValue,
244247
attributeProbability.s_CorrelatedAttributes.empty()
@@ -248,10 +251,13 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
248251
model_t::outputFunctionName(feature), node.s_AnnotatedProbability.s_BaselineBucketCount,
249252
node.s_AnnotatedProbability.s_CurrentBucketCount,
250253
attributeProbability.s_BaselineBucketMean, attributeProbability.s_CurrentBucketValue,
251-
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
254+
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore,
255+
node.probability(), node.s_AnnotatedProbability.s_MultiBucketImpact,
252256
*node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
253257
node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
254-
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
258+
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST);
259+
260+
m_ResultWriterFunc(individualResult);
255261
}
256262

257263
void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results,
@@ -285,6 +291,7 @@ void CHierarchicalResultsWriter::writeSimpleCountResult(const TNode& node) {
285291
baselineCount ? TDouble1Vec(1, *baselineCount) : TDouble1Vec(),
286292
currentCount ? TDouble1Vec(1, static_cast<double>(*currentCount)) : TDouble1Vec(),
287293
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
294+
-1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE,
288295
*node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
289296
node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
290297
node.s_Spec.s_Detector, node.s_BucketLength, node.s_Spec.s_ScheduledEventDescriptions));

lib/api/CJsonOutputWriter.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const std::string RECORDS("records");
3333
const std::string EVENT_COUNT("event_count");
3434
const std::string IS_INTERIM("is_interim");
3535
const std::string PROBABILITY("probability");
36+
const std::string MULTI_BUCKET_IMPACT("multi_bucket_impact");
3637
const std::string RAW_ANOMALY_SCORE("raw_anomaly_score");
3738
const std::string ANOMALY_SCORE("anomaly_score");
3839
const std::string RECORD_SCORE("record_score");
@@ -534,6 +535,7 @@ void CJsonOutputWriter::addMetricFields(const CHierarchicalResultsWriter::TResul
534535
results.s_NormalizedAnomalyScore, *docPtr);
535536
m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
536537
m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
538+
m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
537539
m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
538540
if (!results.s_ByFieldName.empty()) {
539541
m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);
@@ -736,6 +738,7 @@ void CJsonOutputWriter::addEventRateFields(const CHierarchicalResultsWriter::TRe
736738
results.s_NormalizedAnomalyScore, *docPtr);
737739
m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
738740
m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
741+
m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
739742
m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
740743
if (!results.s_ByFieldName.empty()) {
741744
m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);

0 commit comments

Comments
 (0)