elastic · edsavage · Oct 5, 2018 · Oct 4, 2018 · Oct 4, 2018 · Oct 4, 2018
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -51,6 +51,10 @@ Increased independence of anomaly scores across partitions (See {ml-pull}182[182
 Avoid potential false positives at model start up when first detecting new components of the time
 series decomposition. (See {ml-pull}218[218].)
 
+Add a new label - multi_bucket_impact - to record level anomaly results.
+The value will be on a scale of -5 to +5 where -5 means the anomaly is purely single bucket
+and +5 means the anomaly is purely multi bucket. ({ml-pull}230[230])
+
 === Bug Fixes
 
 Fix cause of "Bad density value..." log errors whilst forecasting. ({ml-pull}207[207])

diff --git a/include/api/CHierarchicalResultsWriter.h b/include/api/CHierarchicalResultsWriter.h
@@ -97,6 +97,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
                  double rawAnomalyScore,
                  double normalizedAnomalyScore,
                  double probability,
+                 double multiBucketImpact,
                  const std::string& metricValueField,
                  const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
                  bool useNull,
@@ -131,6 +132,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
         double s_RawAnomalyScore;
         double s_NormalizedAnomalyScore;
         double s_Probability;
+        double s_MultiBucketImpact;
         const TStoredStringPtrStoredStringPtrPrDoublePrVec& s_Influences;
         int s_Identifier;
         TStr1Vec s_ScheduledEventDescriptions;

diff --git a/include/maths/CModel.h b/include/maths/CModel.h
@@ -230,12 +230,19 @@ struct MATHS_EXPORT SModelProbabilityResult {
     using TSize1Vec = core::CSmallVector<std::size_t, 1>;
     using TTail2Vec = core::CSmallVector<maths_t::ETail, 2>;
 
+    //! Labels for different contributions to the overall probability.
+    enum EFeatureProbabilityLabel {
+        E_SingleBucketProbability,
+        E_MultiBucketProbability,
+        E_AnomalyModelProbability,
+        E_UndefinedProbability
+    };
+
     //! \brief Wraps up a feature label and probability.
     struct MATHS_EXPORT SFeatureProbability {
-        using TStrCRef = boost::reference_wrapper<const std::string>;
         SFeatureProbability();
-        SFeatureProbability(const std::string& label, double probability);
-        TStrCRef s_Label;
+        SFeatureProbability(EFeatureProbabilityLabel label, double probability);
+        EFeatureProbabilityLabel s_Label;
         double s_Probability = 1.0;
     };
     using TFeatureProbability4Vec = core::CSmallVector<SFeatureProbability, 4>;

diff --git a/include/model/CAnnotatedProbability.h b/include/model/CAnnotatedProbability.h
@@ -137,6 +137,9 @@ struct MODEL_EXPORT SAnnotatedProbability {
     //! The probability of seeing the series' sample in a time interval.
     double s_Probability;
 
+    //! The impact of multi/single bucket analysis on the probability
+    double s_MultiBucketImpact;
+
     //! The smallest attribute probabilities and associated data describing
     //! the calculation.
     TAttributeProbability1Vec s_AttributeProbabilities;

diff --git a/include/model/CAnnotatedProbabilityBuilder.h b/include/model/CAnnotatedProbabilityBuilder.h
@@ -50,6 +50,7 @@ class MODEL_EXPORT CAnnotatedProbabilityBuilder : private core::CNonCopyable {
     void personAttributeProbabilityPrior(const maths::CMultinomialConjugate* prior);
     void personFrequency(double frequency, bool everSeenBefore);
     void probability(double p);
+    void multiBucketImpact(double multiBucketImpact);
     void addAttributeProbability(std::size_t cid,
                                  const core::CStoredStringPtr& attribute,
                                  double pAttribute,

diff --git a/include/model/CAnomalyDetectorModelConfig.h b/include/model/CAnomalyDetectorModelConfig.h
@@ -166,6 +166,9 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
     //! for anomaly detection.
     static const std::size_t MULTIBUCKET_FEATURES_WINDOW_LENGTH;
 
+    //! The maximum value that the multi_bucket_impact can take
+    static const double MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE;
+
     //! The maximum number of times we'll update a model in a bucketing
     //! interval. This only applies to our metric statistics, which are
     //! computed on a fixed number of measurements rather than a fixed

diff --git a/include/model/CProbabilityAndInfluenceCalculator.h b/include/model/CProbabilityAndInfluenceCalculator.h
@@ -71,6 +71,10 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
     using TStrCRefDouble1VecDouble1VecPrPrVec = std::vector<TStrCRefDouble1VecDouble1VecPrPr>;
     using TStrCRefDouble1VecDouble1VecPrPrVecVec =
         std::vector<TStrCRefDouble1VecDouble1VecPrPrVec>;
+    using TFeatureProbabilityLabelDoubleUMap =
+        boost::unordered_map<maths::SModelProbabilityResult::EFeatureProbabilityLabel, double>;
+    using TFeatureProbabilityLabelProbabilityAggregatorUMap =
+        boost::unordered_map<maths::SModelProbabilityResult::EFeatureProbabilityLabel, CModelTools::CProbabilityAggregator>;
     using TStoredStringPtrStoredStringPtrPr =
         std::pair<core::CStoredStringPtr, core::CStoredStringPtr>;
     using TStoredStringPtrStoredStringPtrPrVec = std::vector<TStoredStringPtrStoredStringPtrPr>;
@@ -308,10 +312,23 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
     bool calculate(double& probability,
                    TStoredStringPtrStoredStringPtrPrDoublePrVec& influences) const;
 
+    //! Calculate a measure of the impact of both the single bucket and multi
+    //! bucket probabilities on the make up of the overall probability.
+    //!
+    //! The calculation is designed  such that the impact saturates when
+    //! one of the probabilities is less than a small fraction of the other or
+    //! when one probability is close to one, i.e. when one factor is not at all anomalous.
+    //!
+    //! \param[out] multiBucketImpact Filled in with the impact of constituent probabilities.
+    bool calculateMultiBucketImpact(double& multiBucketImpact) const;
+
 private:
     //! Actually commit any influences we've found.
     void commitInfluences(model_t::EFeature feature, double logp, double weight);
 
+    //! calculate the explaining probabilities
+    bool calculateExplainingProbabilities(TFeatureProbabilityLabelDoubleUMap& explainingProbabilities) const;
+
 private:
     //! The minimum value for the influence for which an influencing
     //! field value is judged to have any influence on a feature value.
@@ -327,6 +344,14 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
     //! The probability calculator.
     CModelTools::CProbabilityAggregator m_Probability;
 
+    //! holds the probabilities of explanatory features
+    TFeatureProbabilityLabelProbabilityAggregatorUMap m_ExplainingProbabilities =
+        TFeatureProbabilityLabelProbabilityAggregatorUMap{
+            {maths::SModelProbabilityResult::E_SingleBucketProbability,
+             {CModelTools::CProbabilityAggregator::E_Min}},
+            {maths::SModelProbabilityResult::E_MultiBucketProbability,
+             {CModelTools::CProbabilityAggregator::E_Min}}};
+
     //! The probability calculation cache if there is one.
     CModelTools::CProbabilityCache* m_ProbabilityCache;
 

diff --git a/lib/api/CHierarchicalResultsWriter.cc b/lib/api/CHierarchicalResultsWriter.cc
@@ -61,7 +61,8 @@ CHierarchicalResultsWriter::SResults::SResults(
       s_FunctionValue(functionValue), s_PopulationAverage(populationAverage),
       s_BaselineRate(0.0), s_CurrentRate(currentRate), s_BaselineMean(1, 0.0),
       s_CurrentMean(1, 0.0), s_RawAnomalyScore(rawAnomalyScore),
-      s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
+      s_NormalizedAnomalyScore(normalizedAnomalyScore),
+      s_Probability(probability), s_MultiBucketImpact{-1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE},
       s_Influences(influences), s_Identifier(identifier) {
 }
 
@@ -82,6 +83,7 @@ CHierarchicalResultsWriter::SResults::SResults(
     double rawAnomalyScore,
     double normalizedAnomalyScore,
     double probability,
+    double multiBucketImpact,
     const std::string& metricValueField,
     const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
     bool useNull,
@@ -103,7 +105,8 @@ CHierarchicalResultsWriter::SResults::SResults(
       s_BaselineRate(baselineRate), s_CurrentRate(currentRate),
       s_BaselineMean(baselineMean), s_CurrentMean(currentMean),
       s_RawAnomalyScore(rawAnomalyScore),
-      s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
+      s_NormalizedAnomalyScore(normalizedAnomalyScore),
+      s_Probability(probability), s_MultiBucketImpact{multiBucketImpact},
       s_Influences(influences), s_Identifier(identifier),
       s_ScheduledEventDescriptions(scheduledEventDescriptions) {
 }
@@ -238,7 +241,7 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
     const model::SAttributeProbability& attributeProbability =
         node.s_AnnotatedProbability.s_AttributeProbabilities[0];
 
-    m_ResultWriterFunc(TResults(
+    SResults individualResult = TResults(
         E_Result, *node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PartitionFieldValue,
         *node.s_Spec.s_ByFieldName, *node.s_Spec.s_PersonFieldValue,
         attributeProbability.s_CorrelatedAttributes.empty()
@@ -248,10 +251,13 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
         model_t::outputFunctionName(feature), node.s_AnnotatedProbability.s_BaselineBucketCount,
         node.s_AnnotatedProbability.s_CurrentBucketCount,
         attributeProbability.s_BaselineBucketMean, attributeProbability.s_CurrentBucketValue,
-        node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
+        node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore,
+        node.probability(), node.s_AnnotatedProbability.s_MultiBucketImpact,
         *node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
         node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
-        node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
+        node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST);
+
+    m_ResultWriterFunc(individualResult);
 }
 
 void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results,
@@ -285,6 +291,7 @@ void CHierarchicalResultsWriter::writeSimpleCountResult(const TNode& node) {
         baselineCount ? TDouble1Vec(1, *baselineCount) : TDouble1Vec(),
         currentCount ? TDouble1Vec(1, static_cast<double>(*currentCount)) : TDouble1Vec(),
         node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
+        -1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE,
         *node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
         node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
         node.s_Spec.s_Detector, node.s_BucketLength, node.s_Spec.s_ScheduledEventDescriptions));

diff --git a/lib/api/CJsonOutputWriter.cc b/lib/api/CJsonOutputWriter.cc
@@ -33,6 +33,7 @@ const std::string RECORDS("records");
 const std::string EVENT_COUNT("event_count");
 const std::string IS_INTERIM("is_interim");
 const std::string PROBABILITY("probability");
+const std::string MULTI_BUCKET_IMPACT("multi_bucket_impact");
 const std::string RAW_ANOMALY_SCORE("raw_anomaly_score");
 const std::string ANOMALY_SCORE("anomaly_score");
 const std::string RECORD_SCORE("record_score");
@@ -534,6 +535,7 @@ void CJsonOutputWriter::addMetricFields(const CHierarchicalResultsWriter::TResul
                                  results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
+    m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
     m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
     if (!results.s_ByFieldName.empty()) {
         m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);
@@ -736,6 +738,7 @@ void CJsonOutputWriter::addEventRateFields(const CHierarchicalResultsWriter::TRe
                                  results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
+    m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
     m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
     if (!results.s_ByFieldName.empty()) {
         m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);