elastic · edsavage · Oct 4, 2018 · Oct 3, 2018 · Oct 3, 2018 · Oct 3, 2018
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -51,6 +51,10 @@ Increased independence of anomaly scores across partitions (See {ml-pull}182[182
 Avoid potential false positives at model start up when first detecting new components of the time
 series decomposition. (See {ml-pull}218[218].)
 
+Add a new label - multi_bucket_impact - to record level anomaly results.
+The value will be on a scale of -5 to +5 where -5 means the anomaly is purely single bucket
+and +5 means the anomaly is purely multi bucket. ({ml-pull}230[230])
+
 === Bug Fixes
 
 Fix cause of "Bad density value..." log errors whilst forecasting. ({ml-pull}207[207])

diff --git a/include/api/CHierarchicalResultsWriter.h b/include/api/CHierarchicalResultsWriter.h
@@ -97,6 +97,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
                  double rawAnomalyScore,
                  double normalizedAnomalyScore,
                  double probability,
+                 double multiBucketImpact,
                  const std::string& metricValueField,
                  const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
                  bool useNull,
@@ -131,6 +132,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
         double s_RawAnomalyScore;
         double s_NormalizedAnomalyScore;
         double s_Probability;
+        double s_MultiBucketImpact;
         const TStoredStringPtrStoredStringPtrPrDoublePrVec& s_Influences;
         int s_Identifier;
         TStr1Vec s_ScheduledEventDescriptions;

diff --git a/include/maths/CTimeSeriesDecompositionDetail.h b/include/maths/CTimeSeriesDecompositionDetail.h
@@ -751,7 +751,8 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail {
 
         //! Adjust the values to remove any piecewise constant linear scales
         //! of the component with period \p period.
-        void adjustValuesForPiecewiseConstantScaling(std::size_t period, TFloatMeanAccumulatorVec& values) const;
+        void adjustValuesForPiecewiseConstantScaling(std::size_t period,
+                                                     TFloatMeanAccumulatorVec& values) const;
 
         //! Reweight the outlier values in \p values.
         //!

diff --git a/include/model/CAnnotatedProbability.h b/include/model/CAnnotatedProbability.h
@@ -137,6 +137,9 @@ struct MODEL_EXPORT SAnnotatedProbability {
     //! The probability of seeing the series' sample in a time interval.
     double s_Probability;
 
+    //! The impact of multi/single bucket analysis on the probability
+    double s_MultiBucketImpact;
+
     //! The smallest attribute probabilities and associated data describing
     //! the calculation.
     TAttributeProbability1Vec s_AttributeProbabilities;

diff --git a/include/model/CAnnotatedProbabilityBuilder.h b/include/model/CAnnotatedProbabilityBuilder.h
@@ -50,6 +50,7 @@ class MODEL_EXPORT CAnnotatedProbabilityBuilder : private core::CNonCopyable {
     void personAttributeProbabilityPrior(const maths::CMultinomialConjugate* prior);
     void personFrequency(double frequency, bool everSeenBefore);
     void probability(double p);
+    void multiBucketImpact(double multiBucketImpact);
     void addAttributeProbability(std::size_t cid,
                                  const core::CStoredStringPtr& attribute,
                                  double pAttribute,

diff --git a/include/model/CProbabilityAndInfluenceCalculator.h b/include/model/CProbabilityAndInfluenceCalculator.h
@@ -71,6 +71,9 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
     using TStrCRefDouble1VecDouble1VecPrPrVec = std::vector<TStrCRefDouble1VecDouble1VecPrPr>;
     using TStrCRefDouble1VecDouble1VecPrPrVecVec =
         std::vector<TStrCRefDouble1VecDouble1VecPrPrVec>;
+    using TStrDoubleUMap = boost::unordered_map<std::string, double>;
+    using TStrProbabilityAggregatorMap =
+        boost::unordered_map<std::string, CModelTools::CProbabilityAggregator>;
     using TStoredStringPtrStoredStringPtrPr =
         std::pair<core::CStoredStringPtr, core::CStoredStringPtr>;
     using TStoredStringPtrStoredStringPtrPrVec = std::vector<TStoredStringPtrStoredStringPtrPr>;
@@ -308,10 +311,19 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
     bool calculate(double& probability,
                    TStoredStringPtrStoredStringPtrPrDoublePrVec& influences) const;
 
+    //! Calculate a measure of the impact of both the single bucket and multi
+    //! bucket probabilities on the make up of the overall probability.
+    //!
+    //! \param[out] multiBucketImpact Filled in with the impact of constituent probabilities.
+    bool calculateMultiBucketImpact(double& multiBucketImpact) const;
+
 private:
     //! Actually commit any influences we've found.
     void commitInfluences(model_t::EFeature feature, double logp, double weight);
 
+    //! calculate the explaining probabilities
+    bool calculateExplainingProbabilities(TStrDoubleUMap& explainingProbabilities) const;
+
 private:
     //! The minimum value for the influence for which an influencing
     //! field value is judged to have any influence on a feature value.
@@ -327,6 +339,9 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator {
     //! The probability calculator.
     CModelTools::CProbabilityAggregator m_Probability;
 
+    //! holds the probabilities of explanatory features
+    TStrProbabilityAggregatorMap m_ExplainingProbabilities;
+
     //! The probability calculation cache if there is one.
     CModelTools::CProbabilityCache* m_ProbabilityCache;
 

diff --git a/lib/api/CHierarchicalResultsWriter.cc b/lib/api/CHierarchicalResultsWriter.cc
@@ -61,7 +61,8 @@ CHierarchicalResultsWriter::SResults::SResults(
       s_PopulationAverage(populationAverage), s_BaselineRate(0.0),
       s_CurrentRate(currentRate), s_BaselineMean{0.0}, s_CurrentMean{0.0},
       s_RawAnomalyScore(rawAnomalyScore),
-      s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
+      s_NormalizedAnomalyScore(normalizedAnomalyScore),
+      s_Probability(probability), s_MultiBucketImpact{-5.0},
       s_Influences(influences), s_Identifier(identifier) {
 }
 
@@ -82,6 +83,7 @@ CHierarchicalResultsWriter::SResults::SResults(
     double rawAnomalyScore,
     double normalizedAnomalyScore,
     double probability,
+    double multiBucketImpact,
     const std::string& metricValueField,
     const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences,
     bool useNull,
@@ -103,7 +105,8 @@ CHierarchicalResultsWriter::SResults::SResults(
       s_BaselineRate(baselineRate), s_CurrentRate(currentRate),
       s_BaselineMean(baselineMean), s_CurrentMean(currentMean),
       s_RawAnomalyScore(rawAnomalyScore),
-      s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability),
+      s_NormalizedAnomalyScore(normalizedAnomalyScore),
+      s_Probability(probability), s_MultiBucketImpact{multiBucketImpact},
       s_Influences(influences), s_Identifier(identifier),
       s_ScheduledEventDescriptions(scheduledEventDescriptions) {
 }
@@ -238,7 +241,7 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
     const model::SAttributeProbability& attributeProbability =
         node.s_AnnotatedProbability.s_AttributeProbabilities[0];
 
-    m_ResultWriterFunc(TResults(
+    SResults individualResult = TResults(
         E_Result, *node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PartitionFieldValue,
         *node.s_Spec.s_ByFieldName, *node.s_Spec.s_PersonFieldValue,
         attributeProbability.s_CorrelatedAttributes.empty()
@@ -248,10 +251,13 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
         model_t::outputFunctionName(feature), node.s_AnnotatedProbability.s_BaselineBucketCount,
         node.s_AnnotatedProbability.s_CurrentBucketCount,
         attributeProbability.s_BaselineBucketMean, attributeProbability.s_CurrentBucketValue,
-        node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
+        node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore,
+        node.probability(), node.s_AnnotatedProbability.s_MultiBucketImpact,
         *node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
         node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
-        node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
+        node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST);
+
+    m_ResultWriterFunc(individualResult);
 }
 
 void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results,
@@ -284,10 +290,11 @@ void CHierarchicalResultsWriter::writeSimpleCountResult(const TNode& node) {
         m_BucketTime, EMPTY_STRING, EMPTY_STRING, baselineCount, currentCount,
         baselineCount ? TDouble1Vec(1, *baselineCount) : TDouble1Vec(),
         currentCount ? TDouble1Vec(1, static_cast<double>(*currentCount)) : TDouble1Vec(),
-        node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
-        *node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
-        node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
-        node.s_Spec.s_Detector, node.s_BucketLength, node.s_Spec.s_ScheduledEventDescriptions));
+        node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore,
+        node.probability(), -5.0, *node.s_Spec.s_ValueFieldName,
+        node.s_AnnotatedProbability.s_Influences, node.s_Spec.s_UseNull,
+        model::function_t::isMetric(node.s_Spec.s_Function), node.s_Spec.s_Detector,
+        node.s_BucketLength, node.s_Spec.s_ScheduledEventDescriptions));
 }
 
 void CHierarchicalResultsWriter::findParentProbabilities(const TNode& node,

diff --git a/lib/api/CJsonOutputWriter.cc b/lib/api/CJsonOutputWriter.cc
@@ -33,6 +33,7 @@ const std::string RECORDS("records");
 const std::string EVENT_COUNT("event_count");
 const std::string IS_INTERIM("is_interim");
 const std::string PROBABILITY("probability");
+const std::string MULTI_BUCKET_IMPACT("multi_bucket_impact");
 const std::string RAW_ANOMALY_SCORE("raw_anomaly_score");
 const std::string ANOMALY_SCORE("anomaly_score");
 const std::string RECORD_SCORE("record_score");
@@ -534,6 +535,7 @@ void CJsonOutputWriter::addMetricFields(const CHierarchicalResultsWriter::TResul
                                  results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
+    m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
     m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
     if (!results.s_ByFieldName.empty()) {
         m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);
@@ -736,6 +738,7 @@ void CJsonOutputWriter::addEventRateFields(const CHierarchicalResultsWriter::TRe
                                  results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
     m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
+    m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr);
     m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr);
     if (!results.s_ByFieldName.empty()) {
         m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr);

diff --git a/lib/maths/CTimeSeriesDecompositionDetail.cc b/lib/maths/CTimeSeriesDecompositionDetail.cc
@@ -1633,7 +1633,8 @@ bool CTimeSeriesDecompositionDetail::CComponents::addCalendarComponent(const CCa
 }
 
 void CTimeSeriesDecompositionDetail::CComponents::adjustValuesForPiecewiseConstantScaling(
-        std::size_t period, TFloatMeanAccumulatorVec& values) const {
+    std::size_t period,
+    TFloatMeanAccumulatorVec& values) const {
 
     // Periodicity testing detected piecewise constant linear scaling
     // of the underlying seasonal component. Here, we adjust all values
@@ -1644,14 +1645,13 @@ void CTimeSeriesDecompositionDetail::CComponents::adjustValuesForPiecewiseConsta
 
     TDoubleVec trend;
     TDoubleVec scales;
-    TSizeVec segmentation(CTimeSeriesSegmentation::piecewiseLinearScaledPeriodic(
-        values, period));
+    TSizeVec segmentation(
+        CTimeSeriesSegmentation::piecewiseLinearScaledPeriodic(values, period));
     std::tie(trend, scales) = CTimeSeriesSegmentation::piecewiseLinearScaledPeriodic(
         values, period, segmentation);
     LOG_TRACE(<< "trend = " << core::CContainerPrinter::print(trend));
     LOG_TRACE(<< "scales = " << core::CContainerPrinter::print(scales));
-    LOG_TRACE(<< "segmentation = "
-              << core::CContainerPrinter::print(segmentation));
+    LOG_TRACE(<< "segmentation = " << core::CContainerPrinter::print(segmentation));
     values = CTimeSeriesSegmentation::removePiecewiseLinearScaledPeriodic(
         values, segmentation, trend, scales);
     TMeanAccumulator scale;
@@ -1666,8 +1666,8 @@ void CTimeSeriesDecompositionDetail::CComponents::adjustValuesForPiecewiseConsta
     }
     LOG_TRACE(<< "scale = " << CBasicStatistics::mean(scale));
     for (std::size_t i = 0; i < values.size(); ++i) {
-        CBasicStatistics::moment<0>(values[i]) +=
-            CBasicStatistics::mean(scale) * trend[i % trend.size()];
+        CBasicStatistics::moment<0>(values[i]) += CBasicStatistics::mean(scale) *
+                                                  trend[i % trend.size()];
     }
 }
 

diff --git a/lib/maths/CTimeSeriesModel.cc b/lib/maths/CTimeSeriesModel.cc
@@ -170,8 +170,8 @@ const std::string CORRELATION_TAG{"d"};
 
 // Strings identifying the different features for which time series
 // models compute probabilities.
-const std::string BUCKET_FEATURE_LABEL{"bucket"};
-const std::string MEAN_FEATURE_LABEL{"mean"};
+const std::string SINGLE_BUCKET_FEATURE_LABEL{"single_bucket"};
+const std::string MULTI_BUCKET_FEATURE_LABEL{"multi_bucket"};
 const std::string ANOMALY_FEATURE_LABEL{"anomaly"};
 }
 
@@ -1002,7 +1002,7 @@ bool CUnivariateTimeSeriesModel::uncorrelatedProbability(const CModelProbability
             calculation, value[0], params.bucketEmpty()[0][0],
             this->params().probabilityBucketEmpty(), (pl + pu) / 2.0)};
         probabilities.push_back(probability);
-        featureProbabilities.emplace_back(BUCKET_FEATURE_LABEL, probability);
+        featureProbabilities.emplace_back(SINGLE_BUCKET_FEATURE_LABEL, probability);
     } else {
         LOG_ERROR(<< "Failed to compute P(" << sample
                   << " | weight = " << weights << ", time = " << time << ")");
@@ -1033,7 +1033,7 @@ bool CUnivariateTimeSeriesModel::uncorrelatedProbability(const CModelProbability
             calculation, value[0], params.bucketEmpty()[0][0],
             this->params().probabilityBucketEmpty(), probability);
         probabilities.push_back(probability);
-        featureProbabilities.emplace_back(MEAN_FEATURE_LABEL, probability);
+        featureProbabilities.emplace_back(MULTI_BUCKET_FEATURE_LABEL, probability);
     }
 
     double probability{aggregateFeatureProbabilities(probabilities, correlation)};
@@ -1166,7 +1166,7 @@ bool CUnivariateTimeSeriesModel::correlatedProbability(const CModelProbabilityPa
     aggregator.calculate(probability);
     TDouble4Vec probabilities{probability};
     SModelProbabilityResult::TFeatureProbability4Vec featureProbabilities;
-    featureProbabilities.emplace_back(BUCKET_FEATURE_LABEL, probability);
+    featureProbabilities.emplace_back(SINGLE_BUCKET_FEATURE_LABEL, probability);
 
     if (m_AnomalyModel != nullptr && params.useAnomalyModel()) {
         double residual{
@@ -2439,7 +2439,8 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa
     }
     TTail2Vec tail(coordinates.size(), maths_t::E_UndeterminedTail);
 
-    result = SModelProbabilityResult{1.0, false, {{BUCKET_FEATURE_LABEL, 1.0}}, tail, {}};
+    result = SModelProbabilityResult{
+        1.0, false, {{SINGLE_BUCKET_FEATURE_LABEL, 1.0}}, tail, {}};
 
     std::size_t dimension{this->dimension()};
     core_t::TTime time{time_[0][0]};
@@ -2534,7 +2535,8 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa
                                 2.0);
     }
 
-    TStrCRef labels[]{boost::cref(BUCKET_FEATURE_LABEL), boost::cref(MEAN_FEATURE_LABEL)};
+    TStrCRef labels[]{boost::cref(SINGLE_BUCKET_FEATURE_LABEL),
+                      boost::cref(MULTI_BUCKET_FEATURE_LABEL)};
     SModelProbabilityResult::TFeatureProbability4Vec featureProbabilities;
     for (std::size_t i = 0u; i < probabilities.size(); ++i) {
         featureProbabilities.emplace_back(labels[i], probabilities[i]);

diff --git a/lib/model/CAnnotatedProbabilityBuilder.cc b/lib/model/CAnnotatedProbabilityBuilder.cc
@@ -68,6 +68,9 @@ void CAnnotatedProbabilityBuilder::personAttributeProbabilityPrior(const maths::
 void CAnnotatedProbabilityBuilder::probability(double p) {
     m_Result.s_Probability = p;
 }
+void CAnnotatedProbabilityBuilder::multiBucketImpact(double multiBucketImpact) {
+    m_Result.s_MultiBucketImpact = multiBucketImpact;
+}
 
 void CAnnotatedProbabilityBuilder::addAttributeProbability(
     std::size_t cid,

diff --git a/lib/model/CEventRateModel.cc b/lib/model/CEventRateModel.cc
@@ -431,6 +431,12 @@ bool CEventRateModel::computeProbability(std::size_t pid,
     LOG_TRACE(<< "probability(" << this->personName(pid) << ") = " << p);
 
     resultBuilder.probability(p);
+
+    double multiBucketImpact{-5.0};
+    if (pJoint.calculateMultiBucketImpact(multiBucketImpact)) {
+        resultBuilder.multiBucketImpact(multiBucketImpact);
+    }
+
     bool everSeenBefore = this->firstBucketTimes()[pid] != startTime;
     resultBuilder.personFrequency(this->personFrequency(pid), everSeenBefore);
     resultBuilder.build();

diff --git a/lib/model/CMetricModel.cc b/lib/model/CMetricModel.cc
@@ -396,6 +396,12 @@ bool CMetricModel::computeProbability(const std::size_t pid,
     LOG_TRACE(<< "probability(" << this->personName(pid) << ") = " << p);
 
     resultBuilder.probability(p);
+
+    double multiBucketImpact{-5.0};
+    if (pJoint.calculateMultiBucketImpact(multiBucketImpact)) {
+        resultBuilder.multiBucketImpact(multiBucketImpact);
+    }
+
     resultBuilder.build();
 
     return true;