diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index ba04e944fa..e33c924860 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -51,6 +51,10 @@ Increased independence of anomaly scores across partitions (See {ml-pull}182[182 Avoid potential false positives at model start up when first detecting new components of the time series decomposition. (See {ml-pull}218[218].) +Add a new label - multi_bucket_impact - to record level anomaly results. +The value will be on a scale of -5 to +5 where -5 means the anomaly is purely single bucket +and +5 means the anomaly is purely multi bucket. ({ml-pull}230[230]) + === Bug Fixes Fix cause of "Bad density value..." log errors whilst forecasting. ({ml-pull}207[207]) diff --git a/include/api/CHierarchicalResultsWriter.h b/include/api/CHierarchicalResultsWriter.h index 610e373eee..9b77024cf3 100644 --- a/include/api/CHierarchicalResultsWriter.h +++ b/include/api/CHierarchicalResultsWriter.h @@ -97,6 +97,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults double rawAnomalyScore, double normalizedAnomalyScore, double probability, + double multiBucketImpact, const std::string& metricValueField, const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences, bool useNull, @@ -131,6 +132,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults double s_RawAnomalyScore; double s_NormalizedAnomalyScore; double s_Probability; + double s_MultiBucketImpact; const TStoredStringPtrStoredStringPtrPrDoublePrVec& s_Influences; int s_Identifier; TStr1Vec s_ScheduledEventDescriptions; diff --git a/include/maths/CModel.h b/include/maths/CModel.h index 4f564c12c3..625b4d9378 100644 --- a/include/maths/CModel.h +++ b/include/maths/CModel.h @@ -230,12 +230,19 @@ struct MATHS_EXPORT SModelProbabilityResult { using TSize1Vec = core::CSmallVector; using TTail2Vec = core::CSmallVector; + //! Labels for different contributions to the overall probability. + enum EFeatureProbabilityLabel { + E_SingleBucketProbability, + E_MultiBucketProbability, + E_AnomalyModelProbability, + E_UndefinedProbability + }; + //! \brief Wraps up a feature label and probability. struct MATHS_EXPORT SFeatureProbability { - using TStrCRef = boost::reference_wrapper; SFeatureProbability(); - SFeatureProbability(const std::string& label, double probability); - TStrCRef s_Label; + SFeatureProbability(EFeatureProbabilityLabel label, double probability); + EFeatureProbabilityLabel s_Label; double s_Probability = 1.0; }; using TFeatureProbability4Vec = core::CSmallVector; diff --git a/include/maths/CTimeSeriesDecompositionDetail.h b/include/maths/CTimeSeriesDecompositionDetail.h index 365b680e07..5063831787 100644 --- a/include/maths/CTimeSeriesDecompositionDetail.h +++ b/include/maths/CTimeSeriesDecompositionDetail.h @@ -751,7 +751,8 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { //! Adjust the values to remove any piecewise constant linear scales //! of the component with period \p period. - void adjustValuesForPiecewiseConstantScaling(std::size_t period, TFloatMeanAccumulatorVec& values) const; + void adjustValuesForPiecewiseConstantScaling(std::size_t period, + TFloatMeanAccumulatorVec& values) const; //! Reweight the outlier values in \p values. //! diff --git a/include/model/CAnnotatedProbability.h b/include/model/CAnnotatedProbability.h index 9604b0eb28..032cda11a6 100644 --- a/include/model/CAnnotatedProbability.h +++ b/include/model/CAnnotatedProbability.h @@ -137,6 +137,9 @@ struct MODEL_EXPORT SAnnotatedProbability { //! The probability of seeing the series' sample in a time interval. double s_Probability; + //! The impact of multi/single bucket analysis on the probability + double s_MultiBucketImpact; + //! The smallest attribute probabilities and associated data describing //! the calculation. TAttributeProbability1Vec s_AttributeProbabilities; diff --git a/include/model/CAnnotatedProbabilityBuilder.h b/include/model/CAnnotatedProbabilityBuilder.h index dcb1865ed6..e91d82280c 100644 --- a/include/model/CAnnotatedProbabilityBuilder.h +++ b/include/model/CAnnotatedProbabilityBuilder.h @@ -50,6 +50,7 @@ class MODEL_EXPORT CAnnotatedProbabilityBuilder : private core::CNonCopyable { void personAttributeProbabilityPrior(const maths::CMultinomialConjugate* prior); void personFrequency(double frequency, bool everSeenBefore); void probability(double p); + void multiBucketImpact(double multiBucketImpact); void addAttributeProbability(std::size_t cid, const core::CStoredStringPtr& attribute, double pAttribute, diff --git a/include/model/CAnomalyDetectorModelConfig.h b/include/model/CAnomalyDetectorModelConfig.h index 9ace314f34..b47b7b8851 100644 --- a/include/model/CAnomalyDetectorModelConfig.h +++ b/include/model/CAnomalyDetectorModelConfig.h @@ -166,6 +166,9 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { //! for anomaly detection. static const std::size_t MULTIBUCKET_FEATURES_WINDOW_LENGTH; + //! The maximum value that the multi_bucket_impact can take + static const double MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE; + //! The maximum number of times we'll update a model in a bucketing //! interval. This only applies to our metric statistics, which are //! computed on a fixed number of measurements rather than a fixed diff --git a/include/model/CProbabilityAndInfluenceCalculator.h b/include/model/CProbabilityAndInfluenceCalculator.h index e20684e782..9f3bce2099 100644 --- a/include/model/CProbabilityAndInfluenceCalculator.h +++ b/include/model/CProbabilityAndInfluenceCalculator.h @@ -71,6 +71,10 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator { using TStrCRefDouble1VecDouble1VecPrPrVec = std::vector; using TStrCRefDouble1VecDouble1VecPrPrVecVec = std::vector; + using TFeatureProbabilityLabelDoubleUMap = + boost::unordered_map; + using TFeatureProbabilityLabelProbabilityAggregatorUMap = + boost::unordered_map; using TStoredStringPtrStoredStringPtrPr = std::pair; using TStoredStringPtrStoredStringPtrPrVec = std::vector; @@ -308,10 +312,23 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator { bool calculate(double& probability, TStoredStringPtrStoredStringPtrPrDoublePrVec& influences) const; + //! Calculate a measure of the impact of both the single bucket and multi + //! bucket probabilities on the make up of the overall probability. + //! + //! The calculation is designed such that the impact saturates when + //! one of the probabilities is less than a small fraction of the other or + //! when one probability is close to one, i.e. when one factor is not at all anomalous. + //! + //! \param[out] multiBucketImpact Filled in with the impact of constituent probabilities. + bool calculateMultiBucketImpact(double& multiBucketImpact) const; + private: //! Actually commit any influences we've found. void commitInfluences(model_t::EFeature feature, double logp, double weight); + //! calculate the explaining probabilities + bool calculateExplainingProbabilities(TFeatureProbabilityLabelDoubleUMap& explainingProbabilities) const; + private: //! The minimum value for the influence for which an influencing //! field value is judged to have any influence on a feature value. @@ -327,6 +344,9 @@ class MODEL_EXPORT CProbabilityAndInfluenceCalculator { //! The probability calculator. CModelTools::CProbabilityAggregator m_Probability; + //! holds the probabilities of explanatory features + TFeatureProbabilityLabelProbabilityAggregatorUMap m_ExplainingProbabilities; + //! The probability calculation cache if there is one. CModelTools::CProbabilityCache* m_ProbabilityCache; diff --git a/lib/api/CHierarchicalResultsWriter.cc b/lib/api/CHierarchicalResultsWriter.cc index 18b81ddef6..05da9531fd 100644 --- a/lib/api/CHierarchicalResultsWriter.cc +++ b/lib/api/CHierarchicalResultsWriter.cc @@ -61,7 +61,8 @@ CHierarchicalResultsWriter::SResults::SResults( s_PopulationAverage(populationAverage), s_BaselineRate(0.0), s_CurrentRate(currentRate), s_BaselineMean{0.0}, s_CurrentMean{0.0}, s_RawAnomalyScore(rawAnomalyScore), - s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability), + s_NormalizedAnomalyScore(normalizedAnomalyScore), + s_Probability(probability), s_MultiBucketImpact{-1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE}, s_Influences(influences), s_Identifier(identifier) { } @@ -82,6 +83,7 @@ CHierarchicalResultsWriter::SResults::SResults( double rawAnomalyScore, double normalizedAnomalyScore, double probability, + double multiBucketImpact, const std::string& metricValueField, const TStoredStringPtrStoredStringPtrPrDoublePrVec& influences, bool useNull, @@ -103,7 +105,8 @@ CHierarchicalResultsWriter::SResults::SResults( s_BaselineRate(baselineRate), s_CurrentRate(currentRate), s_BaselineMean(baselineMean), s_CurrentMean(currentMean), s_RawAnomalyScore(rawAnomalyScore), - s_NormalizedAnomalyScore(normalizedAnomalyScore), s_Probability(probability), + s_NormalizedAnomalyScore(normalizedAnomalyScore), + s_Probability(probability), s_MultiBucketImpact{multiBucketImpact}, s_Influences(influences), s_Identifier(identifier), s_ScheduledEventDescriptions(scheduledEventDescriptions) { } @@ -238,7 +241,7 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica const model::SAttributeProbability& attributeProbability = node.s_AnnotatedProbability.s_AttributeProbabilities[0]; - m_ResultWriterFunc(TResults( + SResults individualResult = TResults( E_Result, *node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PartitionFieldValue, *node.s_Spec.s_ByFieldName, *node.s_Spec.s_PersonFieldValue, attributeProbability.s_CorrelatedAttributes.empty() @@ -248,10 +251,13 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica model_t::outputFunctionName(feature), node.s_AnnotatedProbability.s_BaselineBucketCount, node.s_AnnotatedProbability.s_CurrentBucketCount, attributeProbability.s_BaselineBucketMean, attributeProbability.s_CurrentBucketValue, - node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(), + node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, + node.probability(), node.s_AnnotatedProbability.s_MultiBucketImpact, *node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences, node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function), - node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST)); + node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST); + + m_ResultWriterFunc(individualResult); } void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results, @@ -285,6 +291,7 @@ void CHierarchicalResultsWriter::writeSimpleCountResult(const TNode& node) { baselineCount ? TDouble1Vec(1, *baselineCount) : TDouble1Vec(), currentCount ? TDouble1Vec(1, static_cast(*currentCount)) : TDouble1Vec(), node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(), + -1.0 * model::CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE, *node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences, node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function), node.s_Spec.s_Detector, node.s_BucketLength, node.s_Spec.s_ScheduledEventDescriptions)); diff --git a/lib/api/CJsonOutputWriter.cc b/lib/api/CJsonOutputWriter.cc index a075f2639e..696a8a705a 100644 --- a/lib/api/CJsonOutputWriter.cc +++ b/lib/api/CJsonOutputWriter.cc @@ -33,6 +33,7 @@ const std::string RECORDS("records"); const std::string EVENT_COUNT("event_count"); const std::string IS_INTERIM("is_interim"); const std::string PROBABILITY("probability"); +const std::string MULTI_BUCKET_IMPACT("multi_bucket_impact"); const std::string RAW_ANOMALY_SCORE("raw_anomaly_score"); const std::string ANOMALY_SCORE("anomaly_score"); const std::string RECORD_SCORE("record_score"); @@ -534,6 +535,7 @@ void CJsonOutputWriter::addMetricFields(const CHierarchicalResultsWriter::TResul results.s_NormalizedAnomalyScore, *docPtr); m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr); m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr); + m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr); m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr); if (!results.s_ByFieldName.empty()) { m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr); @@ -736,6 +738,7 @@ void CJsonOutputWriter::addEventRateFields(const CHierarchicalResultsWriter::TRe results.s_NormalizedAnomalyScore, *docPtr); m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr); m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr); + m_Writer.addDoubleFieldToObj(MULTI_BUCKET_IMPACT, results.s_MultiBucketImpact, *docPtr); m_Writer.addStringFieldCopyToObj(FIELD_NAME, results.s_MetricValueField, *docPtr); if (!results.s_ByFieldName.empty()) { m_Writer.addStringFieldCopyToObj(BY_FIELD_NAME, results.s_ByFieldName, *docPtr); diff --git a/lib/api/unittest/CJsonOutputWriterTest.cc b/lib/api/unittest/CJsonOutputWriterTest.cc index 6dc129fb20..135f9267fa 100644 --- a/lib/api/unittest/CJsonOutputWriterTest.cc +++ b/lib/api/unittest/CJsonOutputWriterTest.cc @@ -365,20 +365,20 @@ void CJsonOutputWriterTest::testBucketWriteHelper(bool isInterim) { partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.8, 0.0, - fieldName, influences, false, true, 2, 100, EMPTY_STRING_LIST); + -5.0, fieldName, influences, false, true, 2, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result13( ml::api::CHierarchicalResultsWriter::E_SimpleCountResult, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 1, function, functionDescription, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.5, 0.0, + TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.5, 0.0, -5.0, fieldName, influences, false, false, 3, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result14( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 1, function, functionDescription, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, + TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, -5.0, fieldName, influences, false, false, 4, 100, EMPTY_STRING_LIST); // 1st bucket @@ -414,20 +414,20 @@ void CJsonOutputWriterTest::testBucketWriteHelper(bool isInterim) { partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 2, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.8, 0.0, - fieldName, influences, false, true, 2, 100, EMPTY_STRING_LIST); + -5.0, fieldName, influences, false, true, 2, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result23( ml::api::CHierarchicalResultsWriter::E_SimpleCountResult, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 2, function, functionDescription, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, + TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, -5.0, fieldName, influences, false, false, 3, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result24( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 2, function, functionDescription, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, + TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, -5.0, fieldName, influences, false, false, 4, 100, EMPTY_STRING_LIST); // 2nd bucket @@ -463,20 +463,20 @@ void CJsonOutputWriterTest::testBucketWriteHelper(bool isInterim) { partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 3, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, - fieldName, influences, false, true, 2, 100, EMPTY_STRING_LIST); + -5.0, fieldName, influences, false, true, 2, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result33( ml::api::CHierarchicalResultsWriter::E_SimpleCountResult, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 3, function, functionDescription, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, + TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, -5.0, fieldName, influences, false, false, 3, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result34( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 3, function, functionDescription, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, + TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, -5.0, fieldName, influences, false, false, 4, 100, EMPTY_STRING_LIST); // 3rd bucket @@ -817,32 +817,32 @@ void CJsonOutputWriterTest::testLimitedRecordsWriteHelper(bool isInterim) { ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.1, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.1, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result111)); ml::api::CHierarchicalResultsWriter::SResults result112( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.2, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.2, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result112)); ml::api::CHierarchicalResultsWriter::SResults result113( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 2.0, 0.0, 0.4, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 2.0, 0.0, 0.4, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result113)); ml::api::CHierarchicalResultsWriter::SResults result114( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 12.0, 0.0, 0.4, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 12.0, 0.0, 0.4, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result114)); CPPUNIT_ASSERT(writer.acceptResult(result114)); @@ -901,24 +901,24 @@ void CJsonOutputWriterTest::testLimitedRecordsWriteHelper(bool isInterim) { ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 2, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 1.0, 0.0, 0.05, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 1.0, 0.0, 0.05, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result211)); ml::api::CHierarchicalResultsWriter::SResults result212( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 2, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 7.0, 0.0, 0.001, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 7.0, 0.0, 0.001, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result212)); ml::api::CHierarchicalResultsWriter::SResults result213( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 2, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 0.6, 0.0, 0.1, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 0.6, 0.0, 0.1, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result213)); CPPUNIT_ASSERT(writer.acceptResult(result213)); @@ -964,8 +964,8 @@ void CJsonOutputWriterTest::testLimitedRecordsWriteHelper(bool isInterim) { ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 3, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 30.0, 0.0, 0.02, fieldName, influences, - false, true, 1, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 30.0, 0.0, 0.02, -5.0, fieldName, + influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result311)); overFieldName = "ofn"; @@ -1287,9 +1287,10 @@ void CJsonOutputWriterTest::testWriteInfluencersWithLimit() { std::string emptyStr; ml::api::CHierarchicalResultsWriter::TStoredStringPtrStoredStringPtrPrDoublePrVec influences; ml::api::CHierarchicalResultsWriter::SResults result( - ml::api::CHierarchicalResultsWriter::E_Result, pfn, pfv, bfn, bfv, emptyStr, - 0, fun, fund, 42.0, 79, TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), - 0.0, 0.1, 0.1, fn, influences, false, true, 1, 100, EMPTY_STRING_LIST); + ml::api::CHierarchicalResultsWriter::E_Result, pfn, pfv, bfn, bfv, + emptyStr, 0, fun, fund, 42.0, 79, TDouble1Vec(1, 6953.0), + TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.1, -5.0, fn, influences, false, + true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result)); @@ -1435,7 +1436,7 @@ void CJsonOutputWriterTest::testWriteWithInfluences() { ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.1, fieldName, influences, + TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.1, -5.0, fieldName, influences, false, true, 1, 100, EMPTY_STRING_LIST); ml::core::CJsonOutputStreamWrapper outputStream(sstream); @@ -1620,7 +1621,7 @@ void CJsonOutputWriterTest::testWriteScheduledEvent() { partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 100, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.1, - fieldName, influences, false, true, 1, 100, EMPTY_STRING_LIST); + -5.0, fieldName, influences, false, true, 1, 100, EMPTY_STRING_LIST); CPPUNIT_ASSERT(writer.acceptResult(result)); // This result has 2 scheduled events @@ -1630,7 +1631,7 @@ void CJsonOutputWriterTest::testWriteScheduledEvent() { partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, emptyString, 200, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 0.0, 0.1, 0.1, - fieldName, influences, false, true, 1, 100, eventDescriptions); + -5.0, fieldName, influences, false, true, 1, 100, eventDescriptions); CPPUNIT_ASSERT(writer.acceptResult(result2)); CPPUNIT_ASSERT(writer.endOutputBatch(false, 1U)); @@ -1711,22 +1712,22 @@ void CJsonOutputWriterTest::testThroughputHelper(bool useScopedAllocator) { ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 2.24, 0.8, 0.0, fieldName, influences, false, - true, 2, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 2.24, 0.8, 0.0, -5.0, fieldName, influences, + false, true, 2, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result13( ml::api::CHierarchicalResultsWriter::E_SimpleCountResult, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 1, function, functionDescription, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.5, 0.0, + TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 2.24, 0.5, 0.0, -5.0, fieldName, influences, false, false, 3, 100, EMPTY_STRING_LIST); ml::api::CHierarchicalResultsWriter::SResults result14( ml::api::CHierarchicalResultsWriter::E_Result, partitionFieldName, partitionFieldValue, byFieldName, byFieldValue, correlatedByFieldValue, 1, function, functionDescription, 42.0, 79, TDouble1Vec(1, 6953.0), - TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, fieldName, influences, false, - false, 4, 100, EMPTY_STRING_LIST); + TDouble1Vec(1, 10090.0), 2.24, 0.0, 0.0, -5.0, fieldName, influences, + false, false, 4, 100, EMPTY_STRING_LIST); // 1st bucket writer.acceptBucketTimeInfluencer(1, 0.01, 13.44, 70.0); diff --git a/lib/maths/CModel.cc b/lib/maths/CModel.cc index 7adaaff3c2..adc953f9e0 100644 --- a/lib/maths/CModel.cc +++ b/lib/maths/CModel.cc @@ -276,12 +276,12 @@ bool CModelProbabilityParams::useAnomalyModel() const { //////// SModelProbabilityResult::SFeatureProbability //////// SModelProbabilityResult::SFeatureProbability::SFeatureProbability() - : s_Label{boost::cref(EMPTY_STRING)} { + : s_Label{E_UndefinedProbability} { } -SModelProbabilityResult::SFeatureProbability::SFeatureProbability(const std::string& label, +SModelProbabilityResult::SFeatureProbability::SFeatureProbability(EFeatureProbabilityLabel label, double probability) - : s_Label{boost::cref(label)}, s_Probability{probability} { + : s_Label{label}, s_Probability{probability} { } //////// CModel //////// diff --git a/lib/maths/CTimeSeriesDecompositionDetail.cc b/lib/maths/CTimeSeriesDecompositionDetail.cc index 3cc3ff191c..1baf6a99d3 100644 --- a/lib/maths/CTimeSeriesDecompositionDetail.cc +++ b/lib/maths/CTimeSeriesDecompositionDetail.cc @@ -1633,7 +1633,8 @@ bool CTimeSeriesDecompositionDetail::CComponents::addCalendarComponent(const CCa } void CTimeSeriesDecompositionDetail::CComponents::adjustValuesForPiecewiseConstantScaling( - std::size_t period, TFloatMeanAccumulatorVec& values) const { + std::size_t period, + TFloatMeanAccumulatorVec& values) const { // Periodicity testing detected piecewise constant linear scaling // of the underlying seasonal component. Here, we adjust all values @@ -1644,14 +1645,13 @@ void CTimeSeriesDecompositionDetail::CComponents::adjustValuesForPiecewiseConsta TDoubleVec trend; TDoubleVec scales; - TSizeVec segmentation(CTimeSeriesSegmentation::piecewiseLinearScaledPeriodic( - values, period)); + TSizeVec segmentation( + CTimeSeriesSegmentation::piecewiseLinearScaledPeriodic(values, period)); std::tie(trend, scales) = CTimeSeriesSegmentation::piecewiseLinearScaledPeriodic( values, period, segmentation); LOG_TRACE(<< "trend = " << core::CContainerPrinter::print(trend)); LOG_TRACE(<< "scales = " << core::CContainerPrinter::print(scales)); - LOG_TRACE(<< "segmentation = " - << core::CContainerPrinter::print(segmentation)); + LOG_TRACE(<< "segmentation = " << core::CContainerPrinter::print(segmentation)); values = CTimeSeriesSegmentation::removePiecewiseLinearScaledPeriodic( values, segmentation, trend, scales); TMeanAccumulator scale; @@ -1666,8 +1666,8 @@ void CTimeSeriesDecompositionDetail::CComponents::adjustValuesForPiecewiseConsta } LOG_TRACE(<< "scale = " << CBasicStatistics::mean(scale)); for (std::size_t i = 0; i < values.size(); ++i) { - CBasicStatistics::moment<0>(values[i]) += - CBasicStatistics::mean(scale) * trend[i % trend.size()]; + CBasicStatistics::moment<0>(values[i]) += CBasicStatistics::mean(scale) * + trend[i % trend.size()]; } } diff --git a/lib/maths/CTimeSeriesModel.cc b/lib/maths/CTimeSeriesModel.cc index 406eee27c7..f70a47c3e8 100644 --- a/lib/maths/CTimeSeriesModel.cc +++ b/lib/maths/CTimeSeriesModel.cc @@ -167,12 +167,6 @@ const std::string FIRST_CORRELATE_ID_TAG{"a"}; const std::string SECOND_CORRELATE_ID_TAG{"b"}; const std::string CORRELATION_MODEL_TAG{"c"}; const std::string CORRELATION_TAG{"d"}; - -// Strings identifying the different features for which time series -// models compute probabilities. -const std::string BUCKET_FEATURE_LABEL{"bucket"}; -const std::string MEAN_FEATURE_LABEL{"mean"}; -const std::string ANOMALY_FEATURE_LABEL{"anomaly"}; } namespace forecast { @@ -1002,7 +996,8 @@ bool CUnivariateTimeSeriesModel::uncorrelatedProbability(const CModelProbability calculation, value[0], params.bucketEmpty()[0][0], this->params().probabilityBucketEmpty(), (pl + pu) / 2.0)}; probabilities.push_back(probability); - featureProbabilities.emplace_back(BUCKET_FEATURE_LABEL, probability); + featureProbabilities.emplace_back( + SModelProbabilityResult::E_SingleBucketProbability, probability); } else { LOG_ERROR(<< "Failed to compute P(" << sample << " | weight = " << weights << ", time = " << time << ")"); @@ -1033,7 +1028,8 @@ bool CUnivariateTimeSeriesModel::uncorrelatedProbability(const CModelProbability calculation, value[0], params.bucketEmpty()[0][0], this->params().probabilityBucketEmpty(), probability); probabilities.push_back(probability); - featureProbabilities.emplace_back(MEAN_FEATURE_LABEL, probability); + featureProbabilities.emplace_back( + SModelProbabilityResult::E_MultiBucketProbability, probability); } double probability{aggregateFeatureProbabilities(probabilities, correlation)}; @@ -1047,7 +1043,8 @@ bool CUnivariateTimeSeriesModel::uncorrelatedProbability(const CModelProbability std::tie(probability, anomalyProbability) = m_AnomalyModel->probability(time, probability); probabilities.push_back(anomalyProbability); - featureProbabilities.emplace_back(ANOMALY_FEATURE_LABEL, anomalyProbability); + featureProbabilities.emplace_back( + SModelProbabilityResult::E_AnomalyModelProbability, anomalyProbability); } result.s_Probability = probability; @@ -1166,7 +1163,8 @@ bool CUnivariateTimeSeriesModel::correlatedProbability(const CModelProbabilityPa aggregator.calculate(probability); TDouble4Vec probabilities{probability}; SModelProbabilityResult::TFeatureProbability4Vec featureProbabilities; - featureProbabilities.emplace_back(BUCKET_FEATURE_LABEL, probability); + featureProbabilities.emplace_back( + SModelProbabilityResult::E_SingleBucketProbability, probability); if (m_AnomalyModel != nullptr && params.useAnomalyModel()) { double residual{ @@ -1178,7 +1176,8 @@ bool CUnivariateTimeSeriesModel::correlatedProbability(const CModelProbabilityPa std::tie(probability, anomalyProbability) = m_AnomalyModel->probability(mostAnomalousTime, probability); probabilities.push_back(anomalyProbability); - featureProbabilities.emplace_back(ANOMALY_FEATURE_LABEL, anomalyProbability); + featureProbabilities.emplace_back( + SModelProbabilityResult::E_AnomalyModelProbability, anomalyProbability); } aggregator.calculate(probability); @@ -2439,7 +2438,8 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa } TTail2Vec tail(coordinates.size(), maths_t::E_UndeterminedTail); - result = SModelProbabilityResult{1.0, false, {{BUCKET_FEATURE_LABEL, 1.0}}, tail, {}}; + result = SModelProbabilityResult{ + 1.0, false, {{SModelProbabilityResult::E_SingleBucketProbability, 1.0}}, tail, {}}; std::size_t dimension{this->dimension()}; core_t::TTime time{time_[0][0]}; @@ -2534,7 +2534,9 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa 2.0); } - TStrCRef labels[]{boost::cref(BUCKET_FEATURE_LABEL), boost::cref(MEAN_FEATURE_LABEL)}; + SModelProbabilityResult::EFeatureProbabilityLabel labels[]{ + SModelProbabilityResult::E_SingleBucketProbability, + SModelProbabilityResult::E_MultiBucketProbability}; SModelProbabilityResult::TFeatureProbability4Vec featureProbabilities; for (std::size_t i = 0u; i < probabilities.size(); ++i) { featureProbabilities.emplace_back(labels[i], probabilities[i]); @@ -2554,7 +2556,8 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa std::tie(probability, anomalyProbability) = m_AnomalyModel->probability(time, probability); probabilities.push_back(anomalyProbability); - featureProbabilities.emplace_back(ANOMALY_FEATURE_LABEL, anomalyProbability); + featureProbabilities.emplace_back( + SModelProbabilityResult::E_AnomalyModelProbability, anomalyProbability); } result.s_Probability = probability; diff --git a/lib/model/CAnnotatedProbabilityBuilder.cc b/lib/model/CAnnotatedProbabilityBuilder.cc index 23c1621500..aafb8c4fe0 100644 --- a/lib/model/CAnnotatedProbabilityBuilder.cc +++ b/lib/model/CAnnotatedProbabilityBuilder.cc @@ -68,6 +68,9 @@ void CAnnotatedProbabilityBuilder::personAttributeProbabilityPrior(const maths:: void CAnnotatedProbabilityBuilder::probability(double p) { m_Result.s_Probability = p; } +void CAnnotatedProbabilityBuilder::multiBucketImpact(double multiBucketImpact) { + m_Result.s_MultiBucketImpact = multiBucketImpact; +} void CAnnotatedProbabilityBuilder::addAttributeProbability( std::size_t cid, diff --git a/lib/model/CAnomalyDetectorModelConfig.cc b/lib/model/CAnomalyDetectorModelConfig.cc index 373884df63..077f431f47 100644 --- a/lib/model/CAnomalyDetectorModelConfig.cc +++ b/lib/model/CAnomalyDetectorModelConfig.cc @@ -74,6 +74,7 @@ const core_t::TTime const core_t::TTime CAnomalyDetectorModelConfig::DEFAULT_MAXIMUM_TIME_TO_TEST_FOR_CHANGE(core::constants::DAY); const std::size_t CAnomalyDetectorModelConfig::MULTIBUCKET_FEATURES_WINDOW_LENGTH(12); +const double CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE(5.0); const double CAnomalyDetectorModelConfig::DEFAULT_MAXIMUM_UPDATES_PER_BUCKET(1.0); const double CAnomalyDetectorModelConfig::DEFAULT_INFLUENCE_CUTOFF(0.4); const double CAnomalyDetectorModelConfig::DEFAULT_PRUNE_WINDOW_SCALE_MINIMUM(0.25); diff --git a/lib/model/CEventRateModel.cc b/lib/model/CEventRateModel.cc index dae114ca0b..ceed534bd1 100644 --- a/lib/model/CEventRateModel.cc +++ b/lib/model/CEventRateModel.cc @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -431,6 +432,12 @@ bool CEventRateModel::computeProbability(std::size_t pid, LOG_TRACE(<< "probability(" << this->personName(pid) << ") = " << p); resultBuilder.probability(p); + + double multiBucketImpact{-1.0 * CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE}; + if (pJoint.calculateMultiBucketImpact(multiBucketImpact)) { + resultBuilder.multiBucketImpact(multiBucketImpact); + } + bool everSeenBefore = this->firstBucketTimes()[pid] != startTime; resultBuilder.personFrequency(this->personFrequency(pid), everSeenBefore); resultBuilder.build(); diff --git a/lib/model/CMetricModel.cc b/lib/model/CMetricModel.cc index 36b3ac4601..4bb195535a 100644 --- a/lib/model/CMetricModel.cc +++ b/lib/model/CMetricModel.cc @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -396,6 +397,12 @@ bool CMetricModel::computeProbability(const std::size_t pid, LOG_TRACE(<< "probability(" << this->personName(pid) << ") = " << p); resultBuilder.probability(p); + + double multiBucketImpact{-1.0 * CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE}; + if (pJoint.calculateMultiBucketImpact(multiBucketImpact)) { + resultBuilder.multiBucketImpact(multiBucketImpact); + } + resultBuilder.build(); return true; diff --git a/lib/model/CProbabilityAndInfluenceCalculator.cc b/lib/model/CProbabilityAndInfluenceCalculator.cc index 0f8192426e..21fa677175 100644 --- a/lib/model/CProbabilityAndInfluenceCalculator.cc +++ b/lib/model/CProbabilityAndInfluenceCalculator.cc @@ -17,6 +17,7 @@ #include #include +#include #include namespace ml { @@ -573,6 +574,10 @@ CProbabilityAndInfluenceCalculator::CProbabilityAndInfluenceCalculator(double cu : m_Cutoff(cutoff), m_InfluenceCalculator(nullptr), m_ProbabilityTemplate(CModelTools::CProbabilityAggregator::E_Min), m_Probability(CModelTools::CProbabilityAggregator::E_Min), + m_ExplainingProbabilities{{maths::SModelProbabilityResult::E_SingleBucketProbability, + {CModelTools::CProbabilityAggregator::E_Min}}, + {maths::SModelProbabilityResult::E_MultiBucketProbability, + {CModelTools::CProbabilityAggregator::E_Min}}}, m_ProbabilityCache(nullptr) { } @@ -592,11 +597,17 @@ void CProbabilityAndInfluenceCalculator::addAggregator( const maths::CJointProbabilityOfLessLikelySamples& aggregator) { m_ProbabilityTemplate.add(aggregator); m_Probability.add(aggregator); + for (auto& ep : m_ExplainingProbabilities) { + ep.second.add(aggregator); + } } void CProbabilityAndInfluenceCalculator::addAggregator(const maths::CProbabilityOfExtremeSample& aggregator) { m_ProbabilityTemplate.add(aggregator); m_Probability.add(aggregator); + for (auto& ep : m_ExplainingProbabilities) { + ep.second.add(aggregator); + } } void CProbabilityAndInfluenceCalculator::addCache(CModelTools::CProbabilityCache& cache) { @@ -612,6 +623,16 @@ void CProbabilityAndInfluenceCalculator::add(const CProbabilityAndInfluenceCalcu if (!other.m_Probability.empty()) { m_Probability.add(p, weight); } + + for (const auto& ep : other.m_ExplainingProbabilities) { + if (ep.second.calculate(p) && !ep.second.empty()) { + auto ret = m_ExplainingProbabilities.insert(ep); + if (ret.second == false) { + ret.first->second.add(p, weight); + } + } + } + for (const auto& aggregator : other.m_InfluencerProbabilities) { if (aggregator.second.calculate(p)) { auto& aggregator_ = m_InfluencerProbabilities @@ -693,18 +714,34 @@ bool CProbabilityAndInfluenceCalculator::addProbability(model_t::EFeature featur return false; } + auto readResult = [&](const maths::SModelProbabilityResult& result) { + for (const auto& fp : result.s_FeatureProbabilities) { + auto itr = m_ExplainingProbabilities.find(fp.s_Label); + if (itr != m_ExplainingProbabilities.end()) { + double featureProbability = fp.s_Probability; + featureProbability = model_t::adjustProbability( + feature, elapsedTime, featureProbability); + itr->second.add(featureProbability, weight); + } + } + + probability = result.s_Probability; + probability = model_t::adjustProbability(feature, elapsedTime, probability); + tail = std::move(result.s_Tail); + type.set(result.s_Conditional ? model_t::CResultType::E_Conditional + : model_t::CResultType::E_Unconditional); + mostAnomalousCorrelate = std::move(result.s_MostAnomalousCorrelate); + m_Probability.add(probability, weight); + }; + // Check the cache. if (model_t::isConstant(feature) == false && m_ProbabilityCache) { TDouble2Vec1Vec values(model_t::stripExtraStatistics(feature, values_)); model.detrend(time, computeProbabilityParams.seasonalConfidenceInterval(), values); maths::SModelProbabilityResult cached; + if (m_ProbabilityCache->lookup(feature, id, values, cached)) { - probability = cached.s_Probability; - tail = std::move(cached.s_Tail); - type.set(cached.s_Conditional ? model_t::CResultType::E_Conditional - : model_t::CResultType::E_Unconditional); - mostAnomalousCorrelate = std::move(cached.s_MostAnomalousCorrelate); - m_Probability.add(cached.s_Probability, weight); + readResult(cached); return true; } } @@ -715,13 +752,7 @@ bool CProbabilityAndInfluenceCalculator::addProbability(model_t::EFeature featur maths::SModelProbabilityResult result; if (model.probability(computeProbabilityParams, time, values, result)) { if (model_t::isConstant(feature) == false) { - probability = result.s_Probability; - probability = model_t::adjustProbability(feature, elapsedTime, probability); - tail = std::move(result.s_Tail); - type.set(result.s_Conditional ? model_t::CResultType::E_Conditional - : model_t::CResultType::E_Unconditional); - mostAnomalousCorrelate = std::move(result.s_MostAnomalousCorrelate); - m_Probability.add(probability, weight); + readResult(result); if (m_ProbabilityCache) { m_ProbabilityCache->addModes(feature, id, model); m_ProbabilityCache->addProbability(feature, id, values, result); @@ -827,6 +858,47 @@ bool CProbabilityAndInfluenceCalculator::calculate(double& probability) const { return m_Probability.calculate(probability); } +bool CProbabilityAndInfluenceCalculator::calculateExplainingProbabilities( + TFeatureProbabilityLabelDoubleUMap& explainingProbabilities) const { + + double probability{0.0}; + for (const auto& ep : m_ExplainingProbabilities) { + if (!ep.second.calculate(probability)) { + return false; + } else { + explainingProbabilities.emplace(ep.first, probability); + } + } + + return true; +} + +bool CProbabilityAndInfluenceCalculator::calculateMultiBucketImpact(double& multiBucketImpact) const { + TFeatureProbabilityLabelDoubleUMap explainingProbabilities; + if (!this->calculateExplainingProbabilities(explainingProbabilities)) { + LOG_INFO(<< "Failed to compute explaining probabilities"); + return false; + } + + double sbProbability = + explainingProbabilities[maths::SModelProbabilityResult::E_SingleBucketProbability]; + double mbProbability = + explainingProbabilities[maths::SModelProbabilityResult::E_MultiBucketProbability]; + + double ls = std::log(std::max(sbProbability, ml::maths::CTools::smallestProbability())); + double lm = std::log(std::max(mbProbability, ml::maths::CTools::smallestProbability())); + + double scale = CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE * + std::min(ls, lm) / std::min(std::max(ls, lm), -0.001) / + std::log(1000); + + multiBucketImpact = std::max( + std::min(scale * (ls - lm), CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE), + -1.0 * CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE); + + return true; +} + bool CProbabilityAndInfluenceCalculator::calculate( double& probability, TStoredStringPtrStoredStringPtrPrDoublePrVec& influences) const { diff --git a/lib/model/unittest/CEventRateModelTest.cc b/lib/model/unittest/CEventRateModelTest.cc index 672ad66f78..176fc81400 100644 --- a/lib/model/unittest/CEventRateModelTest.cc +++ b/lib/model/unittest/CEventRateModelTest.cc @@ -280,7 +280,7 @@ const TSizeDoublePr1Vec NO_CORRELATES; } // unnamed:: -void CEventRateModelTest::testOnlineCountSample() { +void CEventRateModelTest::testCountSample() { const core_t::TTime startTime = 1346968800; const core_t::TTime bucketLength = 3600; SModelParams params(bucketLength); @@ -373,7 +373,7 @@ void CEventRateModelTest::testOnlineCountSample() { CPPUNIT_ASSERT_EQUAL(origXml, newXml); } -void CEventRateModelTest::testOnlineNonZeroCountSample() { +void CEventRateModelTest::testNonZeroCountSample() { const core_t::TTime startTime = 1346968800; const core_t::TTime bucketLength = 3600; SModelParams params(bucketLength); @@ -441,7 +441,7 @@ void CEventRateModelTest::testOnlineNonZeroCountSample() { } } -void CEventRateModelTest::testOnlineRare() { +void CEventRateModelTest::testRare() { const core_t::TTime startTime = 1346968800; const core_t::TTime bucketLength = 3600; SModelParams params(bucketLength); @@ -517,60 +517,89 @@ void CEventRateModelTest::testOnlineRare() { CPPUNIT_ASSERT_EQUAL(origXml, newXml); } -void CEventRateModelTest::testOnlineProbabilityCalculation() { - using TDoubleSizePr = std::pair; - using TMinAccumulator = maths::CBasicStatistics::COrderStatisticsHeap; +void CEventRateModelTest::testProbabilityCalculation() { + using TDoubleSizeAnotatedProbabilityTr = + core::CTriple; + using TMinAccumulator = maths::CBasicStatistics::COrderStatisticsHeap< + TDoubleSizeAnotatedProbabilityTr, + std::function>; const core_t::TTime startTime = 1346968800; const core_t::TTime bucketLength = 3600; - const std::size_t anomalousBucket = 25u; - SModelParams params(bucketLength); - params.s_DecayRate = 0.001; - this->makeModel(params, {model_t::E_IndividualCountByBucketAndPerson}, startTime, 1); - CEventRateModel* model = dynamic_cast(m_Model.get()); + TSizeVec anomalousBuckets[]{TSizeVec{25}, TSizeVec{24, 25, 26, 27}}; + double anomalousBucketsRateMultipliers[]{3.0, 1.3}; - TMinAccumulator minProbabilities(2u); + for (std::size_t t = 0; t < 2; ++t) { - // Generate some events. - TTimeVec eventTimes; - TUInt64Vec expectedEventCounts = rawEventCounts(2); - expectedEventCounts[anomalousBucket] *= 3; - generateEvents(startTime, bucketLength, expectedEventCounts, eventTimes); - core_t::TTime endTime = (eventTimes.back() / bucketLength + 1) * bucketLength; - LOG_DEBUG(<< "startTime = " << startTime << ", endTime = " << endTime - << ", # events = " << eventTimes.size()); - - std::size_t i = 0u, j = 0u; - for (core_t::TTime bucketStartTime = startTime; bucketStartTime < endTime; - bucketStartTime += bucketLength, ++j) { - core_t::TTime bucketEndTime = bucketStartTime + bucketLength; + // Create the model. + SModelParams params(bucketLength); + params.s_DecayRate = 0.001; + this->makeModel(params, {model_t::E_IndividualCountByBucketAndPerson}, startTime, 1); + CEventRateModel* model = dynamic_cast(m_Model.get()); - double count = 0.0; - for (; i < eventTimes.size() && eventTimes[i] < bucketEndTime; ++i) { - addArrival(*m_Gatherer, m_ResourceMonitor, eventTimes[i], "p1"); - count += 1.0; + // Generate some events. + TTimeVec eventTimes; + TUInt64Vec expectedEventCounts = rawEventCounts(2); + for (auto i : anomalousBuckets[t]) { + expectedEventCounts[i] = + static_cast(static_cast(expectedEventCounts[i]) * + anomalousBucketsRateMultipliers[t]); } + generateEvents(startTime, bucketLength, expectedEventCounts, eventTimes); + core_t::TTime endTime = (eventTimes.back() / bucketLength + 1) * bucketLength; + LOG_DEBUG(<< "startTime = " << startTime << ", endTime = " << endTime + << ", # events = " << eventTimes.size()); - LOG_DEBUG(<< "bucket count = " << count); + // Play the data through the model and get the lowest probability buckets. + TMinAccumulator minProbabilities( + 2, [](const TDoubleSizeAnotatedProbabilityTr& lhs, + const TDoubleSizeAnotatedProbabilityTr& rhs) { + return lhs.first < rhs.first; + }); - model->sample(bucketStartTime, bucketEndTime, m_ResourceMonitor); + std::size_t i = 0; + for (core_t::TTime j = 0, bucketStartTime = startTime; + bucketStartTime < endTime; bucketStartTime += bucketLength, ++j) { + core_t::TTime bucketEndTime = bucketStartTime + bucketLength; - SAnnotatedProbability p; - CPartitioningFields partitioningFields(EMPTY_STRING, EMPTY_STRING); - CPPUNIT_ASSERT(model->computeProbability( - 0 /*pid*/, bucketStartTime, bucketEndTime, partitioningFields, 1, p)); - LOG_DEBUG(<< "probability = " << p.s_Probability); - minProbabilities.add(TDoubleSizePr(p.s_Probability, j)); - } + double count = 0.0; + for (; i < eventTimes.size() && eventTimes[i] < bucketEndTime; ++i) { + addArrival(*m_Gatherer, m_ResourceMonitor, eventTimes[i], "p1"); + count += 1.0; + } - minProbabilities.sort(); - LOG_DEBUG(<< "minProbabilities = " << core::CContainerPrinter::print(minProbabilities)); - CPPUNIT_ASSERT_EQUAL(anomalousBucket, minProbabilities[0].second); - CPPUNIT_ASSERT(minProbabilities[0].first / minProbabilities[1].first < 0.1); + model->sample(bucketStartTime, bucketEndTime, m_ResourceMonitor); + + SAnnotatedProbability p; + CPartitioningFields partitioningFields(EMPTY_STRING, EMPTY_STRING); + CPPUNIT_ASSERT(model->computeProbability(0 /*pid*/, bucketStartTime, bucketEndTime, + partitioningFields, 1, p)); + LOG_DEBUG(<< "bucket count = " << count << ", probability = " << p.s_Probability); + minProbabilities.add({p.s_Probability, j, p}); + } + + minProbabilities.sort(); + + if (anomalousBuckets[t].size() == 1) { + // Check the one anomalous bucket has the lowest probability by a significant margin. + CPPUNIT_ASSERT_EQUAL(anomalousBuckets[0][0], minProbabilities[0].second); + CPPUNIT_ASSERT(minProbabilities[0].first / minProbabilities[1].first < 0.1); + } else { + // Check the multi-bucket impact values are relatively high + // (indicating a large contribution from multi-bucket analysis) + double expectedMultiBucketImpactThresholds[2]{0.3, 2.5}; + for (int j = 0; j < 2; ++j) { + double multiBucketImpact = minProbabilities[j].third.s_MultiBucketImpact; + LOG_DEBUG(<< "multi_bucket_impact = " << multiBucketImpact); + CPPUNIT_ASSERT(multiBucketImpact > expectedMultiBucketImpactThresholds[j]); + CPPUNIT_ASSERT(multiBucketImpact <= CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE); + } + } + } } -void CEventRateModelTest::testOnlineProbabilityCalculationForLowNonZeroCount() { +void CEventRateModelTest::testProbabilityCalculationForLowNonZeroCount() { core_t::TTime startTime(0); core_t::TTime bucketLength(100); std::size_t lowNonZeroCountBucket = 6u; @@ -616,7 +645,7 @@ void CEventRateModelTest::testOnlineProbabilityCalculationForLowNonZeroCount() { CPPUNIT_ASSERT(probabilities[highNonZeroCountBucket] > 0.9); } -void CEventRateModelTest::testOnlineProbabilityCalculationForHighNonZeroCount() { +void CEventRateModelTest::testProbabilityCalculationForHighNonZeroCount() { core_t::TTime startTime(0); core_t::TTime bucketLength(100); std::size_t lowNonZeroCountBucket = 6u; @@ -662,7 +691,7 @@ void CEventRateModelTest::testOnlineProbabilityCalculationForHighNonZeroCount() CPPUNIT_ASSERT(probabilities[highNonZeroCountBucket] > 0.9); } -void CEventRateModelTest::testOnlineCorrelatedNoTrend() { +void CEventRateModelTest::testCorrelatedNoTrend() { // Check we find the correct correlated variables, and identify // correlate and marginal anomalies. @@ -860,7 +889,7 @@ void CEventRateModelTest::testOnlineCorrelatedNoTrend() { } } -void CEventRateModelTest::testOnlineCorrelatedTrend() { +void CEventRateModelTest::testCorrelatedTrend() { // Check we find the correct correlated variables, and identify // correlate and marginal anomalies. @@ -1990,7 +2019,7 @@ void CEventRateModelTest::testDistinctCountProbabilityCalculationWithInfluence() } } -void CEventRateModelTest::testOnlineRareWithInfluence() { +void CEventRateModelTest::testRareWithInfluence() { const core_t::TTime startTime = 1346968800; const core_t::TTime bucketLength = 3600; SModelParams params(bucketLength); @@ -2904,29 +2933,25 @@ CppUnit::Test* CEventRateModelTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CEventRateModelTest"); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::testOnlineCountSample", &CEventRateModelTest::testOnlineCountSample)); + "CEventRateModelTest::testCountSample", &CEventRateModelTest::testCountSample)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::testOnlineNonZeroCountSample", - &CEventRateModelTest::testOnlineNonZeroCountSample)); + "CEventRateModelTest::testNonZeroCountSample", + &CEventRateModelTest::testNonZeroCountSample)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::testOnlineRare", &CEventRateModelTest::testOnlineRare)); + "CEventRateModelTest::testRare", &CEventRateModelTest::testRare)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::testOnlineProbabilityCalculation", - &CEventRateModelTest::testOnlineProbabilityCalculation)); + "CEventRateModelTest::testProbabilityCalculation", + &CEventRateModelTest::testProbabilityCalculation)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::" - "testOnlineProbabilityCalculationForLowNonZeroCount", - &CEventRateModelTest::testOnlineProbabilityCalculationForLowNonZeroCount)); + "CEventRateModelTest::testProbabilityCalculationForLowNonZeroCount", + &CEventRateModelTest::testProbabilityCalculationForLowNonZeroCount)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::" - "testOnlineProbabilityCalculationForHighNonZeroCount", - &CEventRateModelTest::testOnlineProbabilityCalculationForHighNonZeroCount)); + "CEventRateModelTest::testProbabilityCalculationForHighNonZeroCount", + &CEventRateModelTest::testProbabilityCalculationForHighNonZeroCount)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::testOnlineCorrelatedNoTrend", - &CEventRateModelTest::testOnlineCorrelatedNoTrend)); + "CEventRateModelTest::testCorrelatedNoTrend", &CEventRateModelTest::testCorrelatedNoTrend)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::testOnlineCorrelatedTrend", - &CEventRateModelTest::testOnlineCorrelatedTrend)); + "CEventRateModelTest::testCorrelatedTrend", &CEventRateModelTest::testCorrelatedTrend)); suiteOfTests->addTest(new CppUnit::TestCaller( "CEventRateModelTest::testPrune", &CEventRateModelTest::testPrune)); suiteOfTests->addTest(new CppUnit::TestCaller( @@ -2938,12 +2963,10 @@ CppUnit::Test* CEventRateModelTest::suite() { "CEventRateModelTest::testCountProbabilityCalculationWithInfluence", &CEventRateModelTest::testCountProbabilityCalculationWithInfluence)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::" - "testDistinctCountProbabilityCalculationWithInfluence", + "CEventRateModelTest::testDistinctCountProbabilityCalculationWithInfluence", &CEventRateModelTest::testDistinctCountProbabilityCalculationWithInfluence)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CEventRateModelTest::testOnlineRareWithInfluence", - &CEventRateModelTest::testOnlineRareWithInfluence)); + "CEventRateModelTest::testRareWithInfluence", &CEventRateModelTest::testRareWithInfluence)); suiteOfTests->addTest(new CppUnit::TestCaller( "CEventRateModelTest::testSkipSampling", &CEventRateModelTest::testSkipSampling)); suiteOfTests->addTest(new CppUnit::TestCaller( diff --git a/lib/model/unittest/CEventRateModelTest.h b/lib/model/unittest/CEventRateModelTest.h index 31f7d4e92a..b9ef457c3b 100644 --- a/lib/model/unittest/CEventRateModelTest.h +++ b/lib/model/unittest/CEventRateModelTest.h @@ -25,20 +25,20 @@ struct SModelParams; class CEventRateModelTest : public CppUnit::TestFixture { public: - void testOnlineCountSample(); - void testOnlineNonZeroCountSample(); - void testOnlineRare(); - void testOnlineProbabilityCalculation(); - void testOnlineProbabilityCalculationForLowNonZeroCount(); - void testOnlineProbabilityCalculationForHighNonZeroCount(); - void testOnlineCorrelatedNoTrend(); - void testOnlineCorrelatedTrend(); + void testCountSample(); + void testNonZeroCountSample(); + void testRare(); + void testProbabilityCalculation(); + void testProbabilityCalculationForLowNonZeroCount(); + void testProbabilityCalculationForHighNonZeroCount(); + void testCorrelatedNoTrend(); + void testCorrelatedTrend(); void testPrune(); void testKey(); void testModelsWithValueFields(); void testCountProbabilityCalculationWithInfluence(); void testDistinctCountProbabilityCalculationWithInfluence(); - void testOnlineRareWithInfluence(); + void testRareWithInfluence(); void testSkipSampling(); void testExplicitNulls(); void testInterimCorrections();