Skip to content

[ML] Refactor initialization of the bucket gatherer #2844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Checks: >
-modernize-use-trailing-return-type,
-modernize-concat-nested-namespaces,
-modernize-use-nodiscard,
-modernize-use-ranges,

performance-*,

Expand Down
29 changes: 24 additions & 5 deletions include/model/CBucketGatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,26 @@ class MODEL_EXPORT CBucketGatherer {
using TTimeVec = std::vector<core_t::TTime>;
using TTimeVecCItr = TTimeVec::const_iterator;

struct SBucketGathererInitData {
// The name of the field holding the summary count.
const std::string& s_SummaryCountFieldName;
// The name of the field which identifies people.
const std::string& s_PersonFieldName;
// The name of the field which defines the person attributes.
const std::string& s_AttributeFieldName;
// The name of the field which contains the metric values.
const std::string& s_ValueFieldName;
// The field names for which we will compute influences.
const TStrVec& s_InfluenceFieldNames;
// The start of the time interval for which to gather data.
core_t::TTime s_StartTime;
// Override for the number of measurements
// in a statistic. (Note that this is intended for testing only.)
// A zero value means that the data gatherer class will determine
// an appropriate value for the bucket length and data rate.
unsigned int s_SampleOverrideCount;
};

public:
static const std::string EVENTRATE_BUCKET_GATHERER_TAG;
static const std::string METRIC_BUCKET_GATHERER_TAG;
Expand All @@ -142,11 +162,10 @@ class MODEL_EXPORT CBucketGatherer {
//! Create a new data series gatherer.
//!
//! \param[in] dataGatherer The owning data gatherer.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
//! \param[in] numberInfluencers The number of result influencers
//! for which to gather data.
CBucketGatherer(CDataGatherer& dataGatherer, core_t::TTime startTime, std::size_t numberInfluencers);
//! \param[in] bucketGathererInitData The parameter initialization object
//! for the bucket gatherer.
CBucketGatherer(CDataGatherer& dataGatherer,
const SBucketGathererInitData& bucketGathererInitData);

//! Create a copy that will result in the same persisted state as the
//! original. This is effectively a copy constructor that creates a
Expand Down
52 changes: 7 additions & 45 deletions include/model/CDataGatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,51 +146,26 @@ class MODEL_EXPORT CDataGatherer {
//! \param[in] summaryMode Indicates whether the data being gathered
//! are already summarized by an external aggregation process.
//! \param[in] modelParams The global configuration parameters.
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] partitionFieldValue The value of the field which splits
//! the data.
//! \param[in] personFieldName The name of the field which identifies
//! people.
//! \param[in] attributeFieldName The name of the field which defines
//! the person attributes.
//! \param[in] valueFieldName The name of the field which contains
//! the metric values.
//! \param[in] influenceFieldNames The field names for which we will
//! compute influences.
//! \param[in] key The key of the search for which to gatherer data.
//! \param[in] features The features of the data to model.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
//! \param[in] sampleCountOverride for the number of measurements
//! in a statistic. (Note that this is intended for testing only.)
//! A zero value means that the data gatherer class will determine
//! an appropriate value for the bucket length and data rate.
//! \param[in] bucketGathererInitData The parameter initialization object for the bucket gatherer.
CDataGatherer(model_t::EAnalysisCategory gathererType,
model_t::ESummaryMode summaryMode,
const SModelParams& modelParams,
const std::string& summaryCountFieldName,
const std::string& partitionFieldValue,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
const CSearchKey& key,
const TFeatureVec& features,
core_t::TTime startTime,
int sampleCountOverride);
const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData);

//! Construct from a state document.
CDataGatherer(model_t::EAnalysisCategory gathererType,
model_t::ESummaryMode summaryMode,
const SModelParams& modelParams,
const std::string& summaryCountFieldName,
const std::string& partitionFieldValue,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
const CSearchKey& key,
const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Create a copy that will result in the same persisted state as the
Expand Down Expand Up @@ -330,6 +305,7 @@ class MODEL_EXPORT CDataGatherer {
//! containing \p time.
//!
//! \param[in] time The time of interest.
//! \param[in] bucketLength The length of the bucketing interval.
//! \param[out] result Filled in with the feature data at \p time.
//! \tparam T The type of the feature data.
template<typename T>
Expand Down Expand Up @@ -675,19 +651,11 @@ class MODEL_EXPORT CDataGatherer {

private:
//! Restore state from supplied traverser.
bool acceptRestoreTraverser(const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
bool acceptRestoreTraverser(const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Restore a bucket gatherer from the supplied traverser.
bool restoreBucketGatherer(const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
bool restoreBucketGatherer(const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Persist a bucket gatherer by passing information to the supplied
Expand All @@ -696,13 +664,7 @@ class MODEL_EXPORT CDataGatherer {

//! Create the bucket specific data gatherer.
void createBucketGatherer(model_t::EAnalysisCategory gathererType,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
core_t::TTime startTime,
unsigned int sampleCountOverride);
const CBucketGatherer::SBucketGathererInitData& initData);

private:
//! The type of the bucket gatherer(s) used.
Expand Down
32 changes: 4 additions & 28 deletions include/model/CEventRateBucketGatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,33 +111,13 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
//! Create an event rate bucket gatherer.
//!
//! \param[in] dataGatherer The owning data gatherer.
//! \param[in] summaryCountFieldName If summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] personFieldName The name of the field which identifies
//! people.
//! \param[in] attributeFieldName The name of the field which defines
//! the person attributes.
//! \param[in] valueFieldName The name of the field which contains
//! the metric values.
//! \param[in] influenceFieldNames The field names for which we will
//! compute influences.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
//! \param[in] bucketGathererInitData The parameter initialization object.
CEventRateBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
core_t::TTime startTime);
const SBucketGathererInitData& bucketGathererInitData);

//! Construct from a state document.
CEventRateBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
const SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Create a copy that will result in the same persisted state as the
Expand Down Expand Up @@ -444,11 +424,7 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
void startNewBucket(core_t::TTime time, bool skipUpdates) override;

//! Initialize the field names collection.
void initializeFieldNames(const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const std::string& summaryCountFieldName,
const TStrVec& influenceFieldNames);
void initializeFieldNames(const CBucketGatherer::SBucketGathererInitData& initData);

//! Initialize the feature data gatherers.
void initializeFeatureData();
Expand Down
35 changes: 6 additions & 29 deletions include/model/CMetricBucketGatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,33 +53,13 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
//! Create a new population metric data gatherer.
//!
//! \param[in] dataGatherer The owning data gatherer.
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] personFieldName The name of the field which identifies
//! people.
//! \param[in] attributeFieldName The name of the field which defines
//! the person attributes.
//! \param[in] valueFieldName The name of the field which contains
//! the metric values.
//! \param[in] influenceFieldNames The field names for which we will
//! compute influences.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
CMetricBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
core_t::TTime startTime);
//! \param[in] initData The parameter initialization object for the bucket
//! gatherer.
CMetricBucketGatherer(CDataGatherer& dataGatherer, const SBucketGathererInitData& initData);

//! Construct from a state document.
CMetricBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
const SBucketGathererInitData& initData,
core::CStateRestoreTraverser& traverser);

//! Create a copy that will result in the same persisted state as the
Expand Down Expand Up @@ -266,9 +246,7 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
//! 1) initializeFieldNamesPart1()
//! 2) restore state
//! 3) initializeFieldNamesPart2()
void initializeFieldNamesPart1(const std::string& personFieldName,
const std::string& attributeFieldName,
const TStrVec& influenceFieldNames);
void initializeFieldNamesPart1(const SBucketGathererInitData& initData);

//! Initialize the field names collection.
//! initializeFieldNamesPart1() must be called before this.
Expand All @@ -277,8 +255,7 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
//! 1) initializeFieldNamesPart1()
//! 2) restore state
//! 3) initializeFieldNamesPart2()
void initializeFieldNamesPart2(const std::string& valueFieldName,
const std::string& summaryCountFieldName);
void initializeFieldNamesPart2(const SBucketGathererInitData& initData);

//! Initialize the feature data gatherers.
void initializeFeatureData();
Expand Down
4 changes: 3 additions & 1 deletion include/model/CModelFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
#include <maths/common/COrderings.h>
#include <maths/common/MathsTypes.h>

#include <model/CResourceMonitor.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>
#include <model/SModelParams.h>

#include <map>
#include <memory>
#include <optional>
#include <utility>
#include <vector>

namespace ml {
Expand Down Expand Up @@ -104,6 +104,8 @@ class MODEL_EXPORT CModelFactory {
using TStrDetectionRulePr = std::pair<std::string, model::CDetectionRule>;
using TStrDetectionRulePrVec = std::vector<TStrDetectionRulePr>;
using TStrDetectionRulePrVecCRef = std::reference_wrapper<const TStrDetectionRulePrVec>;
using TResourceMonitorCRef = std::reference_wrapper<const CResourceMonitor>;
using TOptionalResourceMonitorCRef = std::optional<TResourceMonitorCRef>;

public:
//! Wrapper around the model initialization data.
Expand Down
15 changes: 8 additions & 7 deletions lib/model/CBucketGatherer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,21 +202,22 @@ const std::string CBucketGatherer::EVENTRATE_BUCKET_GATHERER_TAG("a");
const std::string CBucketGatherer::METRIC_BUCKET_GATHERER_TAG("b");

CBucketGatherer::CBucketGatherer(CDataGatherer& dataGatherer,
core_t::TTime startTime,
std::size_t numberInfluencers)
: m_DataGatherer(dataGatherer), m_EarliestTime(startTime), m_BucketStart(startTime),
const SBucketGathererInitData& initData)
: m_DataGatherer(dataGatherer), m_EarliestTime(initData.s_StartTime),
m_BucketStart(initData.s_StartTime),
m_PersonAttributeCounts(dataGatherer.params().s_LatencyBuckets,
dataGatherer.params().s_BucketLength,
startTime,
initData.s_StartTime,
TSizeSizePrUInt64UMap(1)),
m_PersonAttributeExplicitNulls(dataGatherer.params().s_LatencyBuckets,
dataGatherer.params().s_BucketLength,
startTime,
initData.s_StartTime,
TSizeSizePrUSet(1)),
m_InfluencerCounts(dataGatherer.params().s_LatencyBuckets + 3,
dataGatherer.params().s_BucketLength,
startTime,
TSizeSizePrOptionalStrPrUInt64UMapVec(numberInfluencers)) {
initData.s_StartTime,
TSizeSizePrOptionalStrPrUInt64UMapVec(
initData.s_InfluenceFieldNames.size())) {
}

CBucketGatherer::CBucketGatherer(bool isForPersistence, const CBucketGatherer& other)
Expand Down
22 changes: 15 additions & 7 deletions lib/model/CCountingModelFactory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,27 @@ CCountingModelFactory::makeModel(const SModelInitializationData& initData,

CDataGatherer*
CCountingModelFactory::makeDataGatherer(const SGathererInitializationData& initData) const {
return new CDataGatherer(model_t::E_EventRate, m_SummaryMode, this->modelParams(),
m_SummaryCountFieldName, initData.s_PartitionFieldValue,
m_PersonFieldName, EMPTY_STRING, EMPTY_STRING, {},
this->searchKey(), m_Features, initData.s_StartTime, 0);
const CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
m_SummaryCountFieldName,
m_PersonFieldName,
EMPTY_STRING,
EMPTY_STRING,
{},
initData.s_StartTime,
0};
return new CDataGatherer(model_t::E_EventRate, m_SummaryMode,
this->modelParams(), initData.s_PartitionFieldValue,
this->searchKey(), m_Features, bucketGathererInitData);
}

CDataGatherer*
CCountingModelFactory::makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const {
CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
m_SummaryCountFieldName, m_PersonFieldName, EMPTY_STRING, EMPTY_STRING, {}, 0, 0};
return new CDataGatherer(model_t::E_EventRate, m_SummaryMode,
this->modelParams(), m_SummaryCountFieldName,
partitionFieldValue, m_PersonFieldName, EMPTY_STRING,
EMPTY_STRING, {}, this->searchKey(), traverser);
this->modelParams(), partitionFieldValue,
this->searchKey(), bucketGathererInitData, traverser);
}

CCountingModelFactory::TPriorPtr
Expand Down
Loading