Skip to content

Commit 395f57f

Browse files
authored
[6.5][ML] Improve time series decomposition in the presence of change points (#231)
Backports #198.
1 parent 9404c0d commit 395f57f

25 files changed

+2843
-743
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,20 @@
4040
4141
Perform anomaly detection on features derived from multiple bucket values to improve robustness
4242
of detection with respect to misconfigured bucket lengths and improve detection of long lasting
43-
anomalies. (See {pull}175[#175].)
43+
anomalies. (See {ml-pull}175[#175].)
4444
45-
Increased independence of anomaly scores across partitions ({pull}182[182])
45+
Support decomposing a time series into a piecewise linear trend and with piecewise constant
46+
scaling of the periodic components. This extends our decomposition functionality to handle the
47+
same types of change points that our modelling capabilities do. (See {ml-pull}198[198].)
48+
49+
Increased independence of anomaly scores across partitions (See {ml-pull}182[182].)
4650
4751
Avoid potential false positives at model start up when first detecting new components of the time
48-
series decomposition. (See {pull}218[218].)
52+
series decomposition. (See {ml-pull}218[218].)
4953
5054
=== Bug Fixes
5155
52-
Fix cause of "Bad density value..." log errors whilst forecasting. ({pull}207[207])
56+
Fix cause of "Bad density value..." log errors whilst forecasting. ({ml-pull}207[207])
5357
5458
Fix incorrectly missing influencers when the influence field is one of the detector's partitioning
5559
fields and the bucket is empty. ({pull}219[#219])

include/maths/CPeriodicityHypothesisTests.h

Lines changed: 94 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,18 @@ class CSeasonalTime;
2727

2828
//! \brief Represents the result of running the periodicity
2929
//! hypothesis tests.
30-
// clang-format off
31-
class MATHS_EXPORT CPeriodicityHypothesisTestsResult : boost::equality_comparable<CPeriodicityHypothesisTestsResult,
32-
boost::addable<CPeriodicityHypothesisTestsResult> > {
33-
// clang-format on
30+
class MATHS_EXPORT CPeriodicityHypothesisTestsResult
31+
: boost::equality_comparable<CPeriodicityHypothesisTestsResult> {
3432
public:
3533
using TTimeTimePr = std::pair<core_t::TTime, core_t::TTime>;
34+
using TSizeVec = std::vector<std::size_t>;
3635

37-
public:
3836
//! \brief Component data.
3937
struct MATHS_EXPORT SComponent {
40-
SComponent();
38+
SComponent() = default;
4139
SComponent(const std::string& description,
4240
bool diurnal,
41+
bool piecewiseScaled,
4342
core_t::TTime startOfPartition,
4443
core_t::TTime period,
4544
const TTimeTimePr& window,
@@ -56,41 +55,45 @@ class MATHS_EXPORT CPeriodicityHypothesisTestsResult : boost::equality_comparabl
5655
//! An identifier for the component used by the test.
5756
std::string s_Description;
5857
//! True if this is a diurnal component false otherwise.
59-
bool s_Diurnal;
58+
bool s_Diurnal = false;
59+
//! The segmentation of the window into intervals of constant
60+
//! scaling.
61+
bool s_PiecewiseScaled = false;
6062
//! The start of the partition.
61-
core_t::TTime s_StartOfPartition;
63+
core_t::TTime s_StartOfPartition = 0;
6264
//! The period of the component.
63-
core_t::TTime s_Period;
65+
core_t::TTime s_Period = 0;
6466
//! The component window.
6567
TTimeTimePr s_Window;
66-
//! The precedence to apply to this component when
67-
//! deciding which to keep.
68-
double s_Precedence;
68+
//! The precedence to apply to this component when deciding
69+
//! which to keep.
70+
double s_Precedence = 0.0;
6971
};
7072

7173
using TComponent5Vec = core::CSmallVector<SComponent, 5>;
74+
using TRemoveCondition = std::function<bool(const SComponent&)>;
7275

7376
public:
7477
//! Check if this is equal to \p other.
7578
bool operator==(const CPeriodicityHypothesisTestsResult& other) const;
7679

77-
//! Sets to the union of the periodic components present.
78-
//!
79-
//! \warning This only makes sense if the this and the
80-
//! other result share the start of the partition time.
81-
const CPeriodicityHypothesisTestsResult&
82-
operator+=(const CPeriodicityHypothesisTestsResult& other);
83-
8480
//! Add a component.
8581
void add(const std::string& description,
8682
bool diurnal,
83+
bool piecewiseScaled,
8784
core_t::TTime startOfWeek,
8885
core_t::TTime period,
8986
const TTimeTimePr& window,
9087
double precedence = 1.0);
9188

9289
//! Remove the component with \p description.
93-
void remove(const std::string& description);
90+
void remove(const TRemoveCondition& condition);
91+
92+
//! Set if this is a piecewise linear trend.
93+
void piecewiseLinearTrend(bool value);
94+
95+
//! Check if this is a piecewise linear trend.
96+
bool piecewiseLinearTrend() const;
9497

9598
//! Check if there are any periodic components.
9699
bool periodic() const;
@@ -102,6 +105,9 @@ class MATHS_EXPORT CPeriodicityHypothesisTestsResult : boost::equality_comparabl
102105
std::string print() const;
103106

104107
private:
108+
//! If true then the hypothesis used a piecewise linear trend.
109+
bool m_PiecewiseLinearTrend = false;
110+
105111
//! The periodic components.
106112
TComponent5Vec m_Components;
107113
};
@@ -174,14 +180,17 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
174180
using TComponent = CPeriodicityHypothesisTestsResult::SComponent;
175181

176182
public:
177-
CPeriodicityHypothesisTests();
183+
CPeriodicityHypothesisTests() = default;
178184
explicit CPeriodicityHypothesisTests(const CPeriodicityHypothesisTestsConfig& config);
179185

180186
//! Check if the test is initialized.
181187
bool initialized() const;
182188

183189
//! Initialize the bucket values.
184-
void initialize(core_t::TTime bucketLength, core_t::TTime window, core_t::TTime period);
190+
void initialize(core_t::TTime startTime,
191+
core_t::TTime bucketLength,
192+
core_t::TTime window,
193+
core_t::TTime period);
185194

186195
//! Add \p value at \p time.
187196
void add(core_t::TTime time, double value, double weight = 1.0);
@@ -193,34 +202,39 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
193202
private:
194203
using TDoubleVec = std::vector<double>;
195204
using TDoubleVec2Vec = core::CSmallVector<TDoubleVec, 2>;
205+
using TSizeVec = std::vector<std::size_t>;
196206
using TFloatMeanAccumulatorCRng = core::CVectorRange<const TFloatMeanAccumulatorVec>;
197207
using TMinMaxAccumulator = maths::CBasicStatistics::CMinMax<core_t::TTime>;
198208

199209
//! \brief A collection of statistics used during testing.
200210
struct STestStats {
201-
STestStats();
211+
explicit STestStats(double meanMagnitude);
202212
//! Set the various test thresholds.
203213
void setThresholds(double vt, double at, double Rt);
204214
//! Check if the null hypothesis is good enough to not need an
205215
//! alternative.
206216
bool nullHypothesisGoodEnough() const;
217+
//! The number of segments in the trend.
218+
double s_TrendSegments;
207219
//! True if a known periodic component is tested.
208220
bool s_HasPeriod;
209221
//! True if a known repeating partition is tested.
210222
bool s_HasPartition;
211223
//! The maximum variance to accept the alternative hypothesis.
212-
double s_Vt;
224+
double s_VarianceThreshold;
213225
//! The minimum amplitude to accept the alternative hypothesis.
214-
double s_At;
226+
double s_AmplitudeThreshold;
215227
//! The minimum autocorrelation to accept the alternative
216228
//! hypothesis.
217-
double s_Rt;
229+
double s_AutocorrelationThreshold;
218230
//! The data range.
219231
double s_Range;
220232
//! The number of buckets with at least one measurement.
221-
double s_B;
233+
double s_NonEmptyBuckets;
222234
//! The average number of measurements per bucket value.
223-
double s_M;
235+
double s_MeasurementsPerBucket;
236+
//! The mean magnitude of the bucket values.
237+
double s_MeanMagnitude;
224238
//! The null hypothesis periodic components.
225239
CPeriodicityHypothesisTestsResult s_H0;
226240
//! The variance estimate of H0.
@@ -231,10 +245,14 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
231245
double s_DF0;
232246
//! The trend for the null hypothesis.
233247
TDoubleVec2Vec s_T0;
248+
//! The linear scales if any.
249+
TDoubleVec s_Scales;
234250
//! The partition for the null hypothesis.
235251
TTimeTimePr2Vec s_Partition;
236252
//! The start of the repeating partition.
237253
core_t::TTime s_StartOfPartition;
254+
//! The segmentation of the interval if any.
255+
TSizeVec s_Segmentation;
238256
};
239257

240258
//! \brief Manages the testing of a set of nested hypotheses.
@@ -268,13 +286,19 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
268286
CNestedHypotheses& addNested(TTestFunc test);
269287
//! Test the hypotheses.
270288
CPeriodicityHypothesisTestsResult test(STestStats& stats) const;
289+
//! Set if the hypothesis uses a piecewise linear trend.
290+
void trendSegments(std::size_t segments);
291+
//! Check if the hypothesis uses a piecewise linear trend.
292+
std::size_t trendSegments() const;
271293

272294
private:
273295
using THypothesisVec = std::vector<CNestedHypotheses>;
274296

275297
private:
276298
//! The test.
277299
TTestFunc m_Test;
300+
//! The number of segments in the trend.
301+
std::size_t m_TrendSegments;
278302
//! If true always test the nested hypotheses.
279303
bool m_AlwaysTestNested;
280304
//! The nested hypotheses to test.
@@ -314,11 +338,13 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
314338
//! Test for a daily periodic component.
315339
CPeriodicityHypothesisTestsResult testForDaily(const TTimeTimePr2Vec& window,
316340
const TFloatMeanAccumulatorCRng& buckets,
341+
bool scaling,
317342
STestStats& stats) const;
318343

319344
//! Test for a weekly periodic component.
320345
CPeriodicityHypothesisTestsResult testForWeekly(const TTimeTimePr2Vec& window,
321346
const TFloatMeanAccumulatorCRng& buckets,
347+
bool scaling,
322348
STestStats& stats) const;
323349

324350
//! Test for a weekday/end partition.
@@ -336,6 +362,7 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
336362
//! periodicity.
337363
CPeriodicityHypothesisTestsResult testForPeriod(const TTimeTimePr2Vec& window,
338364
const TFloatMeanAccumulatorCRng& buckets,
365+
bool scaling,
339366
STestStats& stats) const;
340367

341368
//! Check we've seen sufficient data to test accurately.
@@ -344,7 +371,8 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
344371

345372
//! Check if there are enough non-empty buckets which are repeated
346373
//! at at least one \p period in \p buckets.
347-
bool seenSufficientPeriodicallyPopulatedBucketsToTest(const TFloatMeanAccumulatorCRng& buckets,
374+
template<typename CONTAINER>
375+
bool seenSufficientPeriodicallyPopulatedBucketsToTest(const CONTAINER& buckets,
348376
std::size_t period) const;
349377

350378
//! Compute various ancillary statistics for testing.
@@ -373,6 +401,13 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
373401
core_t::TTime period,
374402
STestStats& stats) const;
375403

404+
//! Test to see if there is significant evidence for a component
405+
//! with period \p period which is piecewise linearly scaled.
406+
bool testPeriodWithScaling(const TTimeTimePr2Vec& windows,
407+
const TFloatMeanAccumulatorCRng& buckets,
408+
core_t::TTime period,
409+
STestStats& stats) const;
410+
376411
//! Test to see if there is significant evidence for a repeating
377412
//! partition of the data into windows defined by \p partition.
378413
bool testPartition(const TTimeTimePr2Vec& partition,
@@ -381,6 +416,29 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
381416
double correction,
382417
STestStats& stats) const;
383418

419+
//! Run the explained variance test on an alternative hypothesis.
420+
bool testVariance(const TTimeTimePr2Vec& window,
421+
const TFloatMeanAccumulatorVec& buckets,
422+
core_t::TTime period,
423+
double df1,
424+
double v1,
425+
STestStats& stats,
426+
double& R,
427+
double& meanRepeats,
428+
double& pVariance,
429+
const TSizeVec& segmentation = TSizeVec{}) const;
430+
431+
//! Run the component amplitude test on the alternative hypothesis.
432+
bool testAmplitude(const TTimeTimePr2Vec& window,
433+
const TFloatMeanAccumulatorVec& buckets,
434+
core_t::TTime period,
435+
double b,
436+
double v,
437+
double R,
438+
double meanRepeats,
439+
double pVariance,
440+
STestStats& stats) const;
441+
384442
private:
385443
//! The minimum proportion of populated buckets for which
386444
//! the test is accurate.
@@ -393,14 +451,17 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
393451
//! Configures the tests to run.
394452
CPeriodicityHypothesisTestsConfig m_Config;
395453

454+
//! The start time of the window.
455+
core_t::TTime m_StartTime = 0;
456+
396457
//! The bucketing interval.
397-
core_t::TTime m_BucketLength;
458+
core_t::TTime m_BucketLength = 0;
398459

399460
//! The window length for which to maintain bucket values.
400-
core_t::TTime m_WindowLength;
461+
core_t::TTime m_WindowLength = 0;
401462

402463
//! The specified period to test.
403-
core_t::TTime m_Period;
464+
core_t::TTime m_Period = 0;
404465

405466
//! The time range of values added to the test.
406467
TMinMaxAccumulator m_TimeRange;
@@ -409,16 +470,13 @@ class MATHS_EXPORT CPeriodicityHypothesisTests {
409470
TFloatMeanAccumulatorVec m_BucketValues;
410471
};
411472

412-
using TFloatMeanAccumulator = CBasicStatistics::SSampleMean<CFloatStorage>::TAccumulator;
413-
using TFloatMeanAccumulatorVec = std::vector<TFloatMeanAccumulator>;
414-
415473
//! Test for periodic components in \p values.
416474
MATHS_EXPORT
417475
CPeriodicityHypothesisTestsResult
418476
testForPeriods(const CPeriodicityHypothesisTestsConfig& config,
419477
core_t::TTime startTime,
420478
core_t::TTime bucketLength,
421-
const TFloatMeanAccumulatorVec& values);
479+
const std::vector<CBasicStatistics::SSampleMean<CFloatStorage>::TAccumulator>& values);
422480
}
423481
}
424482

include/maths/CRegression.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,11 @@ class MATHS_EXPORT CRegression {
9595
//! is at a premium.
9696
//!
9797
//! \tparam N_ The degree of the polynomial.
98+
// clang-format off
9899
template<std::size_t N_, typename T = CFloatStorage>
99-
class CLeastSquaresOnline : boost::addable<CLeastSquaresOnline<N_, T>> {
100+
class CLeastSquaresOnline : boost::addable<CLeastSquaresOnline<N_, T>,
101+
boost::subtractable<CLeastSquaresOnline<N_, T>>> {
102+
// clang-format on
100103
public:
101104
static const std::size_t N = N_ + 1;
102105
using TArray = boost::array<double, N>;

include/maths/CTimeSeriesDecomposition.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt
125125
//! Get the value of the time series at \p time.
126126
//!
127127
//! \param[in] time The time of interest.
128-
//! \param[in] confidence The symmetric confidence interval for the prediction
129-
//! the baseline as a percentage.
128+
//! \param[in] confidence The symmetric confidence interval for the
129+
//! prediction the baseline as a percentage.
130130
//! \param[in] components The components to include in the baseline.
131131
virtual maths_t::TDoubleDoublePr value(core_t::TTime time,
132132
double confidence = 0.0,

0 commit comments

Comments
 (0)