Skip to content

Commit 975c802

Browse files
authored
[ML] Handle NaNs when detrending seasonal components (#408)
While it is as yet unknown how it can occur, NaN values have been observed to be present in seasonal components when detrending, resulting in a chain of errors in the logs as the NaN values percolate up the call stack. This PR attempts to detect the NaN values as close as possible to their source and ensure that they do not unduly affect the analysis. The approach is to check for bad values in seasonal and calendar components from within CTimeSeriesDecompositionDetail::CComponents::interpolate, which is called periodically. If invalid values are detected in a component then it is removed from the decomposition.
1 parent df87984 commit 975c802

19 files changed

+263
-47
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,11 @@
3939
4040
== {es} version 7.1.0
4141
42-
Remove hard limit for maximum forecast interval and limit based on the time interval of data added
42+
* Remove hard limit for maximum forecast interval and limit based on the time interval of data added
4343
to the model. (See {pull}214[#214].)
4444
45+
* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]
46+
4547
== {es} version 7.0.0-alpha1
4648
4749
== {es} version 6.7.0

include/maths/CAdaptiveBucketing.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,9 @@ class MATHS_EXPORT CAdaptiveBucketing {
133133
//! Name of component
134134
virtual std::string name() const = 0;
135135

136+
//! Check that the state is valid.
137+
virtual bool isBad() const = 0;
138+
136139
protected:
137140
using TRestoreFunc = std::function<bool(core::CStateRestoreTraverser&)>;
138141
using TPersistFunc = std::function<void(core::CStatePersistInserter&)>;

include/maths/CCalendarComponent.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ class MATHS_EXPORT CCalendarComponent : private CDecompositionComponent {
140140
//! Get the memory used by this component.
141141
std::size_t memoryUsage() const;
142142

143+
//! Check that the state is valid.
144+
bool isBad() const;
145+
143146
private:
144147
//! Create by traversing a state document.
145148
bool acceptRestoreTraverser(double decayRate,

include/maths/CCalendarComponentAdaptiveBucketing.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing final : public CAdaptiveB
100100
//! Name of component
101101
std::string name() const override;
102102

103+
//! Check that the state is valid.
104+
bool isBad() const override;
105+
103106
private:
104107
using TFloatMeanVarVec = std::vector<TFloatMeanVarAccumulator>;
105108

include/maths/CMathsFuncsDetail.h

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,6 @@ bool CMathsFuncs::isNan(const CSymmetricMatrixNxN<double, N>& val) {
6767
return anElement(static_cast<bool (*)(double)>(&isNan), val);
6868
}
6969

70-
template<std::size_t N>
71-
bool CMathsFuncs::isNan(const core::CSmallVector<double, N>& val) {
72-
for (std::size_t i = 0u; i < val.size(); ++i) {
73-
if (isNan(val[i])) {
74-
return true;
75-
}
76-
}
77-
return false;
78-
}
79-
8070
template<std::size_t N>
8171
bool CMathsFuncs::isInf(const CVectorNx1<double, N>& val) {
8272
return aComponent(static_cast<bool (*)(double)>(&isInf), val);
@@ -87,16 +77,6 @@ bool CMathsFuncs::isInf(const CSymmetricMatrixNxN<double, N>& val) {
8777
return anElement(static_cast<bool (*)(double)>(&isInf), val);
8878
}
8979

90-
template<std::size_t N>
91-
bool CMathsFuncs::isInf(const core::CSmallVector<double, N>& val) {
92-
for (std::size_t i = 0u; i < val.size(); ++i) {
93-
if (isInf(val[i])) {
94-
return true;
95-
}
96-
}
97-
return false;
98-
}
99-
10080
template<std::size_t N>
10181
bool CMathsFuncs::isFinite(const CVectorNx1<double, N>& val) {
10282
return everyComponent(static_cast<bool (*)(double)>(&isFinite), val);
@@ -106,16 +86,6 @@ template<std::size_t N>
10686
bool CMathsFuncs::isFinite(const CSymmetricMatrixNxN<double, N>& val) {
10787
return everyElement(static_cast<bool (*)(double)>(&isFinite), val);
10888
}
109-
110-
template<std::size_t N>
111-
bool CMathsFuncs::isFinite(const core::CSmallVector<double, N>& val) {
112-
for (std::size_t i = 0u; i < val.size(); ++i) {
113-
if (!isFinite(val[i])) {
114-
return false;
115-
}
116-
}
117-
return true;
118-
}
11989
}
12090
}
12191

include/maths/CRegression.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <cstddef>
2323
#include <cstdint>
2424

25+
class CNanInjector;
26+
2527
namespace ml {
2628
namespace core {
2729
class CStatePersistInserter;
@@ -400,6 +402,9 @@ class MATHS_EXPORT CRegression {
400402
//! regression. There are 3N - 1 in total, for the distinct
401403
//! values in the design matrix and vector.
402404
TVectorMeanAccumulator m_S;
405+
406+
//! Befriend a helper class used by the unit tests
407+
friend class ::CNanInjector;
403408
};
404409

405410
//! Get the predicted value of \p r at \p x.

include/maths/CSeasonalComponent.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include <string>
2020
#include <vector>
2121

22+
class CNanInjector;
23+
2224
namespace ml {
2325
namespace core {
2426
class CStatePersistInserter;
@@ -186,6 +188,9 @@ class MATHS_EXPORT CSeasonalComponent : private CDecompositionComponent {
186188
//! Get the memory used by this component.
187189
std::size_t memoryUsage() const;
188190

191+
//! Check that the state is valid.
192+
bool isBad() const { return m_Bucketing.isBad(); }
193+
189194
private:
190195
//! Create by traversing a state document.
191196
bool acceptRestoreTraverser(double decayRate,
@@ -202,6 +207,9 @@ class MATHS_EXPORT CSeasonalComponent : private CDecompositionComponent {
202207

203208
//! Regression models for a collection of buckets covering the period.
204209
CSeasonalComponentAdaptiveBucketing m_Bucketing;
210+
211+
//! Befriend a helper class used by the unit tests
212+
friend class ::CNanInjector;
205213
};
206214

207215
//! Create a free function which will be picked up in Koenig lookup.

include/maths/CSeasonalComponentAdaptiveBucketing.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
#include <stdint.h>
2323

24+
class CNanInjector;
25+
2426
namespace ml {
2527
namespace core {
2628
class CStatePersistInserter;
@@ -133,6 +135,9 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing final : public CAdaptiveB
133135
//! Name of component
134136
std::string name() const override;
135137

138+
//! Check that the state is valid.
139+
bool isBad() const override;
140+
136141
private:
137142
using TSeasonalTimePtr = std::unique_ptr<CSeasonalTime>;
138143

@@ -149,6 +154,9 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing final : public CAdaptiveB
149154

150155
uint64_t checksum(uint64_t seed) const;
151156

157+
//! Check that the state is valid.
158+
bool isBad() const;
159+
152160
TRegression s_Regression;
153161
CFloatStorage s_Variance;
154162
core_t::TTime s_FirstUpdate;
@@ -198,6 +206,9 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing final : public CAdaptiveB
198206

199207
//! The buckets.
200208
TBucketVec m_Buckets;
209+
210+
//! Befriend a helper class used by the unit tests
211+
friend class ::CNanInjector;
201212
};
202213

203214
//! Create a free function which will be found by Koenig lookup.

include/maths/CTimeSeriesDecomposition.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
#include <memory>
1616

17-
class CTimeSeriesDecompositionTest;
17+
class CNanInjector;
1818

1919
namespace ml {
2020
namespace core {
@@ -252,6 +252,9 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt
252252

253253
//! The state for modeling the components of the decomposition.
254254
CComponents m_Components;
255+
256+
//! Befriend a helper class used by the unit tests
257+
friend class ::CNanInjector;
255258
};
256259
}
257260
}

include/maths/CTimeSeriesDecompositionDetail.h

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include <memory>
2929
#include <vector>
3030

31+
class CNanInjector;
32+
3133
namespace ml {
3234
namespace maths {
3335
class CExpandingWindow;
@@ -373,12 +375,12 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail {
373375
//! Set whether or not we're testing for a change.
374376
void testingForChange(bool value);
375377

376-
//! Start observing for new components.
377-
void observeComponentsAdded();
378+
//! Start observing for modifications to the components.
379+
void observeComponentsModified();
378380

379-
//! Check if any components were added since observeComponentsAdded
381+
//! Check if any components were added or removed since observeComponentsModified
380382
//! was last called.
381-
bool componentsAdded();
383+
bool componentsModified();
382384

383385
//! Apply \p shift to the level at \p time and \p value.
384386
void shiftLevel(core_t::TTime time, double value, double shift);
@@ -653,12 +655,18 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail {
653655
//! Get the memory used by this object.
654656
std::size_t memoryUsage() const;
655657

658+
//! Remove any components with invalid values
659+
bool removeComponentsWithBadValues(core_t::TTime);
660+
656661
private:
657662
//! The components.
658663
maths_t::TSeasonalComponentVec m_Components;
659664

660665
//! The components' prediction errors.
661666
TComponentErrorsVec m_PredictionErrors;
667+
668+
//! Befriend a helper class used by the unit tests
669+
friend class ::CNanInjector;
662670
};
663671

664672
using TSeasonalPtr = std::unique_ptr<CSeasonal>;
@@ -728,6 +736,9 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail {
728736
//! Get the memory used by this object.
729737
std::size_t memoryUsage() const;
730738

739+
//! Remove any components with invalid values
740+
bool removeComponentsWithBadValues(core_t::TTime time);
741+
731742
private:
732743
//! The calendar components.
733744
maths_t::TCalendarComponentVec m_Components;
@@ -846,8 +857,11 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail {
846857
//! Set to true if the trend model should be used for prediction.
847858
bool m_UsingTrendForPrediction = false;
848859

849-
//! Set to true when new components are added.
850-
bool m_ComponentsAdded = false;
860+
//! Set to true when new components are added or removed
861+
bool m_ComponentsModified = false;
862+
863+
//! Befriend a helper class used by the unit tests
864+
friend class ::CNanInjector;
851865
};
852866
};
853867

lib/maths/CCalendarComponent.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,5 +181,9 @@ std::size_t CCalendarComponent::memoryUsage() const {
181181
return core::CMemory::dynamicSize(m_Bucketing) +
182182
core::CMemory::dynamicSize(this->splines());
183183
}
184+
185+
bool CCalendarComponent::isBad() const {
186+
return m_Bucketing.isBad();
187+
}
184188
}
185189
}

lib/maths/CCalendarComponentAdaptiveBucketing.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include <maths/CBasicStatisticsPersist.h>
1717
#include <maths/CChecksum.h>
18+
#include <maths/CMathsFuncs.h>
1819
#include <maths/CTools.h>
1920

2021
#include <boost/bind.hpp>
@@ -356,5 +357,13 @@ std::string CCalendarComponentAdaptiveBucketing::name() const {
356357
return "Calendar[" + std::to_string(this->decayRate()) + "," +
357358
std::to_string(this->minimumBucketLength()) + "]";
358359
}
360+
361+
bool CCalendarComponentAdaptiveBucketing::isBad() const {
362+
// check for bad values in both the means and the variances
363+
return std::any_of(m_Values.begin(), m_Values.end(), [](const auto& value) {
364+
return ((CMathsFuncs::isFinite(CBasicStatistics::mean(value)) == false) ||
365+
(CMathsFuncs::isFinite(CBasicStatistics::variance(value))) == false);
366+
});
367+
}
359368
}
360369
}

lib/maths/CDecompositionComponent.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ void CDecompositionComponent::interpolate(const TDoubleVec& knots,
107107
const TDoubleVec& values,
108108
const TDoubleVec& variances) {
109109
m_Splines.interpolate(knots, values, variances, m_BoundaryCondition);
110+
110111
m_MeanValue = this->valueSpline().mean();
111112
m_MeanVariance = this->varianceSpline().mean();
112113
}
@@ -123,9 +124,9 @@ TDoubleDoublePr CDecompositionComponent::value(double offset, double n, double c
123124
// asymptotically normal with mean equal to the sample mean
124125
// and variance equal to the sample variance divided by root
125126
// of the number of samples.
126-
127127
if (this->initialized()) {
128128
double m{this->valueSpline().value(offset)};
129+
129130
if (confidence == 0.0) {
130131
return {m, m};
131132
}

lib/maths/CPeriodicityHypothesisTests.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1981,6 +1981,13 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
19811981
return false;
19821982
}
19831983

1984+
// It's possible that none of the candidates are <= 1.05 times the minimum,
1985+
// this would be the case if a NaN were present in the values say.
1986+
// NaNs are detected and purged elsewhere so we simply return false here.
1987+
if (best.count() == 0) {
1988+
return false;
1989+
}
1990+
19841991
startOfPartition = (m_StartTime + best[0].second) % repeat;
19851992
double v1{varianceAtPercentile(correction * minimum[0].first, df1,
19861993
50.0 + CONFIDENCE_INTERVAL / 2.0)};

lib/maths/CSeasonalComponentAdaptiveBucketing.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include <maths/CLinearAlgebra.h>
2020
#include <maths/CLinearAlgebraPersist.h>
2121
#include <maths/CLinearAlgebraTools.h>
22+
#include <maths/CMathsFuncs.h>
23+
#include <maths/CMathsFuncsDetail.h>
2224
#include <maths/CRegression.h>
2325
#include <maths/CRegressionDetail.h>
2426
#include <maths/CSeasonalTime.h>
@@ -54,6 +56,13 @@ double gradient(const TRegression& r) {
5456
return params[1];
5557
}
5658

59+
//! Safely promote the type of the vector statistics value returned by
60+
//! CBasicStatistics::mean to a type accepted by CMathsFuncs::isFinite
61+
template<typename T>
62+
typename SPromoted<T>::Type promote(const T& vectorStat) {
63+
return (vectorStat);
64+
}
65+
5766
// Version 6.3
5867
const std::string VERSION_6_3_TAG("6.3");
5968
const std::string ADAPTIVE_BUCKETING_6_3_TAG{"a"};
@@ -578,6 +587,11 @@ std::string CSeasonalComponentAdaptiveBucketing::name() const {
578587
std::to_string(this->minimumBucketLength()) + "]";
579588
}
580589

590+
bool CSeasonalComponentAdaptiveBucketing::isBad() const {
591+
return std::any_of(m_Buckets.begin(), m_Buckets.end(),
592+
[](const auto& bucket) { return bucket.isBad(); });
593+
}
594+
581595
double CSeasonalComponentAdaptiveBucketing::observedInterval(core_t::TTime time) const {
582596
return m_Time->regressionInterval(
583597
std::min_element(m_Buckets.begin(), m_Buckets.end(),
@@ -627,5 +641,11 @@ uint64_t CSeasonalComponentAdaptiveBucketing::SBucket::checksum(uint64_t seed) c
627641
seed = CChecksum::calculate(seed, s_FirstUpdate);
628642
return CChecksum::calculate(seed, s_LastUpdate);
629643
}
644+
645+
bool CSeasonalComponentAdaptiveBucketing::SBucket::isBad() const {
646+
return ((CMathsFuncs::isFinite(
647+
promote(CBasicStatistics::mean(s_Regression.statistic()))) == false) ||
648+
(CMathsFuncs::isFinite(s_Variance) == false));
649+
}
630650
}
631651
}

0 commit comments

Comments
 (0)