From 567fa80c690b1b6502a6d1b41dd41d5a19ef38e8 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 11 May 2020 16:17:13 +0100 Subject: [PATCH] [ML] Fix possible cause for "Bad variance scale nan" log errors (#1225) --- docs/CHANGELOG.asciidoc | 1 + lib/maths/CAdaptiveBucketing.cc | 29 +++++++++++++------ .../CCalendarComponentAdaptiveBucketing.cc | 6 +++- lib/maths/CTimeSeriesDecomposition.cc | 2 +- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 058ef6e004..45511945ec 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -38,6 +38,7 @@ the number of classes present in the training data. (See {ml-pull}1144[#1144].) * Fix underlying cause for "Failed to calculate splitting significance" log errors. (See {ml-pull}1157[#1157].) +* Fix possible root cause for "Bad variance scale nan" log errors. (See {ml-pull}1225[#1225].) * Change data frame analytics instrumentation timestamp resolution to milliseconds. (See {ml-pull}1237[#1237].) * Fix "autodetect process stopped unexpectedly: Fatal error: 'terminate called after diff --git a/lib/maths/CAdaptiveBucketing.cc b/lib/maths/CAdaptiveBucketing.cc index 195ce62ff2..88443a7246 100644 --- a/lib/maths/CAdaptiveBucketing.cc +++ b/lib/maths/CAdaptiveBucketing.cc @@ -246,7 +246,7 @@ bool CAdaptiveBucketing::initialize(double a, double b, std::size_t n) { void CAdaptiveBucketing::initialValues(core_t::TTime start, core_t::TTime end, const TFloatMeanAccumulatorVec& values) { - if (!this->initialized()) { + if (this->initialized() == false) { return; } @@ -406,7 +406,7 @@ void CAdaptiveBucketing::refine(core_t::TTime time) { LOG_TRACE(<< "totalAveragingError = " << totalAveragingError); double n_{static_cast(n)}; - double step{(1 - n_ * EPS) * totalAveragingError / n_}; + double step{(1.0 - n_ * EPS) * totalAveragingError / n_}; TFloatVec endpoints{m_Endpoints}; LOG_TRACE(<< "step = " << step); @@ -505,7 +505,7 @@ bool CAdaptiveBucketing::knots(core_t::TTime time, double a{m_Endpoints[i]}; double b{m_Endpoints[i + 1]}; double c{m_Centres[i]}; - double c0{c}; + double c0{c - m_Endpoints[0]}; knots.push_back(m_Endpoints[0]); values.push_back(this->predict(i, time, c)); variances.push_back(this->variance(i)); @@ -549,26 +549,37 @@ bool CAdaptiveBucketing::knots(core_t::TTime time, double alpha{m_Endpoints[n] - m_Centres[j]}; double beta{c0}; double Z{alpha + beta}; + if (Z == 0.0) { + alpha = beta = 0.5; + } else { + alpha /= Z; + beta /= Z; + } double lastPeriodValue{ this->predict(j, time, m_Centres[j] - m_Endpoints[n])}; double lastPeriodVariance{this->variance(j)}; knots[0] = m_Endpoints[0]; - values[0] = (alpha * values[0] + beta * lastPeriodValue) / Z; - variances[0] = (alpha * variances[0] + beta * lastPeriodVariance) / Z; + values[0] = alpha * values[0] + beta * lastPeriodValue; + variances[0] = alpha * variances[0] + beta * lastPeriodVariance; break; } } - for (std::size_t j = 0u; j < n; ++j) { + for (std::size_t j = 0; j < n; ++j) { if (this->bucketCount(j) > 0.0) { double alpha{m_Centres[j]}; double beta{m_Endpoints[n] - knots.back()}; double Z{alpha + beta}; + if (Z == 0.0) { + alpha = beta = 0.5; + } else { + alpha /= Z; + beta /= Z; + } double nextPeriodValue{ this->predict(j, time, m_Endpoints[n] + m_Centres[j])}; double nextPeriodVariance{this->variance(j)}; - values.push_back((alpha * values.back() + beta * nextPeriodValue) / Z); - variances.push_back( - (alpha * variances.back() + beta * nextPeriodVariance) / Z); + values.push_back(alpha * values.back() + beta * nextPeriodValue); + variances.push_back(alpha * variances.back() + beta * nextPeriodVariance); knots.push_back(m_Endpoints[n]); break; } diff --git a/lib/maths/CCalendarComponentAdaptiveBucketing.cc b/lib/maths/CCalendarComponentAdaptiveBucketing.cc index 96ad17c33f..4a12ad8e4c 100644 --- a/lib/maths/CCalendarComponentAdaptiveBucketing.cc +++ b/lib/maths/CCalendarComponentAdaptiveBucketing.cc @@ -276,7 +276,11 @@ void CCalendarComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) static_cast(oldCentres[l - 1])); largeErrorCount += w * oldLargeErrorCounts[l - 1]; count += w * w * CBasicStatistics::count(m_Values[l - 1]); - double scale{count / CBasicStatistics::count(value)}; + // Defend against 0 / 0: if CBasicStatistics::count(value) + // is zero then count must be too. + double scale{count == CBasicStatistics::count(value) + ? 1.0 + : count / CBasicStatistics::count(value)}; newValues.push_back(CBasicStatistics::scaled(value, scale)); newCentres.push_back(CTools::truncate(CBasicStatistics::mean(centre), yl, yr)); newLargeErrorCounts.push_back(largeErrorCount); diff --git a/lib/maths/CTimeSeriesDecomposition.cc b/lib/maths/CTimeSeriesDecomposition.cc index 9480e00d76..9224444004 100644 --- a/lib/maths/CTimeSeriesDecomposition.cc +++ b/lib/maths/CTimeSeriesDecomposition.cc @@ -419,7 +419,7 @@ TDoubleDoublePr CTimeSeriesDecomposition::scale(core_t::TTime time, } double mean{this->meanVariance()}; - if (mean == 0.0) { + if (mean == 0.0 || variance == 0.0) { return {1.0, 1.0}; }