From 567fa80c690b1b6502a6d1b41dd41d5a19ef38e8 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@users.noreply.github.com>
Date: Mon, 11 May 2020 16:17:13 +0100
Subject: [PATCH] [ML] Fix possible cause for "Bad variance scale nan" log
 errors (#1225)

---
 docs/CHANGELOG.asciidoc                       |  1 +
 lib/maths/CAdaptiveBucketing.cc               | 29 +++++++++++++------
 .../CCalendarComponentAdaptiveBucketing.cc    |  6 +++-
 lib/maths/CTimeSeriesDecomposition.cc         |  2 +-
 4 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index 058ef6e004..45511945ec 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -38,6 +38,7 @@
   the number of classes present in the training data. (See {ml-pull}1144[#1144].)
 * Fix underlying cause for "Failed to calculate splitting significance" log errors.
   (See {ml-pull}1157[#1157].)
+* Fix possible root cause for "Bad variance scale nan" log errors. (See {ml-pull}1225[#1225].)
 * Change data frame analytics instrumentation timestamp resolution to milliseconds. (See
   {ml-pull}1237[#1237].)
 * Fix "autodetect process stopped unexpectedly: Fatal error: 'terminate called after
diff --git a/lib/maths/CAdaptiveBucketing.cc b/lib/maths/CAdaptiveBucketing.cc
index 195ce62ff2..88443a7246 100644
--- a/lib/maths/CAdaptiveBucketing.cc
+++ b/lib/maths/CAdaptiveBucketing.cc
@@ -246,7 +246,7 @@ bool CAdaptiveBucketing::initialize(double a, double b, std::size_t n) {
 void CAdaptiveBucketing::initialValues(core_t::TTime start,
                                        core_t::TTime end,
                                        const TFloatMeanAccumulatorVec& values) {
-    if (!this->initialized()) {
+    if (this->initialized() == false) {
         return;
     }
 
@@ -406,7 +406,7 @@ void CAdaptiveBucketing::refine(core_t::TTime time) {
     LOG_TRACE(<< "totalAveragingError = " << totalAveragingError);
 
     double n_{static_cast<double>(n)};
-    double step{(1 - n_ * EPS) * totalAveragingError / n_};
+    double step{(1.0 - n_ * EPS) * totalAveragingError / n_};
     TFloatVec endpoints{m_Endpoints};
     LOG_TRACE(<< "step = " << step);
 
@@ -505,7 +505,7 @@ bool CAdaptiveBucketing::knots(core_t::TTime time,
             double a{m_Endpoints[i]};
             double b{m_Endpoints[i + 1]};
             double c{m_Centres[i]};
-            double c0{c};
+            double c0{c - m_Endpoints[0]};
             knots.push_back(m_Endpoints[0]);
             values.push_back(this->predict(i, time, c));
             variances.push_back(this->variance(i));
@@ -549,26 +549,37 @@ bool CAdaptiveBucketing::knots(core_t::TTime time,
                         double alpha{m_Endpoints[n] - m_Centres[j]};
                         double beta{c0};
                         double Z{alpha + beta};
+                        if (Z == 0.0) {
+                            alpha = beta = 0.5;
+                        } else {
+                            alpha /= Z;
+                            beta /= Z;
+                        }
                         double lastPeriodValue{
                             this->predict(j, time, m_Centres[j] - m_Endpoints[n])};
                         double lastPeriodVariance{this->variance(j)};
                         knots[0] = m_Endpoints[0];
-                        values[0] = (alpha * values[0] + beta * lastPeriodValue) / Z;
-                        variances[0] = (alpha * variances[0] + beta * lastPeriodVariance) / Z;
+                        values[0] = alpha * values[0] + beta * lastPeriodValue;
+                        variances[0] = alpha * variances[0] + beta * lastPeriodVariance;
                         break;
                     }
                 }
-                for (std::size_t j = 0u; j < n; ++j) {
+                for (std::size_t j = 0; j < n; ++j) {
                     if (this->bucketCount(j) > 0.0) {
                         double alpha{m_Centres[j]};
                         double beta{m_Endpoints[n] - knots.back()};
                         double Z{alpha + beta};
+                        if (Z == 0.0) {
+                            alpha = beta = 0.5;
+                        } else {
+                            alpha /= Z;
+                            beta /= Z;
+                        }
                         double nextPeriodValue{
                             this->predict(j, time, m_Endpoints[n] + m_Centres[j])};
                         double nextPeriodVariance{this->variance(j)};
-                        values.push_back((alpha * values.back() + beta * nextPeriodValue) / Z);
-                        variances.push_back(
-                            (alpha * variances.back() + beta * nextPeriodVariance) / Z);
+                        values.push_back(alpha * values.back() + beta * nextPeriodValue);
+                        variances.push_back(alpha * variances.back() + beta * nextPeriodVariance);
                         knots.push_back(m_Endpoints[n]);
                         break;
                     }
diff --git a/lib/maths/CCalendarComponentAdaptiveBucketing.cc b/lib/maths/CCalendarComponentAdaptiveBucketing.cc
index 96ad17c33f..4a12ad8e4c 100644
--- a/lib/maths/CCalendarComponentAdaptiveBucketing.cc
+++ b/lib/maths/CCalendarComponentAdaptiveBucketing.cc
@@ -276,7 +276,11 @@ void CCalendarComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints)
                 static_cast<double>(oldCentres[l - 1]));
             largeErrorCount += w * oldLargeErrorCounts[l - 1];
             count += w * w * CBasicStatistics::count(m_Values[l - 1]);
-            double scale{count / CBasicStatistics::count(value)};
+            // Defend against 0 / 0: if CBasicStatistics::count(value)
+            // is zero then count must be too.
+            double scale{count == CBasicStatistics::count(value)
+                             ? 1.0
+                             : count / CBasicStatistics::count(value)};
             newValues.push_back(CBasicStatistics::scaled(value, scale));
             newCentres.push_back(CTools::truncate(CBasicStatistics::mean(centre), yl, yr));
             newLargeErrorCounts.push_back(largeErrorCount);
diff --git a/lib/maths/CTimeSeriesDecomposition.cc b/lib/maths/CTimeSeriesDecomposition.cc
index 9480e00d76..9224444004 100644
--- a/lib/maths/CTimeSeriesDecomposition.cc
+++ b/lib/maths/CTimeSeriesDecomposition.cc
@@ -419,7 +419,7 @@ TDoubleDoublePr CTimeSeriesDecomposition::scale(core_t::TTime time,
     }
 
     double mean{this->meanVariance()};
-    if (mean == 0.0) {
+    if (mean == 0.0 || variance == 0.0) {
         return {1.0, 1.0};
     }