From d48ee89ab3c93cb25b50d26fb0eb6183f63a10e7 Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Wed, 3 Jun 2020 14:27:45 +0800 Subject: [PATCH 01/15] create class PeriodDetectUtils --- src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs diff --git a/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs b/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs new file mode 100644 index 0000000000..b7ba59fe8d --- /dev/null +++ b/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.TimeSeries +{ + internal class PeriodDetectUtils + { + } +} From 34567ba8391b8261d11f13aaff1f6fad616d3041 Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Wed, 3 Jun 2020 14:50:42 +0800 Subject: [PATCH 02/15] Test period detect --- .../PeriodDetectUtils.cs | 279 +++++++++++++++++- .../TimeSeriesDirectApi.cs | 6 + 2 files changed, 283 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs b/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs index b7ba59fe8d..8e384224e3 100644 --- a/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs +++ b/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs @@ -4,11 +4,286 @@ using System; using System.Collections.Generic; -using System.Text; +using System.Linq; +using System.Numerics; +using Microsoft.ML.Transforms.TimeSeries; namespace Microsoft.ML.TimeSeries { - internal class PeriodDetectUtils + /// + /// this class is used to detect the periodicity automatically + /// + public class PeriodDetectUtils { + /// + /// the minimum value that is considered as a valid period. + /// + private const int MinPeriod = 4; + + /// + /// in practice, the max lag very rarely exceed 365, which lacks of strong interpretation, and which also brings performance overhead. + /// + private const int MaxLag = 400; + + /// + /// suppose the length of time series is 651, now we found an period is 128, then 651/128 = 5, which means there are at most 5 recurrent period. this is too small, the significance build upon this is not trustable. + /// + private const int MinRecurrentCount = 8; + + /// + /// when input time series is with very close values (i.e., different is smaller than E-20), the accuracy of double could distort the + /// final trend signal. any seasonal signal under such circumstance becomes unreliable. + /// so use this threshold to eliminate such kind of time series. here set to 1e-10 is for conservative consideration. + /// + private const double MinEnergyThreshold = 1e-10; + + public static int DetectSeasonality(double[] y) + { + int length = y.Length; + int newLength = Get2Power(y.Length); + double[] fftRe = new double[newLength]; + double[] fftIm = new double[newLength]; + double[] inputRe = new double[newLength]; + + double mean = 0; + double std = 0; + + foreach (double value in y) + mean += value; + mean /= length; + + for (int i = 0; i < length; ++i) + { + inputRe[i] = y[i] - mean; + std += inputRe[i] * inputRe[i]; + } + if (std / length < 1e-8) + { + return -1; + } + + for (int i = length; i < newLength; ++i) + { + inputRe[i] = 0; + } + + FftUtils.ComputeForwardFft(inputRe, Enumerable.Repeat(0.0, newLength).ToArray(), fftRe, fftIm, newLength); + + var z = fftRe.Select((m, i) => new Complex(m, fftIm[i])).ToArray(); + var w = z.Select((t, i) => t * Complex.Conjugate(t)).ToArray(); + FindBestTwoFrequencies(w, length, out var bestFreq, out var secondFreq); + + double[] ifftRe = new double[newLength]; + double[] ifftIm = new double[newLength]; + FftUtils.ComputeBackwardFft( + w.Select(t => (double)t.Real).ToArray(), + w.Select(t => (double)t.Imaginary).ToArray(), ifftRe, ifftIm, newLength); + var r = ifftRe.Select((t, i) => new Complex(t, ifftIm[i])).ToArray(); + int period = FindTruePeriod(r, bestFreq, secondFreq, newLength); + + if (period < MinPeriod) + { + period = -1; + } + + return period; + } + + private static int FindTruePeriod(Complex[] r, int bestFreq, int secondFreq, int timeSeriesLength) + { + int firstPeriod = -1; + int secondPeriod = -1; + double firstTimeDomainEnergy = -1; + double secondTimeDomainEnergy = -1; + firstPeriod = FindBestPeriod(r, bestFreq, timeSeriesLength, out firstTimeDomainEnergy); + if (secondFreq != -1) + { + secondPeriod = FindBestPeriod(r, secondFreq, timeSeriesLength, out secondTimeDomainEnergy); + } + if (firstPeriod == -1 && secondPeriod == -1) + return -1; + int truePeriod; + double trueTimeDomainEnergy; + if (firstPeriod == -1) + { + truePeriod = secondPeriod; + trueTimeDomainEnergy = secondTimeDomainEnergy; + } + else if (secondPeriod == -1) + { + truePeriod = firstPeriod; + trueTimeDomainEnergy = firstTimeDomainEnergy; + } + else + { + if (firstPeriod == secondPeriod) + { + truePeriod = firstPeriod; + trueTimeDomainEnergy = firstTimeDomainEnergy; + } + else + { + // hueristic: if the second frequency is with somewhat higher energy in time domain, we think it is a better candidate + if (secondTimeDomainEnergy > firstTimeDomainEnergy * 1.05) + { + truePeriod = secondPeriod; + trueTimeDomainEnergy = secondTimeDomainEnergy; + } + else + { + truePeriod = firstPeriod; + trueTimeDomainEnergy = firstTimeDomainEnergy; + } + } + } + trueTimeDomainEnergy /= r[0].Real; + + // this is a key equation, which is named the "testing for randomness with the correlogram". /ref: http://www.ltrr.arizona.edu/~dmeko/notes_3.pdf + // actually, 1.96 is for the 2-sigma, which has 95% statistical confidence. 2.58 is for 99% confidence, 2.85 for 99.5% confidence + /* increasing the threshold aims to mitigate the fake seasonal component caused by outliers. in practice, if there exist true seasonal component, + * such as BirdStrike/Appdownloads, the energy is far larger than threshold, hence change threshold from 2.85 to 4.0 have no impact (tested); + */ + + double threshold = 4 / Math.Sqrt(timeSeriesLength); + + if (trueTimeDomainEnergy < threshold || r[truePeriod].Real < MinEnergyThreshold) + return -1; + + return truePeriod; + } + + /// + /// in order to pick up a proper frequency robustly (this is useful especially for large frequency, or small period, e.g., period = 2), + /// this method aims to pick up the top two frequencies for further evaluation. + /// of course, the energy of the second frequency (in frequency domain) must be at similar magnitude compared with the energy of the first + /// frequency. + /// + /// the energy list in the frequency domain, the index is the frequency. + /// the original time series length + /// the frequency with highest energy + /// the frequency with second highest energy + private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, out int bestFreq, out int secondFreq) + { + bestFreq = -1; + double bestEnergy = -1.0; + secondFreq = -1; + double secondEnergy = -1.0; + + if (w.Length < 2) + return; + + List energies = new List(); + + /* length of time series divided by frequency is period. it is obvious that the period should be larger than 1 and smaller than the total length, and is an integer */ + for (int i = w.Length / timeSeriesLength; i < w.Length / 2 + 1; i++) + { + double nextWeight = w[i].Magnitude; + energies.Add(nextWeight); + + if (nextWeight > bestEnergy) + { + bestEnergy = nextWeight; + bestFreq = i; + } + } + + // once we found a best frequency, the region formed by lower bound to upper bound corresponding to this frequency will not be inspected anymore. because they all share the same period. + int period = w.Length / bestFreq; + double lowerBound = w.Length * 1.0 / (period + 1); + double upperBound = w.Length * 1.0 / (period - 1); + + for (int i = w.Length / timeSeriesLength; i < w.Length / 2 + 1; i++) + { + if (i > lowerBound && i < upperBound) + continue; + double weight = w[i].Magnitude; + if (weight > secondEnergy) + { + double prevWeight = 0; + if (i > 0) + prevWeight = w[i - 1].Magnitude; + double nextWeight = 0; + if (i < w.Length - 1) + nextWeight = w[i + 1].Magnitude; + + // should be a local maximum + if (weight >= prevWeight && weight >= nextWeight) + { + secondEnergy = nextWeight; + secondFreq = i; + } + } + } + double typycalEnergy = MathUtils.QuickMedian(energies); + + // the second energy must be at least significantly large enough than typical energies, and also similar to best energy at magnitude level. + if (typycalEnergy * 6.0 < secondEnergy && secondEnergy * 10.0 > bestEnergy) + return; + + // set the second frequency to -1, since it is obviously not strong enought to compete with the best energy. + secondFreq = -1; + } + + /// + /// given a frequency F represented by an integer, we aim to find the best period by inspecting the auto-correlation function in time domain. + /// since either frequency or the period is an integer, so the possible period located within + /// [N/(F+1), N/(F-1)], we need to check this domain, and pick the best one. where N is the length of the augmented time series + /// + /// the auto-correlation function of the augmented time series + /// the input frequency candidate + /// the length of the original time series, this is used for post processing to reduce false positive + /// output the energy on the auto-correlation function + /// return the best period estimated + private static int FindBestPeriod(Complex[] r, int frequency, int timeSeriesLength, out double energy) + { + energy = -1; + + // this will never make sense of a seasonal signal + if (frequency <= 1) + return -1; + + int lowerBound = r.Length / (frequency + 1); + int upperBound = r.Length / (frequency - 1); + int bestPeriod = -1; + for (int i = lowerBound; i <= upperBound && i < r.Length; i++) + { + var currentEnergy = r[i].Real; + if (currentEnergy > energy) + { + energy = currentEnergy; + bestPeriod = i; + } + } + + /* condition1: period does not make sense, since the corresponding zone in the auto-correlation energy list are all negative. + * condition2: for real dataset, we do not think there will exist such long period. this is used to reduce false-positive + * condition3: the number of repeats under this period is too few. this is used to reduce false-positive + */ + if (bestPeriod <= 1 || bestPeriod > MaxLag || timeSeriesLength < MinRecurrentCount * bestPeriod) + { + energy = -1; + return -1; + } + return bestPeriod; + } + + /// + /// get the smallest 2^k which is equal or greater than n + /// + private static int Get2Power(int n) + { + int result = 1; + bool meet1 = false; // check is n is just equals to 2^k for some k + while (n > 1) + { + if ((n & 1) != 0) + meet1 = true; + result <<= 1; + n >>= 1; + } + if (meet1) + result <<= 1; + return result; + } } } diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index d25b614d37..b4a499b185 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -408,6 +408,12 @@ public void SsaForecast() } + [Fact] + public void TestSeasonality() + { + + } + [Fact] public void SsaForecastPredictionEngine() { From a7b3bd4fceb608b9863be0cd6886c11169b01fc5 Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Wed, 3 Jun 2020 14:51:20 +0800 Subject: [PATCH 03/15] math utils --- src/Microsoft.ML.TimeSeries/MathUtils.cs | 117 +++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 src/Microsoft.ML.TimeSeries/MathUtils.cs diff --git a/src/Microsoft.ML.TimeSeries/MathUtils.cs b/src/Microsoft.ML.TimeSeries/MathUtils.cs new file mode 100644 index 0000000000..df118eea09 --- /dev/null +++ b/src/Microsoft.ML.TimeSeries/MathUtils.cs @@ -0,0 +1,117 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.TimeSeries +{ + internal class MathUtils + { + /// + /// use quick-sort like method to obtain the median value. + /// the complexity in expectation is O(n), which is faster than using quickSort. + /// + /// the input list of values. note that this list will be modified after calling this method + /// returns the median value + public static double QuickMedian(List values) + { + if (values == null || values.Count == 0) + return double.NaN; + + // here the third parameter is start from 1. so we need to plus 1 to compliant. + return QuickSelect(values, values.Count / 2 + 1); + } + + public static double QuickSelect(IReadOnlyList values, int k) + { + var nums = values; + double[] left = new double[values.Count]; + double[] right = new double[values.Count]; + int numsCount = nums.Count; + + while (true) + { + if (numsCount == 1) + return nums[0]; + + int idx = FindMedianIndex(nums, 0, numsCount - 1); + double key = nums[idx]; + + int leftIdx = 0; + int rightIdx = 0; + for (int i = 0; i < numsCount; i++) + { + if (i == idx) + continue; + + if (nums[i] < key) + left[leftIdx++] = nums[i]; + else + right[rightIdx++] = nums[i]; + } + + if (leftIdx == k - 1) + return key; + + if (leftIdx >= k) + { + nums = left; + numsCount = leftIdx; + } + else + { + nums = right; + k = k - leftIdx - 1; + numsCount = rightIdx; + } + } + } + + public static int FindMedianIndex(IReadOnlyList values, int start, int end) + { + // use the middle value among first/middle/end as the guard value, to make sure the average performance good. + // according to unit test, this fix will improve the average performance 10%. and works normally when input list is ordered. + double first = values[start]; + double last = values[end]; + int midIndex = (start + end) / 2; + int medianIndex = -1; + double middleValue = values[midIndex]; + if (first < last) + { + if (middleValue > last) + { + // last is the middle value + medianIndex = end; + } + else if (middleValue > first) + { + // middleValue is the middle value + medianIndex = midIndex; + } + else + { + // first is the middle value + medianIndex = start; + } + } + else + { + if (middleValue > first) + { + // first is the middle value + medianIndex = start; + } + else if (middleValue < last) + { + // last is the middle value + medianIndex = end; + } + else + { + // middleValue is the middle value + medianIndex = midIndex; + } + } + return medianIndex; + } + } +} From 88d8824a14976ef02ebe444c3c1afe0a4eeed286 Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Wed, 3 Jun 2020 15:15:27 +0800 Subject: [PATCH 04/15] restore file --- test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index b4a499b185..d25b614d37 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -408,12 +408,6 @@ public void SsaForecast() } - [Fact] - public void TestSeasonality() - { - - } - [Fact] public void SsaForecastPredictionEngine() { From 614695cc97da44d6fdbfcd6532ee78ecf1c6092d Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Wed, 3 Jun 2020 16:17:08 +0800 Subject: [PATCH 05/15] update license --- src/Microsoft.ML.TimeSeries/MathUtils.cs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.TimeSeries/MathUtils.cs b/src/Microsoft.ML.TimeSeries/MathUtils.cs index df118eea09..312d73aa9b 100644 --- a/src/Microsoft.ML.TimeSeries/MathUtils.cs +++ b/src/Microsoft.ML.TimeSeries/MathUtils.cs @@ -1,4 +1,8 @@ -using System; +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; using System.Collections.Generic; using System.Text; From 9f8281a359d9bbbc96e0cc5ddcf41888db3940ca Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Thu, 11 Jun 2020 09:06:12 -0700 Subject: [PATCH 06/15] 1. Add DetectSeasonality as a helper method in ExtensionsCatalog, 2. Remove the MathUtils and use MedianDblAggregator (make it BestFriend) 3. Add Unit Tests --- .../TimeSeries/DetectSeasonality.cs | 46 +++++ .../Transforms/NormalizeColumnDbl.cs | 3 +- .../ExtensionsCatalog.cs | 22 +++ src/Microsoft.ML.TimeSeries/MathUtils.cs | 121 ------------ ...dDetectUtils.cs => SeasonalityDetector.cs} | 172 ++++++++++-------- .../TimeSeriesDirectApi.cs | 28 ++- 6 files changed, 190 insertions(+), 202 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs delete mode 100644 src/Microsoft.ML.TimeSeries/MathUtils.cs rename src/Microsoft.ML.TimeSeries/{PeriodDetectUtils.cs => SeasonalityDetector.cs} (59%) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs new file mode 100644 index 0000000000..6c2fb1b8a6 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs @@ -0,0 +1,46 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.TimeSeries; + +namespace Samples.Dynamic +{ + public static class DetectSeasonality + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for + // exception tracking and logging, as well as the source of randomness. + var mlContext = new MLContext(); + + var dataView = mlContext.Data.LoadFromEnumerable(GetPointsWithSeasonality()); + // Create a seasonal data as input for DetectSeasonality. + var input = GetPointsWithSeasonality(); + SeasonalityDetector seasonalityDetector = new SeasonalityDetector(); + int period = mlContext.AnomalyDetection.DetectSeasonality(dataView, "Input"); + + // Print the Seasonality Period result. + Console.WriteLine($"Seasonality Period: #{period}"); + } + + private static IEnumerable GetPointsWithSeasonality() + { + return new List() {18.004, 87.401, 87.411, 18.088, 18.017, 87.759, 33.996, 18.043, 87.853, 18.364, 18.004, 86.992, 87.555, + 18.088, 18.029, 87.906, 87.471, 18.039, 18.099, 87.403, 18.030, 72.991, 87.804, 18.381, 18.016, 87.145, 87.771, + 18.029, 18.084, 87.976, 34.913, 18.064, 18.302, 87.723, 18.001, 86.401, 87.344, 18.295, 18.002, 87.793, + 87.531, 18.055, 18.005, 87.947, 18.003, 72.743, 87.722, 18.142 }.Select(t => new TimeSeriesData(t)); + } + + private class TimeSeriesData + { + public double Value; + + public TimeSeriesData(double value) + { + Value = value; + } + } + + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs index cd1d6a8b60..7a5d42518c 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs @@ -597,6 +597,7 @@ internal static void GetMedianSoFar(in double num, ref double median, ref MaxHea /// It tracks median values of non-sparse values (vCount). /// NaNs are ignored when updating min and max. /// + [BestFriend] internal sealed class MedianDblAggregator : IColumnAggregator { private MedianAggregatorUtils.MaxHeap _belowMedianHeap; @@ -1213,7 +1214,7 @@ private void GetResult(ref TFloat input, ref TFloat value) } public override NormalizingTransformer.NormalizerModelParametersBase GetNormalizerModelParams() - => new NormalizingTransformer.BinNormalizerModelParameters(ImmutableArray.Create(_binUpperBounds), _den,_offset); + => new NormalizingTransformer.BinNormalizerModelParameters(ImmutableArray.Create(_binUpperBounds), _den, _offset); } public sealed class ImplVec : BinColumnFunction diff --git a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs index 161f8bc27b..8f645dffe0 100644 --- a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs @@ -210,6 +210,28 @@ public static RootCause LocalizeRootCause(this AnomalyDetectionCatalog catalog, return dst; } + /// + /// Obtain the period by adopting techniques of spectral analysis. which is founded by + /// the fourier analysis. returns -1 means there's no significant period. otherwise, a period + /// is returned. + /// + /// The detect seasonality catalog. + /// Input DataView.The data is an instance of . + /// Name of column to process. The column data must be . + /// An upper bound on the largest relevant seasonality in the input time-series. + /// When set to -1, use the whole input to fit model, when set to a positive integer, use this number as batch size. + /// Default value is -1. + /// The detected period if seasonality period exists, otherwise return -1. + /// + /// + /// + /// + /// + public static int DetectSeasonality(this AnomalyDetectionCatalog catalog, IDataView input, string inputColumnName, int seasonalityWindowSize = -1) + => new SeasonalityDetector().DetectSeasonality(CatalogUtils.GetEnvironment(catalog), input, inputColumnName, seasonalityWindowSize); + private static void CheckRootCauseInput(IHostEnvironment host, RootCauseLocalizationInput src) { host.CheckUserArg(src.Slices.Count >= 1, nameof(src.Slices), "Must has more than one item"); diff --git a/src/Microsoft.ML.TimeSeries/MathUtils.cs b/src/Microsoft.ML.TimeSeries/MathUtils.cs deleted file mode 100644 index 312d73aa9b..0000000000 --- a/src/Microsoft.ML.TimeSeries/MathUtils.cs +++ /dev/null @@ -1,121 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace Microsoft.ML.TimeSeries -{ - internal class MathUtils - { - /// - /// use quick-sort like method to obtain the median value. - /// the complexity in expectation is O(n), which is faster than using quickSort. - /// - /// the input list of values. note that this list will be modified after calling this method - /// returns the median value - public static double QuickMedian(List values) - { - if (values == null || values.Count == 0) - return double.NaN; - - // here the third parameter is start from 1. so we need to plus 1 to compliant. - return QuickSelect(values, values.Count / 2 + 1); - } - - public static double QuickSelect(IReadOnlyList values, int k) - { - var nums = values; - double[] left = new double[values.Count]; - double[] right = new double[values.Count]; - int numsCount = nums.Count; - - while (true) - { - if (numsCount == 1) - return nums[0]; - - int idx = FindMedianIndex(nums, 0, numsCount - 1); - double key = nums[idx]; - - int leftIdx = 0; - int rightIdx = 0; - for (int i = 0; i < numsCount; i++) - { - if (i == idx) - continue; - - if (nums[i] < key) - left[leftIdx++] = nums[i]; - else - right[rightIdx++] = nums[i]; - } - - if (leftIdx == k - 1) - return key; - - if (leftIdx >= k) - { - nums = left; - numsCount = leftIdx; - } - else - { - nums = right; - k = k - leftIdx - 1; - numsCount = rightIdx; - } - } - } - - public static int FindMedianIndex(IReadOnlyList values, int start, int end) - { - // use the middle value among first/middle/end as the guard value, to make sure the average performance good. - // according to unit test, this fix will improve the average performance 10%. and works normally when input list is ordered. - double first = values[start]; - double last = values[end]; - int midIndex = (start + end) / 2; - int medianIndex = -1; - double middleValue = values[midIndex]; - if (first < last) - { - if (middleValue > last) - { - // last is the middle value - medianIndex = end; - } - else if (middleValue > first) - { - // middleValue is the middle value - medianIndex = midIndex; - } - else - { - // first is the middle value - medianIndex = start; - } - } - else - { - if (middleValue > first) - { - // first is the middle value - medianIndex = start; - } - else if (middleValue < last) - { - // last is the middle value - medianIndex = end; - } - else - { - // middleValue is the middle value - medianIndex = midIndex; - } - } - return medianIndex; - } - } -} diff --git a/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs similarity index 59% rename from src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs rename to src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 8e384224e3..912806d6db 100644 --- a/src/Microsoft.ML.TimeSeries/PeriodDetectUtils.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -6,82 +6,95 @@ using System.Collections.Generic; using System.Linq; using System.Numerics; +using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.TimeSeries; namespace Microsoft.ML.TimeSeries { /// - /// this class is used to detect the periodicity automatically + /// This class is used to detect the periodicity. /// - public class PeriodDetectUtils + public class SeasonalityDetector { /// - /// the minimum value that is considered as a valid period. - /// - private const int MinPeriod = 4; - - /// - /// in practice, the max lag very rarely exceed 365, which lacks of strong interpretation, and which also brings performance overhead. + /// In practice, the max lag very rarely exceed 365, which lacks of strong interpretation, and which also brings performance overhead. /// private const int MaxLag = 400; /// - /// suppose the length of time series is 651, now we found an period is 128, then 651/128 = 5, which means there are at most 5 recurrent period. this is too small, the significance build upon this is not trustable. + /// Suppose the length of time series is 651, now we found an period is 128, then 651/128 = 5, which means there are at most 5 recurrent period. this is too small, the significance build upon this is not trustable. /// private const int MinRecurrentCount = 8; /// - /// when input time series is with very close values (i.e., different is smaller than E-20), the accuracy of double could distort the - /// final trend signal. any seasonal signal under such circumstance becomes unreliable. - /// so use this threshold to eliminate such kind of time series. here set to 1e-10 is for conservative consideration. + /// When input time series is with very close values (i.e., different is smaller than E-20), the accuracy of double could distort the + /// final trend signal. Any seasonal signal under such circumstance becomes unreliable. + /// So use this threshold to eliminate such kind of time series. Here set to 1e-10 is for conservative consideration. /// private const double MinEnergyThreshold = 1e-10; - public static int DetectSeasonality(double[] y) + /// + /// Obtain the period by adopting techniques of spectral analysis. which is founded by + /// the fourier analysis. returns -1 means there's no significant period. otherwise, a period + /// is returned. + /// + /// The detect seasonality host environment. + /// Input DataView.The data is an instance of . + /// Name of column to process. The column data must be . + /// An upper bound on the largest relevant seasonality in the input time-series. + /// When set to -1, use the whole input to fit model, when set to a positive integer, use this number as batch size. + /// Default value is -1. + /// The detected period if seasonality period exists, otherwise return -1. + public int DetectSeasonality(IHostEnvironment host, IDataView input, string inputColumnName, int seasonalityWindowSize) { - int length = y.Length; - int newLength = Get2Power(y.Length); - double[] fftRe = new double[newLength]; - double[] fftIm = new double[newLength]; - double[] inputRe = new double[newLength]; + host.CheckValue(input, nameof(input)); + host.CheckValue(inputColumnName, nameof(inputColumnName)); + host.CheckUserArg(seasonalityWindowSize == -1 || seasonalityWindowSize >= 0, nameof(seasonalityWindowSize)); - double mean = 0; - double std = 0; + var column = input.Schema.GetColumnOrNull(inputColumnName); + host.CheckUserArg(column.HasValue, nameof(inputColumnName)); - foreach (double value in y) - mean += value; - mean /= length; + var rowCursor = input.GetRowCursor(new List() { column.Value }); + var valueDelegate = rowCursor.GetGetter(column.Value); - for (int i = 0; i < length; ++i) - { - inputRe[i] = y[i] - mean; - std += inputRe[i] * inputRe[i]; - } - if (std / length < 1e-8) + int length = 0; + float valueRef = 0; + List valueCache = new List(seasonalityWindowSize == -1 ? 1024 : seasonalityWindowSize); + while (rowCursor.MoveNext()) { - return -1; + valueDelegate.Invoke(ref valueRef); + length++; + valueCache.Add(valueRef); + if (seasonalityWindowSize != -1 && length >= seasonalityWindowSize) + break; } - for (int i = length; i < newLength; ++i) - { - inputRe[i] = 0; - } + float[] fftRe = new float[length]; + float[] fftIm = new float[length]; + float[] inputRe = valueCache.ToArray(); - FftUtils.ComputeForwardFft(inputRe, Enumerable.Repeat(0.0, newLength).ToArray(), fftRe, fftIm, newLength); + FftUtils.ComputeForwardFft(inputRe, Enumerable.Repeat(0f, length).ToArray(), fftRe, fftIm, length); var z = fftRe.Select((m, i) => new Complex(m, fftIm[i])).ToArray(); + + /* Here w is the periodogram, which indicates the square of "energy" on the frequency domain. + * Specifically, w[j] = a[j]^2+b[j]^2, where a and b are Fourier Coefficients for cosine and sine, + * x(t) = a0+sum(a[j]cos(2Pi * f[j]t)+b[j]sin(2Pi * f[j]t) + */ var w = z.Select((t, i) => t * Complex.Conjugate(t)).ToArray(); FindBestTwoFrequencies(w, length, out var bestFreq, out var secondFreq); - double[] ifftRe = new double[newLength]; - double[] ifftIm = new double[newLength]; + double[] ifftRe = new double[length]; + double[] ifftIm = new double[length]; FftUtils.ComputeBackwardFft( w.Select(t => (double)t.Real).ToArray(), - w.Select(t => (double)t.Imaginary).ToArray(), ifftRe, ifftIm, newLength); + w.Select(t => (double)t.Imaginary).ToArray(), ifftRe, ifftIm, length); var r = ifftRe.Select((t, i) => new Complex(t, ifftIm[i])).ToArray(); - int period = FindTruePeriod(r, bestFreq, secondFreq, newLength); - if (period < MinPeriod) + int period = FindActualPeriod(r, bestFreq, secondFreq, length); + + if (period < 0) { period = -1; } @@ -89,7 +102,19 @@ public static int DetectSeasonality(double[] y) return period; } - private static int FindTruePeriod(Complex[] r, int bestFreq, int secondFreq, int timeSeriesLength) + /// + /// Find the actual period following the steps: + /// Pick the best frequency by inspecting the auto-correlation energy (pick the highest) in time-domain. + /// In the normal case, usually, when the time series is with period T, then the best frequency is N/T, + /// while the second frequency would be N/2T, because period = T implies period = nT, where n is an integer. + /// In such a case, smaller period will win out on the autu-correlation energy list, due to the property of auto-correlation. + /// + /// + /// + /// + /// + /// + private static int FindActualPeriod(Complex[] r, int bestFreq, int secondFreq, int timeSeriesLength) { int firstPeriod = -1; int secondPeriod = -1; @@ -138,12 +163,11 @@ private static int FindTruePeriod(Complex[] r, int bestFreq, int secondFreq, int } trueTimeDomainEnergy /= r[0].Real; - // this is a key equation, which is named the "testing for randomness with the correlogram". /ref: http://www.ltrr.arizona.edu/~dmeko/notes_3.pdf - // actually, 1.96 is for the 2-sigma, which has 95% statistical confidence. 2.58 is for 99% confidence, 2.85 for 99.5% confidence - /* increasing the threshold aims to mitigate the fake seasonal component caused by outliers. in practice, if there exist true seasonal component, + /* This is a key equation, which is named the "testing for randomness with the correlogram". /ref: http://www.ltrr.arizona.edu/~dmeko/notes_3.pdf + * 1.96 is for the 2-sigma, which has 95% statistical confidence. 2.58 is for 99% confidence, 2.85 for 99.5% confidence + * increasing the threshold aims to mitigate the fake seasonal component caused by outliers. in practice, if there exist true seasonal component, * such as BirdStrike/Appdownloads, the energy is far larger than threshold, hence change threshold from 2.85 to 4.0 have no impact (tested); */ - double threshold = 4 / Math.Sqrt(timeSeriesLength); if (trueTimeDomainEnergy < threshold || r[truePeriod].Real < MinEnergyThreshold) @@ -153,7 +177,7 @@ private static int FindTruePeriod(Complex[] r, int bestFreq, int secondFreq, int } /// - /// in order to pick up a proper frequency robustly (this is useful especially for large frequency, or small period, e.g., period = 2), + /// In order to pick up a proper frequency robustly (this is useful especially for large frequency, or small period, e.g., period = 2), /// this method aims to pick up the top two frequencies for further evaluation. /// of course, the energy of the second frequency (in frequency domain) must be at similar magnitude compared with the energy of the first /// frequency. @@ -174,7 +198,9 @@ private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, ou List energies = new List(); - /* length of time series divided by frequency is period. it is obvious that the period should be larger than 1 and smaller than the total length, and is an integer */ + /* Length of time series divided by frequency is period. + * It is obvious that the period should be larger than 1 and smaller than the total length, and is an integer. + */ for (int i = w.Length / timeSeriesLength; i < w.Length / 2 + 1; i++) { double nextWeight = w[i].Magnitude; @@ -187,7 +213,9 @@ private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, ou } } - // once we found a best frequency, the region formed by lower bound to upper bound corresponding to this frequency will not be inspected anymore. because they all share the same period. + /* Once we found a best frequency, the region formed by lower bound to upper bound corresponding to this frequency + * will not be inspected anymore. because they all share the same period. + */ int period = w.Length / bestFreq; double lowerBound = w.Length * 1.0 / (period + 1); double upperBound = w.Length * 1.0 / (period - 1); @@ -214,9 +242,18 @@ private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, ou } } } - double typycalEnergy = MathUtils.QuickMedian(energies); - // the second energy must be at least significantly large enough than typical energies, and also similar to best energy at magnitude level. + var medianAggregator = new MedianDblAggregator(energies.Count); + foreach (var value in energies) + { + medianAggregator.ProcessValue(value); + } + + var typycalEnergy = medianAggregator.Median; + + /* The second energy must be at least significantly large enough than typical energies, + * and also similar to best energy at magnitude level. + */ if (typycalEnergy * 6.0 < secondEnergy && secondEnergy * 10.0 > bestEnergy) return; @@ -225,15 +262,15 @@ private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, ou } /// - /// given a frequency F represented by an integer, we aim to find the best period by inspecting the auto-correlation function in time domain. - /// since either frequency or the period is an integer, so the possible period located within - /// [N/(F+1), N/(F-1)], we need to check this domain, and pick the best one. where N is the length of the augmented time series + /// Given a frequency F represented by an integer, we aim to find the best period by inspecting the auto-correlation function in time domain. + /// Since either frequency or the period is an integer, the possible period located within + /// [N/(F+1), N/(F-1)], we need to check this domain, and pick the best one, where N is the length of the augmented time series /// - /// the auto-correlation function of the augmented time series - /// the input frequency candidate - /// the length of the original time series, this is used for post processing to reduce false positive - /// output the energy on the auto-correlation function - /// return the best period estimated + /// The auto-correlation function of the augmented time series + /// The input frequency candidate + /// The length of the original time series, this is used for post processing to reduce false positive + /// Output the energy on the auto-correlation function + /// The best period estimated private static int FindBestPeriod(Complex[] r, int frequency, int timeSeriesLength, out double energy) { energy = -1; @@ -266,24 +303,5 @@ private static int FindBestPeriod(Complex[] r, int frequency, int timeSeriesLeng } return bestPeriod; } - - /// - /// get the smallest 2^k which is equal or greater than n - /// - private static int Get2Power(int n) - { - int result = 1; - bool meet1 = false; // check is n is just equals to 2^k for some k - while (n > 1) - { - if ((n & 1) != 0) - meet1 = true; - result <<= 1; - n >>= 1; - } - if (meet1) - result <<= 1; - return result; - } } } diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index d25b614d37..fb032f7ec4 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.TestFramework; using Microsoft.ML.TimeSeries; @@ -527,10 +528,10 @@ public void AnomalyDetectionWithSrCnn(bool loadDataFromFile) { var ml = new MLContext(1); IDataView dataView; - if(loadDataFromFile) + if (loadDataFromFile) { var dataPath = GetDataPath(Path.Combine("Timeseries", "anomaly_detection.csv")); - + // Load data from file into the dataView dataView = ml.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("Value", DataKind.Single, 0), @@ -596,7 +597,7 @@ public void TestSrCnnBatchAnomalyDetector( var data = new List(); for (int index = 0; index < 20; index++) { - data.Add(new TimeSeriesDataDouble { Value = 5 } ); + data.Add(new TimeSeriesDataDouble { Value = 5 }); } data.Add(new TimeSeriesDataDouble { Value = 10 }); for (int index = 0; index < 5; index++) @@ -688,6 +689,27 @@ public void RootCauseLocalization() } } + [Theory] + [InlineData(-1, 3)] + [InlineData(40, 3)] + [InlineData(20, -1)] + public void TestDetectSeasonality(int seasonalityWindowSize, int expectedPeriod) + { + // Create a detect seasonality input + var input = new List() {18.004, 87.401, 87.411, 18.088, 18.017, 87.759, 33.996, 18.043, 87.853, 18.364, 18.004, 86.992, 87.555, + 18.088, 18.029, 87.906, 87.471, 18.039, 18.099, 87.403, 18.030, 72.991, 87.804, 18.381, 18.016, 87.145, 87.771, + 18.029, 18.084, 87.976, 34.913, 18.064, 18.302, 87.723, 18.001, 86.401, 87.344, 18.295, 18.002, 87.793, + 87.531, 18.055, 18.005, 87.947, 18.003, 72.743, 87.722, 18.142 }.Select(t => new TimeSeriesData((float)t)); + + var mlContext = new MLContext(); + + var dataView = mlContext.Data.LoadFromEnumerable(input); + SeasonalityDetector seasonalityDetector = new SeasonalityDetector(); + + int period = mlContext.AnomalyDetection.DetectSeasonality(dataView, "Value", seasonalityWindowSize); + Assert.Equal(expectedPeriod, period); + } + private static List GetRootCauseLocalizationPoints() { List points = new List(); From 2b982458e928c80e3ccd8c5f53560f165afecafc Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Thu, 11 Jun 2020 09:54:25 -0700 Subject: [PATCH 07/15] Change SeasonalityDetector to be internal class --- src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 912806d6db..0feebae478 100644 --- a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -15,7 +15,7 @@ namespace Microsoft.ML.TimeSeries /// /// This class is used to detect the periodicity. /// - public class SeasonalityDetector + internal class SeasonalityDetector { /// /// In practice, the max lag very rarely exceed 365, which lacks of strong interpretation, and which also brings performance overhead. From 177502ef9b2e08352d4dac0247d7892d556bd736 Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Fri, 12 Jun 2020 12:25:37 -0700 Subject: [PATCH 08/15] 1. Introduce randomnessThreshold as an optional parameter 2. Update comments and polish SeasonalityDetector for readability. --- .../TimeSeries/DetectSeasonality.cs | 22 ++- .../ExtensionsCatalog.cs | 45 ++++- .../SeasonalityDetector.cs | 160 ++++++++++-------- 3 files changed, 137 insertions(+), 90 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs index 6c2fb1b8a6..547ed40f09 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs @@ -10,15 +10,25 @@ public static class DetectSeasonality { public static void Example() { - // Create a new ML context, for ML.NET operations. It can be used for - // exception tracking and logging, as well as the source of randomness. + /* Create a new ML context, for ML.NET operations. It can be used for + exception tracking and logging, as well as the source of randomness.*/ var mlContext = new MLContext(); - var dataView = mlContext.Data.LoadFromEnumerable(GetPointsWithSeasonality()); // Create a seasonal data as input for DetectSeasonality. - var input = GetPointsWithSeasonality(); - SeasonalityDetector seasonalityDetector = new SeasonalityDetector(); - int period = mlContext.AnomalyDetection.DetectSeasonality(dataView, "Input"); + var dataView = mlContext.Data.LoadFromEnumerable(GetPointsWithSeasonality()); + + /* Two option parameters: + * seasonalityWindowSize: Default value is -1. When set to -1, use the whole input to fit model; + * when set to a positive integer, only the first windowSize number of values will be considered. + * randomnessThreshold: Randomness threashold that specifies how confidence the input values follows + * a predictable pattern recurring over an interval as seasonal data. By default, it is set as 2.81 + * for 99.5% confidence. The higher the threshold is set, the more strict recurring pattern the + * input values should follow to be determined as seasonal data. + */ + int period = mlContext.AnomalyDetection.DetectSeasonality( + dataView, + nameof(TimeSeriesData.Value), + seasonalityWindowSize: 40); // Print the Seasonality Period result. Console.WriteLine($"Seasonality Period: #{period}"); diff --git a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs index 8f645dffe0..e841b93fdc 100644 --- a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs @@ -211,17 +211,34 @@ public static RootCause LocalizeRootCause(this AnomalyDetectionCatalog catalog, } /// - /// Obtain the period by adopting techniques of spectral analysis. which is founded by - /// the fourier analysis. returns -1 means there's no significant period. otherwise, a period - /// is returned. + /// + /// In time series data, seasonality (or periodicity) is the presence of variations that occur at specific regular intervals, + /// such as weekly, monthly, or quarterly. + /// + /// + /// This method detects this predictable interval (or period) by adopting techniques of fourier analysis. + /// Assuming the input values have the same time interval (e.g., sensor data collected at every second ordered by timestamps), + /// this method takes a list of time-series data, and returns the regular period for the input seasonal data, + /// if a predictable fluctuation or pattern can be found that recurs or repeats over this period throughout the input values. + /// + /// + /// Returns -1 if no such pattern is found, that is, the input values do not follow a seasonal fluctuation. + /// /// /// The detect seasonality catalog. /// Input DataView.The data is an instance of . /// Name of column to process. The column data must be . - /// An upper bound on the largest relevant seasonality in the input time-series. - /// When set to -1, use the whole input to fit model, when set to a positive integer, use this number as batch size. - /// Default value is -1. - /// The detected period if seasonality period exists, otherwise return -1. + /// An upper bound on the number of values to be considered in the input values. + /// When set to -1, use the whole input to fit model; when set to a positive integer, only the first windowSize number + /// of values will be considered. Default value is -1. + /// Randomness flutuation threashold + /// that specifies how confidence the input values follows a predictable pattern recurring over an interval as seasonal data. + /// By default, it is set as 2.81: two-sided t-distribution + /// for 99.5% confidence interval with infinit degree of freedom. + /// The higher the threshold is set, the more strict recurring pattern the input values should follow to be determined as + /// seasonal data. + /// + /// The regular interval for the input as seasonal data, otherwise return -1. /// /// /// /// /// - public static int DetectSeasonality(this AnomalyDetectionCatalog catalog, IDataView input, string inputColumnName, int seasonalityWindowSize = -1) - => new SeasonalityDetector().DetectSeasonality(CatalogUtils.GetEnvironment(catalog), input, inputColumnName, seasonalityWindowSize); + public static int DetectSeasonality( + this AnomalyDetectionCatalog catalog, + IDataView input, + string inputColumnName, + int seasonalityWindowSize = -1, + double randomnessThreshold = 2.81) + => new SeasonalityDetector().DetectSeasonality( + CatalogUtils.GetEnvironment(catalog), + input, + inputColumnName, + seasonalityWindowSize, + randomnessThreshold); private static void CheckRootCauseInput(IHostEnvironment host, RootCauseLocalizationInput src) { diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 0feebae478..7a62b368a5 100644 --- a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -35,18 +35,26 @@ internal class SeasonalityDetector private const double MinEnergyThreshold = 1e-10; /// - /// Obtain the period by adopting techniques of spectral analysis. which is founded by - /// the fourier analysis. returns -1 means there's no significant period. otherwise, a period - /// is returned. + /// This method detects this predictable interval (or period) by adopting techniques of fourier analysis. + /// Returns -1 if no such pattern is found, that is, the input values do not follow a seasonal fluctuation. /// /// The detect seasonality host environment. /// Input DataView.The data is an instance of . /// Name of column to process. The column data must be . - /// An upper bound on the largest relevant seasonality in the input time-series. - /// When set to -1, use the whole input to fit model, when set to a positive integer, use this number as batch size. + /// An upper bound on the number of values to be considered in the input values. + /// When set to -1, use the whole input to fit model; when set to a positive integer, use this number as batch size. /// Default value is -1. + /// Randomness flutuation threashold that specifies how confidence the input + /// follows a predictable pattern recurring over an interval as seasonal data. + /// By default, it is set as 2.81 for 99.5% confidence interval + /// /// The detected period if seasonality period exists, otherwise return -1. - public int DetectSeasonality(IHostEnvironment host, IDataView input, string inputColumnName, int seasonalityWindowSize) + public int DetectSeasonality( + IHostEnvironment host, + IDataView input, + string inputColumnName, + int seasonalityWindowSize, + double randomessThreshold) { host.CheckValue(input, nameof(input)); host.CheckValue(inputColumnName, nameof(inputColumnName)); @@ -60,7 +68,7 @@ public int DetectSeasonality(IHostEnvironment host, IDataView input, string inpu int length = 0; float valueRef = 0; - List valueCache = new List(seasonalityWindowSize == -1 ? 1024 : seasonalityWindowSize); + var valueCache = seasonalityWindowSize == -1 ? new List() : new List(seasonalityWindowSize); while (rowCursor.MoveNext()) { valueDelegate.Invoke(ref valueRef); @@ -70,60 +78,61 @@ public int DetectSeasonality(IHostEnvironment host, IDataView input, string inpu break; } - float[] fftRe = new float[length]; - float[] fftIm = new float[length]; - float[] inputRe = valueCache.ToArray(); + double[] fftRe = new double[length]; + double[] fftIm = new double[length]; + double[] inputRe = valueCache.ToArray(); - FftUtils.ComputeForwardFft(inputRe, Enumerable.Repeat(0f, length).ToArray(), fftRe, fftIm, length); + FftUtils.ComputeForwardFft(inputRe, Enumerable.Repeat(0.0, length).ToArray(), fftRe, fftIm, length); - var z = fftRe.Select((m, i) => new Complex(m, fftIm[i])).ToArray(); + var energies = fftRe.Select((m, i) => new Complex(m, fftIm[i])).ToArray(); - /* Here w is the periodogram, which indicates the square of "energy" on the frequency domain. - * Specifically, w[j] = a[j]^2+b[j]^2, where a and b are Fourier Coefficients for cosine and sine, + /* Periodogram indicates the square of "energy" on the frequency domain. + * Specifically, periodogram[j] = a[j]^2+b[j]^2, where a and b are Fourier Coefficients for cosine and sine, * x(t) = a0+sum(a[j]cos(2Pi * f[j]t)+b[j]sin(2Pi * f[j]t) */ - var w = z.Select((t, i) => t * Complex.Conjugate(t)).ToArray(); - FindBestTwoFrequencies(w, length, out var bestFreq, out var secondFreq); + var periodogram = energies.Select((t, i) => t * Complex.Conjugate(t)).ToArray(); + FindBestTwoFrequencies(periodogram, length, out var bestFreq, out var secondFreq); double[] ifftRe = new double[length]; double[] ifftIm = new double[length]; FftUtils.ComputeBackwardFft( - w.Select(t => (double)t.Real).ToArray(), - w.Select(t => (double)t.Imaginary).ToArray(), ifftRe, ifftIm, length); - var r = ifftRe.Select((t, i) => new Complex(t, ifftIm[i])).ToArray(); - - int period = FindActualPeriod(r, bestFreq, secondFreq, length); - - if (period < 0) - { - period = -1; - } - - return period; + periodogram.Select(t => t.Real).ToArray(), + periodogram.Select(t => t.Imaginary).ToArray(), + ifftRe, + ifftIm, + length); + var values = ifftRe.Select((t, i) => new Complex(t, ifftIm[i])).ToArray(); + + int period = FindActualPeriod(values, bestFreq, secondFreq, length, randomessThreshold); + return period < 0 ? -1 : period; } /// - /// Find the actual period following the steps: + /// Find the actual period based on best frequency and second best frequency: /// Pick the best frequency by inspecting the auto-correlation energy (pick the highest) in time-domain. /// In the normal case, usually, when the time series is with period T, then the best frequency is N/T, /// while the second frequency would be N/2T, because period = T implies period = nT, where n is an integer. - /// In such a case, smaller period will win out on the autu-correlation energy list, due to the property of auto-correlation. + /// In such a case, smaller period will win out on the autu-correlation energy list, due to the property + /// of auto-correlation. /// - /// - /// - /// - /// - /// - private static int FindActualPeriod(Complex[] r, int bestFreq, int secondFreq, int timeSeriesLength) + /// The auto-correlation function of the augmented time series + /// The best frequency candidate + /// The second best frequency candidate + /// The length of the original time series, this is used for post processing to reduce false positive + /// Randomness flutuation threashold that specifies how confidence the input + /// follows a predictable pattern recurring over an interval as seasonal data. + /// + /// The period detected by check best frequency and second best frequency + private static int FindActualPeriod(Complex[] values, int bestFrequency, int secondFrequency, int timeSeriesLength, double randomnessThreshold) { int firstPeriod = -1; int secondPeriod = -1; double firstTimeDomainEnergy = -1; double secondTimeDomainEnergy = -1; - firstPeriod = FindBestPeriod(r, bestFreq, timeSeriesLength, out firstTimeDomainEnergy); - if (secondFreq != -1) + firstPeriod = FindBestPeriod(values, bestFrequency, timeSeriesLength, out firstTimeDomainEnergy); + if (secondFrequency != -1) { - secondPeriod = FindBestPeriod(r, secondFreq, timeSeriesLength, out secondTimeDomainEnergy); + secondPeriod = FindBestPeriod(values, secondFrequency, timeSeriesLength, out secondTimeDomainEnergy); } if (firstPeriod == -1 && secondPeriod == -1) return -1; @@ -161,39 +170,39 @@ private static int FindActualPeriod(Complex[] r, int bestFreq, int secondFreq, i } } } - trueTimeDomainEnergy /= r[0].Real; + trueTimeDomainEnergy /= values[0].Real; /* This is a key equation, which is named the "testing for randomness with the correlogram". /ref: http://www.ltrr.arizona.edu/~dmeko/notes_3.pdf * 1.96 is for the 2-sigma, which has 95% statistical confidence. 2.58 is for 99% confidence, 2.85 for 99.5% confidence * increasing the threshold aims to mitigate the fake seasonal component caused by outliers. in practice, if there exist true seasonal component, * such as BirdStrike/Appdownloads, the energy is far larger than threshold, hence change threshold from 2.85 to 4.0 have no impact (tested); */ - double threshold = 4 / Math.Sqrt(timeSeriesLength); + double threshold = randomnessThreshold / Math.Sqrt(timeSeriesLength); - if (trueTimeDomainEnergy < threshold || r[truePeriod].Real < MinEnergyThreshold) + if (trueTimeDomainEnergy < threshold || values[truePeriod].Real < MinEnergyThreshold) return -1; return truePeriod; } /// - /// In order to pick up a proper frequency robustly (this is useful especially for large frequency, or small period, e.g., period = 2), - /// this method aims to pick up the top two frequencies for further evaluation. - /// of course, the energy of the second frequency (in frequency domain) must be at similar magnitude compared with the energy of the first - /// frequency. + /// In order to pick up a proper frequency robustly (this is useful especially for large frequency, + /// or small period, e.g., period = 2), this method aims to pick up the top two frequencies for + /// further evaluation. The energy of the second frequency (in frequency domain) must be at similar + /// magnitude compared with the energy of the first frequency. /// - /// the energy list in the frequency domain, the index is the frequency. + /// the energy list in the frequency domain, the index is the frequency. /// the original time series length - /// the frequency with highest energy - /// the frequency with second highest energy - private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, out int bestFreq, out int secondFreq) + /// the frequency with highest energy + /// the frequency with second highest energy + private static void FindBestTwoFrequencies(Complex[] values, int timeSeriesLength, out int bestFrequency, out int secondFrequency) { - bestFreq = -1; + bestFrequency = -1; double bestEnergy = -1.0; - secondFreq = -1; + secondFrequency = -1; double secondEnergy = -1.0; - if (w.Length < 2) + if (values.Length < 2) return; List energies = new List(); @@ -201,44 +210,44 @@ private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, ou /* Length of time series divided by frequency is period. * It is obvious that the period should be larger than 1 and smaller than the total length, and is an integer. */ - for (int i = w.Length / timeSeriesLength; i < w.Length / 2 + 1; i++) + for (int i = values.Length / timeSeriesLength; i < values.Length / 2 + 1; i++) { - double nextWeight = w[i].Magnitude; + double nextWeight = values[i].Magnitude; energies.Add(nextWeight); if (nextWeight > bestEnergy) { bestEnergy = nextWeight; - bestFreq = i; + bestFrequency = i; } } /* Once we found a best frequency, the region formed by lower bound to upper bound corresponding to this frequency * will not be inspected anymore. because they all share the same period. */ - int period = w.Length / bestFreq; - double lowerBound = w.Length * 1.0 / (period + 1); - double upperBound = w.Length * 1.0 / (period - 1); + int period = values.Length / bestFrequency; + double lowerBound = values.Length * 1.0 / (period + 1); + double upperBound = values.Length * 1.0 / (period - 1); - for (int i = w.Length / timeSeriesLength; i < w.Length / 2 + 1; i++) + for (int i = values.Length / timeSeriesLength; i < values.Length / 2 + 1; i++) { if (i > lowerBound && i < upperBound) continue; - double weight = w[i].Magnitude; + double weight = values[i].Magnitude; if (weight > secondEnergy) { double prevWeight = 0; if (i > 0) - prevWeight = w[i - 1].Magnitude; + prevWeight = values[i - 1].Magnitude; double nextWeight = 0; - if (i < w.Length - 1) - nextWeight = w[i + 1].Magnitude; + if (i < values.Length - 1) + nextWeight = values[i + 1].Magnitude; // should be a local maximum if (weight >= prevWeight && weight >= nextWeight) { secondEnergy = nextWeight; - secondFreq = i; + secondFrequency = i; } } } @@ -258,20 +267,21 @@ private static void FindBestTwoFrequencies(Complex[] w, int timeSeriesLength, ou return; // set the second frequency to -1, since it is obviously not strong enought to compete with the best energy. - secondFreq = -1; + secondFrequency = -1; } /// - /// Given a frequency F represented by an integer, we aim to find the best period by inspecting the auto-correlation function in time domain. - /// Since either frequency or the period is an integer, the possible period located within - /// [N/(F+1), N/(F-1)], we need to check this domain, and pick the best one, where N is the length of the augmented time series + /// Given a frequency F represented by an integer, we aim to find the best period by inspecting the + /// auto-correlation function in time domain. Since either frequency or the period is an integer, + /// the possible period located within [N/(F+1), N/(F-1)], we need to check this domain, and pick + /// the best one, where N is the length of the augmented time series. /// - /// The auto-correlation function of the augmented time series + /// The auto-correlation function of the augmented time series /// The input frequency candidate /// The length of the original time series, this is used for post processing to reduce false positive /// Output the energy on the auto-correlation function /// The best period estimated - private static int FindBestPeriod(Complex[] r, int frequency, int timeSeriesLength, out double energy) + private static int FindBestPeriod(Complex[] values, int frequency, int timeSeriesLength, out double energy) { energy = -1; @@ -279,12 +289,12 @@ private static int FindBestPeriod(Complex[] r, int frequency, int timeSeriesLeng if (frequency <= 1) return -1; - int lowerBound = r.Length / (frequency + 1); - int upperBound = r.Length / (frequency - 1); + int lowerBound = values.Length / (frequency + 1); + int upperBound = values.Length / (frequency - 1); int bestPeriod = -1; - for (int i = lowerBound; i <= upperBound && i < r.Length; i++) + for (int i = lowerBound; i <= upperBound && i < values.Length; i++) { - var currentEnergy = r[i].Real; + var currentEnergy = values[i].Real; if (currentEnergy > energy) { energy = currentEnergy; From b5022120b78c6c57d2830270b548f79a0e4d36b2 Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Fri, 12 Jun 2020 12:27:34 -0700 Subject: [PATCH 09/15] minor float to double type change --- src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 7a62b368a5..62c72de1a1 100644 --- a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -64,10 +64,10 @@ public int DetectSeasonality( host.CheckUserArg(column.HasValue, nameof(inputColumnName)); var rowCursor = input.GetRowCursor(new List() { column.Value }); - var valueDelegate = rowCursor.GetGetter(column.Value); + var valueDelegate = rowCursor.GetGetter(column.Value); int length = 0; - float valueRef = 0; + double valueRef = 0; var valueCache = seasonalityWindowSize == -1 ? new List() : new List(seasonalityWindowSize); while (rowCursor.MoveNext()) { From 53d5c0dd13b76b48035bdd5fc37e40e3b3399547 Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Mon, 15 Jun 2020 12:57:09 -0700 Subject: [PATCH 10/15] fix unit tests --- test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index fb032f7ec4..78dcee8fb9 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -699,14 +699,14 @@ public void TestDetectSeasonality(int seasonalityWindowSize, int expectedPeriod) var input = new List() {18.004, 87.401, 87.411, 18.088, 18.017, 87.759, 33.996, 18.043, 87.853, 18.364, 18.004, 86.992, 87.555, 18.088, 18.029, 87.906, 87.471, 18.039, 18.099, 87.403, 18.030, 72.991, 87.804, 18.381, 18.016, 87.145, 87.771, 18.029, 18.084, 87.976, 34.913, 18.064, 18.302, 87.723, 18.001, 86.401, 87.344, 18.295, 18.002, 87.793, - 87.531, 18.055, 18.005, 87.947, 18.003, 72.743, 87.722, 18.142 }.Select(t => new TimeSeriesData((float)t)); + 87.531, 18.055, 18.005, 87.947, 18.003, 72.743, 87.722, 18.142 }.Select(t => new TimeSeriesDataDouble() { Value = t }); var mlContext = new MLContext(); var dataView = mlContext.Data.LoadFromEnumerable(input); SeasonalityDetector seasonalityDetector = new SeasonalityDetector(); - int period = mlContext.AnomalyDetection.DetectSeasonality(dataView, "Value", seasonalityWindowSize); + int period = mlContext.AnomalyDetection.DetectSeasonality(dataView, nameof(TimeSeriesDataDouble.Value), seasonalityWindowSize); Assert.Equal(expectedPeriod, period); } From 860915849aaccdd64f807051dcf7f0dc5dfabe38 Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Tue, 16 Jun 2020 21:25:09 -0700 Subject: [PATCH 11/15] address Harish's comments: 1. Change Randomness threshold to [0, 1] range as confidence internal and map to inverse normal cumulative distribution 2. Update unit tests to use sin(2pi + x) 3. Other formatting issues --- .../TimeSeries/DetectSeasonality.cs | 21 +++--- .../ExtensionsCatalog.cs | 11 ++- .../SeasonalityDetector.cs | 70 ++++++++++++++----- .../TimeSeriesDirectApi.cs | 18 ++--- 4 files changed, 76 insertions(+), 44 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs index 547ed40f09..f786e58e76 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs @@ -14,15 +14,18 @@ public static void Example() exception tracking and logging, as well as the source of randomness.*/ var mlContext = new MLContext(); - // Create a seasonal data as input for DetectSeasonality. - var dataView = mlContext.Data.LoadFromEnumerable(GetPointsWithSeasonality()); + // Create a seasonal data as input: y = sin(2 * Pi + x) + var seasonalData = Enumerable.Range(0, 100).Select(x => new TimeSeriesData(Math.Sin(2 * Math.PI + x))); + + // Load the input data as a DataView. + var dataView = mlContext.Data.LoadFromEnumerable(seasonalData); /* Two option parameters: * seasonalityWindowSize: Default value is -1. When set to -1, use the whole input to fit model; * when set to a positive integer, only the first windowSize number of values will be considered. - * randomnessThreshold: Randomness threashold that specifies how confidence the input values follows - * a predictable pattern recurring over an interval as seasonal data. By default, it is set as 2.81 - * for 99.5% confidence. The higher the threshold is set, the more strict recurring pattern the + * randomnessThreshold: Randomness threshold that specifies how confidence the input values follows + * a predictable pattern recurring as seasonal data. By default, it is set as 0.99. + * The higher the threshold is set, the more strict recurring pattern the * input values should follow to be determined as seasonal data. */ int period = mlContext.AnomalyDetection.DetectSeasonality( @@ -34,14 +37,6 @@ public static void Example() Console.WriteLine($"Seasonality Period: #{period}"); } - private static IEnumerable GetPointsWithSeasonality() - { - return new List() {18.004, 87.401, 87.411, 18.088, 18.017, 87.759, 33.996, 18.043, 87.853, 18.364, 18.004, 86.992, 87.555, - 18.088, 18.029, 87.906, 87.471, 18.039, 18.099, 87.403, 18.030, 72.991, 87.804, 18.381, 18.016, 87.145, 87.771, - 18.029, 18.084, 87.976, 34.913, 18.064, 18.302, 87.723, 18.001, 86.401, 87.344, 18.295, 18.002, 87.793, - 87.531, 18.055, 18.005, 87.947, 18.003, 72.743, 87.722, 18.142 }.Select(t => new TimeSeriesData(t)); - } - private class TimeSeriesData { public double Value; diff --git a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs index e841b93fdc..e6f509eda4 100644 --- a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs @@ -231,12 +231,9 @@ public static RootCause LocalizeRootCause(this AnomalyDetectionCatalog catalog, /// An upper bound on the number of values to be considered in the input values. /// When set to -1, use the whole input to fit model; when set to a positive integer, only the first windowSize number /// of values will be considered. Default value is -1. - /// Randomness flutuation threashold - /// that specifies how confidence the input values follows a predictable pattern recurring over an interval as seasonal data. - /// By default, it is set as 2.81: two-sided t-distribution - /// for 99.5% confidence interval with infinit degree of freedom. - /// The higher the threshold is set, the more strict recurring pattern the input values should follow to be determined as - /// seasonal data. + /// Randomness threshold + /// that specifies how confidence the input values follows a predictable pattern recurring as seasonal data. + /// The range is between [0, 1]. By default, it is set as 0.99. /// /// The regular interval for the input as seasonal data, otherwise return -1. /// @@ -251,7 +248,7 @@ public static int DetectSeasonality( IDataView input, string inputColumnName, int seasonalityWindowSize = -1, - double randomnessThreshold = 2.81) + double randomnessThreshold = 0.99) => new SeasonalityDetector().DetectSeasonality( CatalogUtils.GetEnvironment(catalog), input, diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 62c72de1a1..4777306616 100644 --- a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -34,6 +34,18 @@ internal class SeasonalityDetector /// private const double MinEnergyThreshold = 1e-10; + /// + /// The inverse normal cumulative distribution table that maps the confidence interval (randomness threshold) to the inverse of + /// normal cumulative distribution value. + /// + private static readonly SortedDictionary _confidenceToInverseNormalDistribution = new SortedDictionary + { + { 0.95, 1.96 }, + { 0.99, 2.58 }, + { 0.995, 2.81 }, + { 0.999, 3.29 }, + }; + /// /// This method detects this predictable interval (or period) by adopting techniques of fourier analysis. /// Returns -1 if no such pattern is found, that is, the input values do not follow a seasonal fluctuation. @@ -44,9 +56,8 @@ internal class SeasonalityDetector /// An upper bound on the number of values to be considered in the input values. /// When set to -1, use the whole input to fit model; when set to a positive integer, use this number as batch size. /// Default value is -1. - /// Randomness flutuation threashold that specifies how confidence the input - /// follows a predictable pattern recurring over an interval as seasonal data. - /// By default, it is set as 2.81 for 99.5% confidence interval + /// Randomness threshold, ranging from [0, 1]. It specifies how confidence the input + /// follows a predictable pattern recurring as seasonal data. By default, it is set as 0.99. /// /// The detected period if seasonality period exists, otherwise return -1. public int DetectSeasonality( @@ -119,7 +130,7 @@ public int DetectSeasonality( /// The best frequency candidate /// The second best frequency candidate /// The length of the original time series, this is used for post processing to reduce false positive - /// Randomness flutuation threashold that specifies how confidence the input + /// Randomness threshold that specifies how confidence the input /// follows a predictable pattern recurring over an interval as seasonal data. /// /// The period detected by check best frequency and second best frequency @@ -134,8 +145,12 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec { secondPeriod = FindBestPeriod(values, secondFrequency, timeSeriesLength, out secondTimeDomainEnergy); } + if (firstPeriod == -1 && secondPeriod == -1) + { return -1; + } + int truePeriod; double trueTimeDomainEnergy; if (firstPeriod == -1) @@ -157,7 +172,7 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec } else { - // hueristic: if the second frequency is with somewhat higher energy in time domain, we think it is a better candidate + // hueristic: if the second frequency is with higher energy in time domain, we think it is a better candidate if (secondTimeDomainEnergy > firstTimeDomainEnergy * 1.05) { truePeriod = secondPeriod; @@ -170,6 +185,7 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec } } } + trueTimeDomainEnergy /= values[0].Real; /* This is a key equation, which is named the "testing for randomness with the correlogram". /ref: http://www.ltrr.arizona.edu/~dmeko/notes_3.pdf @@ -177,10 +193,23 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec * increasing the threshold aims to mitigate the fake seasonal component caused by outliers. in practice, if there exist true seasonal component, * such as BirdStrike/Appdownloads, the energy is far larger than threshold, hence change threshold from 2.85 to 4.0 have no impact (tested); */ - double threshold = randomnessThreshold / Math.Sqrt(timeSeriesLength); + double randomnessValue = _confidenceToInverseNormalDistribution.First().Value; + foreach (var entry in _confidenceToInverseNormalDistribution) + { + if (randomnessThreshold < entry.Key) + { + break; + } + + randomnessValue = entry.Value; + } + + double threshold = randomnessValue / Math.Sqrt(timeSeriesLength); if (trueTimeDomainEnergy < threshold || values[truePeriod].Real < MinEnergyThreshold) + { return -1; + } return truePeriod; } @@ -203,9 +232,11 @@ private static void FindBestTwoFrequencies(Complex[] values, int timeSeriesLengt double secondEnergy = -1.0; if (values.Length < 2) + { return; + } - List energies = new List(); + var medianAggregator = new MedianDblAggregator(values.Length / 2 + 1 - values.Length / timeSeriesLength); /* Length of time series divided by frequency is period. * It is obvious that the period should be larger than 1 and smaller than the total length, and is an integer. @@ -213,7 +244,7 @@ private static void FindBestTwoFrequencies(Complex[] values, int timeSeriesLengt for (int i = values.Length / timeSeriesLength; i < values.Length / 2 + 1; i++) { double nextWeight = values[i].Magnitude; - energies.Add(nextWeight); + medianAggregator.ProcessValue(nextWeight); if (nextWeight > bestEnergy) { @@ -232,16 +263,24 @@ private static void FindBestTwoFrequencies(Complex[] values, int timeSeriesLengt for (int i = values.Length / timeSeriesLength; i < values.Length / 2 + 1; i++) { if (i > lowerBound && i < upperBound) + { continue; + } + double weight = values[i].Magnitude; if (weight > secondEnergy) { double prevWeight = 0; if (i > 0) + { prevWeight = values[i - 1].Magnitude; + } + double nextWeight = 0; if (i < values.Length - 1) + { nextWeight = values[i + 1].Magnitude; + } // should be a local maximum if (weight >= prevWeight && weight >= nextWeight) @@ -252,19 +291,15 @@ private static void FindBestTwoFrequencies(Complex[] values, int timeSeriesLengt } } - var medianAggregator = new MedianDblAggregator(energies.Count); - foreach (var value in energies) - { - medianAggregator.ProcessValue(value); - } - - var typycalEnergy = medianAggregator.Median; + var typicalEnergy = medianAggregator.Median; /* The second energy must be at least significantly large enough than typical energies, * and also similar to best energy at magnitude level. */ - if (typycalEnergy * 6.0 < secondEnergy && secondEnergy * 10.0 > bestEnergy) + if (typicalEnergy * 6.0 < secondEnergy && secondEnergy * 10.0 > bestEnergy) + { return; + } // set the second frequency to -1, since it is obviously not strong enought to compete with the best energy. secondFrequency = -1; @@ -287,7 +322,9 @@ private static int FindBestPeriod(Complex[] values, int frequency, int timeSerie // this will never make sense of a seasonal signal if (frequency <= 1) + { return -1; + } int lowerBound = values.Length / (frequency + 1); int upperBound = values.Length / (frequency - 1); @@ -311,6 +348,7 @@ private static int FindBestPeriod(Complex[] values, int frequency, int timeSerie energy = -1; return -1; } + return bestPeriod; } } diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index 78dcee8fb9..1ab0a70f73 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -690,17 +690,19 @@ public void RootCauseLocalization() } [Theory] - [InlineData(-1, 3)] - [InlineData(40, 3)] + [InlineData(-1, 6)] + [InlineData(60, 6)] [InlineData(20, -1)] public void TestDetectSeasonality(int seasonalityWindowSize, int expectedPeriod) { - // Create a detect seasonality input - var input = new List() {18.004, 87.401, 87.411, 18.088, 18.017, 87.759, 33.996, 18.043, 87.853, 18.364, 18.004, 86.992, 87.555, - 18.088, 18.029, 87.906, 87.471, 18.039, 18.099, 87.403, 18.030, 72.991, 87.804, 18.381, 18.016, 87.145, 87.771, - 18.029, 18.084, 87.976, 34.913, 18.064, 18.302, 87.723, 18.001, 86.401, 87.344, 18.295, 18.002, 87.793, - 87.531, 18.055, 18.005, 87.947, 18.003, 72.743, 87.722, 18.142 }.Select(t => new TimeSeriesDataDouble() { Value = t }); - + // Create a detect seasonality input: y = sin(2 * Pi + x) + var input = Enumerable.Range(0, 100).Select(x => + new TimeSeriesDataDouble() + { + Value = Math.Sin(2 * Math.PI + x), + }); + foreach (var data in input) + Console.WriteLine(data.Value); var mlContext = new MLContext(); var dataView = mlContext.Data.LoadFromEnumerable(input); From d2eeb718bab455850c50f47343920ce3c2dc10f4 Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Tue, 16 Jun 2020 21:35:43 -0700 Subject: [PATCH 12/15] minor format update --- src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 4777306616..0e98eff565 100644 --- a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -80,6 +80,7 @@ public int DetectSeasonality( int length = 0; double valueRef = 0; var valueCache = seasonalityWindowSize == -1 ? new List() : new List(seasonalityWindowSize); + while (rowCursor.MoveNext()) { valueDelegate.Invoke(ref valueRef); @@ -115,6 +116,7 @@ public int DetectSeasonality( var values = ifftRe.Select((t, i) => new Complex(t, ifftIm[i])).ToArray(); int period = FindActualPeriod(values, bestFreq, secondFreq, length, randomessThreshold); + return period < 0 ? -1 : period; } @@ -141,6 +143,7 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec double firstTimeDomainEnergy = -1; double secondTimeDomainEnergy = -1; firstPeriod = FindBestPeriod(values, bestFrequency, timeSeriesLength, out firstTimeDomainEnergy); + if (secondFrequency != -1) { secondPeriod = FindBestPeriod(values, secondFrequency, timeSeriesLength, out secondTimeDomainEnergy); @@ -153,6 +156,7 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec int truePeriod; double trueTimeDomainEnergy; + if (firstPeriod == -1) { truePeriod = secondPeriod; @@ -194,6 +198,7 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec * such as BirdStrike/Appdownloads, the energy is far larger than threshold, hence change threshold from 2.85 to 4.0 have no impact (tested); */ double randomnessValue = _confidenceToInverseNormalDistribution.First().Value; + foreach (var entry in _confidenceToInverseNormalDistribution) { if (randomnessThreshold < entry.Key) From beed798a02e654df3acec0c5a9f7a2d9628a4155 Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Wed, 17 Jun 2020 14:24:31 -0700 Subject: [PATCH 13/15] update comments --- src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs index e6f509eda4..a58be55e34 100644 --- a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs @@ -232,7 +232,7 @@ public static RootCause LocalizeRootCause(this AnomalyDetectionCatalog catalog, /// When set to -1, use the whole input to fit model; when set to a positive integer, only the first windowSize number /// of values will be considered. Default value is -1. /// Randomness threshold - /// that specifies how confidence the input values follows a predictable pattern recurring as seasonal data. + /// that specifies how confidently the input values follow a predictable pattern recurring as seasonal data. /// The range is between [0, 1]. By default, it is set as 0.99. /// /// The regular interval for the input as seasonal data, otherwise return -1. From 16e7e07eb994331be486d625181baa7806ed5754 Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Wed, 17 Jun 2020 14:28:38 -0700 Subject: [PATCH 14/15] minor follow up comment update --- src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 0e98eff565..7bcfd28096 100644 --- a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -131,9 +131,11 @@ public int DetectSeasonality( /// The auto-correlation function of the augmented time series /// The best frequency candidate /// The second best frequency candidate - /// The length of the original time series, this is used for post processing to reduce false positive - /// Randomness threshold that specifies how confidence the input - /// follows a predictable pattern recurring over an interval as seasonal data. + /// The length of the original time series, this is used for post + /// processing to reduce false positive + /// + /// Randomness threshold that specifies how confidently the input + /// values follow a predictable pattern recurring as seasonal data. /// /// The period detected by check best frequency and second best frequency private static int FindActualPeriod(Complex[] values, int bestFrequency, int secondFrequency, int timeSeriesLength, double randomnessThreshold) From 080e08ff4d39a8abcd2337316a31f04f560f53ac Mon Sep 17 00:00:00 2001 From: Lisa Hua Date: Mon, 22 Jun 2020 11:17:13 -0700 Subject: [PATCH 15/15] update threshold to p value --- .../ExtensionsCatalog.cs | 4 +-- .../SeasonalityDetector.cs | 28 ++----------------- 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs index a58be55e34..dc254d2043 100644 --- a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs @@ -233,7 +233,7 @@ public static RootCause LocalizeRootCause(this AnomalyDetectionCatalog catalog, /// of values will be considered. Default value is -1. /// Randomness threshold /// that specifies how confidently the input values follow a predictable pattern recurring as seasonal data. - /// The range is between [0, 1]. By default, it is set as 0.99. + /// The range is between [0, 1]. By default, it is set as 0.95. /// /// The regular interval for the input as seasonal data, otherwise return -1. /// @@ -248,7 +248,7 @@ public static int DetectSeasonality( IDataView input, string inputColumnName, int seasonalityWindowSize = -1, - double randomnessThreshold = 0.99) + double randomnessThreshold = 0.95) => new SeasonalityDetector().DetectSeasonality( CatalogUtils.GetEnvironment(catalog), input, diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs index 7bcfd28096..1df4228b9f 100644 --- a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.Linq; using System.Numerics; +using Microsoft.ML.Internal.CpuMath; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.TimeSeries; @@ -34,18 +35,6 @@ internal class SeasonalityDetector /// private const double MinEnergyThreshold = 1e-10; - /// - /// The inverse normal cumulative distribution table that maps the confidence interval (randomness threshold) to the inverse of - /// normal cumulative distribution value. - /// - private static readonly SortedDictionary _confidenceToInverseNormalDistribution = new SortedDictionary - { - { 0.95, 1.96 }, - { 0.99, 2.58 }, - { 0.995, 2.81 }, - { 0.999, 3.29 }, - }; - /// /// This method detects this predictable interval (or period) by adopting techniques of fourier analysis. /// Returns -1 if no such pattern is found, that is, the input values do not follow a seasonal fluctuation. @@ -57,7 +46,7 @@ internal class SeasonalityDetector /// When set to -1, use the whole input to fit model; when set to a positive integer, use this number as batch size. /// Default value is -1. /// Randomness threshold, ranging from [0, 1]. It specifies how confidence the input - /// follows a predictable pattern recurring as seasonal data. By default, it is set as 0.99. + /// follows a predictable pattern recurring as seasonal data. By default, it is set as 0.95. /// /// The detected period if seasonality period exists, otherwise return -1. public int DetectSeasonality( @@ -199,18 +188,7 @@ private static int FindActualPeriod(Complex[] values, int bestFrequency, int sec * increasing the threshold aims to mitigate the fake seasonal component caused by outliers. in practice, if there exist true seasonal component, * such as BirdStrike/Appdownloads, the energy is far larger than threshold, hence change threshold from 2.85 to 4.0 have no impact (tested); */ - double randomnessValue = _confidenceToInverseNormalDistribution.First().Value; - - foreach (var entry in _confidenceToInverseNormalDistribution) - { - if (randomnessThreshold < entry.Key) - { - break; - } - - randomnessValue = entry.Value; - } - + double randomnessValue = ProbabilityFunctions.Probit(randomnessThreshold); double threshold = randomnessValue / Math.Sqrt(timeSeriesLength); if (trueTimeDomainEnergy < threshold || values[truePeriod].Real < MinEnergyThreshold)