diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs new file mode 100644 index 0000000000..f786e58e76 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectSeasonality.cs @@ -0,0 +1,51 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.TimeSeries; + +namespace Samples.Dynamic +{ + public static class DetectSeasonality + { + public static void Example() + { + /* Create a new ML context, for ML.NET operations. It can be used for + exception tracking and logging, as well as the source of randomness.*/ + var mlContext = new MLContext(); + + // Create a seasonal data as input: y = sin(2 * Pi + x) + var seasonalData = Enumerable.Range(0, 100).Select(x => new TimeSeriesData(Math.Sin(2 * Math.PI + x))); + + // Load the input data as a DataView. + var dataView = mlContext.Data.LoadFromEnumerable(seasonalData); + + /* Two option parameters: + * seasonalityWindowSize: Default value is -1. When set to -1, use the whole input to fit model; + * when set to a positive integer, only the first windowSize number of values will be considered. + * randomnessThreshold: Randomness threshold that specifies how confidence the input values follows + * a predictable pattern recurring as seasonal data. By default, it is set as 0.99. + * The higher the threshold is set, the more strict recurring pattern the + * input values should follow to be determined as seasonal data. + */ + int period = mlContext.AnomalyDetection.DetectSeasonality( + dataView, + nameof(TimeSeriesData.Value), + seasonalityWindowSize: 40); + + // Print the Seasonality Period result. + Console.WriteLine($"Seasonality Period: #{period}"); + } + + private class TimeSeriesData + { + public double Value; + + public TimeSeriesData(double value) + { + Value = value; + } + } + + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs index cd1d6a8b60..7a5d42518c 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs @@ -597,6 +597,7 @@ internal static void GetMedianSoFar(in double num, ref double median, ref MaxHea /// It tracks median values of non-sparse values (vCount). /// NaNs are ignored when updating min and max. /// + [BestFriend] internal sealed class MedianDblAggregator : IColumnAggregator { private MedianAggregatorUtils.MaxHeap _belowMedianHeap; @@ -1213,7 +1214,7 @@ private void GetResult(ref TFloat input, ref TFloat value) } public override NormalizingTransformer.NormalizerModelParametersBase GetNormalizerModelParams() - => new NormalizingTransformer.BinNormalizerModelParameters(ImmutableArray.Create(_binUpperBounds), _den,_offset); + => new NormalizingTransformer.BinNormalizerModelParameters(ImmutableArray.Create(_binUpperBounds), _den, _offset); } public sealed class ImplVec : BinColumnFunction diff --git a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs index 161f8bc27b..dc254d2043 100644 --- a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs @@ -210,6 +210,52 @@ public static RootCause LocalizeRootCause(this AnomalyDetectionCatalog catalog, return dst; } + /// + /// + /// In time series data, seasonality (or periodicity) is the presence of variations that occur at specific regular intervals, + /// such as weekly, monthly, or quarterly. + /// + /// + /// This method detects this predictable interval (or period) by adopting techniques of fourier analysis. + /// Assuming the input values have the same time interval (e.g., sensor data collected at every second ordered by timestamps), + /// this method takes a list of time-series data, and returns the regular period for the input seasonal data, + /// if a predictable fluctuation or pattern can be found that recurs or repeats over this period throughout the input values. + /// + /// + /// Returns -1 if no such pattern is found, that is, the input values do not follow a seasonal fluctuation. + /// + /// + /// The detect seasonality catalog. + /// Input DataView.The data is an instance of . + /// Name of column to process. The column data must be . + /// An upper bound on the number of values to be considered in the input values. + /// When set to -1, use the whole input to fit model; when set to a positive integer, only the first windowSize number + /// of values will be considered. Default value is -1. + /// Randomness threshold + /// that specifies how confidently the input values follow a predictable pattern recurring as seasonal data. + /// The range is between [0, 1]. By default, it is set as 0.95. + /// + /// The regular interval for the input as seasonal data, otherwise return -1. + /// + /// + /// + /// + /// + public static int DetectSeasonality( + this AnomalyDetectionCatalog catalog, + IDataView input, + string inputColumnName, + int seasonalityWindowSize = -1, + double randomnessThreshold = 0.95) + => new SeasonalityDetector().DetectSeasonality( + CatalogUtils.GetEnvironment(catalog), + input, + inputColumnName, + seasonalityWindowSize, + randomnessThreshold); + private static void CheckRootCauseInput(IHostEnvironment host, RootCauseLocalizationInput src) { host.CheckUserArg(src.Slices.Count >= 1, nameof(src.Slices), "Must has more than one item"); diff --git a/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs new file mode 100644 index 0000000000..1df4228b9f --- /dev/null +++ b/src/Microsoft.ML.TimeSeries/SeasonalityDetector.cs @@ -0,0 +1,340 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Numerics; +using Microsoft.ML.Internal.CpuMath; +using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.TimeSeries; + +namespace Microsoft.ML.TimeSeries +{ + /// + /// This class is used to detect the periodicity. + /// + internal class SeasonalityDetector + { + /// + /// In practice, the max lag very rarely exceed 365, which lacks of strong interpretation, and which also brings performance overhead. + /// + private const int MaxLag = 400; + + /// + /// Suppose the length of time series is 651, now we found an period is 128, then 651/128 = 5, which means there are at most 5 recurrent period. this is too small, the significance build upon this is not trustable. + /// + private const int MinRecurrentCount = 8; + + /// + /// When input time series is with very close values (i.e., different is smaller than E-20), the accuracy of double could distort the + /// final trend signal. Any seasonal signal under such circumstance becomes unreliable. + /// So use this threshold to eliminate such kind of time series. Here set to 1e-10 is for conservative consideration. + /// + private const double MinEnergyThreshold = 1e-10; + + /// + /// This method detects this predictable interval (or period) by adopting techniques of fourier analysis. + /// Returns -1 if no such pattern is found, that is, the input values do not follow a seasonal fluctuation. + /// + /// The detect seasonality host environment. + /// Input DataView.The data is an instance of . + /// Name of column to process. The column data must be . + /// An upper bound on the number of values to be considered in the input values. + /// When set to -1, use the whole input to fit model; when set to a positive integer, use this number as batch size. + /// Default value is -1. + /// Randomness threshold, ranging from [0, 1]. It specifies how confidence the input + /// follows a predictable pattern recurring as seasonal data. By default, it is set as 0.95. + /// + /// The detected period if seasonality period exists, otherwise return -1. + public int DetectSeasonality( + IHostEnvironment host, + IDataView input, + string inputColumnName, + int seasonalityWindowSize, + double randomessThreshold) + { + host.CheckValue(input, nameof(input)); + host.CheckValue(inputColumnName, nameof(inputColumnName)); + host.CheckUserArg(seasonalityWindowSize == -1 || seasonalityWindowSize >= 0, nameof(seasonalityWindowSize)); + + var column = input.Schema.GetColumnOrNull(inputColumnName); + host.CheckUserArg(column.HasValue, nameof(inputColumnName)); + + var rowCursor = input.GetRowCursor(new List() { column.Value }); + var valueDelegate = rowCursor.GetGetter(column.Value); + + int length = 0; + double valueRef = 0; + var valueCache = seasonalityWindowSize == -1 ? new List() : new List(seasonalityWindowSize); + + while (rowCursor.MoveNext()) + { + valueDelegate.Invoke(ref valueRef); + length++; + valueCache.Add(valueRef); + if (seasonalityWindowSize != -1 && length >= seasonalityWindowSize) + break; + } + + double[] fftRe = new double[length]; + double[] fftIm = new double[length]; + double[] inputRe = valueCache.ToArray(); + + FftUtils.ComputeForwardFft(inputRe, Enumerable.Repeat(0.0, length).ToArray(), fftRe, fftIm, length); + + var energies = fftRe.Select((m, i) => new Complex(m, fftIm[i])).ToArray(); + + /* Periodogram indicates the square of "energy" on the frequency domain. + * Specifically, periodogram[j] = a[j]^2+b[j]^2, where a and b are Fourier Coefficients for cosine and sine, + * x(t) = a0+sum(a[j]cos(2Pi * f[j]t)+b[j]sin(2Pi * f[j]t) + */ + var periodogram = energies.Select((t, i) => t * Complex.Conjugate(t)).ToArray(); + FindBestTwoFrequencies(periodogram, length, out var bestFreq, out var secondFreq); + + double[] ifftRe = new double[length]; + double[] ifftIm = new double[length]; + FftUtils.ComputeBackwardFft( + periodogram.Select(t => t.Real).ToArray(), + periodogram.Select(t => t.Imaginary).ToArray(), + ifftRe, + ifftIm, + length); + var values = ifftRe.Select((t, i) => new Complex(t, ifftIm[i])).ToArray(); + + int period = FindActualPeriod(values, bestFreq, secondFreq, length, randomessThreshold); + + return period < 0 ? -1 : period; + } + + /// + /// Find the actual period based on best frequency and second best frequency: + /// Pick the best frequency by inspecting the auto-correlation energy (pick the highest) in time-domain. + /// In the normal case, usually, when the time series is with period T, then the best frequency is N/T, + /// while the second frequency would be N/2T, because period = T implies period = nT, where n is an integer. + /// In such a case, smaller period will win out on the autu-correlation energy list, due to the property + /// of auto-correlation. + /// + /// The auto-correlation function of the augmented time series + /// The best frequency candidate + /// The second best frequency candidate + /// The length of the original time series, this is used for post + /// processing to reduce false positive + /// + /// Randomness threshold that specifies how confidently the input + /// values follow a predictable pattern recurring as seasonal data. + /// + /// The period detected by check best frequency and second best frequency + private static int FindActualPeriod(Complex[] values, int bestFrequency, int secondFrequency, int timeSeriesLength, double randomnessThreshold) + { + int firstPeriod = -1; + int secondPeriod = -1; + double firstTimeDomainEnergy = -1; + double secondTimeDomainEnergy = -1; + firstPeriod = FindBestPeriod(values, bestFrequency, timeSeriesLength, out firstTimeDomainEnergy); + + if (secondFrequency != -1) + { + secondPeriod = FindBestPeriod(values, secondFrequency, timeSeriesLength, out secondTimeDomainEnergy); + } + + if (firstPeriod == -1 && secondPeriod == -1) + { + return -1; + } + + int truePeriod; + double trueTimeDomainEnergy; + + if (firstPeriod == -1) + { + truePeriod = secondPeriod; + trueTimeDomainEnergy = secondTimeDomainEnergy; + } + else if (secondPeriod == -1) + { + truePeriod = firstPeriod; + trueTimeDomainEnergy = firstTimeDomainEnergy; + } + else + { + if (firstPeriod == secondPeriod) + { + truePeriod = firstPeriod; + trueTimeDomainEnergy = firstTimeDomainEnergy; + } + else + { + // hueristic: if the second frequency is with higher energy in time domain, we think it is a better candidate + if (secondTimeDomainEnergy > firstTimeDomainEnergy * 1.05) + { + truePeriod = secondPeriod; + trueTimeDomainEnergy = secondTimeDomainEnergy; + } + else + { + truePeriod = firstPeriod; + trueTimeDomainEnergy = firstTimeDomainEnergy; + } + } + } + + trueTimeDomainEnergy /= values[0].Real; + + /* This is a key equation, which is named the "testing for randomness with the correlogram". /ref: http://www.ltrr.arizona.edu/~dmeko/notes_3.pdf + * 1.96 is for the 2-sigma, which has 95% statistical confidence. 2.58 is for 99% confidence, 2.85 for 99.5% confidence + * increasing the threshold aims to mitigate the fake seasonal component caused by outliers. in practice, if there exist true seasonal component, + * such as BirdStrike/Appdownloads, the energy is far larger than threshold, hence change threshold from 2.85 to 4.0 have no impact (tested); + */ + double randomnessValue = ProbabilityFunctions.Probit(randomnessThreshold); + double threshold = randomnessValue / Math.Sqrt(timeSeriesLength); + + if (trueTimeDomainEnergy < threshold || values[truePeriod].Real < MinEnergyThreshold) + { + return -1; + } + + return truePeriod; + } + + /// + /// In order to pick up a proper frequency robustly (this is useful especially for large frequency, + /// or small period, e.g., period = 2), this method aims to pick up the top two frequencies for + /// further evaluation. The energy of the second frequency (in frequency domain) must be at similar + /// magnitude compared with the energy of the first frequency. + /// + /// the energy list in the frequency domain, the index is the frequency. + /// the original time series length + /// the frequency with highest energy + /// the frequency with second highest energy + private static void FindBestTwoFrequencies(Complex[] values, int timeSeriesLength, out int bestFrequency, out int secondFrequency) + { + bestFrequency = -1; + double bestEnergy = -1.0; + secondFrequency = -1; + double secondEnergy = -1.0; + + if (values.Length < 2) + { + return; + } + + var medianAggregator = new MedianDblAggregator(values.Length / 2 + 1 - values.Length / timeSeriesLength); + + /* Length of time series divided by frequency is period. + * It is obvious that the period should be larger than 1 and smaller than the total length, and is an integer. + */ + for (int i = values.Length / timeSeriesLength; i < values.Length / 2 + 1; i++) + { + double nextWeight = values[i].Magnitude; + medianAggregator.ProcessValue(nextWeight); + + if (nextWeight > bestEnergy) + { + bestEnergy = nextWeight; + bestFrequency = i; + } + } + + /* Once we found a best frequency, the region formed by lower bound to upper bound corresponding to this frequency + * will not be inspected anymore. because they all share the same period. + */ + int period = values.Length / bestFrequency; + double lowerBound = values.Length * 1.0 / (period + 1); + double upperBound = values.Length * 1.0 / (period - 1); + + for (int i = values.Length / timeSeriesLength; i < values.Length / 2 + 1; i++) + { + if (i > lowerBound && i < upperBound) + { + continue; + } + + double weight = values[i].Magnitude; + if (weight > secondEnergy) + { + double prevWeight = 0; + if (i > 0) + { + prevWeight = values[i - 1].Magnitude; + } + + double nextWeight = 0; + if (i < values.Length - 1) + { + nextWeight = values[i + 1].Magnitude; + } + + // should be a local maximum + if (weight >= prevWeight && weight >= nextWeight) + { + secondEnergy = nextWeight; + secondFrequency = i; + } + } + } + + var typicalEnergy = medianAggregator.Median; + + /* The second energy must be at least significantly large enough than typical energies, + * and also similar to best energy at magnitude level. + */ + if (typicalEnergy * 6.0 < secondEnergy && secondEnergy * 10.0 > bestEnergy) + { + return; + } + + // set the second frequency to -1, since it is obviously not strong enought to compete with the best energy. + secondFrequency = -1; + } + + /// + /// Given a frequency F represented by an integer, we aim to find the best period by inspecting the + /// auto-correlation function in time domain. Since either frequency or the period is an integer, + /// the possible period located within [N/(F+1), N/(F-1)], we need to check this domain, and pick + /// the best one, where N is the length of the augmented time series. + /// + /// The auto-correlation function of the augmented time series + /// The input frequency candidate + /// The length of the original time series, this is used for post processing to reduce false positive + /// Output the energy on the auto-correlation function + /// The best period estimated + private static int FindBestPeriod(Complex[] values, int frequency, int timeSeriesLength, out double energy) + { + energy = -1; + + // this will never make sense of a seasonal signal + if (frequency <= 1) + { + return -1; + } + + int lowerBound = values.Length / (frequency + 1); + int upperBound = values.Length / (frequency - 1); + int bestPeriod = -1; + for (int i = lowerBound; i <= upperBound && i < values.Length; i++) + { + var currentEnergy = values[i].Real; + if (currentEnergy > energy) + { + energy = currentEnergy; + bestPeriod = i; + } + } + + /* condition1: period does not make sense, since the corresponding zone in the auto-correlation energy list are all negative. + * condition2: for real dataset, we do not think there will exist such long period. this is used to reduce false-positive + * condition3: the number of repeats under this period is too few. this is used to reduce false-positive + */ + if (bestPeriod <= 1 || bestPeriod > MaxLag || timeSeriesLength < MinRecurrentCount * bestPeriod) + { + energy = -1; + return -1; + } + + return bestPeriod; + } + } +} diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index d25b614d37..1ab0a70f73 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.TestFramework; using Microsoft.ML.TimeSeries; @@ -527,10 +528,10 @@ public void AnomalyDetectionWithSrCnn(bool loadDataFromFile) { var ml = new MLContext(1); IDataView dataView; - if(loadDataFromFile) + if (loadDataFromFile) { var dataPath = GetDataPath(Path.Combine("Timeseries", "anomaly_detection.csv")); - + // Load data from file into the dataView dataView = ml.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("Value", DataKind.Single, 0), @@ -596,7 +597,7 @@ public void TestSrCnnBatchAnomalyDetector( var data = new List(); for (int index = 0; index < 20; index++) { - data.Add(new TimeSeriesDataDouble { Value = 5 } ); + data.Add(new TimeSeriesDataDouble { Value = 5 }); } data.Add(new TimeSeriesDataDouble { Value = 10 }); for (int index = 0; index < 5; index++) @@ -688,6 +689,29 @@ public void RootCauseLocalization() } } + [Theory] + [InlineData(-1, 6)] + [InlineData(60, 6)] + [InlineData(20, -1)] + public void TestDetectSeasonality(int seasonalityWindowSize, int expectedPeriod) + { + // Create a detect seasonality input: y = sin(2 * Pi + x) + var input = Enumerable.Range(0, 100).Select(x => + new TimeSeriesDataDouble() + { + Value = Math.Sin(2 * Math.PI + x), + }); + foreach (var data in input) + Console.WriteLine(data.Value); + var mlContext = new MLContext(); + + var dataView = mlContext.Data.LoadFromEnumerable(input); + SeasonalityDetector seasonalityDetector = new SeasonalityDetector(); + + int period = mlContext.AnomalyDetection.DetectSeasonality(dataView, nameof(TimeSeriesDataDouble.Value), seasonalityWindowSize); + Assert.Equal(expectedPeriod, period); + } + private static List GetRootCauseLocalizationPoints() { List points = new List();