diff --git a/docs/api-reference/io-time-series-change-point.md b/docs/api-reference/io-time-series-change-point.md new file mode 100644 index 0000000000..b33bde3345 --- /dev/null +++ b/docs/api-reference/io-time-series-change-point.md @@ -0,0 +1,7 @@ +### Input and Output Columns +There is only one input column and its type is . +This estimator adds the following output columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Prediction` | 4-element vector of | It sequentially contains alert level (non-zero value means a change point), score, p-value, and martingale value. | diff --git a/docs/api-reference/io-time-series-spike.md b/docs/api-reference/io-time-series-spike.md new file mode 100644 index 0000000000..877672f7c3 --- /dev/null +++ b/docs/api-reference/io-time-series-spike.md @@ -0,0 +1,7 @@ +### Input and Output Columns +There is only one input column and its type is . +This estimator adds the following output columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Prediction` | 3-element vector of | It sequentially contains alert level (non-zero value means a change point), score, and p-value. | diff --git a/docs/api-reference/time-series-iid.md b/docs/api-reference/time-series-iid.md new file mode 100644 index 0000000000..b366c5e2d2 --- /dev/null +++ b/docs/api-reference/time-series-iid.md @@ -0,0 +1,4 @@ +### Training Algorithm Details +This trainer assumes that data points collected in the time series are independently sampled from the same distribution (independent identically distributed). +Thus, the value at the current timestamp can be viewed as the value at the next timestamp in expectation. +If the observed value at timestamp $t-1$ is $p$, the predicted value at $t$ timestamp would be $p$ as well. diff --git a/docs/api-reference/time-series-props.md b/docs/api-reference/time-series-props.md new file mode 100644 index 0000000000..6bab332edf --- /dev/null +++ b/docs/api-reference/time-series-props.md @@ -0,0 +1,7 @@ +### Estimator Characteristics +| | | +| -- | -- | +| Machine learning task | Anomaly detection | +| Is normalization required? | No | +| Is caching required? | No | +| Required NuGet in addition to Microsoft.ML | Microsoft.ML.TimeSeries | diff --git a/docs/api-reference/time-series-scorer.md b/docs/api-reference/time-series-scorer.md new file mode 100644 index 0000000000..48cc8813bc --- /dev/null +++ b/docs/api-reference/time-series-scorer.md @@ -0,0 +1,26 @@ +### Anomaly Scorer +Once the raw score at a timestamp is computed, it is fed to the anomaly scorer component to calculate the final anomaly score at that timestamp. +There are two statistics involved in this scorer, p-value and martingale score. + +#### Spike detection based on p-value +The p-value score indicates the p-value of the current computed raw score according to a distribution of raw scores. +Here, the distribution is estimated based on the most recent raw score values up to certain depth back in the history. +More specifically, this distribution is estimated using [kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation) +with the Gaussian [kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#In_non-parametric_statistics) of adaptive bandwidth. +The p-value score is always in $[0, 1]$, and the lower its value, the more likely the current point is an outlier (also known as a spike). +If the p-value score exceeds $1 - \frac{\text{confidence}}{100}$, the associated timestamp may get a non-zero alert value in spike detection, which means a spike point is detected. +Note that $\text{confidence}$ is defined in the signatures of [DetectChangePointBySsa](xref:Microsoft.ML.TimeSeriesCatalog.DetectChangePointBySsa(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int32,System.Int32,System.Int32,System.Int32,Microsoft.ML.Transforms.TimeSeries.ErrorFunction,Microsoft.ML.Transforms.TimeSeries.MartingaleType,System.Double)) +and [DetectIidChangePoint](xref:Microsoft.ML.TimeSeriesCatalog.DetectIidChangePoint(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int32,System.Int32,Microsoft.ML.Transforms.TimeSeries.MartingaleType,System.Double)). + + +#### Change point detection based on martingale score +The martingale score is an extra level of scoring that is built upon the p-value scores. +The idea is based on the [Exchangeability Martingales](https://arxiv.org/pdf/1204.3251.pdf) that detect a change of distribution over a stream of i.i.d. values. +In short, the value of the martingale score starts increasing significantly when a sequence of small p-values detected in a row; this indicates the change of the distribution of the underlying data generation process. +Thus, the martingale score is used for change point detection. +Given a sequence of most recently observed p-values, $p1, \dots, p_n$, the martingale score is computed as:? $s(p1, \dots, p_n) = \prod_{i=1}^n \beta(p_i)$. +There are two choices of $\beta$: $\beta(p) = e p^{\epsilon - 1}$ for $0 < \epsilon < 1$ or $\beta(p) = \int_{0}^1 \epsilon p^{\epsilon - 1} d\epsilon$. + +If the martingle score exceeds $s(q_1, \dots, q_n)$ where $q_i=1 - \frac{\text{confidence}}{100}$, the associated timestamp may get a non-zero alert value for change point detection. +Note that $\text{confidence}$ is defined in the signatures of [DetectChangePointBySsa](xref:Microsoft.ML.TimeSeriesCatalog.DetectChangePointBySsa(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int32,System.Int32,System.Int32,System.Int32,Microsoft.ML.Transforms.TimeSeries.ErrorFunction,Microsoft.ML.Transforms.TimeSeries.MartingaleType,System.Double)) or +[DetectIidChangePoint](xref:Microsoft.ML.TimeSeriesCatalog.DetectIidChangePoint(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int32,System.Int32,Microsoft.ML.Transforms.TimeSeries.MartingaleType,System.Double)). diff --git a/docs/api-reference/time-series-ssa.md b/docs/api-reference/time-series-ssa.md new file mode 100644 index 0000000000..c5df2ef6eb --- /dev/null +++ b/docs/api-reference/time-series-ssa.md @@ -0,0 +1,5 @@ +### Training Algorithm Details +This class implements the general anomaly detection transform based on [Singular Spectrum Analysis (SSA)](https://en.wikipedia.org/wiki/Singular_spectrum_analysis). +SSA is a powerful framework for decomposing the time-series into trend, seasonality and noise components as well as forecasting the future values of the time-series. +In principle, SSA performs spectral analysis on the input time-series where each component in the spectrum corresponds to a trend, seasonal or noise component in the time-series. +For details of the Singular Spectrum Analysis (SSA), refer to [this document](http://arxiv.org/pdf/1206.6910.pdf). diff --git a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs index 681d53fe40..e4a3da6761 100644 --- a/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.TimeSeries/ExtensionsCatalog.cs @@ -10,14 +10,14 @@ namespace Microsoft.ML public static class TimeSeriesCatalog { /// - /// Create a new instance of that detects a change of in an - /// independent identically distributed (i.i.d.) time series. - /// Detection is based on adaptive kernel density estimations and martingale scores. + /// Create , which predicts change points in an + /// independent identically distributed (i.i.d.) + /// time series based on adaptive kernel density estimations and martingale scores. /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// Column is a vector of type double and size 4. The vector contains Alert, Raw Score, P-Value and Martingale score as first four values. - /// Name of column to transform. If set to , the value of the will be used as source. + /// The column data is a vector of . The vector contains 4 elements: alert (non-zero value means a change point), raw score, p-Value and martingale score. + /// Name of column to transform. The column data must be . If set to , the value of the will be used as source. /// The confidence for change point detection in the range [0, 100]. /// The length of the sliding window on p-values for computing the martingale score. /// The martingale used for scoring. @@ -34,13 +34,15 @@ public static IidChangePointEstimator DetectIidChangePoint(this TransformsCatalo => new IidChangePointEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, confidence, changeHistoryLength, inputColumnName, martingale, eps); /// - /// Create a new instance of that detects a spike in an - /// independent identically distributed (i.i.d.) time series. - /// Detection is based on adaptive kernel density estimations and martingale scores. + /// Create , which predicts spikes in + /// independent identically distributed (i.i.d.) + /// time series based on adaptive kernel density estimations and martingale scores. /// /// The transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The column data is a vector of . The vector contains 3 elements: alert (non-zero value means a spike), raw score, and p-value. + /// Name of column to transform. The column data must be . + /// If set to , the value of the will be used as source. /// The confidence for spike detection in the range [0, 100]. /// The size of the sliding window for computing the p-value. /// The argument that determines whether to detect positive or negative anomalies, or both. @@ -56,13 +58,14 @@ public static IidSpikeEstimator DetectIidSpike(this TransformsCatalog catalog, s => new IidSpikeEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, confidence, pvalueHistoryLength, inputColumnName, side); /// - /// Create a new instance of for detecting a change in a time series signal + /// Create , which predicts change points in time series /// using Singular Spectrum Analysis (SSA). /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// Column is a vector of type double and size 4. The vector contains Alert, Raw Score, P-Value and Martingale score as first four values. - /// Name of column to transform. If set to , the value of the will be used as source. + /// The column data is a vector of . The vector contains 4 elements: alert (non-zero value means a change point), raw score, p-Value and martingale score. + /// Name of column to transform. The column data must be . + /// If set to , the value of the will be used as source. /// The confidence for change point detection in the range [0, 100]. /// The number of points from the beginning of the sequence used for training. /// The size of the sliding window for computing the p-value. @@ -94,17 +97,18 @@ public static SsaChangePointEstimator DetectChangePointBySsa(this TransformsCata }); /// - /// Create a new instance of for detecting a spike in a time series signal + /// Create , which predicts spikes in time series /// using Singular Spectrum Analysis (SSA). /// /// The transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The column data is a vector of . The vector contains 3 elements: alert (non-zero value means a spike), raw score, and p-value. + /// Name of column to transform. The column data must be . + /// If set to , the value of the will be used as source. /// The confidence for spike detection in the range [0, 100]. /// The size of the sliding window for computing the p-value. /// The number of points from the beginning of the sequence used for training. /// An upper bound on the largest relevant seasonality in the input time-series. - /// The vector contains Alert, Raw Score, P-Value as first three values. /// The argument that determines whether to detect positive or negative anomalies, or both. /// The function used to compute the error between the expected and the observed value. /// diff --git a/src/Microsoft.ML.TimeSeries/IidAnomalyDetectionBase.cs b/src/Microsoft.ML.TimeSeries/IidAnomalyDetectionBase.cs index b42320dd76..36f569b991 100644 --- a/src/Microsoft.ML.TimeSeries/IidAnomalyDetectionBase.cs +++ b/src/Microsoft.ML.TimeSeries/IidAnomalyDetectionBase.cs @@ -23,7 +23,7 @@ public class IidAnomalyDetectionBaseWrapper : IStatefulTransformer, ICanSaveMode bool ITransformer.IsRowToRowMapper => ((ITransformer)InternalTransform).IsRowToRowMapper; /// - /// Creates a clone of the transfomer. Used for taking the snapshot of the state. + /// Create a clone of the transformer. Used for taking the snapshot of the state. /// /// IStatefulTransformer IStatefulTransformer.Clone() => InternalTransform.Clone(); diff --git a/src/Microsoft.ML.TimeSeries/IidChangePointDetector.cs b/src/Microsoft.ML.TimeSeries/IidChangePointDetector.cs index 1fa4b7ce6e..b9aac25f90 100644 --- a/src/Microsoft.ML.TimeSeries/IidChangePointDetector.cs +++ b/src/Microsoft.ML.TimeSeries/IidChangePointDetector.cs @@ -191,10 +191,25 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Dat } /// - /// The for detecting a signal change on an - /// independent identically distributed (i.i.d.) time series. - /// Detection is based on adaptive kernel density estimation and martingales. + /// The to detect a signal change on an + /// independent identically distributed (i.i.d.) + /// time series based on adaptive kernel density estimation and martingales. /// + /// + /// + /// + /// + /// public sealed class IidChangePointEstimator : TrivialEstimator { /// diff --git a/src/Microsoft.ML.TimeSeries/IidSpikeDetector.cs b/src/Microsoft.ML.TimeSeries/IidSpikeDetector.cs index 8805731c35..90f57b6311 100644 --- a/src/Microsoft.ML.TimeSeries/IidSpikeDetector.cs +++ b/src/Microsoft.ML.TimeSeries/IidSpikeDetector.cs @@ -171,10 +171,25 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Dat } /// - /// The for detecting a signal spike on an - /// independent identically distributed (i.i.d.) time series. - /// Detection is based on adaptive kernel density estimation. + /// The to detect a signal spike on an + /// independent identically distributed (i.i.d.) + /// time series based on adaptive kernel density estimation. /// + /// + /// + /// + /// + /// public sealed class IidSpikeEstimator : TrivialEstimator { /// diff --git a/src/Microsoft.ML.TimeSeries/SsaAnomalyDetectionBase.cs b/src/Microsoft.ML.TimeSeries/SsaAnomalyDetectionBase.cs index 86b0589a71..42dabcda12 100644 --- a/src/Microsoft.ML.TimeSeries/SsaAnomalyDetectionBase.cs +++ b/src/Microsoft.ML.TimeSeries/SsaAnomalyDetectionBase.cs @@ -92,7 +92,7 @@ public class SsaAnomalyDetectionBaseWrapper : IStatefulTransformer, ICanSaveMode bool ITransformer.IsRowToRowMapper => ((ITransformer)InternalTransform).IsRowToRowMapper; /// - /// Creates a clone of the transfomer. Used for taking the snapshot of the state. + /// Creates a clone of the transformer. Used for taking the snapshot of the state. /// /// IStatefulTransformer IStatefulTransformer.Clone() => InternalTransform.Clone(); @@ -340,7 +340,7 @@ private protected override void InitializeAnomalyDetector() private protected override double ComputeRawAnomalyScore(ref Single input, FixedSizeQueue windowedBuffer, long iteration) { - // Get the prediction for the next point opn the series + // Get the prediction for the next point in the series Single expectedValue = 0; _model.PredictNext(ref expectedValue); diff --git a/src/Microsoft.ML.TimeSeries/SsaChangePointDetector.cs b/src/Microsoft.ML.TimeSeries/SsaChangePointDetector.cs index ff5cb4a423..e6c72d4399 100644 --- a/src/Microsoft.ML.TimeSeries/SsaChangePointDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SsaChangePointDetector.cs @@ -26,7 +26,7 @@ namespace Microsoft.ML.Transforms.TimeSeries { /// - /// produced by fitting the to an . + /// produced by fitting the to an . /// public sealed class SsaChangePointDetector : SsaAnomalyDetectionBaseWrapper, IStatefulTransformer { @@ -200,8 +200,23 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Dat } /// - /// The for detecting a signal change through Singular Spectrum Analysis (SSA) of time series. + /// The to predict change points in time series using Singular Spectrum Analysis. /// + /// + /// + /// + /// + /// public sealed class SsaChangePointEstimator : IEstimator { private readonly IHost _host; diff --git a/src/Microsoft.ML.TimeSeries/SsaSpikeDetector.cs b/src/Microsoft.ML.TimeSeries/SsaSpikeDetector.cs index f8af5d99dc..43f40570b1 100644 --- a/src/Microsoft.ML.TimeSeries/SsaSpikeDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SsaSpikeDetector.cs @@ -181,8 +181,23 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Dat } /// - /// The for detecting a signal spike through Singular Spectrum Analysis (SSA) of time series. + /// The to predict spikes in time series using Singular Spectrum Analysis. /// + /// + /// + /// + /// + /// public sealed class SsaSpikeEstimator : IEstimator { private readonly IHost _host;